/src/openssl/crypto/evp/enc_b64_avx2.c
Line | Count | Source |
1 | | #include <openssl/evp.h> |
2 | | #include "enc_b64_scalar.h" |
3 | | #include "enc_b64_avx2.h" |
4 | | #include "internal/cryptlib.h" |
5 | | #include "crypto/evp.h" |
6 | | #include "evp_local.h" |
7 | | |
8 | | #if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) |
9 | | #if !defined(_M_ARM64EC) |
10 | | #if defined(HAVE_AVX2_INTRINSICS) |
11 | | #define STRINGIFY_IMPLEMENTATION_(a) #a |
12 | | #define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) |
13 | | |
14 | | #ifdef __clang__ |
15 | | /* |
16 | | * clang does not have GCC push pop |
17 | | * warning: clang attribute push can't be used within a namespace in clang up |
18 | | * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be |
19 | | * outside* of a namespace. |
20 | | */ |
21 | | #define OPENSSL_TARGET_REGION(T) \ |
22 | | _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \ |
23 | | apply_to = function))) |
24 | | #define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop") |
25 | | #elif defined(__GNUC__) |
26 | | #define OPENSSL_TARGET_REGION(T) \ |
27 | | _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) |
28 | | #define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options") |
29 | | #endif /* clang then gcc */ |
30 | | |
31 | | /* Default target region macros don't do anything. */ |
32 | | #ifndef OPENSSL_TARGET_REGION |
33 | | #define OPENSSL_TARGET_REGION(T) |
34 | | #define OPENSSL_UNTARGET_REGION |
35 | | #endif |
36 | | |
37 | | #define OPENSSL_TARGET_AVX2 \ |
38 | | OPENSSL_TARGET_REGION("avx2") |
39 | | #define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION |
40 | | |
41 | | /* |
42 | | * Ensure this whole block is compiled with AVX2 enabled on GCC. |
43 | | * Clang/MSVC will just ignore these pragmas. |
44 | | */ |
45 | | |
46 | | #include <string.h> |
47 | | #include <immintrin.h> |
48 | | #include <stddef.h> |
49 | | #include <stdint.h> |
50 | | |
51 | | OPENSSL_TARGET_AVX2 |
52 | | static __m256i lookup_pshufb_std(__m256i input) |
53 | 0 | { |
54 | 0 | __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); |
55 | 0 | const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); |
56 | |
|
57 | 0 | result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); |
58 | 0 | __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
59 | 0 | '0' - 52, '0' - 52, |
60 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
61 | 0 | '/' - 63, 'A', 0, 0, |
62 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
63 | 0 | '0' - 52, '0' - 52, |
64 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
65 | 0 | '/' - 63, 'A', 0, 0); |
66 | |
|
67 | 0 | result = _mm256_shuffle_epi8(shift_LUT, result); |
68 | 0 | return _mm256_add_epi8(result, input); |
69 | 0 | } |
70 | | OPENSSL_UNTARGET_AVX2 |
71 | | |
72 | | OPENSSL_TARGET_AVX2 |
73 | | static ossl_inline __m256i lookup_pshufb_srp(__m256i input) |
74 | 0 | { |
75 | 0 | const __m256i zero = _mm256_setzero_si256(); |
76 | 0 | const __m256i hi = _mm256_set1_epi8((char)0x80); |
77 | 0 | __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input), |
78 | 0 | _mm256_cmpgt_epi8(input, |
79 | 0 | _mm256_set1_epi8(63))); |
80 | 0 | __m256i idx = _mm256_setzero_si256(); |
81 | |
|
82 | 0 | idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9))); |
83 | 0 | idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35))); |
84 | 0 | idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3), |
85 | 0 | _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62))); |
86 | 0 | idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4), |
87 | 0 | _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63))); |
88 | | |
89 | | /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */ |
90 | 0 | idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi)); |
91 | |
|
92 | 0 | const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, |
93 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, |
94 | 0 | '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, |
95 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0); |
96 | |
|
97 | 0 | __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx); |
98 | 0 | __m256i ascii = _mm256_add_epi8(shift, input); |
99 | 0 | return ascii; |
100 | 0 | } |
101 | | OPENSSL_UNTARGET_AVX2 |
102 | | |
103 | | OPENSSL_TARGET_AVX2 |
104 | | static ossl_inline __m256i shift_right_zeros(__m256i v, int n) |
105 | 0 | { |
106 | 0 | switch (n) { |
107 | 0 | case 0: |
108 | 0 | return v; |
109 | 0 | case 1: |
110 | 0 | return _mm256_srli_si256(v, 1); |
111 | 0 | case 2: |
112 | 0 | return _mm256_srli_si256(v, 2); |
113 | 0 | case 3: |
114 | 0 | return _mm256_srli_si256(v, 3); |
115 | 0 | case 4: |
116 | 0 | return _mm256_srli_si256(v, 4); |
117 | 0 | case 5: |
118 | 0 | return _mm256_srli_si256(v, 5); |
119 | 0 | case 6: |
120 | 0 | return _mm256_srli_si256(v, 6); |
121 | 0 | case 7: |
122 | 0 | return _mm256_srli_si256(v, 7); |
123 | 0 | case 8: |
124 | 0 | return _mm256_srli_si256(v, 8); |
125 | 0 | case 9: |
126 | 0 | return _mm256_srli_si256(v, 9); |
127 | 0 | case 10: |
128 | 0 | return _mm256_srli_si256(v, 10); |
129 | 0 | case 11: |
130 | 0 | return _mm256_srli_si256(v, 11); |
131 | 0 | case 12: |
132 | 0 | return _mm256_srli_si256(v, 12); |
133 | 0 | case 13: |
134 | 0 | return _mm256_srli_si256(v, 13); |
135 | 0 | case 14: |
136 | 0 | return _mm256_srli_si256(v, 14); |
137 | 0 | case 15: |
138 | 0 | return _mm256_srli_si256(v, 15); |
139 | 0 | default: |
140 | 0 | return _mm256_setzero_si256(); |
141 | 0 | } |
142 | 0 | } |
143 | | OPENSSL_UNTARGET_AVX2 |
144 | | |
145 | | OPENSSL_TARGET_AVX2 |
146 | | static ossl_inline __m256i shift_left_zeros(__m256i v, int n) |
147 | 0 | { |
148 | 0 | switch (n) { |
149 | 0 | case 0: |
150 | 0 | return v; |
151 | 0 | case 1: |
152 | 0 | return _mm256_slli_si256(v, 1); |
153 | 0 | case 2: |
154 | 0 | return _mm256_slli_si256(v, 2); |
155 | 0 | case 3: |
156 | 0 | return _mm256_slli_si256(v, 3); |
157 | 0 | case 4: |
158 | 0 | return _mm256_slli_si256(v, 4); |
159 | 0 | case 5: |
160 | 0 | return _mm256_slli_si256(v, 5); |
161 | 0 | case 6: |
162 | 0 | return _mm256_slli_si256(v, 6); |
163 | 0 | case 7: |
164 | 0 | return _mm256_slli_si256(v, 7); |
165 | 0 | case 8: |
166 | 0 | return _mm256_slli_si256(v, 8); |
167 | 0 | case 9: |
168 | 0 | return _mm256_slli_si256(v, 9); |
169 | 0 | case 10: |
170 | 0 | return _mm256_slli_si256(v, 10); |
171 | 0 | case 11: |
172 | 0 | return _mm256_slli_si256(v, 11); |
173 | 0 | case 12: |
174 | 0 | return _mm256_slli_si256(v, 12); |
175 | 0 | case 13: |
176 | 0 | return _mm256_slli_si256(v, 13); |
177 | 0 | case 14: |
178 | 0 | return _mm256_slli_si256(v, 14); |
179 | 0 | case 15: |
180 | 0 | return _mm256_slli_si256(v, 15); |
181 | 0 | case 16: |
182 | 0 | return _mm256_setzero_si256(); |
183 | 0 | default: |
184 | 0 | return _mm256_setzero_si256(); |
185 | 0 | } |
186 | 0 | } |
187 | | OPENSSL_UNTARGET_AVX2 |
188 | | |
189 | | static const uint8_t shuffle_masks[16][16] = { |
190 | | { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
191 | | { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
192 | | { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
193 | | { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
194 | | { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
195 | | { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
196 | | { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
197 | | { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 }, |
198 | | { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 }, |
199 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 }, |
200 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 }, |
201 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 }, |
202 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 }, |
203 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 }, |
204 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 }, |
205 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 } |
206 | | }; |
207 | | |
208 | | /** |
209 | | * Insert a line feed character in the 64-byte input at index K in [0,32). |
210 | | */ |
211 | | OPENSSL_TARGET_AVX2 |
212 | | static ossl_inline __m256i insert_line_feed32(__m256i input, int K) |
213 | 0 | { |
214 | 0 | __m256i line_feed_vector = _mm256_set1_epi8('\n'); |
215 | 0 | __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
216 | |
|
217 | 0 | if (K >= 16) { |
218 | 0 | __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]); |
219 | 0 | __m256i mask = _mm256_set_m128i(maskhi, identity); |
220 | 0 | __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); |
221 | 0 | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
222 | 0 | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
223 | |
|
224 | 0 | return result; |
225 | 0 | } |
226 | | /* Shift input right by 1 byte */ |
227 | 0 | __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21), |
228 | 0 | 15); |
229 | 0 | input = _mm256_blend_epi32(input, shift, 0xF0); |
230 | 0 | __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]); |
231 | 0 | __m256i mask = _mm256_set_m128i(identity, masklo); |
232 | 0 | __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); |
233 | 0 | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
234 | 0 | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
235 | 0 | return result; |
236 | 0 | } |
237 | | OPENSSL_UNTARGET_AVX2 |
238 | | |
239 | | OPENSSL_TARGET_AVX2 |
240 | | static ossl_inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride, |
241 | | int *wrap_cnt) |
242 | 0 | { |
243 | 0 | const int until_nl = stride - *wrap_cnt; |
244 | |
|
245 | 0 | if (until_nl > 32) { |
246 | 0 | _mm256_storeu_si256((__m256i *)out, v); |
247 | |
|
248 | 0 | *wrap_cnt += 32; |
249 | 0 | return 32; |
250 | 0 | } |
251 | | |
252 | 0 | if (until_nl == 32) { |
253 | 0 | _mm256_storeu_si256((__m256i *)out, v); |
254 | |
|
255 | 0 | out[32] = '\n'; |
256 | 0 | *wrap_cnt = 0; |
257 | 0 | return 33; |
258 | 0 | } |
259 | | |
260 | 0 | const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31); |
261 | 0 | const __m256i with_lf = insert_line_feed32(v, until_nl); |
262 | 0 | _mm256_storeu_si256((__m256i *)out, with_lf); |
263 | 0 | out[32] = last; |
264 | |
|
265 | 0 | *wrap_cnt = 32 - until_nl; |
266 | 0 | return 33; |
267 | 0 | } |
268 | | OPENSSL_UNTARGET_AVX2 |
269 | | |
270 | | OPENSSL_TARGET_AVX2 |
271 | | static ossl_inline size_t insert_nl_gt16(const __m256i v0, |
272 | | uint8_t *output, |
273 | | int wrap_max, int *wrap_cnt) |
274 | 0 | { |
275 | 0 | uint8_t *out = output; |
276 | 0 | int wrap_rem = wrap_max - *wrap_cnt; |
277 | 0 | _mm256_storeu_si256((__m256i *)(output), v0); |
278 | |
|
279 | 0 | if (wrap_rem > 32) { |
280 | 0 | *wrap_cnt += 32; |
281 | 0 | return 32; |
282 | 0 | } |
283 | | |
284 | 0 | __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF); |
285 | |
|
286 | 0 | __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, |
287 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
288 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
289 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
290 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
291 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF); |
292 | |
|
293 | 0 | __m256i blended_0L = v0; |
294 | 0 | int surplus_0 = wrap_rem < 16 ? 1 : 0; |
295 | 0 | if (surplus_0 == 1) { |
296 | 0 | __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem), |
297 | 0 | wrap_rem + surplus_0); |
298 | 0 | __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0); |
299 | 0 | __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane); |
300 | 0 | __m256i shifted_1_L = shift_left_zeros(v0, 1); |
301 | 0 | __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask); |
302 | |
|
303 | 0 | blended_0L = _mm256_blendv_epi8(v0, shifted, mask); |
304 | 0 | _mm256_storeu_si256((__m256i *)(output), blended_0L); |
305 | 0 | wrap_rem += wrap_max; |
306 | 0 | } |
307 | |
|
308 | 0 | int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0; |
309 | 0 | int last_of_1L = _mm256_extract_epi8(v0, 31); |
310 | |
|
311 | 0 | if (surplus_1 == 1) { |
312 | 0 | uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30); |
313 | 0 | int wrap_rem_1 = wrap_rem - 16; |
314 | 0 | __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1), |
315 | 0 | wrap_rem_1 + surplus_0 + surplus_1); |
316 | 0 | __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1); |
317 | 0 | __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L); |
318 | 0 | __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask); |
319 | 0 | _mm256_storeu_si256((__m256i *)(output), blended_1L); |
320 | |
|
321 | 0 | output[wrap_rem + surplus_0] = '\n'; |
322 | 0 | output[31 + surplus_0] = (uint8_t)sec_last_of_1L; |
323 | 0 | output[31 + surplus_0 + surplus_1] = last_of_1L; |
324 | 0 | } |
325 | |
|
326 | 0 | if (surplus_0 == 1) { |
327 | 0 | output[wrap_rem - wrap_max] = '\n'; |
328 | 0 | output[16] = _mm256_extract_epi8(v0, 15); |
329 | 0 | output[31 + surplus_0 + surplus_1] = last_of_1L; |
330 | 0 | } |
331 | |
|
332 | 0 | *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem; |
333 | |
|
334 | 0 | int nl_at_end = 0; |
335 | 0 | if (*wrap_cnt == wrap_max || *wrap_cnt == 0) { |
336 | 0 | *wrap_cnt = 0; |
337 | 0 | output[32 + surplus_0 + surplus_1] = '\n'; |
338 | 0 | nl_at_end = 1; |
339 | 0 | } |
340 | |
|
341 | 0 | out += 32 + surplus_0 + surplus_1 + nl_at_end; |
342 | 0 | size_t written = (size_t)(out - output); |
343 | |
|
344 | 0 | return written; |
345 | 0 | } |
346 | | OPENSSL_UNTARGET_AVX2 |
347 | | |
348 | | OPENSSL_TARGET_AVX2 |
349 | | static ossl_inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0, |
350 | | uint8_t *output, |
351 | | int dummy_stride, |
352 | | int *wrap_cnt) |
353 | 0 | { |
354 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, |
355 | 0 | (char)0xFF, |
356 | 0 | (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF, |
357 | 0 | 12); |
358 | 0 | __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask); |
359 | |
|
360 | 0 | _mm256_storeu_si256((__m256i *)(output + 0), shuffled); |
361 | |
|
362 | 0 | int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7); |
363 | 0 | int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29); |
364 | 0 | int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15); |
365 | |
|
366 | 0 | uint8_t *out = output; |
367 | 0 | out[4] = '\n'; |
368 | 0 | memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext)); |
369 | 0 | out[16 + 1] = '\n'; |
370 | 0 | memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1)); |
371 | 0 | out[16 + 14] = '\n'; |
372 | 0 | memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2)); |
373 | |
|
374 | 0 | out += 32 + 3; |
375 | 0 | *wrap_cnt = 4; |
376 | |
|
377 | 0 | size_t written = (out - output); |
378 | 0 | return written; |
379 | 0 | } |
380 | | OPENSSL_UNTARGET_AVX2 |
381 | | |
382 | | OPENSSL_TARGET_AVX2 |
383 | | static ossl_inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask) |
384 | 0 | { |
385 | 0 | __m256i newline = _mm256_set1_epi8('\n'); |
386 | |
|
387 | 0 | return _mm256_or_si256(_mm256_and_si256(mask, newline), |
388 | 0 | _mm256_andnot_si256(mask, data)); |
389 | 0 | } |
390 | | OPENSSL_UNTARGET_AVX2 |
391 | | |
392 | | OPENSSL_TARGET_AVX2 |
393 | | static ossl_inline size_t insert_nl_str4(const __m256i v0, uint8_t *output) |
394 | 0 | { |
395 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, |
396 | 0 | 7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12, |
397 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3, |
398 | 0 | (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9); |
399 | 0 | __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
400 | 0 | 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
401 | 0 | 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
402 | 0 | 0, 0); |
403 | 0 | __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); |
404 | 0 | __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes); |
405 | |
|
406 | 0 | _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl); |
407 | | |
408 | | /* Handle cross-lane remainder logic */ |
409 | | /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */ |
410 | 0 | #define B_LANE 16 /* Bytes per lane */ |
411 | 0 | #define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */ |
412 | 0 | #define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */ |
413 | | |
414 | | /* Bytes that were shifted out of lane 0 */ |
415 | 0 | __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L); |
416 | | |
417 | | /* Bytes that were shifted out of lane 1 */ |
418 | 0 | __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L), |
419 | 0 | B_LANE - N_RET_1_L), |
420 | 0 | B_LANE - 2); |
421 | | |
422 | | /* isolate the bytes that were shifted out of lane 1 */ |
423 | 0 | __m256i rem_2_L_P2 = _mm256_slli_si256( |
424 | 0 | _mm256_srli_si256(v0, |
425 | 0 | B_LANE - N_RET_2_L + N_RET_1_L), |
426 | 0 | N_RET_1_L); |
427 | |
|
428 | 0 | __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2); |
429 | |
|
430 | 0 | int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0); |
431 | 0 | int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2); |
432 | |
|
433 | 0 | uint8_t *out = output + 16; |
434 | 0 | memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext)); |
435 | 0 | out += 3; |
436 | 0 | *out++ = '\n'; |
437 | |
|
438 | 0 | out = output + 32; |
439 | 0 | memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext)); |
440 | 0 | out += 2; |
441 | 0 | *out++ = '\n'; |
442 | 0 | out += 4; |
443 | 0 | *out++ = '\n'; |
444 | |
|
445 | 0 | size_t written = (out - output); |
446 | 0 | return written; |
447 | 0 | } |
448 | | OPENSSL_UNTARGET_AVX2 |
449 | | |
450 | | OPENSSL_TARGET_AVX2 |
451 | | static ossl_inline size_t insert_nl_str8(const __m256i v0, uint8_t *output) |
452 | 0 | { |
453 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF, |
454 | 0 | 8, 9, 10, 11, 12, 13, 14, |
455 | 0 | (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, |
456 | 0 | 7, (char)0xFF, 8, 9, 10, 11, 12); |
457 | 0 | __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); |
458 | 0 | _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes); |
459 | 0 | int8_t rem_1_L = _mm256_extract_epi8(v0, 15); |
460 | 0 | int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29); |
461 | 0 | int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15); |
462 | 0 | uint8_t *out = output; |
463 | |
|
464 | 0 | memcpy(out + 16, &rem_1_L, sizeof(rem_1_L)); |
465 | 0 | memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1)); |
466 | 0 | memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2)); |
467 | |
|
468 | 0 | output[8] = '\n'; |
469 | 0 | output[17] = '\n'; |
470 | 0 | output[26] = '\n'; |
471 | 0 | output[35] = '\n'; |
472 | |
|
473 | 0 | out += 32 + 4; |
474 | |
|
475 | 0 | size_t written = (out - output); |
476 | 0 | return written; |
477 | 0 | } |
478 | | OPENSSL_UNTARGET_AVX2 |
479 | | |
480 | | OPENSSL_TARGET_AVX2 |
481 | | size_t encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst, |
482 | | const unsigned char *src, int srclen, int ctx_length, |
483 | | int *final_wrap_cnt) |
484 | 0 | { |
485 | 0 | const uint8_t *input = (const uint8_t *)src; |
486 | 0 | uint8_t *out = (uint8_t *)dst; |
487 | 0 | int i = 0; |
488 | 0 | int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4; |
489 | 0 | int wrap_cnt = 0; |
490 | 0 | const int use_srp = (ctx != NULL |
491 | 0 | && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0); |
492 | 0 | const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
493 | 0 | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
494 | 0 | int base = 0; |
495 | | |
496 | | /* Process 96 bytes at a time */ |
497 | 0 | for (; i + 100 <= srclen; i += 96) { |
498 | 0 | _mm_prefetch((const char *)(input + i + 192), _MM_HINT_T0); |
499 | | /* |
500 | | * Interleaved for each vector: load, shuffle, bit-split, lookup |
501 | | * before starting the next, giving the OoO engine independent work chains |
502 | | * across execution ports. |
503 | | */ |
504 | 0 | const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0)); |
505 | 0 | const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1)); |
506 | 0 | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); |
507 | 0 | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); |
508 | 0 | const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); |
509 | 0 | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); |
510 | 0 | const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); |
511 | 0 | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); |
512 | |
|
513 | 0 | const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2)); |
514 | 0 | const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3)); |
515 | 0 | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); |
516 | 0 | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); |
517 | 0 | const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); |
518 | 0 | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); |
519 | 0 | const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); |
520 | 0 | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); |
521 | |
|
522 | 0 | const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4)); |
523 | 0 | const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5)); |
524 | 0 | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); |
525 | 0 | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); |
526 | 0 | const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); |
527 | 0 | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); |
528 | 0 | const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); |
529 | 0 | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); |
530 | |
|
531 | 0 | const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6)); |
532 | 0 | const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7)); |
533 | 0 | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); |
534 | 0 | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); |
535 | 0 | const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); |
536 | 0 | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); |
537 | 0 | const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); |
538 | 0 | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); |
539 | |
|
540 | 0 | __m256i vec0; |
541 | 0 | __m256i vec1; |
542 | 0 | __m256i vec2; |
543 | 0 | __m256i vec3; |
544 | |
|
545 | 0 | if (use_srp) { |
546 | 0 | vec0 = lookup_pshufb_srp(input0); |
547 | 0 | vec1 = lookup_pshufb_srp(input1); |
548 | 0 | vec2 = lookup_pshufb_srp(input2); |
549 | 0 | vec3 = lookup_pshufb_srp(input3); |
550 | |
|
551 | 0 | } else { |
552 | 0 | vec0 = lookup_pshufb_std(input0); |
553 | 0 | vec1 = lookup_pshufb_std(input1); |
554 | 0 | vec2 = lookup_pshufb_std(input2); |
555 | 0 | vec3 = lookup_pshufb_std(input3); |
556 | 0 | } |
557 | |
|
558 | 0 | if (stride == 0) { |
559 | 0 | _mm256_storeu_si256((__m256i *)out, vec0); |
560 | |
|
561 | 0 | out += 32; |
562 | 0 | _mm256_storeu_si256((__m256i *)out, vec1); |
563 | |
|
564 | 0 | out += 32; |
565 | 0 | _mm256_storeu_si256((__m256i *)out, vec2); |
566 | |
|
567 | 0 | out += 32; |
568 | 0 | _mm256_storeu_si256((__m256i *)out, vec3); |
569 | |
|
570 | 0 | out += 32; |
571 | 0 | } else if (stride == 64) { |
572 | 0 | _mm256_storeu_si256((__m256i *)out, vec0); |
573 | |
|
574 | 0 | out += 32; |
575 | 0 | _mm256_storeu_si256((__m256i *)out, vec1); |
576 | |
|
577 | 0 | out += 32; |
578 | 0 | *(out++) = '\n'; |
579 | |
|
580 | 0 | _mm256_storeu_si256((__m256i *)out, vec2); |
581 | 0 | out += 32; |
582 | |
|
583 | 0 | _mm256_storeu_si256((__m256i *)out, vec3); |
584 | 0 | out += 32; |
585 | |
|
586 | 0 | *(out++) = '\n'; |
587 | 0 | } else if (stride == 4) { |
588 | 0 | int out_idx = 0; |
589 | |
|
590 | 0 | out_idx += (int)insert_nl_str4(vec0, out + out_idx); |
591 | 0 | out_idx += (int)insert_nl_str4(vec1, out + out_idx); |
592 | 0 | out_idx += (int)insert_nl_str4(vec2, out + out_idx); |
593 | 0 | out_idx += (int)insert_nl_str4(vec3, out + out_idx); |
594 | |
|
595 | 0 | out += out_idx; |
596 | 0 | } else if (stride == 8) { |
597 | |
|
598 | 0 | out += insert_nl_str8(vec0, out); |
599 | 0 | out += insert_nl_str8(vec1, out); |
600 | 0 | out += insert_nl_str8(vec2, out); |
601 | 0 | out += insert_nl_str8(vec3, out); |
602 | |
|
603 | 0 | } else if (stride == 12) { |
604 | 0 | switch (base) { |
605 | 0 | case 0: |
606 | |
|
607 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
608 | 0 | out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt); |
609 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
610 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
611 | 0 | break; |
612 | 0 | case 1: |
613 | 0 | out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt); |
614 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
615 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
616 | 0 | out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt); |
617 | 0 | break; |
618 | 0 | default: /* base == 2 */ |
619 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
620 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
621 | 0 | out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt); |
622 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
623 | 0 | break; |
624 | 0 | } |
625 | | |
626 | 0 | if (++base == 3) |
627 | 0 | base = 0; |
628 | 0 | } else if (stride >= 32) { |
629 | 0 | out += ins_nl_gt32(vec0, out, stride, &wrap_cnt); |
630 | 0 | out += ins_nl_gt32(vec1, out, stride, &wrap_cnt); |
631 | 0 | out += ins_nl_gt32(vec2, out, stride, &wrap_cnt); |
632 | 0 | out += ins_nl_gt32(vec3, out, stride, &wrap_cnt); |
633 | 0 | } else if (stride >= 16) { |
634 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
635 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
636 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
637 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
638 | 0 | } |
639 | 0 | } |
640 | | |
641 | 0 | if (stride == 0) { |
642 | 0 | for (; i + 28 <= srclen; i += 24) { |
643 | | /* lo = [xxxx|DDDC|CCBB|BAAA] */ |
644 | | /* hi = [xxxx|HHHG|GGFF|FEEE] */ |
645 | 0 | const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i)); |
646 | 0 | const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3)); |
647 | | /* |
648 | | * bytes from groups A, B and C are needed in separate 32-bit lanes |
649 | | * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] |
650 | | */ |
651 | 0 | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); |
652 | 0 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); |
653 | 0 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); |
654 | 0 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); |
655 | 0 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); |
656 | 0 | const __m256i indices = _mm256_or_si256(t1, t3); |
657 | 0 | _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices)); |
658 | |
|
659 | 0 | out += 32; |
660 | 0 | } |
661 | 0 | } |
662 | 0 | *final_wrap_cnt = wrap_cnt; |
663 | |
|
664 | 0 | if (stride >= 32 && wrap_cnt == stride) { |
665 | 0 | wrap_cnt = 0; |
666 | 0 | *out++ = '\n'; |
667 | 0 | } |
668 | |
|
669 | 0 | return (size_t)(out - (uint8_t *)dst) + evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt); |
670 | 0 | } |
671 | | OPENSSL_UNTARGET_AVX2 |
672 | | #endif /* defined(HAVE_AVX2_INTRINSICS) */ |
673 | | #endif /* !defined(_M_ARM64EC) */ |
674 | | #endif |