/src/openssl/crypto/evp/enc_b64_avx2.c
Line | Count | Source |
1 | | #include <openssl/evp.h> |
2 | | #include "enc_b64_scalar.h" |
3 | | #include "enc_b64_avx2.h" |
4 | | #include "internal/cryptlib.h" |
5 | | #include "crypto/evp.h" |
6 | | #include "evp_local.h" |
7 | | |
8 | | #if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) |
9 | | #if !defined(_M_ARM64EC) |
10 | | #define STRINGIFY_IMPLEMENTATION_(a) #a |
11 | | #define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) |
12 | | |
13 | | #ifdef __clang__ |
14 | | /* |
15 | | * clang does not have GCC push pop |
16 | | * warning: clang attribute push can't be used within a namespace in clang up |
17 | | * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be |
18 | | * outside* of a namespace. |
19 | | */ |
20 | | #define OPENSSL_TARGET_REGION(T) \ |
21 | | _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \ |
22 | | apply_to = function))) |
23 | | #define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop") |
24 | | #elif defined(__GNUC__) |
25 | | #define OPENSSL_TARGET_REGION(T) \ |
26 | | _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) |
27 | | #define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options") |
28 | | #endif /* clang then gcc */ |
29 | | |
30 | | /* Default target region macros don't do anything. */ |
31 | | #ifndef OPENSSL_TARGET_REGION |
32 | | #define OPENSSL_TARGET_REGION(T) |
33 | | #define OPENSSL_UNTARGET_REGION |
34 | | #endif |
35 | | |
36 | | #define OPENSSL_TARGET_AVX2 \ |
37 | | OPENSSL_TARGET_REGION("avx2") |
38 | | #define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION |
39 | | |
40 | | /* |
41 | | * Ensure this whole block is compiled with AVX2 enabled on GCC. |
42 | | * Clang/MSVC will just ignore these pragmas. |
43 | | */ |
44 | | |
45 | | #include <string.h> |
46 | | #include <immintrin.h> |
47 | | #include <stddef.h> |
48 | | #include <stdint.h> |
49 | | |
50 | | OPENSSL_TARGET_AVX2 |
51 | | static __m256i lookup_pshufb_std(__m256i input) |
52 | 0 | { |
53 | 0 | __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); |
54 | 0 | const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); |
55 | |
|
56 | 0 | result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); |
57 | 0 | __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
58 | 0 | '0' - 52, '0' - 52, |
59 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
60 | 0 | '/' - 63, 'A', 0, 0, |
61 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
62 | 0 | '0' - 52, '0' - 52, |
63 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
64 | 0 | '/' - 63, 'A', 0, 0); |
65 | |
|
66 | 0 | result = _mm256_shuffle_epi8(shift_LUT, result); |
67 | 0 | return _mm256_add_epi8(result, input); |
68 | 0 | } |
69 | | OPENSSL_UNTARGET_AVX2 |
70 | | |
71 | | OPENSSL_TARGET_AVX2 |
72 | | static inline __m256i lookup_pshufb_srp(__m256i input) |
73 | 0 | { |
74 | 0 | const __m256i zero = _mm256_setzero_si256(); |
75 | 0 | const __m256i hi = _mm256_set1_epi8((char)0x80); |
76 | 0 | __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input), |
77 | 0 | _mm256_cmpgt_epi8(input, |
78 | 0 | _mm256_set1_epi8(63))); |
79 | 0 | __m256i idx = _mm256_setzero_si256(); |
80 | |
|
81 | 0 | idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9))); |
82 | 0 | idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35))); |
83 | 0 | idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3), |
84 | 0 | _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62))); |
85 | 0 | idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4), |
86 | 0 | _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63))); |
87 | | |
88 | | /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */ |
89 | 0 | idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi)); |
90 | |
|
91 | 0 | const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, |
92 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, |
93 | 0 | '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, |
94 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0); |
95 | |
|
96 | 0 | __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx); |
97 | 0 | __m256i ascii = _mm256_add_epi8(shift, input); |
98 | 0 | return ascii; |
99 | 0 | } |
100 | | OPENSSL_UNTARGET_AVX2 |
101 | | |
102 | | OPENSSL_TARGET_AVX2 |
103 | | static inline __m256i shift_right_zeros(__m256i v, int n) |
104 | 0 | { |
105 | 0 | switch (n) { |
106 | 0 | case 0: |
107 | 0 | return v; |
108 | 0 | case 1: |
109 | 0 | return _mm256_srli_si256(v, 1); |
110 | 0 | case 2: |
111 | 0 | return _mm256_srli_si256(v, 2); |
112 | 0 | case 3: |
113 | 0 | return _mm256_srli_si256(v, 3); |
114 | 0 | case 4: |
115 | 0 | return _mm256_srli_si256(v, 4); |
116 | 0 | case 5: |
117 | 0 | return _mm256_srli_si256(v, 5); |
118 | 0 | case 6: |
119 | 0 | return _mm256_srli_si256(v, 6); |
120 | 0 | case 7: |
121 | 0 | return _mm256_srli_si256(v, 7); |
122 | 0 | case 8: |
123 | 0 | return _mm256_srli_si256(v, 8); |
124 | 0 | case 9: |
125 | 0 | return _mm256_srli_si256(v, 9); |
126 | 0 | case 10: |
127 | 0 | return _mm256_srli_si256(v, 10); |
128 | 0 | case 11: |
129 | 0 | return _mm256_srli_si256(v, 11); |
130 | 0 | case 12: |
131 | 0 | return _mm256_srli_si256(v, 12); |
132 | 0 | case 13: |
133 | 0 | return _mm256_srli_si256(v, 13); |
134 | 0 | case 14: |
135 | 0 | return _mm256_srli_si256(v, 14); |
136 | 0 | case 15: |
137 | 0 | return _mm256_srli_si256(v, 15); |
138 | 0 | default: |
139 | 0 | return _mm256_setzero_si256(); |
140 | 0 | } |
141 | 0 | } |
142 | | OPENSSL_UNTARGET_AVX2 |
143 | | |
144 | | OPENSSL_TARGET_AVX2 |
145 | | static inline __m256i shift_left_zeros(__m256i v, int n) |
146 | 0 | { |
147 | 0 | switch (n) { |
148 | 0 | case 0: |
149 | 0 | return v; |
150 | 0 | case 1: |
151 | 0 | return _mm256_slli_si256(v, 1); |
152 | 0 | case 2: |
153 | 0 | return _mm256_slli_si256(v, 2); |
154 | 0 | case 3: |
155 | 0 | return _mm256_slli_si256(v, 3); |
156 | 0 | case 4: |
157 | 0 | return _mm256_slli_si256(v, 4); |
158 | 0 | case 5: |
159 | 0 | return _mm256_slli_si256(v, 5); |
160 | 0 | case 6: |
161 | 0 | return _mm256_slli_si256(v, 6); |
162 | 0 | case 7: |
163 | 0 | return _mm256_slli_si256(v, 7); |
164 | 0 | case 8: |
165 | 0 | return _mm256_slli_si256(v, 8); |
166 | 0 | case 9: |
167 | 0 | return _mm256_slli_si256(v, 9); |
168 | 0 | case 10: |
169 | 0 | return _mm256_slli_si256(v, 10); |
170 | 0 | case 11: |
171 | 0 | return _mm256_slli_si256(v, 11); |
172 | 0 | case 12: |
173 | 0 | return _mm256_slli_si256(v, 12); |
174 | 0 | case 13: |
175 | 0 | return _mm256_slli_si256(v, 13); |
176 | 0 | case 14: |
177 | 0 | return _mm256_slli_si256(v, 14); |
178 | 0 | case 15: |
179 | 0 | return _mm256_slli_si256(v, 15); |
180 | 0 | case 16: |
181 | 0 | return _mm256_setzero_si256(); |
182 | 0 | default: |
183 | 0 | return _mm256_setzero_si256(); |
184 | 0 | } |
185 | 0 | } |
186 | | OPENSSL_UNTARGET_AVX2 |
187 | | |
188 | | static const uint8_t shuffle_masks[16][16] = { |
189 | | { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
190 | | { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
191 | | { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
192 | | { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
193 | | { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
194 | | { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
195 | | { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
196 | | { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 }, |
197 | | { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 }, |
198 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 }, |
199 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 }, |
200 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 }, |
201 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 }, |
202 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 }, |
203 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 }, |
204 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 } |
205 | | }; |
206 | | |
207 | | /** |
208 | | * Insert a line feed character in the 64-byte input at index K in [0,32). |
209 | | */ |
210 | | OPENSSL_TARGET_AVX2 |
211 | | static inline __m256i insert_line_feed32(__m256i input, int K) |
212 | 0 | { |
213 | 0 | __m256i line_feed_vector = _mm256_set1_epi8('\n'); |
214 | 0 | __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
215 | |
|
216 | 0 | if (K >= 16) { |
217 | 0 | __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]); |
218 | 0 | __m256i mask = _mm256_set_m128i(maskhi, identity); |
219 | 0 | __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); |
220 | 0 | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
221 | 0 | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
222 | |
|
223 | 0 | return result; |
224 | 0 | } |
225 | | /* Shift input right by 1 byte */ |
226 | 0 | __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21), |
227 | 0 | 15); |
228 | 0 | input = _mm256_blend_epi32(input, shift, 0xF0); |
229 | 0 | __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]); |
230 | 0 | __m256i mask = _mm256_set_m128i(identity, masklo); |
231 | 0 | __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); |
232 | 0 | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
233 | 0 | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
234 | 0 | return result; |
235 | 0 | } |
236 | | OPENSSL_UNTARGET_AVX2 |
237 | | |
238 | | OPENSSL_TARGET_AVX2 |
239 | | static inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride, |
240 | | int *wrap_cnt) |
241 | 0 | { |
242 | 0 | const int until_nl = stride - *wrap_cnt; |
243 | |
|
244 | 0 | if (until_nl > 32) { |
245 | 0 | _mm256_storeu_si256((__m256i *)out, v); |
246 | |
|
247 | 0 | *wrap_cnt += 32; |
248 | 0 | return 32; |
249 | 0 | } |
250 | | |
251 | 0 | if (until_nl == 32) { |
252 | 0 | _mm256_storeu_si256((__m256i *)out, v); |
253 | |
|
254 | 0 | out[32] = '\n'; |
255 | 0 | *wrap_cnt = 0; |
256 | 0 | return 33; |
257 | 0 | } |
258 | | |
259 | 0 | const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31); |
260 | 0 | const __m256i with_lf = insert_line_feed32(v, until_nl); |
261 | 0 | _mm256_storeu_si256((__m256i *)out, with_lf); |
262 | 0 | out[32] = last; |
263 | |
|
264 | 0 | *wrap_cnt = 32 - until_nl; |
265 | 0 | return 33; |
266 | 0 | } |
267 | | OPENSSL_UNTARGET_AVX2 |
268 | | |
269 | | OPENSSL_TARGET_AVX2 |
270 | | static inline size_t insert_nl_gt16(const __m256i v0, |
271 | | uint8_t *output, |
272 | | int wrap_max, int *wrap_cnt) |
273 | 0 | { |
274 | 0 | uint8_t *out = output; |
275 | 0 | int wrap_rem = wrap_max - *wrap_cnt; |
276 | 0 | _mm256_storeu_si256((__m256i *)(output), v0); |
277 | |
|
278 | 0 | if (wrap_rem > 32) { |
279 | 0 | *wrap_cnt += 32; |
280 | 0 | return 32; |
281 | 0 | } |
282 | | |
283 | 0 | __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF); |
284 | |
|
285 | 0 | __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, |
286 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
287 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
288 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
289 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
290 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF); |
291 | |
|
292 | 0 | __m256i blended_0L = v0; |
293 | 0 | int surplus_0 = wrap_rem < 16 ? 1 : 0; |
294 | 0 | if (surplus_0 == 1) { |
295 | 0 | __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem), |
296 | 0 | wrap_rem + surplus_0); |
297 | 0 | __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0); |
298 | 0 | __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane); |
299 | 0 | __m256i shifted_1_L = shift_left_zeros(v0, 1); |
300 | 0 | __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask); |
301 | |
|
302 | 0 | blended_0L = _mm256_blendv_epi8(v0, shifted, mask); |
303 | 0 | _mm256_storeu_si256((__m256i *)(output), blended_0L); |
304 | 0 | wrap_rem += wrap_max; |
305 | 0 | } |
306 | |
|
307 | 0 | int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0; |
308 | 0 | int last_of_1L = _mm256_extract_epi8(v0, 31); |
309 | |
|
310 | 0 | if (surplus_1 == 1) { |
311 | 0 | uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30); |
312 | 0 | int wrap_rem_1 = wrap_rem - 16; |
313 | 0 | __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1), |
314 | 0 | wrap_rem_1 + surplus_0 + surplus_1); |
315 | 0 | __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1); |
316 | 0 | __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L); |
317 | 0 | __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask); |
318 | 0 | _mm256_storeu_si256((__m256i *)(output), blended_1L); |
319 | |
|
320 | 0 | output[wrap_rem + surplus_0] = '\n'; |
321 | 0 | output[31 + surplus_0] = (uint8_t)sec_last_of_1L; |
322 | 0 | output[31 + surplus_0 + surplus_1] = last_of_1L; |
323 | 0 | } |
324 | |
|
325 | 0 | if (surplus_0 == 1) { |
326 | 0 | output[wrap_rem - wrap_max] = '\n'; |
327 | 0 | output[16] = _mm256_extract_epi8(v0, 15); |
328 | 0 | output[31 + surplus_0 + surplus_1] = last_of_1L; |
329 | 0 | } |
330 | |
|
331 | 0 | *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem; |
332 | |
|
333 | 0 | int nl_at_end = 0; |
334 | 0 | if (*wrap_cnt == wrap_max || *wrap_cnt == 0) { |
335 | 0 | *wrap_cnt = 0; |
336 | 0 | output[32 + surplus_0 + surplus_1] = '\n'; |
337 | 0 | nl_at_end = 1; |
338 | 0 | } |
339 | |
|
340 | 0 | out += 32 + surplus_0 + surplus_1 + nl_at_end; |
341 | 0 | size_t written = (size_t)(out - output); |
342 | |
|
343 | 0 | return written; |
344 | 0 | } |
345 | | OPENSSL_UNTARGET_AVX2 |
346 | | |
347 | | OPENSSL_TARGET_AVX2 |
348 | | static inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0, |
349 | | uint8_t *output, |
350 | | int dummy_stride, |
351 | | int *wrap_cnt) |
352 | 0 | { |
353 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, |
354 | 0 | (char)0xFF, |
355 | 0 | (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF, |
356 | 0 | 12); |
357 | 0 | __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask); |
358 | |
|
359 | 0 | _mm256_storeu_si256((__m256i *)(output + 0), shuffled); |
360 | |
|
361 | 0 | int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7); |
362 | 0 | int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29); |
363 | 0 | int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15); |
364 | |
|
365 | 0 | uint8_t *out = output; |
366 | 0 | out[4] = '\n'; |
367 | 0 | memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext)); |
368 | 0 | out[16 + 1] = '\n'; |
369 | 0 | memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1)); |
370 | 0 | out[16 + 14] = '\n'; |
371 | 0 | memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2)); |
372 | |
|
373 | 0 | out += 32 + 3; |
374 | 0 | *wrap_cnt = 4; |
375 | |
|
376 | 0 | size_t written = (out - output); |
377 | 0 | return written; |
378 | 0 | } |
379 | | OPENSSL_UNTARGET_AVX2 |
380 | | |
381 | | OPENSSL_TARGET_AVX2 |
382 | | static inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask) |
383 | 0 | { |
384 | 0 | __m256i newline = _mm256_set1_epi8('\n'); |
385 | |
|
386 | 0 | return _mm256_or_si256(_mm256_and_si256(mask, newline), |
387 | 0 | _mm256_andnot_si256(mask, data)); |
388 | 0 | } |
389 | | OPENSSL_UNTARGET_AVX2 |
390 | | |
391 | | OPENSSL_TARGET_AVX2 |
392 | | static inline size_t insert_nl_str4(const __m256i v0, uint8_t *output) |
393 | 0 | { |
394 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, |
395 | 0 | 7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12, |
396 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3, |
397 | 0 | (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9); |
398 | 0 | __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
399 | 0 | 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
400 | 0 | 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
401 | 0 | 0, 0); |
402 | 0 | __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); |
403 | 0 | __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes); |
404 | |
|
405 | 0 | _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl); |
406 | | |
407 | | /* Handle cross-lane remainder logic */ |
408 | | /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */ |
409 | 0 | #define B_LANE 16 /* Bytes per lane */ |
410 | 0 | #define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */ |
411 | 0 | #define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */ |
412 | | |
413 | | /* Bytes that were shifted out of lane 0 */ |
414 | 0 | __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L); |
415 | | |
416 | | /* Bytes that were shifted out of lane 1 */ |
417 | 0 | __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L), |
418 | 0 | B_LANE - N_RET_1_L), |
419 | 0 | B_LANE - 2); |
420 | | |
421 | | /* isolate the bytes that were shifted out of lane 1 */ |
422 | 0 | __m256i rem_2_L_P2 = _mm256_slli_si256( |
423 | 0 | _mm256_srli_si256(v0, |
424 | 0 | B_LANE - N_RET_2_L + N_RET_1_L), |
425 | 0 | N_RET_1_L); |
426 | |
|
427 | 0 | __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2); |
428 | |
|
429 | 0 | int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0); |
430 | 0 | int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2); |
431 | |
|
432 | 0 | uint8_t *out = output + 16; |
433 | 0 | memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext)); |
434 | 0 | out += 3; |
435 | 0 | *out++ = '\n'; |
436 | |
|
437 | 0 | out = output + 32; |
438 | 0 | memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext)); |
439 | 0 | out += 2; |
440 | 0 | *out++ = '\n'; |
441 | 0 | out += 4; |
442 | 0 | *out++ = '\n'; |
443 | |
|
444 | 0 | size_t written = (out - output); |
445 | 0 | return written; |
446 | 0 | } |
447 | | OPENSSL_UNTARGET_AVX2 |
448 | | |
449 | | OPENSSL_TARGET_AVX2 |
450 | | static inline size_t insert_nl_str8(const __m256i v0, uint8_t *output) |
451 | 0 | { |
452 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF, |
453 | 0 | 8, 9, 10, 11, 12, 13, 14, |
454 | 0 | (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, |
455 | 0 | 7, (char)0xFF, 8, 9, 10, 11, 12); |
456 | 0 | __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); |
457 | 0 | _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes); |
458 | 0 | int8_t rem_1_L = _mm256_extract_epi8(v0, 15); |
459 | 0 | int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29); |
460 | 0 | int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15); |
461 | 0 | uint8_t *out = output; |
462 | |
|
463 | 0 | memcpy(out + 16, &rem_1_L, sizeof(rem_1_L)); |
464 | 0 | memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1)); |
465 | 0 | memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2)); |
466 | |
|
467 | 0 | output[8] = '\n'; |
468 | 0 | output[17] = '\n'; |
469 | 0 | output[26] = '\n'; |
470 | 0 | output[35] = '\n'; |
471 | |
|
472 | 0 | out += 32 + 4; |
473 | |
|
474 | 0 | size_t written = (out - output); |
475 | 0 | return written; |
476 | 0 | } |
477 | | OPENSSL_UNTARGET_AVX2 |
478 | | |
479 | | OPENSSL_TARGET_AVX2 |
480 | | size_t encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst, |
481 | | const unsigned char *src, int srclen, int ctx_length, |
482 | | int *final_wrap_cnt) |
483 | 0 | { |
484 | 0 | const uint8_t *input = (const uint8_t *)src; |
485 | 0 | uint8_t *out = (uint8_t *)dst; |
486 | 0 | int i = 0; |
487 | 0 | int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4; |
488 | 0 | int wrap_cnt = 0; |
489 | 0 | const int use_srp = (ctx != NULL |
490 | 0 | && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0); |
491 | 0 | const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
492 | 0 | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
493 | 0 | int base = 0; |
494 | | |
495 | | /* Process 96 bytes at a time */ |
496 | 0 | for (; i + 100 <= srclen; i += 96) { |
497 | 0 | _mm_prefetch((const char *)(input + i + 192), _MM_HINT_T0); |
498 | | /* |
499 | | * Interleaved for each vector: load, shuffle, bit-split, lookup |
500 | | * before starting the next, giving the OoO engine independent work chains |
501 | | * across execution ports. |
502 | | */ |
503 | 0 | const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0)); |
504 | 0 | const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1)); |
505 | 0 | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); |
506 | 0 | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); |
507 | 0 | const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); |
508 | 0 | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); |
509 | 0 | const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); |
510 | 0 | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); |
511 | |
|
512 | 0 | const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2)); |
513 | 0 | const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3)); |
514 | 0 | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); |
515 | 0 | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); |
516 | 0 | const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); |
517 | 0 | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); |
518 | 0 | const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); |
519 | 0 | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); |
520 | |
|
521 | 0 | const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4)); |
522 | 0 | const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5)); |
523 | 0 | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); |
524 | 0 | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); |
525 | 0 | const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); |
526 | 0 | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); |
527 | 0 | const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); |
528 | 0 | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); |
529 | |
|
530 | 0 | const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6)); |
531 | 0 | const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7)); |
532 | 0 | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); |
533 | 0 | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); |
534 | 0 | const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); |
535 | 0 | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); |
536 | 0 | const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); |
537 | 0 | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); |
538 | |
|
539 | 0 | __m256i vec0; |
540 | 0 | __m256i vec1; |
541 | 0 | __m256i vec2; |
542 | 0 | __m256i vec3; |
543 | |
|
544 | 0 | if (use_srp) { |
545 | 0 | vec0 = lookup_pshufb_srp(input0); |
546 | 0 | vec1 = lookup_pshufb_srp(input1); |
547 | 0 | vec2 = lookup_pshufb_srp(input2); |
548 | 0 | vec3 = lookup_pshufb_srp(input3); |
549 | |
|
550 | 0 | } else { |
551 | 0 | vec0 = lookup_pshufb_std(input0); |
552 | 0 | vec1 = lookup_pshufb_std(input1); |
553 | 0 | vec2 = lookup_pshufb_std(input2); |
554 | 0 | vec3 = lookup_pshufb_std(input3); |
555 | 0 | } |
556 | |
|
557 | 0 | if (stride == 0) { |
558 | 0 | _mm256_storeu_si256((__m256i *)out, vec0); |
559 | |
|
560 | 0 | out += 32; |
561 | 0 | _mm256_storeu_si256((__m256i *)out, vec1); |
562 | |
|
563 | 0 | out += 32; |
564 | 0 | _mm256_storeu_si256((__m256i *)out, vec2); |
565 | |
|
566 | 0 | out += 32; |
567 | 0 | _mm256_storeu_si256((__m256i *)out, vec3); |
568 | |
|
569 | 0 | out += 32; |
570 | 0 | } else if (stride == 64) { |
571 | 0 | _mm256_storeu_si256((__m256i *)out, vec0); |
572 | |
|
573 | 0 | out += 32; |
574 | 0 | _mm256_storeu_si256((__m256i *)out, vec1); |
575 | |
|
576 | 0 | out += 32; |
577 | 0 | *(out++) = '\n'; |
578 | |
|
579 | 0 | _mm256_storeu_si256((__m256i *)out, vec2); |
580 | 0 | out += 32; |
581 | |
|
582 | 0 | _mm256_storeu_si256((__m256i *)out, vec3); |
583 | 0 | out += 32; |
584 | |
|
585 | 0 | *(out++) = '\n'; |
586 | 0 | } else if (stride == 4) { |
587 | 0 | int out_idx = 0; |
588 | |
|
589 | 0 | out_idx += (int)insert_nl_str4(vec0, out + out_idx); |
590 | 0 | out_idx += (int)insert_nl_str4(vec1, out + out_idx); |
591 | 0 | out_idx += (int)insert_nl_str4(vec2, out + out_idx); |
592 | 0 | out_idx += (int)insert_nl_str4(vec3, out + out_idx); |
593 | |
|
594 | 0 | out += out_idx; |
595 | 0 | } else if (stride == 8) { |
596 | |
|
597 | 0 | out += insert_nl_str8(vec0, out); |
598 | 0 | out += insert_nl_str8(vec1, out); |
599 | 0 | out += insert_nl_str8(vec2, out); |
600 | 0 | out += insert_nl_str8(vec3, out); |
601 | |
|
602 | 0 | } else if (stride == 12) { |
603 | 0 | switch (base) { |
604 | 0 | case 0: |
605 | |
|
606 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
607 | 0 | out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt); |
608 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
609 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
610 | 0 | break; |
611 | 0 | case 1: |
612 | 0 | out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt); |
613 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
614 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
615 | 0 | out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt); |
616 | 0 | break; |
617 | 0 | default: /* base == 2 */ |
618 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
619 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
620 | 0 | out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt); |
621 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
622 | 0 | break; |
623 | 0 | } |
624 | | |
625 | 0 | if (++base == 3) |
626 | 0 | base = 0; |
627 | 0 | } else if (stride >= 32) { |
628 | 0 | out += ins_nl_gt32(vec0, out, stride, &wrap_cnt); |
629 | 0 | out += ins_nl_gt32(vec1, out, stride, &wrap_cnt); |
630 | 0 | out += ins_nl_gt32(vec2, out, stride, &wrap_cnt); |
631 | 0 | out += ins_nl_gt32(vec3, out, stride, &wrap_cnt); |
632 | 0 | } else if (stride >= 16) { |
633 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
634 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
635 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
636 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
637 | 0 | } |
638 | 0 | } |
639 | | |
640 | 0 | if (stride == 0) { |
641 | 0 | for (; i + 28 <= srclen; i += 24) { |
642 | | /* lo = [xxxx|DDDC|CCBB|BAAA] */ |
643 | | /* hi = [xxxx|HHHG|GGFF|FEEE] */ |
644 | 0 | const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i)); |
645 | 0 | const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3)); |
646 | | /* |
647 | | * bytes from groups A, B and C are needed in separate 32-bit lanes |
648 | | * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] |
649 | | */ |
650 | 0 | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); |
651 | 0 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); |
652 | 0 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); |
653 | 0 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); |
654 | 0 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); |
655 | 0 | const __m256i indices = _mm256_or_si256(t1, t3); |
656 | 0 | _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices)); |
657 | |
|
658 | 0 | out += 32; |
659 | 0 | } |
660 | 0 | } |
661 | 0 | *final_wrap_cnt = wrap_cnt; |
662 | |
|
663 | 0 | if (stride >= 32 && wrap_cnt == stride) { |
664 | 0 | wrap_cnt = 0; |
665 | 0 | *out++ = '\n'; |
666 | 0 | } |
667 | |
|
668 | 0 | return (size_t)(out - (uint8_t *)dst) + evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt); |
669 | 0 | } |
670 | | OPENSSL_UNTARGET_AVX2 |
671 | | #endif /* !defined(_M_ARM64EC) */ |
672 | | #endif |