/src/openssl/crypto/evp/enc_b64_avx2.c
Line | Count | Source |
1 | | #include <openssl/evp.h> |
2 | | #include "enc_b64_scalar.h" |
3 | | #include "enc_b64_avx2.h" |
4 | | #include "internal/cryptlib.h" |
5 | | #include "crypto/evp.h" |
6 | | #include "evp_local.h" |
7 | | |
8 | | #if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) |
9 | | #define STRINGIFY_IMPLEMENTATION_(a) #a |
10 | | #define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) |
11 | | |
12 | | #ifdef __clang__ |
13 | | /* |
14 | | * clang does not have GCC push pop |
15 | | * warning: clang attribute push can't be used within a namespace in clang up |
16 | | * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be |
17 | | * outside* of a namespace. |
18 | | */ |
19 | | #define OPENSSL_TARGET_REGION(T) \ |
20 | | _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \ |
21 | | apply_to = function))) |
22 | | #define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop") |
23 | | #elif defined(__GNUC__) |
24 | | #define OPENSSL_TARGET_REGION(T) \ |
25 | | _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) |
26 | | #define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options") |
27 | | #endif /* clang then gcc */ |
28 | | |
29 | | /* Default target region macros don't do anything. */ |
30 | | #ifndef OPENSSL_TARGET_REGION |
31 | | #define OPENSSL_TARGET_REGION(T) |
32 | | #define OPENSSL_UNTARGET_REGION |
33 | | #endif |
34 | | |
35 | | #define OPENSSL_TARGET_AVX2 \ |
36 | | OPENSSL_TARGET_REGION("avx2") |
37 | | #define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION |
38 | | |
39 | | /* |
40 | | * Ensure this whole block is compiled with AVX2 enabled on GCC. |
41 | | * Clang/MSVC will just ignore these pragmas. |
42 | | */ |
43 | | |
44 | | #include <string.h> |
45 | | #include <immintrin.h> |
46 | | #include <stddef.h> |
47 | | #include <stdint.h> |
48 | | |
49 | | OPENSSL_TARGET_AVX2 |
50 | | static __m256i lookup_pshufb_std(__m256i input) |
51 | 0 | { |
52 | 0 | __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); |
53 | 0 | const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); |
54 | |
|
55 | 0 | result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); |
56 | 0 | __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
57 | 0 | '0' - 52, '0' - 52, |
58 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
59 | 0 | '/' - 63, 'A', 0, 0, |
60 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
61 | 0 | '0' - 52, '0' - 52, |
62 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
63 | 0 | '/' - 63, 'A', 0, 0); |
64 | |
|
65 | 0 | result = _mm256_shuffle_epi8(shift_LUT, result); |
66 | 0 | return _mm256_add_epi8(result, input); |
67 | 0 | } |
68 | | OPENSSL_UNTARGET_AVX2 |
69 | | |
70 | | OPENSSL_TARGET_AVX2 |
71 | | static inline __m256i lookup_pshufb_srp(__m256i input) |
72 | 0 | { |
73 | 0 | const __m256i zero = _mm256_setzero_si256(); |
74 | 0 | const __m256i hi = _mm256_set1_epi8((char)0x80); |
75 | 0 | __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input), |
76 | 0 | _mm256_cmpgt_epi8(input, |
77 | 0 | _mm256_set1_epi8(63))); |
78 | 0 | __m256i idx = _mm256_setzero_si256(); |
79 | |
|
80 | 0 | idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9))); |
81 | 0 | idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35))); |
82 | 0 | idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3), |
83 | 0 | _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62))); |
84 | 0 | idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4), |
85 | 0 | _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63))); |
86 | | |
87 | | /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */ |
88 | 0 | idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi)); |
89 | |
|
90 | 0 | const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, |
91 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, |
92 | 0 | '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0, |
93 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0); |
94 | |
|
95 | 0 | __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx); |
96 | 0 | __m256i ascii = _mm256_add_epi8(shift, input); |
97 | 0 | return ascii; |
98 | 0 | } |
99 | | OPENSSL_UNTARGET_AVX2 |
100 | | |
101 | | OPENSSL_TARGET_AVX2 |
102 | | static inline __m256i shift_right_zeros(__m256i v, int n) |
103 | 0 | { |
104 | 0 | switch (n) { |
105 | 0 | case 0: |
106 | 0 | return v; |
107 | 0 | case 1: |
108 | 0 | return _mm256_srli_si256(v, 1); |
109 | 0 | case 2: |
110 | 0 | return _mm256_srli_si256(v, 2); |
111 | 0 | case 3: |
112 | 0 | return _mm256_srli_si256(v, 3); |
113 | 0 | case 4: |
114 | 0 | return _mm256_srli_si256(v, 4); |
115 | 0 | case 5: |
116 | 0 | return _mm256_srli_si256(v, 5); |
117 | 0 | case 6: |
118 | 0 | return _mm256_srli_si256(v, 6); |
119 | 0 | case 7: |
120 | 0 | return _mm256_srli_si256(v, 7); |
121 | 0 | case 8: |
122 | 0 | return _mm256_srli_si256(v, 8); |
123 | 0 | case 9: |
124 | 0 | return _mm256_srli_si256(v, 9); |
125 | 0 | case 10: |
126 | 0 | return _mm256_srli_si256(v, 10); |
127 | 0 | case 11: |
128 | 0 | return _mm256_srli_si256(v, 11); |
129 | 0 | case 12: |
130 | 0 | return _mm256_srli_si256(v, 12); |
131 | 0 | case 13: |
132 | 0 | return _mm256_srli_si256(v, 13); |
133 | 0 | case 14: |
134 | 0 | return _mm256_srli_si256(v, 14); |
135 | 0 | case 15: |
136 | 0 | return _mm256_srli_si256(v, 15); |
137 | 0 | default: |
138 | 0 | return _mm256_setzero_si256(); |
139 | 0 | } |
140 | 0 | } |
141 | | OPENSSL_UNTARGET_AVX2 |
142 | | |
143 | | OPENSSL_TARGET_AVX2 |
144 | | static inline __m256i shift_left_zeros(__m256i v, int n) |
145 | 0 | { |
146 | 0 | switch (n) { |
147 | 0 | case 0: |
148 | 0 | return v; |
149 | 0 | case 1: |
150 | 0 | return _mm256_slli_si256(v, 1); |
151 | 0 | case 2: |
152 | 0 | return _mm256_slli_si256(v, 2); |
153 | 0 | case 3: |
154 | 0 | return _mm256_slli_si256(v, 3); |
155 | 0 | case 4: |
156 | 0 | return _mm256_slli_si256(v, 4); |
157 | 0 | case 5: |
158 | 0 | return _mm256_slli_si256(v, 5); |
159 | 0 | case 6: |
160 | 0 | return _mm256_slli_si256(v, 6); |
161 | 0 | case 7: |
162 | 0 | return _mm256_slli_si256(v, 7); |
163 | 0 | case 8: |
164 | 0 | return _mm256_slli_si256(v, 8); |
165 | 0 | case 9: |
166 | 0 | return _mm256_slli_si256(v, 9); |
167 | 0 | case 10: |
168 | 0 | return _mm256_slli_si256(v, 10); |
169 | 0 | case 11: |
170 | 0 | return _mm256_slli_si256(v, 11); |
171 | 0 | case 12: |
172 | 0 | return _mm256_slli_si256(v, 12); |
173 | 0 | case 13: |
174 | 0 | return _mm256_slli_si256(v, 13); |
175 | 0 | case 14: |
176 | 0 | return _mm256_slli_si256(v, 14); |
177 | 0 | case 15: |
178 | 0 | return _mm256_slli_si256(v, 15); |
179 | 0 | case 16: |
180 | 0 | return _mm256_setzero_si256(); |
181 | 0 | default: |
182 | 0 | return _mm256_setzero_si256(); |
183 | 0 | } |
184 | 0 | } |
185 | | OPENSSL_UNTARGET_AVX2 |
186 | | |
187 | | static const uint8_t shuffle_masks[16][16] = { |
188 | | { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
189 | | { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
190 | | { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
191 | | { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
192 | | { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
193 | | { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
194 | | { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, |
195 | | { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 }, |
196 | | { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 }, |
197 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 }, |
198 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 }, |
199 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 }, |
200 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 }, |
201 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 }, |
202 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 }, |
203 | | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 } |
204 | | }; |
205 | | |
206 | | /** |
207 | | * Insert a line feed character in the 64-byte input at index K in [0,32). |
208 | | */ |
209 | | OPENSSL_TARGET_AVX2 |
210 | | static inline __m256i insert_line_feed32(__m256i input, int K) |
211 | 0 | { |
212 | 0 | __m256i line_feed_vector = _mm256_set1_epi8('\n'); |
213 | 0 | __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
214 | |
|
215 | 0 | if (K >= 16) { |
216 | 0 | __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]); |
217 | 0 | __m256i mask = _mm256_set_m128i(maskhi, identity); |
218 | 0 | __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); |
219 | 0 | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
220 | 0 | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
221 | |
|
222 | 0 | return result; |
223 | 0 | } |
224 | | /* Shift input right by 1 byte */ |
225 | 0 | __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21), |
226 | 0 | 15); |
227 | 0 | input = _mm256_blend_epi32(input, shift, 0xF0); |
228 | 0 | __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]); |
229 | 0 | __m256i mask = _mm256_set_m128i(identity, masklo); |
230 | 0 | __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80)); |
231 | 0 | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
232 | 0 | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
233 | 0 | return result; |
234 | 0 | } |
235 | | OPENSSL_UNTARGET_AVX2 |
236 | | |
237 | | OPENSSL_TARGET_AVX2 |
238 | | static inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride, |
239 | | int *wrap_cnt) |
240 | 0 | { |
241 | 0 | const int until_nl = stride - *wrap_cnt; |
242 | |
|
243 | 0 | if (until_nl > 32) { |
244 | 0 | _mm256_storeu_si256((__m256i *)out, v); |
245 | |
|
246 | 0 | *wrap_cnt += 32; |
247 | 0 | return 32; |
248 | 0 | } |
249 | | |
250 | 0 | if (until_nl == 32) { |
251 | 0 | _mm256_storeu_si256((__m256i *)out, v); |
252 | |
|
253 | 0 | out[32] = '\n'; |
254 | 0 | *wrap_cnt = 0; |
255 | 0 | return 33; |
256 | 0 | } |
257 | | |
258 | 0 | const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31); |
259 | 0 | const __m256i with_lf = insert_line_feed32(v, until_nl); |
260 | 0 | _mm256_storeu_si256((__m256i *)out, with_lf); |
261 | 0 | out[32] = last; |
262 | |
|
263 | 0 | *wrap_cnt = 32 - until_nl; |
264 | 0 | return 33; |
265 | 0 | } |
266 | | OPENSSL_UNTARGET_AVX2 |
267 | | |
268 | | OPENSSL_TARGET_AVX2 |
269 | | static inline size_t insert_nl_gt16(const __m256i v0, |
270 | | uint8_t *output, |
271 | | int wrap_max, int *wrap_cnt) |
272 | 0 | { |
273 | 0 | uint8_t *out = output; |
274 | 0 | int wrap_rem = wrap_max - *wrap_cnt; |
275 | 0 | _mm256_storeu_si256((__m256i *)(output), v0); |
276 | |
|
277 | 0 | if (wrap_rem > 32) { |
278 | 0 | *wrap_cnt += 32; |
279 | 0 | return 32; |
280 | 0 | } |
281 | | |
282 | 0 | __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF); |
283 | |
|
284 | 0 | __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, |
285 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
286 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
287 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
288 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, |
289 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF); |
290 | |
|
291 | 0 | __m256i blended_0L = v0; |
292 | 0 | int surplus_0 = wrap_rem < 16 ? 1 : 0; |
293 | 0 | if (surplus_0 == 1) { |
294 | 0 | __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem), |
295 | 0 | wrap_rem + surplus_0); |
296 | 0 | __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0); |
297 | 0 | __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane); |
298 | 0 | __m256i shifted_1_L = shift_left_zeros(v0, 1); |
299 | 0 | __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask); |
300 | |
|
301 | 0 | blended_0L = _mm256_blendv_epi8(v0, shifted, mask); |
302 | 0 | _mm256_storeu_si256((__m256i *)(output), blended_0L); |
303 | 0 | wrap_rem += wrap_max; |
304 | 0 | } |
305 | |
|
306 | 0 | int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0; |
307 | 0 | int last_of_1L = _mm256_extract_epi8(v0, 31); |
308 | |
|
309 | 0 | if (surplus_1 == 1) { |
310 | 0 | uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30); |
311 | 0 | int wrap_rem_1 = wrap_rem - 16; |
312 | 0 | __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1), |
313 | 0 | wrap_rem_1 + surplus_0 + surplus_1); |
314 | 0 | __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1); |
315 | 0 | __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L); |
316 | 0 | __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask); |
317 | 0 | _mm256_storeu_si256((__m256i *)(output), blended_1L); |
318 | |
|
319 | 0 | output[wrap_rem + surplus_0] = '\n'; |
320 | 0 | output[31 + surplus_0] = (uint8_t)sec_last_of_1L; |
321 | 0 | output[31 + surplus_0 + surplus_1] = last_of_1L; |
322 | 0 | } |
323 | |
|
324 | 0 | if (surplus_0 == 1) { |
325 | 0 | output[wrap_rem - wrap_max] = '\n'; |
326 | 0 | output[16] = _mm256_extract_epi8(v0, 15); |
327 | 0 | output[31 + surplus_0 + surplus_1] = last_of_1L; |
328 | 0 | } |
329 | |
|
330 | 0 | *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem; |
331 | |
|
332 | 0 | int nl_at_end = 0; |
333 | 0 | if (*wrap_cnt == wrap_max || *wrap_cnt == 0) { |
334 | 0 | *wrap_cnt = 0; |
335 | 0 | output[32 + surplus_0 + surplus_1] = '\n'; |
336 | 0 | nl_at_end = 1; |
337 | 0 | } |
338 | |
|
339 | 0 | out += 32 + surplus_0 + surplus_1 + nl_at_end; |
340 | 0 | size_t written = (size_t)(out - output); |
341 | |
|
342 | 0 | return written; |
343 | 0 | } |
344 | | OPENSSL_UNTARGET_AVX2 |
345 | | |
346 | | OPENSSL_TARGET_AVX2 |
347 | | static inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0, |
348 | | uint8_t *output, |
349 | | int dummy_stride, |
350 | | int *wrap_cnt) |
351 | 0 | { |
352 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, |
353 | 0 | (char)0xFF, |
354 | 0 | (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF, |
355 | 0 | 12); |
356 | 0 | __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask); |
357 | |
|
358 | 0 | _mm256_storeu_si256((__m256i *)(output + 0), shuffled); |
359 | |
|
360 | 0 | int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7); |
361 | 0 | int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29); |
362 | 0 | int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15); |
363 | |
|
364 | 0 | uint8_t *out = output; |
365 | 0 | out[4] = '\n'; |
366 | 0 | memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext)); |
367 | 0 | out[16 + 1] = '\n'; |
368 | 0 | memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1)); |
369 | 0 | out[16 + 14] = '\n'; |
370 | 0 | memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2)); |
371 | |
|
372 | 0 | out += 32 + 3; |
373 | 0 | *wrap_cnt = 4; |
374 | |
|
375 | 0 | size_t written = (out - output); |
376 | 0 | return written; |
377 | 0 | } |
378 | | OPENSSL_UNTARGET_AVX2 |
379 | | |
380 | | OPENSSL_TARGET_AVX2 |
381 | | static inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask) |
382 | 0 | { |
383 | 0 | __m256i newline = _mm256_set1_epi8('\n'); |
384 | |
|
385 | 0 | return _mm256_or_si256(_mm256_and_si256(mask, newline), |
386 | 0 | _mm256_andnot_si256(mask, data)); |
387 | 0 | } |
388 | | OPENSSL_UNTARGET_AVX2 |
389 | | |
390 | | OPENSSL_TARGET_AVX2 |
391 | | static inline size_t insert_nl_str4(const __m256i v0, uint8_t *output) |
392 | 0 | { |
393 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, |
394 | 0 | 7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12, |
395 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3, |
396 | 0 | (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9); |
397 | 0 | __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
398 | 0 | 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
399 | 0 | 0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF, |
400 | 0 | 0, 0); |
401 | 0 | __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); |
402 | 0 | __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes); |
403 | |
|
404 | 0 | _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl); |
405 | | |
406 | | /* Handle cross-lane remainder logic */ |
407 | | /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */ |
408 | 0 | #define B_LANE 16 /* Bytes per lane */ |
409 | 0 | #define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */ |
410 | 0 | #define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */ |
411 | | |
412 | | /* Bytes that were shifted out of lane 0 */ |
413 | 0 | __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L); |
414 | | |
415 | | /* Bytes that were shifted out of lane 1 */ |
416 | 0 | __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L), |
417 | 0 | B_LANE - N_RET_1_L), |
418 | 0 | B_LANE - 2); |
419 | | |
420 | | /* isolate the bytes that were shifted out of lane 1 */ |
421 | 0 | __m256i rem_2_L_P2 = _mm256_slli_si256( |
422 | 0 | _mm256_srli_si256(v0, |
423 | 0 | B_LANE - N_RET_2_L + N_RET_1_L), |
424 | 0 | N_RET_1_L); |
425 | |
|
426 | 0 | __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2); |
427 | |
|
428 | 0 | int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0); |
429 | 0 | int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2); |
430 | |
|
431 | 0 | uint8_t *out = output + 16; |
432 | 0 | memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext)); |
433 | 0 | out += 3; |
434 | 0 | *out++ = '\n'; |
435 | |
|
436 | 0 | out = output + 32; |
437 | 0 | memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext)); |
438 | 0 | out += 2; |
439 | 0 | *out++ = '\n'; |
440 | 0 | out += 4; |
441 | 0 | *out++ = '\n'; |
442 | |
|
443 | 0 | size_t written = (out - output); |
444 | 0 | return written; |
445 | 0 | } |
446 | | OPENSSL_UNTARGET_AVX2 |
447 | | |
448 | | OPENSSL_TARGET_AVX2 |
449 | | static inline size_t insert_nl_str8(const __m256i v0, uint8_t *output) |
450 | 0 | { |
451 | 0 | __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF, |
452 | 0 | 8, 9, 10, 11, 12, 13, 14, |
453 | 0 | (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, |
454 | 0 | 7, (char)0xFF, 8, 9, 10, 11, 12); |
455 | 0 | __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask); |
456 | 0 | _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes); |
457 | 0 | int8_t rem_1_L = _mm256_extract_epi8(v0, 15); |
458 | 0 | int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29); |
459 | 0 | int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15); |
460 | 0 | uint8_t *out = output; |
461 | |
|
462 | 0 | memcpy(out + 16, &rem_1_L, sizeof(rem_1_L)); |
463 | 0 | memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1)); |
464 | 0 | memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2)); |
465 | |
|
466 | 0 | output[8] = '\n'; |
467 | 0 | output[17] = '\n'; |
468 | 0 | output[26] = '\n'; |
469 | 0 | output[35] = '\n'; |
470 | |
|
471 | 0 | out += 32 + 4; |
472 | |
|
473 | 0 | size_t written = (out - output); |
474 | 0 | return written; |
475 | 0 | } |
476 | | OPENSSL_UNTARGET_AVX2 |
477 | | |
478 | | OPENSSL_TARGET_AVX2 |
479 | | int encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst, |
480 | | const unsigned char *src, int srclen, int ctx_length, |
481 | | int *final_wrap_cnt) |
482 | 0 | { |
483 | 0 | const uint8_t *input = (const uint8_t *)src; |
484 | 0 | uint8_t *out = (uint8_t *)dst; |
485 | 0 | int i = 0; |
486 | 0 | int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4; |
487 | 0 | int wrap_cnt = 0; |
488 | 0 | const int use_srp = (ctx != NULL |
489 | 0 | && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0); |
490 | 0 | const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
491 | 0 | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
492 | 0 | int base = 0; |
493 | | |
494 | | /* Process 96 bytes at a time */ |
495 | 0 | for (; i + 100 <= srclen; i += 96) { |
496 | | /* We shave off 4 bytes from the beginning and the end */ |
497 | 0 | const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0)); |
498 | 0 | const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1)); |
499 | 0 | const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2)); |
500 | 0 | const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3)); |
501 | 0 | const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4)); |
502 | 0 | const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5)); |
503 | 0 | const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6)); |
504 | 0 | const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7)); |
505 | 0 | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); |
506 | 0 | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); |
507 | 0 | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); |
508 | 0 | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); |
509 | 0 | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); |
510 | 0 | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); |
511 | 0 | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); |
512 | 0 | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); |
513 | 0 | const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); |
514 | 0 | const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); |
515 | 0 | const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); |
516 | 0 | const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); |
517 | 0 | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); |
518 | 0 | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); |
519 | 0 | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); |
520 | 0 | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); |
521 | 0 | const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); |
522 | 0 | const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); |
523 | 0 | const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); |
524 | 0 | const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); |
525 | 0 | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); |
526 | 0 | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); |
527 | 0 | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); |
528 | 0 | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); |
529 | 0 | __m256i vec0; |
530 | 0 | __m256i vec1; |
531 | 0 | __m256i vec2; |
532 | 0 | __m256i vec3; |
533 | |
|
534 | 0 | if (use_srp) { |
535 | 0 | vec0 = lookup_pshufb_srp(input0); |
536 | 0 | vec1 = lookup_pshufb_srp(input1); |
537 | 0 | vec2 = lookup_pshufb_srp(input2); |
538 | 0 | vec3 = lookup_pshufb_srp(input3); |
539 | |
|
540 | 0 | } else { |
541 | 0 | vec0 = lookup_pshufb_std(input0); |
542 | 0 | vec1 = lookup_pshufb_std(input1); |
543 | 0 | vec2 = lookup_pshufb_std(input2); |
544 | 0 | vec3 = lookup_pshufb_std(input3); |
545 | 0 | } |
546 | |
|
547 | 0 | if (stride == 0) { |
548 | 0 | _mm256_storeu_si256((__m256i *)out, vec0); |
549 | |
|
550 | 0 | out += 32; |
551 | 0 | _mm256_storeu_si256((__m256i *)out, vec1); |
552 | |
|
553 | 0 | out += 32; |
554 | 0 | _mm256_storeu_si256((__m256i *)out, vec2); |
555 | |
|
556 | 0 | out += 32; |
557 | 0 | _mm256_storeu_si256((__m256i *)out, vec3); |
558 | |
|
559 | 0 | out += 32; |
560 | 0 | } else if (stride == 64) { |
561 | 0 | _mm256_storeu_si256((__m256i *)out, vec0); |
562 | |
|
563 | 0 | out += 32; |
564 | 0 | _mm256_storeu_si256((__m256i *)out, vec1); |
565 | |
|
566 | 0 | out += 32; |
567 | 0 | *(out++) = '\n'; |
568 | |
|
569 | 0 | _mm256_storeu_si256((__m256i *)out, vec2); |
570 | 0 | out += 32; |
571 | |
|
572 | 0 | _mm256_storeu_si256((__m256i *)out, vec3); |
573 | 0 | out += 32; |
574 | |
|
575 | 0 | *(out++) = '\n'; |
576 | 0 | } else if (stride == 4) { |
577 | 0 | int out_idx = 0; |
578 | |
|
579 | 0 | out_idx += (int)insert_nl_str4(vec0, out + out_idx); |
580 | 0 | out_idx += (int)insert_nl_str4(vec1, out + out_idx); |
581 | 0 | out_idx += (int)insert_nl_str4(vec2, out + out_idx); |
582 | 0 | out_idx += (int)insert_nl_str4(vec3, out + out_idx); |
583 | |
|
584 | 0 | out += out_idx; |
585 | 0 | } else if (stride == 8) { |
586 | |
|
587 | 0 | out += insert_nl_str8(vec0, out); |
588 | 0 | out += insert_nl_str8(vec1, out); |
589 | 0 | out += insert_nl_str8(vec2, out); |
590 | 0 | out += insert_nl_str8(vec3, out); |
591 | |
|
592 | 0 | } else if (stride == 12) { |
593 | 0 | switch (base) { |
594 | 0 | case 0: |
595 | |
|
596 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
597 | 0 | out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt); |
598 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
599 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
600 | 0 | break; |
601 | 0 | case 1: |
602 | 0 | out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt); |
603 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
604 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
605 | 0 | out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt); |
606 | 0 | break; |
607 | 0 | default: /* base == 2 */ |
608 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
609 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
610 | 0 | out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt); |
611 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
612 | 0 | break; |
613 | 0 | } |
614 | | |
615 | 0 | if (++base == 3) |
616 | 0 | base = 0; |
617 | 0 | } else if (stride >= 32) { |
618 | 0 | out += ins_nl_gt32(vec0, out, stride, &wrap_cnt); |
619 | 0 | out += ins_nl_gt32(vec1, out, stride, &wrap_cnt); |
620 | 0 | out += ins_nl_gt32(vec2, out, stride, &wrap_cnt); |
621 | 0 | out += ins_nl_gt32(vec3, out, stride, &wrap_cnt); |
622 | 0 | } else if (stride >= 16) { |
623 | 0 | out += insert_nl_gt16(vec0, out, stride, &wrap_cnt); |
624 | 0 | out += insert_nl_gt16(vec1, out, stride, &wrap_cnt); |
625 | 0 | out += insert_nl_gt16(vec2, out, stride, &wrap_cnt); |
626 | 0 | out += insert_nl_gt16(vec3, out, stride, &wrap_cnt); |
627 | 0 | } |
628 | 0 | } |
629 | | |
630 | 0 | if (stride == 0) { |
631 | 0 | for (; i + 28 <= srclen; i += 24) { |
632 | | /* lo = [xxxx|DDDC|CCBB|BAAA] */ |
633 | | /* hi = [xxxx|HHHG|GGFF|FEEE] */ |
634 | 0 | const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i)); |
635 | 0 | const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3)); |
636 | | /* |
637 | | * bytes from groups A, B and C are needed in separate 32-bit lanes |
638 | | * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] |
639 | | */ |
640 | 0 | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); |
641 | 0 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); |
642 | 0 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); |
643 | 0 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); |
644 | 0 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); |
645 | 0 | const __m256i indices = _mm256_or_si256(t1, t3); |
646 | 0 | _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices)); |
647 | |
|
648 | 0 | out += 32; |
649 | 0 | } |
650 | 0 | } |
651 | 0 | *final_wrap_cnt = wrap_cnt; |
652 | |
|
653 | 0 | if (stride >= 32 && wrap_cnt == stride) { |
654 | 0 | wrap_cnt = 0; |
655 | 0 | *out++ = '\n'; |
656 | 0 | } |
657 | |
|
658 | 0 | return (int)(out - (uint8_t *)dst) + +evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt); |
659 | 0 | } |
660 | | OPENSSL_UNTARGET_AVX2 |
661 | | #endif |