/src/simdutf/src/icelake/icelake_base64.inl.cpp
Line | Count | Source |
1 | | // file included directly |
2 | | /** |
3 | | * References and further reading: |
4 | | * |
5 | | * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the |
6 | | * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. |
7 | | * https://arxiv.org/abs/1910.05109 |
8 | | * |
9 | | * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 |
10 | | * Instructions, ACM Transactions on the Web 12 (3), 2018. |
11 | | * https://arxiv.org/abs/1704.00605 |
12 | | * |
13 | | * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. |
14 | | * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, |
15 | | * Request for Comments: 4648. |
16 | | * |
17 | | * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. |
18 | | * http://www.alfredklomp.com/programming/sse-base64/. (2014). |
19 | | * |
20 | | * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD |
21 | | * acceleration. https://github.com/aklomp/base64. (2014). |
22 | | * |
23 | | * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). |
24 | | * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ |
25 | | * |
26 | | * Nick Kopp. 2013. Base64 Encoding on a GPU. |
27 | | * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). |
28 | | */ |
29 | | |
30 | | struct block64 { |
31 | | __m512i chunks[1]; |
32 | | }; |
33 | | |
34 | | template <bool base64_url, bool use_lines> |
35 | | size_t encode_base64_impl(char *dst, const char *src, size_t srclen, |
36 | | base64_options options, |
37 | 0 | size_t line_length = simdutf::default_line_length) { |
38 | 0 | size_t offset = 0; |
39 | 0 | if (line_length < 4) { |
40 | 0 | line_length = 4; // We do not support line_length less than 4 |
41 | 0 | } |
42 | | // credit: Wojciech Muła |
43 | 0 | const uint8_t *input = (const uint8_t *)src; |
44 | |
|
45 | 0 | uint8_t *out = (uint8_t *)dst; |
46 | 0 | static const char *lookup_tbl = |
47 | 0 | base64_url |
48 | 0 | ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" |
49 | 0 | : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
50 | 0 | const __m512i shuffle_input = _mm512_setr_epi32( |
51 | 0 | 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10, |
52 | 0 | 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, |
53 | 0 | 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); |
54 | 0 | const __m512i lookup = |
55 | 0 | _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl)); |
56 | 0 | const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a)); |
57 | 0 | size_t size = srclen; |
58 | 0 | __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1 |
59 | | // We want that input == end_input means that we must stop. |
60 | 0 | const uint8_t *end_input = input + (size - (size % 48)); |
61 | 0 | while (input != end_input) { |
62 | 0 | const __m512i v = _mm512_maskz_loadu_epi8( |
63 | 0 | input_mask, reinterpret_cast<const __m512i *>(input)); |
64 | 0 | const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); |
65 | 0 | const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); |
66 | 0 | const __m512i result = _mm512_permutexvar_epi8(indices, lookup); |
67 | 0 | if (use_lines) { |
68 | 0 | if (offset + 64 > line_length) { |
69 | 0 | if (line_length >= 64) { |
70 | 0 | __m512i expanded = _mm512_mask_expand_epi8( |
71 | 0 | _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))), |
72 | 0 | result); |
73 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded); |
74 | 0 | __m128i last_lane = |
75 | 0 | _mm512_extracti32x4_epi32(result, 3); // Lane 3 (bytes 48-63) |
76 | 0 | uint8_t last_byte = |
77 | 0 | static_cast<uint8_t>(_mm_extract_epi8(last_lane, 15)); |
78 | 0 | out[64] = last_byte; |
79 | 0 | out += 65; |
80 | 0 | offset = 64 - (line_length - offset); |
81 | 0 | } else { // slow path |
82 | 0 | alignas(64) uint8_t local_buffer[64]; |
83 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), |
84 | 0 | result); |
85 | 0 | size_t out_pos = 0; |
86 | 0 | size_t local_offset = offset; |
87 | 0 | for (size_t j = 0; j < 64;) { |
88 | 0 | if (local_offset == line_length) { |
89 | 0 | out[out_pos++] = '\n'; |
90 | 0 | local_offset = 0; |
91 | 0 | } |
92 | 0 | out[out_pos++] = local_buffer[j++]; |
93 | 0 | local_offset++; |
94 | 0 | } |
95 | 0 | offset = local_offset; |
96 | 0 | out += out_pos; |
97 | 0 | } |
98 | 0 | } else { |
99 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); |
100 | 0 | offset += 64; |
101 | 0 | out += 64; |
102 | 0 | } |
103 | 0 | } else { |
104 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); |
105 | 0 | out += 64; |
106 | 0 | } |
107 | 0 | input += 48; |
108 | 0 | } |
109 | 0 | size = size % 48; |
110 | |
|
111 | 0 | input_mask = ((__mmask64)1 << size) - 1; |
112 | 0 | const __m512i v = _mm512_maskz_loadu_epi8( |
113 | 0 | input_mask, reinterpret_cast<const __m512i *>(input)); |
114 | 0 | const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); |
115 | 0 | const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); |
116 | 0 | bool padding_needed = |
117 | 0 | (((options & base64_url) == 0) ^ |
118 | 0 | ((options & base64_reverse_padding) == base64_reverse_padding)); |
119 | 0 | size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0; |
120 | 0 | size_t output_len = ((size + 2) / 3) * 4; |
121 | 0 | size_t non_padded_output_len = output_len - padding_amount; |
122 | 0 | if (!padding_needed) { |
123 | 0 | output_len = non_padded_output_len; |
124 | 0 | } |
125 | | // If no output, we are done. |
126 | 0 | if (output_len == 0) { |
127 | 0 | return (size_t)(out - (uint8_t *)dst); |
128 | 0 | } |
129 | 0 | __mmask64 output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len); |
130 | 0 | __m512i result = _mm512_mask_permutexvar_epi8( |
131 | 0 | _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1, |
132 | 0 | indices, lookup); |
133 | 0 | if (use_lines) { |
134 | 0 | if (offset + output_len > line_length) { |
135 | 0 | if (line_length >= 64) { |
136 | 0 | __m512i expanded = _mm512_mask_expand_epi8( |
137 | 0 | _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))), |
138 | 0 | result); |
139 | 0 | if (output_len == 64) { |
140 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded); |
141 | 0 | out += 64; |
142 | 0 | _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out - 63), |
143 | 0 | 1ULL << 63, result); |
144 | 0 | out++; |
145 | 0 | } else { |
146 | 0 | output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len - 1); |
147 | 0 | _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, |
148 | 0 | expanded); |
149 | 0 | out += output_len + 1; |
150 | 0 | } |
151 | 0 | } else { |
152 | 0 | alignas(64) uint8_t local_buffer[64]; |
153 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), result); |
154 | 0 | size_t out_pos = 0; |
155 | 0 | size_t local_offset = offset; |
156 | 0 | for (size_t j = 0; j < output_len;) { |
157 | 0 | if (local_offset == line_length) { |
158 | 0 | out[out_pos++] = '\n'; |
159 | 0 | local_offset = 0; |
160 | 0 | } |
161 | 0 | out[out_pos++] = local_buffer[j++]; |
162 | 0 | local_offset++; |
163 | 0 | } |
164 | 0 | offset = local_offset; |
165 | 0 | out += out_pos; |
166 | 0 | } |
167 | 0 | } else { |
168 | 0 | _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, |
169 | 0 | result); |
170 | 0 | out += output_len; |
171 | 0 | } |
172 | 0 | } else { |
173 | 0 | _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, |
174 | 0 | result); |
175 | 0 | out += output_len; |
176 | 0 | } |
177 | 0 | return (size_t)(out - (uint8_t *)dst); |
178 | 0 | } Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<true, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<false, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<true, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<false, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) |
179 | | |
180 | | template <bool base64_url> |
181 | | size_t encode_base64(char *dst, const char *src, size_t srclen, |
182 | 0 | base64_options options) { |
183 | 0 | return encode_base64_impl<base64_url, false>(dst, src, srclen, options); |
184 | 0 | } Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64<true>(char*, char const*, unsigned long, simdutf::base64_options) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64<false>(char*, char const*, unsigned long, simdutf::base64_options) |
185 | | |
186 | | template <bool base64_url, bool ignore_garbage, bool default_or_url> |
187 | | static inline uint64_t to_base64_mask(block64 *b, uint64_t *error, |
188 | 0 | uint64_t input_mask = UINT64_MAX) { |
189 | 0 | __m512i input = b->chunks[0]; |
190 | 0 | const __m512i ascii_space_tbl = _mm512_set_epi8( |
191 | 0 | 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, |
192 | 0 | 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, |
193 | 0 | 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32); |
194 | 0 | __m512i lookup0; |
195 | 0 | if (default_or_url) { |
196 | 0 | lookup0 = _mm512_set_epi8( |
197 | 0 | -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, |
198 | 0 | 52, 63, -128, 62, -128, 62, -128, -128, -128, -128, -128, -128, -128, |
199 | 0 | -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, |
200 | 0 | -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, |
201 | 0 | -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); |
202 | 0 | } else if (base64_url) { |
203 | 0 | lookup0 = _mm512_set_epi8( |
204 | 0 | -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, |
205 | 0 | 52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128, |
206 | 0 | -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, |
207 | 0 | -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, |
208 | 0 | -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); |
209 | 0 | } else { |
210 | 0 | lookup0 = _mm512_set_epi8( |
211 | 0 | -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, |
212 | 0 | 52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, |
213 | 0 | -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, |
214 | 0 | -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, |
215 | 0 | -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128); |
216 | 0 | } |
217 | 0 | __m512i lookup1; |
218 | 0 | if (default_or_url) { |
219 | 0 | lookup1 = _mm512_set_epi8( |
220 | 0 | -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, |
221 | 0 | 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, |
222 | 0 | 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, |
223 | 0 | 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); |
224 | 0 | } else if (base64_url) { |
225 | 0 | lookup1 = _mm512_set_epi8( |
226 | 0 | -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, |
227 | 0 | 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, |
228 | 0 | 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, |
229 | 0 | 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); |
230 | 0 | } else { |
231 | 0 | lookup1 = _mm512_set_epi8( |
232 | 0 | -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, |
233 | 0 | 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, |
234 | 0 | -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, |
235 | 0 | 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); |
236 | 0 | } |
237 | |
|
238 | 0 | const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1); |
239 | 0 | const __m512i combined = _mm512_or_si512(translated, input); |
240 | 0 | const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask; |
241 | 0 | if (!ignore_garbage && mask) { |
242 | 0 | const __mmask64 spaces = |
243 | 0 | _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input), |
244 | 0 | input) & |
245 | 0 | input_mask; |
246 | 0 | *error = (mask ^ spaces); |
247 | 0 | } |
248 | 0 | b->chunks[0] = translated; |
249 | |
|
250 | 0 | return mask | (~input_mask); |
251 | 0 | } Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, true, true>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, false, true>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<true, true, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<true, false, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, true, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, false, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long) |
252 | | |
253 | 0 | static inline void copy_block(block64 *b, char *output) { |
254 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]); |
255 | 0 | } |
256 | | |
257 | 0 | static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { |
258 | 0 | uint64_t nmask = ~mask; |
259 | 0 | __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]); |
260 | 0 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c); |
261 | 0 | return _mm_popcnt_u64(nmask); |
262 | 0 | } |
263 | | |
264 | | // The caller of this function is responsible to ensure that there are 64 bytes |
265 | | // available from reading at src. The data is read into a block64 structure. |
266 | 0 | static inline void load_block(block64 *b, const char *src) { |
267 | 0 | b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)); |
268 | 0 | } |
269 | | |
270 | | static inline void load_block_partial(block64 *b, const char *src, |
271 | 0 | __mmask64 input_mask) { |
272 | 0 | b->chunks[0] = _mm512_maskz_loadu_epi8( |
273 | 0 | input_mask, reinterpret_cast<const __m512i *>(src)); |
274 | 0 | } |
275 | | |
276 | | // The caller of this function is responsible to ensure that there are 128 bytes |
277 | | // available from reading at src. The data is read into a block64 structure. |
278 | 0 | static inline void load_block(block64 *b, const char16_t *src) { |
279 | 0 | __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)); |
280 | 0 | __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32)); |
281 | 0 | __m512i p = _mm512_packus_epi16(m1, m2); |
282 | 0 | b->chunks[0] = |
283 | 0 | _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); |
284 | 0 | } |
285 | | |
286 | | static inline void load_block_partial(block64 *b, const char16_t *src, |
287 | 0 | __mmask64 input_mask) { |
288 | 0 | __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask, |
289 | 0 | reinterpret_cast<const __m512i *>(src)); |
290 | 0 | __m512i m2 = |
291 | 0 | _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32), |
292 | 0 | reinterpret_cast<const __m512i *>(src + 32)); |
293 | 0 | __m512i p = _mm512_packus_epi16(m1, m2); |
294 | 0 | b->chunks[0] = |
295 | 0 | _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); |
296 | 0 | } |
297 | | |
298 | 0 | static inline void base64_decode(char *out, __m512i str) { |
299 | 0 | const __m512i merge_ab_and_bc = |
300 | 0 | _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140)); |
301 | 0 | const __m512i merged = |
302 | 0 | _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); |
303 | 0 | const __m512i pack = _mm512_set_epi8( |
304 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, |
305 | 0 | 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, |
306 | 0 | 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, |
307 | 0 | 5, 6, 0, 1, 2); |
308 | 0 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); |
309 | 0 | _mm512_mask_storeu_epi8( |
310 | 0 | (__m512i *)out, 0xffffffffffff, |
311 | 0 | shuffled); // mask would be 0xffffffffffff since we write 48 bytes. |
312 | 0 | } |
313 | | // decode 64 bytes and output 48 bytes |
314 | 0 | static inline void base64_decode_block(char *out, const char *src) { |
315 | 0 | base64_decode(out, |
316 | 0 | _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src))); |
317 | 0 | } |
318 | 0 | static inline void base64_decode_block(char *out, block64 *b) { |
319 | 0 | base64_decode(out, b->chunks[0]); |
320 | 0 | } |
321 | | |
322 | | template <bool base64_url, bool ignore_garbage, bool default_or_url, |
323 | | typename chartype> |
324 | | full_result |
325 | | compress_decode_base64(char *dst, const chartype *src, size_t srclen, |
326 | | base64_options options, |
327 | 0 | last_chunk_handling_options last_chunk_options) { |
328 | 0 | (void)options; |
329 | 0 | const uint8_t *to_base64 = |
330 | 0 | default_or_url ? tables::base64::to_base64_default_or_url_value |
331 | 0 | : (base64_url ? tables::base64::to_base64_url_value |
332 | 0 | : tables::base64::to_base64_value); |
333 | 0 | auto ri = simdutf::scalar::base64::find_end(src, srclen, options); |
334 | 0 | size_t equallocation = ri.equallocation; |
335 | 0 | size_t padding_characters = ri.equalsigns; |
336 | 0 | srclen = ri.srclen; |
337 | 0 | size_t full_input_length = ri.full_input_length; |
338 | 0 | if (srclen == 0) { |
339 | 0 | if (!ignore_garbage && padding_characters > 0) { |
340 | 0 | return {INVALID_BASE64_CHARACTER, equallocation, 0}; |
341 | 0 | } |
342 | 0 | return {SUCCESS, full_input_length, 0}; |
343 | 0 | } |
344 | 0 | const chartype *const srcinit = src; |
345 | 0 | const char *const dstinit = dst; |
346 | 0 | const chartype *const srcend = src + srclen; |
347 | | |
348 | | // figure out why block_size == 2 is sometimes best??? |
349 | 0 | constexpr size_t block_size = 6; |
350 | 0 | char buffer[block_size * 64]; |
351 | 0 | char *bufferptr = buffer; |
352 | 0 | if (srclen >= 64) { |
353 | 0 | const chartype *const srcend64 = src + srclen - 64; |
354 | 0 | while (src <= srcend64) { |
355 | 0 | block64 b; |
356 | 0 | load_block(&b, src); |
357 | 0 | src += 64; |
358 | 0 | uint64_t error = 0; |
359 | 0 | uint64_t badcharmask = |
360 | 0 | to_base64_mask<base64_url, ignore_garbage, default_or_url>(&b, |
361 | 0 | &error); |
362 | 0 | if (!ignore_garbage && error) { |
363 | 0 | src -= 64; |
364 | 0 | size_t error_offset = _tzcnt_u64(error); |
365 | 0 | return {error_code::INVALID_BASE64_CHARACTER, |
366 | 0 | size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; |
367 | 0 | } |
368 | 0 | if (badcharmask != 0) { |
369 | | // optimization opportunity: check for simple masks like those made of |
370 | | // continuous 1s followed by continuous 0s. And masks containing a |
371 | | // single bad character. |
372 | 0 | bufferptr += compress_block(&b, badcharmask, bufferptr); |
373 | 0 | } else if (bufferptr != buffer) { |
374 | 0 | copy_block(&b, bufferptr); |
375 | 0 | bufferptr += 64; |
376 | 0 | } else { |
377 | 0 | base64_decode_block(dst, &b); |
378 | 0 | dst += 48; |
379 | 0 | } |
380 | 0 | if (bufferptr >= (block_size - 1) * 64 + buffer) { |
381 | 0 | for (size_t i = 0; i < (block_size - 1); i++) { |
382 | 0 | base64_decode_block(dst, buffer + i * 64); |
383 | 0 | dst += 48; |
384 | 0 | } |
385 | 0 | std::memcpy(buffer, buffer + (block_size - 1) * 64, |
386 | 0 | 64); // 64 might be too much |
387 | 0 | bufferptr -= (block_size - 1) * 64; |
388 | 0 | } |
389 | 0 | } |
390 | 0 | } |
391 | | |
392 | 0 | int last_block_len = (int)(srcend - src); |
393 | 0 | if (last_block_len != 0) { |
394 | 0 | __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1; |
395 | 0 | block64 b; |
396 | 0 | load_block_partial(&b, src, input_mask); |
397 | 0 | uint64_t error = 0; |
398 | 0 | uint64_t badcharmask = |
399 | 0 | to_base64_mask<base64_url, ignore_garbage, default_or_url>(&b, &error, |
400 | 0 | input_mask); |
401 | 0 | if (!ignore_garbage && error) { |
402 | 0 | size_t error_offset = _tzcnt_u64(error); |
403 | 0 | return {error_code::INVALID_BASE64_CHARACTER, |
404 | 0 | size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; |
405 | 0 | } |
406 | 0 | src += last_block_len; |
407 | 0 | bufferptr += compress_block(&b, badcharmask, bufferptr); |
408 | 0 | } |
409 | | |
410 | 0 | char *buffer_start = buffer; |
411 | 0 | for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { |
412 | 0 | base64_decode_block(dst, buffer_start); |
413 | 0 | dst += 48; |
414 | 0 | } |
415 | 0 | if ((bufferptr - buffer_start) != 0) { |
416 | | // For efficiency reasons, we end up reproducing much of the code |
417 | | // in base64_tail_decode_impl. Better engineering would be to |
418 | | // refactor the code so that we can call it without a performance hit. |
419 | 0 | size_t rem = (bufferptr - buffer_start); |
420 | 0 | int idx = rem % 4; |
421 | 0 | __mmask64 mask = ((__mmask64)1 << rem) - 1; |
422 | 0 | __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start); |
423 | 0 | size_t output_len = (rem / 4) * 3; |
424 | 0 | __mmask64 output_mask = mask >> (rem - output_len); |
425 | 0 | const __m512i merge_ab_and_bc = |
426 | 0 | _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140)); |
427 | 0 | const __m512i merged = |
428 | 0 | _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); |
429 | 0 | const __m512i pack = _mm512_set_epi8( |
430 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, |
431 | 0 | 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, |
432 | 0 | 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, |
433 | 0 | 5, 6, 0, 1, 2); |
434 | 0 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); |
435 | | // We never should have that the number of base64 characters + the |
436 | | // number of padding characters is more than 4. |
437 | 0 | if (!ignore_garbage && (idx + padding_characters > 4)) { |
438 | 0 | return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), |
439 | 0 | size_t(dst - dstinit), true}; |
440 | 0 | } |
441 | | // The idea here is that in loose mode, |
442 | | // if there is padding at all, it must be used |
443 | | // to form 4-wise chunk. However, in loose mode, |
444 | | // we do accept no padding at all. |
445 | 0 | if (!ignore_garbage && |
446 | 0 | last_chunk_options == last_chunk_handling_options::loose && |
447 | 0 | (idx >= 2) && padding_characters > 0 && |
448 | 0 | ((idx + padding_characters) & 3) != 0) { |
449 | 0 | return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), |
450 | 0 | size_t(dst - dstinit), true}; |
451 | 0 | } else |
452 | | // The idea here is that in strict mode, we do not want to accept |
453 | | // incomplete base64 chunks. So if the chunk was otherwise valid, we |
454 | | // return BASE64_INPUT_REMAINDER. |
455 | 0 | if (!ignore_garbage && |
456 | 0 | last_chunk_options == last_chunk_handling_options::strict && |
457 | 0 | (idx >= 2) && ((idx + padding_characters) & 3) != 0) { |
458 | | // The partial chunk was at src - idx |
459 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
460 | 0 | dst += output_len; |
461 | 0 | return {BASE64_INPUT_REMAINDER, equallocation, size_t(dst - dstinit)}; |
462 | 0 | } else |
463 | | // If there is a partial chunk with insufficient padding, with |
464 | | // stop_before_partial, we need to just ignore it. In "only full" mode, |
465 | | // skip the minute there are padding characters. |
466 | 0 | if ((last_chunk_options == |
467 | 0 | last_chunk_handling_options::stop_before_partial && |
468 | 0 | (padding_characters + idx < 4) && (idx != 0) && |
469 | 0 | (idx >= 2 || padding_characters == 0)) || |
470 | 0 | (last_chunk_options == |
471 | 0 | last_chunk_handling_options::only_full_chunks && |
472 | 0 | (idx >= 2 || padding_characters == 0))) { |
473 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
474 | 0 | dst += output_len; |
475 | | // we need to rewind src to before the partial chunk |
476 | 0 | size_t characters_to_skip = idx; |
477 | 0 | while (characters_to_skip > 0) { |
478 | 0 | src--; |
479 | 0 | auto c = *src; |
480 | 0 | uint8_t code = to_base64[uint8_t(c)]; |
481 | 0 | if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) { |
482 | 0 | characters_to_skip--; |
483 | 0 | } |
484 | 0 | } |
485 | | // And then we need to skip ignored characters |
486 | | // See https://github.com/simdutf/simdutf/issues/824 |
487 | 0 | while (src > srcinit) { |
488 | 0 | auto c = *(src - 1); |
489 | 0 | uint8_t code = to_base64[uint8_t(c)]; |
490 | 0 | if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) { |
491 | 0 | break; |
492 | 0 | } |
493 | 0 | src--; |
494 | 0 | } |
495 | 0 | return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; |
496 | 0 | } else { |
497 | 0 | if (idx == 2) { |
498 | 0 | if (!ignore_garbage && |
499 | 0 | last_chunk_options == last_chunk_handling_options::strict) { |
500 | 0 | uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) + |
501 | 0 | (uint32_t(bufferptr[-1]) << 2 * 6); |
502 | 0 | if (triple & 0xffff) { |
503 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
504 | 0 | dst += output_len; |
505 | 0 | return {BASE64_EXTRA_BITS, size_t(src - srcinit), |
506 | 0 | size_t(dst - dstinit)}; |
507 | 0 | } |
508 | 0 | } |
509 | 0 | output_mask = (output_mask << 1) | 1; |
510 | 0 | output_len += 1; |
511 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
512 | 0 | dst += output_len; |
513 | 0 | } else if (idx == 3) { |
514 | 0 | if (!ignore_garbage && |
515 | 0 | last_chunk_options == last_chunk_handling_options::strict) { |
516 | 0 | uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) + |
517 | 0 | (uint32_t(bufferptr[-2]) << 2 * 6) + |
518 | 0 | (uint32_t(bufferptr[-1]) << 1 * 6); |
519 | 0 | if (triple & 0xff) { |
520 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
521 | 0 | dst += output_len; |
522 | 0 | return {BASE64_EXTRA_BITS, size_t(src - srcinit), |
523 | 0 | size_t(dst - dstinit)}; |
524 | 0 | } |
525 | 0 | } |
526 | 0 | output_mask = (output_mask << 2) | 3; |
527 | 0 | output_len += 2; |
528 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
529 | 0 | dst += output_len; |
530 | 0 | } else if (!ignore_garbage && idx == 1 && |
531 | 0 | (!is_partial(last_chunk_options) || |
532 | 0 | (is_partial(last_chunk_options) && |
533 | 0 | padding_characters > 0))) { |
534 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
535 | 0 | dst += output_len; |
536 | 0 | return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), |
537 | 0 | size_t(dst - dstinit)}; |
538 | 0 | } else if (!ignore_garbage && idx == 0 && padding_characters > 0) { |
539 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
540 | 0 | dst += output_len; |
541 | 0 | return {INVALID_BASE64_CHARACTER, equallocation, |
542 | 0 | size_t(dst - dstinit)}; |
543 | 0 | } else { |
544 | 0 | _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); |
545 | 0 | dst += output_len; |
546 | 0 | } |
547 | 0 | } |
548 | 0 | if (!ignore_garbage && !is_partial(last_chunk_options) && |
549 | 0 | padding_characters > 0) { |
550 | 0 | size_t output_count = size_t(dst - dstinit); |
551 | 0 | if ((output_count % 3 == 0) || |
552 | 0 | ((output_count % 3) + 1 + padding_characters != 4)) { |
553 | 0 | return {INVALID_BASE64_CHARACTER, equallocation, output_count}; |
554 | 0 | } |
555 | 0 | } |
556 | 0 | return {SUCCESS, full_input_length, size_t(dst - dstinit)}; |
557 | 0 | } |
558 | | |
559 | 0 | if (!ignore_garbage && padding_characters > 0) { |
560 | 0 | if ((size_t(dst - dstinit) % 3 == 0) || |
561 | 0 | ((size_t(dst - dstinit) % 3) + 1 + padding_characters != 4)) { |
562 | 0 | return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; |
563 | 0 | } |
564 | 0 | } |
565 | 0 | return {SUCCESS, srclen, size_t(dst - dstinit)}; |
566 | 0 | } Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, true, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, true, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, true, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, false, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, true, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, true, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, true, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, false, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options) |