/work/include/simdutf/scalar/base64.h
Line | Count | Source |
1 | | #ifndef SIMDUTF_BASE64_H |
2 | | #define SIMDUTF_BASE64_H |
3 | | |
4 | | #include <algorithm> |
5 | | #include <cstddef> |
6 | | #include <cstdint> |
7 | | #include <cstring> |
8 | | #include <iostream> |
9 | | |
10 | | namespace simdutf { |
11 | | namespace scalar { |
12 | | namespace { |
13 | | namespace base64 { |
14 | | |
15 | | // This function is not expected to be fast. Do not use in long loops. |
16 | | // In most instances you should be using is_ignorable. |
17 | | template <class char_type> bool is_ascii_white_space(char_type c) { |
18 | | return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f'; |
19 | | } |
20 | | |
21 | 0 | template <class char_type> simdutf_constexpr23 bool is_eight_byte(char_type c) { |
22 | 0 | if simdutf_constexpr (sizeof(char_type) == 1) { |
23 | 0 | return true; |
24 | 0 | } |
25 | 0 | return uint8_t(c) == c; |
26 | 0 | } Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_eight_byte<char>(char) Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_eight_byte<char16_t>(char16_t) |
27 | | |
28 | | template <class char_type> |
29 | | simdutf_constexpr23 bool is_ignorable(char_type c, |
30 | 0 | simdutf::base64_options options) { |
31 | 0 | const uint8_t *to_base64 = |
32 | 0 | (options & base64_default_or_url) |
33 | 0 | ? tables::base64::to_base64_default_or_url_value |
34 | 0 | : ((options & base64_url) ? tables::base64::to_base64_url_value |
35 | 0 | : tables::base64::to_base64_value); |
36 | 0 | const bool ignore_garbage = |
37 | 0 | (options == base64_options::base64_url_accept_garbage) || |
38 | 0 | (options == base64_options::base64_default_accept_garbage) || |
39 | 0 | (options == base64_options::base64_default_or_url_accept_garbage); |
40 | 0 | uint8_t code = to_base64[uint8_t(c)]; |
41 | 0 | if (is_eight_byte(c) && code <= 63) { |
42 | 0 | return false; |
43 | 0 | } |
44 | 0 | if (is_eight_byte(c) && code == 64) { |
45 | 0 | return true; |
46 | 0 | } |
47 | 0 | return ignore_garbage; |
48 | 0 | } Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_ignorable<char>(char, simdutf::base64_options) Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_ignorable<char16_t>(char16_t, simdutf::base64_options) |
49 | | template <class char_type> |
50 | | simdutf_constexpr23 bool is_base64(char_type c, |
51 | 0 | simdutf::base64_options options) { |
52 | 0 | const uint8_t *to_base64 = |
53 | 0 | (options & base64_default_or_url) |
54 | 0 | ? tables::base64::to_base64_default_or_url_value |
55 | 0 | : ((options & base64_url) ? tables::base64::to_base64_url_value |
56 | 0 | : tables::base64::to_base64_value); |
57 | 0 | uint8_t code = to_base64[uint8_t(c)]; |
58 | 0 | if (is_eight_byte(c) && code <= 63) { |
59 | 0 | return true; |
60 | 0 | } |
61 | 0 | return false; |
62 | 0 | } Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_base64<char>(char, simdutf::base64_options) Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_base64<char16_t>(char16_t, simdutf::base64_options) |
63 | | |
64 | | template <class char_type> |
65 | | simdutf_constexpr23 bool is_base64_or_padding(char_type c, |
66 | 0 | simdutf::base64_options options) { |
67 | 0 | const uint8_t *to_base64 = |
68 | 0 | (options & base64_default_or_url) |
69 | 0 | ? tables::base64::to_base64_default_or_url_value |
70 | 0 | : ((options & base64_url) ? tables::base64::to_base64_url_value |
71 | 0 | : tables::base64::to_base64_value); |
72 | 0 | if (c == '=') { |
73 | 0 | return true; |
74 | 0 | } |
75 | 0 | uint8_t code = to_base64[uint8_t(c)]; |
76 | 0 | if (is_eight_byte(c) && code <= 63) { |
77 | 0 | return true; |
78 | 0 | } |
79 | 0 | return false; |
80 | 0 | } Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_base64_or_padding<char>(char, simdutf::base64_options) Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::base64::is_base64_or_padding<char16_t>(char16_t, simdutf::base64_options) |
81 | | |
82 | | template <class char_type> |
83 | | bool is_ignorable_or_padding(char_type c, simdutf::base64_options options) { |
84 | | return is_ignorable(c, options) || c == '='; |
85 | | } |
86 | | |
87 | | struct reduced_input { |
88 | | size_t equalsigns; // number of padding characters '=', typically 0, 1, 2. |
89 | | size_t equallocation; // location of the first padding character if any |
90 | | size_t srclen; // length of the input buffer before padding |
91 | | size_t full_input_length; // length of the input buffer with padding but |
92 | | // without ignorable characters |
93 | | }; |
94 | | |
95 | | // find the end of the base64 input buffer |
96 | | // It returns the number of padding characters, the location of the first |
97 | | // padding character if any, the length of the input buffer before padding |
98 | | // and the length of the input buffer with padding. The input buffer is not |
99 | | // modified. The function assumes that there are at most two padding characters. |
100 | | template <class char_type> |
101 | | simdutf_constexpr23 reduced_input find_end(const char_type *src, size_t srclen, |
102 | | simdutf::base64_options options) { |
103 | | const uint8_t *to_base64 = |
104 | | (options & base64_default_or_url) |
105 | | ? tables::base64::to_base64_default_or_url_value |
106 | | : ((options & base64_url) ? tables::base64::to_base64_url_value |
107 | | : tables::base64::to_base64_value); |
108 | | const bool ignore_garbage = |
109 | | (options == base64_options::base64_url_accept_garbage) || |
110 | | (options == base64_options::base64_default_accept_garbage) || |
111 | | (options == base64_options::base64_default_or_url_accept_garbage); |
112 | | |
113 | | size_t equalsigns = 0; |
114 | | // We intentionally include trailing spaces in the full input length. |
115 | | // See https://github.com/simdutf/simdutf/issues/824 |
116 | | size_t full_input_length = srclen; |
117 | | // skip trailing spaces |
118 | | while (!ignore_garbage && srclen > 0 && |
119 | | scalar::base64::is_eight_byte(src[srclen - 1]) && |
120 | | to_base64[uint8_t(src[srclen - 1])] == 64) { |
121 | | srclen--; |
122 | | } |
123 | | size_t equallocation = |
124 | | srclen; // location of the first padding character if any |
125 | | if (ignore_garbage) { |
126 | | // Technically, we don't need to find the first padding character, we can |
127 | | // just change our algorithms, but it adds substantial complexity. |
128 | | auto it = simdutf::find(src, src + srclen, '='); |
129 | | if (it != src + srclen) { |
130 | | equallocation = it - src; |
131 | | equalsigns = 1; |
132 | | srclen = equallocation; |
133 | | full_input_length = equallocation + 1; |
134 | | } |
135 | | return {equalsigns, equallocation, srclen, full_input_length}; |
136 | | } |
137 | | if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') { |
138 | | // This is the last '=' sign. |
139 | | equallocation = srclen - 1; |
140 | | srclen--; |
141 | | equalsigns = 1; |
142 | | // skip trailing spaces |
143 | | while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && |
144 | | to_base64[uint8_t(src[srclen - 1])] == 64) { |
145 | | srclen--; |
146 | | } |
147 | | if (srclen > 0 && src[srclen - 1] == '=') { |
148 | | // This is the second '=' sign. |
149 | | equallocation = srclen - 1; |
150 | | srclen--; |
151 | | equalsigns = 2; |
152 | | } |
153 | | } |
154 | | return {equalsigns, equallocation, srclen, full_input_length}; |
155 | | } |
156 | | |
157 | | // Returns true upon success. The destination buffer must be large enough. |
158 | | // This functions assumes that the padding (=) has been removed. |
159 | | // if check_capacity is true, it will check that the destination buffer is |
160 | | // large enough. If it is not, it will return OUTPUT_BUFFER_TOO_SMALL. |
161 | | template <bool check_capacity, class char_type> |
162 | | simdutf_constexpr23 full_result base64_tail_decode_impl( |
163 | | char *dst, size_t outlen, const char_type *src, size_t length, |
164 | | size_t padding_characters, // number of padding characters |
165 | | // '=', typically 0, 1, 2. |
166 | | base64_options options, last_chunk_handling_options last_chunk_options) { |
167 | | char *dstend = dst + outlen; |
168 | | (void)dstend; |
169 | | // This looks like 10 branches, but we expect the compiler to resolve this to |
170 | | // two branches (easily predicted): |
171 | | const uint8_t *to_base64 = |
172 | | (options & base64_default_or_url) |
173 | | ? tables::base64::to_base64_default_or_url_value |
174 | | : ((options & base64_url) ? tables::base64::to_base64_url_value |
175 | | : tables::base64::to_base64_value); |
176 | | const uint32_t *d0 = |
177 | | (options & base64_default_or_url) |
178 | | ? tables::base64::base64_default_or_url::d0 |
179 | | : ((options & base64_url) ? tables::base64::base64_url::d0 |
180 | | : tables::base64::base64_default::d0); |
181 | | const uint32_t *d1 = |
182 | | (options & base64_default_or_url) |
183 | | ? tables::base64::base64_default_or_url::d1 |
184 | | : ((options & base64_url) ? tables::base64::base64_url::d1 |
185 | | : tables::base64::base64_default::d1); |
186 | | const uint32_t *d2 = |
187 | | (options & base64_default_or_url) |
188 | | ? tables::base64::base64_default_or_url::d2 |
189 | | : ((options & base64_url) ? tables::base64::base64_url::d2 |
190 | | : tables::base64::base64_default::d2); |
191 | | const uint32_t *d3 = |
192 | | (options & base64_default_or_url) |
193 | | ? tables::base64::base64_default_or_url::d3 |
194 | | : ((options & base64_url) ? tables::base64::base64_url::d3 |
195 | | : tables::base64::base64_default::d3); |
196 | | const bool ignore_garbage = |
197 | | (options == base64_options::base64_url_accept_garbage) || |
198 | | (options == base64_options::base64_default_accept_garbage) || |
199 | | (options == base64_options::base64_default_or_url_accept_garbage); |
200 | | |
201 | | const char_type *srcend = src + length; |
202 | | const char_type *srcinit = src; |
203 | | const char *dstinit = dst; |
204 | | |
205 | | uint32_t x; |
206 | | size_t idx; |
207 | | uint8_t buffer[4]; |
208 | | while (true) { |
209 | | while (srcend - src >= 4 && is_eight_byte(src[0]) && |
210 | | is_eight_byte(src[1]) && is_eight_byte(src[2]) && |
211 | | is_eight_byte(src[3]) && |
212 | | (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] | |
213 | | d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) { |
214 | | if (check_capacity && dstend - dst < 3) { |
215 | | return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit), |
216 | | size_t(dst - dstinit)}; |
217 | | } |
218 | | *dst++ = static_cast<char>(x & 0xFF); |
219 | | *dst++ = static_cast<char>((x >> 8) & 0xFF); |
220 | | *dst++ = static_cast<char>((x >> 16) & 0xFF); |
221 | | src += 4; |
222 | | } |
223 | | const char_type *srccur = src; |
224 | | idx = 0; |
225 | | // we need at least four characters. |
226 | | #ifdef __clang__ |
227 | | // If possible, we read four characters at a time. (It is an optimization.) |
228 | | if (ignore_garbage && src + 4 <= srcend) { |
229 | | char_type c0 = src[0]; |
230 | | char_type c1 = src[1]; |
231 | | char_type c2 = src[2]; |
232 | | char_type c3 = src[3]; |
233 | | |
234 | | uint8_t code0 = to_base64[uint8_t(c0)]; |
235 | | uint8_t code1 = to_base64[uint8_t(c1)]; |
236 | | uint8_t code2 = to_base64[uint8_t(c2)]; |
237 | | uint8_t code3 = to_base64[uint8_t(c3)]; |
238 | | |
239 | | buffer[idx] = code0; |
240 | | idx += (is_eight_byte(c0) && code0 <= 63); |
241 | | buffer[idx] = code1; |
242 | | idx += (is_eight_byte(c1) && code1 <= 63); |
243 | | buffer[idx] = code2; |
244 | | idx += (is_eight_byte(c2) && code2 <= 63); |
245 | | buffer[idx] = code3; |
246 | | idx += (is_eight_byte(c3) && code3 <= 63); |
247 | | src += 4; |
248 | | } |
249 | | #endif |
250 | | while ((idx < 4) && (src < srcend)) { |
251 | | char_type c = *src; |
252 | | |
253 | | uint8_t code = to_base64[uint8_t(c)]; |
254 | | buffer[idx] = uint8_t(code); |
255 | | if (is_eight_byte(c) && code <= 63) { |
256 | | idx++; |
257 | | } else if (!ignore_garbage && |
258 | | (code > 64 || !scalar::base64::is_eight_byte(c))) { |
259 | | return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), |
260 | | size_t(dst - dstinit)}; |
261 | | } else { |
262 | | // We have a space or a newline or garbage. We ignore it. |
263 | | } |
264 | | src++; |
265 | | } |
266 | | if (idx != 4) { |
267 | | simdutf_log_assert(idx < 4, "idx should be less than 4"); |
268 | | // We never should have that the number of base64 characters + the |
269 | | // number of padding characters is more than 4. |
270 | | if (!ignore_garbage && (idx + padding_characters > 4)) { |
271 | | return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), |
272 | | size_t(dst - dstinit), true}; |
273 | | } |
274 | | |
275 | | // The idea here is that in loose mode, |
276 | | // if there is padding at all, it must be used |
277 | | // to form 4-wise chunk. However, in loose mode, |
278 | | // we do accept no padding at all. |
279 | | if (!ignore_garbage && |
280 | | last_chunk_options == last_chunk_handling_options::loose && |
281 | | (idx >= 2) && padding_characters > 0 && |
282 | | ((idx + padding_characters) & 3) != 0) { |
283 | | return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), |
284 | | size_t(dst - dstinit), true}; |
285 | | } else |
286 | | |
287 | | // The idea here is that in strict mode, we do not want to accept |
288 | | // incomplete base64 chunks. So if the chunk was otherwise valid, we |
289 | | // return BASE64_INPUT_REMAINDER. |
290 | | if (!ignore_garbage && |
291 | | last_chunk_options == last_chunk_handling_options::strict && |
292 | | (idx >= 2) && ((idx + padding_characters) & 3) != 0) { |
293 | | // The partial chunk was at src - idx |
294 | | return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), |
295 | | size_t(dst - dstinit), true}; |
296 | | } else |
297 | | // If there is a partial chunk with insufficient padding, with |
298 | | // stop_before_partial, we need to just ignore it. In "only full" |
299 | | // mode, skip the minute there are padding characters. |
300 | | if ((last_chunk_options == |
301 | | last_chunk_handling_options::stop_before_partial && |
302 | | (padding_characters + idx < 4) && (idx != 0) && |
303 | | (idx >= 2 || padding_characters == 0)) || |
304 | | (last_chunk_options == |
305 | | last_chunk_handling_options::only_full_chunks && |
306 | | (idx >= 2 || padding_characters == 0))) { |
307 | | // partial means that we are *not* going to consume the read |
308 | | // characters. We need to rewind the src pointer. |
309 | | src = srccur; |
310 | | return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; |
311 | | } else { |
312 | | if (idx == 2) { |
313 | | uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + |
314 | | (uint32_t(buffer[1]) << 2 * 6); |
315 | | if (!ignore_garbage && |
316 | | (last_chunk_options == last_chunk_handling_options::strict) && |
317 | | (triple & 0xffff)) { |
318 | | return {BASE64_EXTRA_BITS, size_t(src - srcinit), |
319 | | size_t(dst - dstinit)}; |
320 | | } |
321 | | if (check_capacity && dstend - dst < 1) { |
322 | | return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit), |
323 | | size_t(dst - dstinit)}; |
324 | | } |
325 | | *dst++ = static_cast<char>((triple >> 16) & 0xFF); |
326 | | } else if (idx == 3) { |
327 | | uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + |
328 | | (uint32_t(buffer[1]) << 2 * 6) + |
329 | | (uint32_t(buffer[2]) << 1 * 6); |
330 | | if (!ignore_garbage && |
331 | | (last_chunk_options == last_chunk_handling_options::strict) && |
332 | | (triple & 0xff)) { |
333 | | return {BASE64_EXTRA_BITS, size_t(src - srcinit), |
334 | | size_t(dst - dstinit)}; |
335 | | } |
336 | | if (check_capacity && dstend - dst < 2) { |
337 | | return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit), |
338 | | size_t(dst - dstinit)}; |
339 | | } |
340 | | *dst++ = static_cast<char>((triple >> 16) & 0xFF); |
341 | | *dst++ = static_cast<char>((triple >> 8) & 0xFF); |
342 | | } else if (!ignore_garbage && idx == 1 && |
343 | | (!is_partial(last_chunk_options) || |
344 | | (is_partial(last_chunk_options) && |
345 | | padding_characters > 0))) { |
346 | | return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), |
347 | | size_t(dst - dstinit)}; |
348 | | } else if (!ignore_garbage && idx == 0 && padding_characters > 0) { |
349 | | return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), |
350 | | size_t(dst - dstinit), true}; |
351 | | } |
352 | | return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; |
353 | | } |
354 | | } |
355 | | if (check_capacity && dstend - dst < 3) { |
356 | | return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit), |
357 | | size_t(dst - dstinit)}; |
358 | | } |
359 | | uint32_t triple = |
360 | | (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) + |
361 | | (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6); |
362 | | *dst++ = static_cast<char>((triple >> 16) & 0xFF); |
363 | | *dst++ = static_cast<char>((triple >> 8) & 0xFF); |
364 | | *dst++ = static_cast<char>(triple & 0xFF); |
365 | | } |
366 | | } |
367 | | |
368 | | template <class char_type> |
369 | | simdutf_constexpr23 full_result base64_tail_decode( |
370 | | char *dst, const char_type *src, size_t length, |
371 | | size_t padding_characters, // number of padding characters |
372 | | // '=', typically 0, 1, 2. |
373 | | base64_options options, last_chunk_handling_options last_chunk_options) { |
374 | | return base64_tail_decode_impl<false>(dst, 0, src, length, padding_characters, |
375 | | options, last_chunk_options); |
376 | | } |
377 | | |
378 | | // like base64_tail_decode, but it will not write past the end of the output |
379 | | // buffer. The outlen parameter is modified to reflect the number of bytes |
380 | | // written. This functions assumes that the padding (=) has been removed. |
381 | | // |
382 | | template <class char_type> |
383 | | simdutf_constexpr23 full_result base64_tail_decode_safe( |
384 | | char *dst, size_t outlen, const char_type *src, size_t length, |
385 | | size_t padding_characters, // number of padding characters |
386 | | // '=', typically 0, 1, 2. |
387 | | base64_options options, last_chunk_handling_options last_chunk_options) { |
388 | | return base64_tail_decode_impl<true>(dst, outlen, src, length, |
389 | | padding_characters, options, |
390 | | last_chunk_options); |
391 | | } |
392 | | |
393 | | inline simdutf_constexpr23 full_result |
394 | | patch_tail_result(full_result r, size_t previous_input, size_t previous_output, |
395 | | size_t equallocation, size_t full_input_length, |
396 | 0 | last_chunk_handling_options last_chunk_options) { |
397 | 0 | r.input_count += previous_input; |
398 | 0 | r.output_count += previous_output; |
399 | 0 | if (r.padding_error) { |
400 | 0 | r.input_count = equallocation; |
401 | 0 | } |
402 | 0 |
|
403 | 0 | if (r.error == error_code::SUCCESS) { |
404 | 0 | if (!is_partial(last_chunk_options)) { |
405 | 0 | // A success when we are not in stop_before_partial mode. |
406 | 0 | // means that we have consumed the whole input buffer. |
407 | 0 | r.input_count = full_input_length; |
408 | 0 | } else if (r.output_count % 3 != 0) { |
409 | 0 | r.input_count = full_input_length; |
410 | 0 | } |
411 | 0 | } |
412 | 0 | return r; |
413 | 0 | } |
414 | | |
415 | | // Returns the number of bytes written. The destination buffer must be large |
416 | | // enough. It will add padding (=) if needed. |
417 | | template <bool use_lines = false> |
418 | | simdutf_constexpr23 size_t tail_encode_base64_impl( |
419 | | char *dst, const char *src, size_t srclen, base64_options options, |
420 | 0 | size_t line_length = simdutf::default_line_length, size_t line_offset = 0) { |
421 | 0 | if simdutf_constexpr (use_lines) { |
422 | 0 | // sanitize line_length and starting_line_offset. |
423 | 0 | // line_length must be greater than 3. |
424 | 0 | if (line_length < 4) { |
425 | 0 | line_length = 4; |
426 | 0 | } |
427 | 0 | simdutf_log_assert(line_offset <= line_length, |
428 | 0 | "line_offset should be less than line_length"); |
429 | 0 | } |
430 | 0 | // By default, we use padding if we are not using the URL variant. |
431 | 0 | // This is check with ((options & base64_url) == 0) which returns true if we |
432 | 0 | // are not using the URL variant. However, we also allow 'inversion' of the |
433 | 0 | // convention with the base64_reverse_padding option. If the |
434 | 0 | // base64_reverse_padding option is set, we use padding if we are using the |
435 | 0 | // URL variant, and we omit it if we are not using the URL variant. This is |
436 | 0 | // checked with |
437 | 0 | // ((options & base64_reverse_padding) == base64_reverse_padding). |
438 | 0 | bool use_padding = |
439 | 0 | ((options & base64_url) == 0) ^ |
440 | 0 | ((options & base64_reverse_padding) == base64_reverse_padding); |
441 | 0 | // This looks like 3 branches, but we expect the compiler to resolve this to |
442 | 0 | // a single branch: |
443 | 0 | const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 |
444 | 0 | : tables::base64::base64_default::e0; |
445 | 0 | const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 |
446 | 0 | : tables::base64::base64_default::e1; |
447 | 0 | const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 |
448 | 0 | : tables::base64::base64_default::e2; |
449 | 0 | char *out = dst; |
450 | 0 | size_t i = 0; |
451 | 0 | uint8_t t1, t2, t3; |
452 | 0 | for (; i + 2 < srclen; i += 3) { |
453 | 0 | t1 = uint8_t(src[i]); |
454 | 0 | t2 = uint8_t(src[i + 1]); |
455 | 0 | t3 = uint8_t(src[i + 2]); |
456 | 0 | if simdutf_constexpr (use_lines) { |
457 | 0 | if (line_offset + 3 >= line_length) { |
458 | 0 | if (line_offset == line_length) { |
459 | 0 | *out++ = '\n'; |
460 | 0 | *out++ = e0[t1]; |
461 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
462 | 0 | *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; |
463 | 0 | *out++ = e2[t3]; |
464 | 0 | line_offset = 4; |
465 | 0 | } else if (line_offset + 1 == line_length) { |
466 | 0 | *out++ = e0[t1]; |
467 | 0 | *out++ = '\n'; |
468 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
469 | 0 | *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; |
470 | 0 | *out++ = e2[t3]; |
471 | 0 | line_offset = 3; |
472 | 0 | } else if (line_offset + 2 == line_length) { |
473 | 0 | *out++ = e0[t1]; |
474 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
475 | 0 | *out++ = '\n'; |
476 | 0 | *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; |
477 | 0 | *out++ = e2[t3]; |
478 | 0 | line_offset = 2; |
479 | 0 | } else if (line_offset + 3 == line_length) { |
480 | 0 | *out++ = e0[t1]; |
481 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
482 | 0 | *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; |
483 | 0 | *out++ = '\n'; |
484 | 0 | *out++ = e2[t3]; |
485 | 0 | line_offset = 1; |
486 | 0 | } |
487 | 0 | } else { |
488 | 0 | *out++ = e0[t1]; |
489 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
490 | 0 | *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; |
491 | 0 | *out++ = e2[t3]; |
492 | 0 | line_offset += 4; |
493 | 0 | } |
494 | 0 | } else { |
495 | 0 | *out++ = e0[t1]; |
496 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
497 | 0 | *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; |
498 | 0 | *out++ = e2[t3]; |
499 | 0 | } |
500 | 0 | } |
501 | 0 | switch (srclen - i) { |
502 | 0 | case 0: |
503 | 0 | break; |
504 | 0 | case 1: |
505 | 0 | t1 = uint8_t(src[i]); |
506 | 0 | if simdutf_constexpr (use_lines) { |
507 | 0 | if (use_padding) { |
508 | 0 | if (line_offset + 3 >= line_length) { |
509 | 0 | if (line_offset == line_length) { |
510 | 0 | *out++ = '\n'; |
511 | 0 | *out++ = e0[t1]; |
512 | 0 | *out++ = e1[(t1 & 0x03) << 4]; |
513 | 0 | *out++ = '='; |
514 | 0 | *out++ = '='; |
515 | 0 | } else if (line_offset + 1 == line_length) { |
516 | 0 | *out++ = e0[t1]; |
517 | 0 | *out++ = '\n'; |
518 | 0 | *out++ = e1[(t1 & 0x03) << 4]; |
519 | 0 | *out++ = '='; |
520 | 0 | *out++ = '='; |
521 | 0 | } else if (line_offset + 2 == line_length) { |
522 | 0 | *out++ = e0[t1]; |
523 | 0 | *out++ = e1[(t1 & 0x03) << 4]; |
524 | 0 | *out++ = '\n'; |
525 | 0 | *out++ = '='; |
526 | 0 | *out++ = '='; |
527 | 0 | } else if (line_offset + 3 == line_length) { |
528 | 0 | *out++ = e0[t1]; |
529 | 0 | *out++ = e1[(t1 & 0x03) << 4]; |
530 | 0 | *out++ = '='; |
531 | 0 | *out++ = '\n'; |
532 | 0 | *out++ = '='; |
533 | 0 | } |
534 | 0 | } else { |
535 | 0 | *out++ = e0[t1]; |
536 | 0 | *out++ = e1[(t1 & 0x03) << 4]; |
537 | 0 | *out++ = '='; |
538 | 0 | *out++ = '='; |
539 | 0 | } |
540 | 0 | } else { |
541 | 0 | if (line_offset + 2 >= line_length) { |
542 | 0 | if (line_offset == line_length) { |
543 | 0 | *out++ = '\n'; |
544 | 0 | *out++ = e0[uint8_t(src[i])]; |
545 | 0 | *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; |
546 | 0 | } else if (line_offset + 1 == line_length) { |
547 | 0 | *out++ = e0[uint8_t(src[i])]; |
548 | 0 | *out++ = '\n'; |
549 | 0 | *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; |
550 | 0 | } else { |
551 | 0 | *out++ = e0[uint8_t(src[i])]; |
552 | 0 | *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; |
553 | 0 | // *out++ = '\n'; ==> no newline at the end of the output |
554 | 0 | } |
555 | 0 | } else { |
556 | 0 | *out++ = e0[uint8_t(src[i])]; |
557 | 0 | *out++ = e1[(uint8_t(src[i]) & 0x03) << 4]; |
558 | 0 | } |
559 | 0 | } |
560 | 0 | } else { |
561 | 0 | *out++ = e0[t1]; |
562 | 0 | *out++ = e1[(t1 & 0x03) << 4]; |
563 | 0 | if (use_padding) { |
564 | 0 | *out++ = '='; |
565 | 0 | *out++ = '='; |
566 | 0 | } |
567 | 0 | } |
568 | 0 | break; |
569 | 0 | default: /* case 2 */ |
570 | 0 | t1 = uint8_t(src[i]); |
571 | 0 | t2 = uint8_t(src[i + 1]); |
572 | 0 | if simdutf_constexpr (use_lines) { |
573 | 0 | if (use_padding) { |
574 | 0 | if (line_offset + 3 >= line_length) { |
575 | 0 | if (line_offset == line_length) { |
576 | 0 | *out++ = '\n'; |
577 | 0 | *out++ = e0[t1]; |
578 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
579 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
580 | 0 | *out++ = '='; |
581 | 0 | } else if (line_offset + 1 == line_length) { |
582 | 0 | *out++ = e0[t1]; |
583 | 0 | *out++ = '\n'; |
584 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
585 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
586 | 0 | *out++ = '='; |
587 | 0 | } else if (line_offset + 2 == line_length) { |
588 | 0 | *out++ = e0[t1]; |
589 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
590 | 0 | *out++ = '\n'; |
591 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
592 | 0 | *out++ = '='; |
593 | 0 | } else if (line_offset + 3 == line_length) { |
594 | 0 | *out++ = e0[t1]; |
595 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
596 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
597 | 0 | *out++ = '\n'; |
598 | 0 | *out++ = '='; |
599 | 0 | } |
600 | 0 | } else { |
601 | 0 | *out++ = e0[t1]; |
602 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
603 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
604 | 0 | *out++ = '='; |
605 | 0 | } |
606 | 0 | } else { |
607 | 0 | if (line_offset + 3 >= line_length) { |
608 | 0 | if (line_offset == line_length) { |
609 | 0 | *out++ = '\n'; |
610 | 0 | *out++ = e0[t1]; |
611 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
612 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
613 | 0 | } else if (line_offset + 1 == line_length) { |
614 | 0 | *out++ = e0[t1]; |
615 | 0 | *out++ = '\n'; |
616 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
617 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
618 | 0 | } else if (line_offset + 2 == line_length) { |
619 | 0 | *out++ = e0[t1]; |
620 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
621 | 0 | *out++ = '\n'; |
622 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
623 | 0 | } else { |
624 | 0 | *out++ = e0[t1]; |
625 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
626 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
627 | 0 | // *out++ = '\n'; ==> no newline at the end of the output |
628 | 0 | } |
629 | 0 | } else { |
630 | 0 | *out++ = e0[t1]; |
631 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
632 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
633 | 0 | } |
634 | 0 | } |
635 | 0 | } else { |
636 | 0 | *out++ = e0[t1]; |
637 | 0 | *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; |
638 | 0 | *out++ = e2[(t2 & 0x0F) << 2]; |
639 | 0 | if (use_padding) { |
640 | 0 | *out++ = '='; |
641 | 0 | } |
642 | 0 | } |
643 | 0 | } |
644 | 0 | return (size_t)(out - dst); |
645 | 0 | } |
646 | | |
647 | | // Returns the number of bytes written. The destination buffer must be large |
648 | | // enough. It will add padding (=) if needed. |
649 | | inline simdutf_constexpr23 size_t tail_encode_base64(char *dst, const char *src, |
650 | | size_t srclen, |
651 | 0 | base64_options options) { |
652 | 0 | return tail_encode_base64_impl(dst, src, srclen, options); |
653 | 0 | } |
654 | | |
655 | | template <class InputPtr> |
656 | | simdutf_warn_unused simdutf_constexpr23 size_t |
657 | | maximal_binary_length_from_base64(InputPtr input, size_t length) noexcept { |
658 | | // We process the padding characters ('=') at the end to make sure |
659 | | // that we return an exact result when the input has no ignorable characters |
660 | | // (e.g., spaces). |
661 | | size_t padding = 0; |
662 | | if (length > 0) { |
663 | | if (input[length - 1] == '=') { |
664 | | padding++; |
665 | | if (length > 1 && input[length - 2] == '=') { |
666 | | padding++; |
667 | | } |
668 | | } |
669 | | } |
670 | | // The input is not otherwise processed for ignorable characters or |
671 | | // validation, so that the function runs in constant time (very fast). In |
672 | | // practice, base64 inputs without ignorable characters are common and the |
673 | | // common case are line separated inputs with relatively long lines (e.g., 76 |
674 | | // characters) which leads this function to a slight (1%) overestimation of |
675 | | // the output size. |
676 | | // |
677 | | // Of course, some inputs might contain an arbitrary number of spaces or |
678 | | // newlines, which would make this function return a very pessimistic output |
679 | | // size but systems that produce base64 outputs typically do not do that and |
680 | | // if they do, they do not care much about minimizing memory usage. |
681 | | // |
682 | | // In specialized applications, users may know that their input is line |
683 | | // separated, which can be checked very quickly by by iterating (e.g., over 76 |
684 | | // character chunks, looking for the linefeed characters only). We could |
685 | | // provide a specialized function for that, but it is not clear that the added |
686 | | // complexity is worth it for us. |
687 | | // |
688 | | size_t actual_length = length - padding; |
689 | | if (actual_length % 4 <= 1) { |
690 | | return actual_length / 4 * 3; |
691 | | } |
692 | | // if we have a valid input, then the remainder must be 2 or 3 adding one or |
693 | | // two extra bytes. |
694 | | return actual_length / 4 * 3 + (actual_length % 4) - 1; |
695 | | } |
696 | | |
697 | | template <typename char_type> |
698 | | simdutf_warn_unused simdutf_constexpr23 full_result |
699 | | base64_to_binary_details_impl( |
700 | | const char_type *input, size_t length, char *output, base64_options options, |
701 | | last_chunk_handling_options last_chunk_options) noexcept { |
702 | | const bool ignore_garbage = |
703 | | (options == base64_options::base64_url_accept_garbage) || |
704 | | (options == base64_options::base64_default_accept_garbage) || |
705 | | (options == base64_options::base64_default_or_url_accept_garbage); |
706 | | auto ri = simdutf::scalar::base64::find_end(input, length, options); |
707 | | size_t equallocation = ri.equallocation; |
708 | | size_t equalsigns = ri.equalsigns; |
709 | | length = ri.srclen; |
710 | | size_t full_input_length = ri.full_input_length; |
711 | | if (length == 0) { |
712 | | if (!ignore_garbage && equalsigns > 0) { |
713 | | return {INVALID_BASE64_CHARACTER, equallocation, 0}; |
714 | | } |
715 | | return {SUCCESS, full_input_length, 0}; |
716 | | } |
717 | | full_result r = scalar::base64::base64_tail_decode( |
718 | | output, input, length, equalsigns, options, last_chunk_options); |
719 | | r = scalar::base64::patch_tail_result(r, 0, 0, equallocation, |
720 | | full_input_length, last_chunk_options); |
721 | | if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS && |
722 | | equalsigns > 0 && !ignore_garbage) { |
723 | | // additional checks |
724 | | if ((r.output_count % 3 == 0) || |
725 | | ((r.output_count % 3) + 1 + equalsigns != 4)) { |
726 | | return {INVALID_BASE64_CHARACTER, equallocation, r.output_count}; |
727 | | } |
728 | | } |
729 | | // When is_partial(last_chunk_options) is true, we must either end with |
730 | | // the end of the stream (beyond whitespace) or right after a non-ignorable |
731 | | // character or at the very beginning of the stream. |
732 | | // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 |
733 | | if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && |
734 | | r.input_count < full_input_length) { |
735 | | // First check if we can extend the input to the end of the stream |
736 | | while (r.input_count < full_input_length && |
737 | | base64_ignorable(*(input + r.input_count), options)) { |
738 | | r.input_count++; |
739 | | } |
740 | | // If we are still not at the end of the stream, then we must backtrack |
741 | | // to the last non-ignorable character. |
742 | | if (r.input_count < full_input_length) { |
743 | | while (r.input_count > 0 && |
744 | | base64_ignorable(*(input + r.input_count - 1), options)) { |
745 | | r.input_count--; |
746 | | } |
747 | | } |
748 | | } |
749 | | return r; |
750 | | } |
751 | | |
752 | | template <typename char_type> |
753 | | simdutf_constexpr23 simdutf_warn_unused full_result |
754 | | base64_to_binary_details_safe_impl( |
755 | | const char_type *input, size_t length, char *output, size_t outlen, |
756 | | base64_options options, |
757 | | last_chunk_handling_options last_chunk_options) noexcept { |
758 | | const bool ignore_garbage = |
759 | | (options == base64_options::base64_url_accept_garbage) || |
760 | | (options == base64_options::base64_default_accept_garbage) || |
761 | | (options == base64_options::base64_default_or_url_accept_garbage); |
762 | | auto ri = simdutf::scalar::base64::find_end(input, length, options); |
763 | | size_t equallocation = ri.equallocation; |
764 | | size_t equalsigns = ri.equalsigns; |
765 | | length = ri.srclen; |
766 | | size_t full_input_length = ri.full_input_length; |
767 | | if (length == 0) { |
768 | | if (!ignore_garbage && equalsigns > 0) { |
769 | | return {INVALID_BASE64_CHARACTER, equallocation, 0}; |
770 | | } |
771 | | return {SUCCESS, full_input_length, 0}; |
772 | | } |
773 | | full_result r = scalar::base64::base64_tail_decode_safe( |
774 | | output, outlen, input, length, equalsigns, options, last_chunk_options); |
775 | | r = scalar::base64::patch_tail_result(r, 0, 0, equallocation, |
776 | | full_input_length, last_chunk_options); |
777 | | if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS && |
778 | | equalsigns > 0 && !ignore_garbage) { |
779 | | // additional checks |
780 | | if ((r.output_count % 3 == 0) || |
781 | | ((r.output_count % 3) + 1 + equalsigns != 4)) { |
782 | | return {INVALID_BASE64_CHARACTER, equallocation, r.output_count}; |
783 | | } |
784 | | } |
785 | | |
786 | | // When is_partial(last_chunk_options) is true, we must either end with |
787 | | // the end of the stream (beyond whitespace) or right after a non-ignorable |
788 | | // character or at the very beginning of the stream. |
789 | | // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64 |
790 | | if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS && |
791 | | r.input_count < full_input_length) { |
792 | | // First check if we can extend the input to the end of the stream |
793 | | while (r.input_count < full_input_length && |
794 | | base64_ignorable(*(input + r.input_count), options)) { |
795 | | r.input_count++; |
796 | | } |
797 | | // If we are still not at the end of the stream, then we must backtrack |
798 | | // to the last non-ignorable character. |
799 | | if (r.input_count < full_input_length) { |
800 | | while (r.input_count > 0 && |
801 | | base64_ignorable(*(input + r.input_count - 1), options)) { |
802 | | r.input_count--; |
803 | | } |
804 | | } |
805 | | } |
806 | | return r; |
807 | | } |
808 | | |
809 | | simdutf_warn_unused simdutf_constexpr23 size_t |
810 | 3.47k | base64_length_from_binary(size_t length, base64_options options) noexcept { |
811 | | // By default, we use padding if we are not using the URL variant. |
812 | | // This is check with ((options & base64_url) == 0) which returns true if we |
813 | | // are not using the URL variant. However, we also allow 'inversion' of the |
814 | | // convention with the base64_reverse_padding option. If the |
815 | | // base64_reverse_padding option is set, we use padding if we are using the |
816 | | // URL variant, and we omit it if we are not using the URL variant. This is |
817 | | // checked with |
818 | | // ((options & base64_reverse_padding) == base64_reverse_padding). |
819 | 3.47k | bool use_padding = |
820 | 3.47k | ((options & base64_url) == 0) ^ |
821 | 3.47k | ((options & base64_reverse_padding) == base64_reverse_padding); |
822 | 3.47k | if (!use_padding) { |
823 | 1.39k | return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0); |
824 | 1.39k | } |
825 | 2.07k | return (length + 2) / 3 * |
826 | 2.07k | 4; // We use padding to make the length a multiple of 4. |
827 | 3.47k | } |
828 | | |
829 | | simdutf_warn_unused simdutf_constexpr23 size_t |
830 | | base64_length_from_binary_with_lines(size_t length, base64_options options, |
831 | 3.53k | size_t line_length) noexcept { |
832 | 3.53k | if (length == 0) { |
833 | 63 | return 0; |
834 | 63 | } |
835 | 3.47k | size_t base64_length = |
836 | 3.47k | scalar::base64::base64_length_from_binary(length, options); |
837 | 3.47k | if (line_length < 4) { |
838 | 0 | line_length = 4; |
839 | 0 | } |
840 | 3.47k | size_t lines = |
841 | 3.47k | (base64_length + line_length - 1) / line_length; // number of lines |
842 | 3.47k | return base64_length + lines - 1; |
843 | 3.53k | } |
844 | | |
845 | | // Return the length of the prefix that contains count base64 characters. |
846 | | // Thus, if count is 3, the function returns the length of the prefix |
847 | | // that contains 3 base64 characters. |
848 | | // The function returns (size_t)-1 if there is not enough base64 characters in |
849 | | // the input. |
850 | | template <typename char_type> |
851 | | simdutf_warn_unused size_t prefix_length(size_t count, |
852 | | simdutf::base64_options options, |
853 | | const char_type *input, |
854 | | size_t length) noexcept { |
855 | | size_t i = 0; |
856 | | while (i < length && is_ignorable(input[i], options)) { |
857 | | i++; |
858 | | } |
859 | | if (count == 0) { |
860 | | return i; // duh! |
861 | | } |
862 | | for (; i < length; i++) { |
863 | | if (is_ignorable(input[i], options)) { |
864 | | continue; |
865 | | } |
866 | | // We have a base64 character or a padding character. |
867 | | count--; |
868 | | if (count == 0) { |
869 | | return i + 1; |
870 | | } |
871 | | } |
872 | | simdutf_log_assert(false, "You never get here"); |
873 | | |
874 | | return -1; // should never happen |
875 | | } |
876 | | |
877 | | } // namespace base64 |
878 | | } // unnamed namespace |
879 | | } // namespace scalar |
880 | | } // namespace simdutf |
881 | | |
882 | | #endif |