Coverage Report

Created: 2026-02-14 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/src/icelake/icelake_base64.inl.cpp
Line
Count
Source
1
// file included directly
2
/**
3
 * References and further reading:
4
 *
5
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
6
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
7
 * https://arxiv.org/abs/1910.05109
8
 *
9
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
10
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
11
 * https://arxiv.org/abs/1704.00605
12
 *
13
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
14
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
15
 * Request for Comments: 4648.
16
 *
17
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
18
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
19
 *
20
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
21
 * acceleration. https://github.com/aklomp/base64. (2014).
22
 *
23
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
24
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
25
 *
26
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
27
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
28
 */
29
30
struct block64 {
31
  __m512i chunks[1];
32
};
33
34
template <bool base64_url, bool use_lines>
35
size_t encode_base64_impl(char *dst, const char *src, size_t srclen,
36
                          base64_options options,
37
0
                          size_t line_length = simdutf::default_line_length) {
38
0
  size_t offset = 0;
39
0
  if (line_length < 4) {
40
0
    line_length = 4; // We do not support line_length less than 4
41
0
  }
42
  // credit: Wojciech Muła
43
0
  const uint8_t *input = (const uint8_t *)src;
44
45
0
  uint8_t *out = (uint8_t *)dst;
46
0
  static const char *lookup_tbl =
47
0
      base64_url
48
0
          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
49
0
          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
50
0
  const __m512i shuffle_input = _mm512_setr_epi32(
51
0
      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
52
0
      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
53
0
      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
54
0
  const __m512i lookup =
55
0
      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
56
0
  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
57
0
  size_t size = srclen;
58
0
  __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
59
  // We want that input == end_input means that we must stop.
60
0
  const uint8_t *end_input = input + (size - (size % 48));
61
0
  while (input != end_input) {
62
0
    const __m512i v = _mm512_maskz_loadu_epi8(
63
0
        input_mask, reinterpret_cast<const __m512i *>(input));
64
0
    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
65
0
    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
66
0
    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
67
0
    if (use_lines) {
68
0
      if (offset + 64 > line_length) {
69
0
        if (line_length >= 64) {
70
0
          __m512i expanded = _mm512_mask_expand_epi8(
71
0
              _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))),
72
0
              result);
73
0
          _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded);
74
0
          __m128i last_lane =
75
0
              _mm512_extracti32x4_epi32(result, 3); // Lane 3 (bytes 48-63)
76
0
          uint8_t last_byte =
77
0
              static_cast<uint8_t>(_mm_extract_epi8(last_lane, 15));
78
0
          out[64] = last_byte;
79
0
          out += 65;
80
0
          offset = 64 - (line_length - offset);
81
0
        } else { // slow path
82
0
          alignas(64) uint8_t local_buffer[64];
83
0
          _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer),
84
0
                              result);
85
0
          size_t out_pos = 0;
86
0
          size_t local_offset = offset;
87
0
          for (size_t j = 0; j < 64;) {
88
0
            if (local_offset == line_length) {
89
0
              out[out_pos++] = '\n';
90
0
              local_offset = 0;
91
0
            }
92
0
            out[out_pos++] = local_buffer[j++];
93
0
            local_offset++;
94
0
          }
95
0
          offset = local_offset;
96
0
          out += out_pos;
97
0
        }
98
0
      } else {
99
0
        _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
100
0
        offset += 64;
101
0
        out += 64;
102
0
      }
103
0
    } else {
104
0
      _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
105
0
      out += 64;
106
0
    }
107
0
    input += 48;
108
0
  }
109
0
  size = size % 48;
110
111
0
  input_mask = ((__mmask64)1 << size) - 1;
112
0
  const __m512i v = _mm512_maskz_loadu_epi8(
113
0
      input_mask, reinterpret_cast<const __m512i *>(input));
114
0
  const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
115
0
  const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
116
0
  bool padding_needed =
117
0
      (((options & base64_url) == 0) ^
118
0
       ((options & base64_reverse_padding) == base64_reverse_padding));
119
0
  size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
120
0
  size_t output_len = ((size + 2) / 3) * 4;
121
0
  size_t non_padded_output_len = output_len - padding_amount;
122
0
  if (!padding_needed) {
123
0
    output_len = non_padded_output_len;
124
0
  }
125
  // If no output, we are done.
126
0
  if (output_len == 0) {
127
0
    return (size_t)(out - (uint8_t *)dst);
128
0
  }
129
0
  __mmask64 output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len);
130
0
  __m512i result = _mm512_mask_permutexvar_epi8(
131
0
      _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
132
0
      indices, lookup);
133
0
  if (use_lines) {
134
0
    if (offset + output_len > line_length) {
135
0
      if (line_length >= 64) {
136
0
        __m512i expanded = _mm512_mask_expand_epi8(
137
0
            _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))),
138
0
            result);
139
0
        if (output_len == 64) {
140
0
          _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded);
141
0
          out += 64;
142
0
          _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out - 63),
143
0
                                  1ULL << 63, result);
144
0
          out++;
145
0
        } else {
146
0
          output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len - 1);
147
0
          _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
148
0
                                  expanded);
149
0
          out += output_len + 1;
150
0
        }
151
0
      } else {
152
0
        alignas(64) uint8_t local_buffer[64];
153
0
        _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), result);
154
0
        size_t out_pos = 0;
155
0
        size_t local_offset = offset;
156
0
        for (size_t j = 0; j < output_len;) {
157
0
          if (local_offset == line_length) {
158
0
            out[out_pos++] = '\n';
159
0
            local_offset = 0;
160
0
          }
161
0
          out[out_pos++] = local_buffer[j++];
162
0
          local_offset++;
163
0
        }
164
0
        offset = local_offset;
165
0
        out += out_pos;
166
0
      }
167
0
    } else {
168
0
      _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
169
0
                              result);
170
0
      out += output_len;
171
0
    }
172
0
  } else {
173
0
    _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
174
0
                            result);
175
0
    out += output_len;
176
0
  }
177
0
  return (size_t)(out - (uint8_t *)dst);
178
0
}
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<true, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<false, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<true, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64_impl<false, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
179
180
template <bool base64_url>
181
size_t encode_base64(char *dst, const char *src, size_t srclen,
182
0
                     base64_options options) {
183
0
  return encode_base64_impl<base64_url, false>(dst, src, srclen, options);
184
0
}
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64<true>(char*, char const*, unsigned long, simdutf::base64_options)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::encode_base64<false>(char*, char const*, unsigned long, simdutf::base64_options)
185
186
template <bool base64_url, bool ignore_garbage, bool default_or_url>
187
static inline uint64_t to_base64_mask(block64 *b, uint64_t *error,
188
0
                                      uint64_t input_mask = UINT64_MAX) {
189
0
  __m512i input = b->chunks[0];
190
0
  const __m512i ascii_space_tbl = _mm512_set_epi8(
191
0
      0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
192
0
      9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
193
0
      0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
194
0
  __m512i lookup0;
195
0
  if (default_or_url) {
196
0
    lookup0 = _mm512_set_epi8(
197
0
        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
198
0
        52, 63, -128, 62, -128, 62, -128, -128, -128, -128, -128, -128, -128,
199
0
        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
200
0
        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
201
0
        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
202
0
  } else if (base64_url) {
203
0
    lookup0 = _mm512_set_epi8(
204
0
        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
205
0
        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
206
0
        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
207
0
        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
208
0
        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
209
0
  } else {
210
0
    lookup0 = _mm512_set_epi8(
211
0
        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
212
0
        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
213
0
        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
214
0
        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
215
0
        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
216
0
  }
217
0
  __m512i lookup1;
218
0
  if (default_or_url) {
219
0
    lookup1 = _mm512_set_epi8(
220
0
        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
221
0
        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
222
0
        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
223
0
        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
224
0
  } else if (base64_url) {
225
0
    lookup1 = _mm512_set_epi8(
226
0
        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
227
0
        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
228
0
        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
229
0
        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
230
0
  } else {
231
0
    lookup1 = _mm512_set_epi8(
232
0
        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
233
0
        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
234
0
        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
235
0
        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
236
0
  }
237
238
0
  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
239
0
  const __m512i combined = _mm512_or_si512(translated, input);
240
0
  const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask;
241
0
  if (!ignore_garbage && mask) {
242
0
    const __mmask64 spaces =
243
0
        _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input),
244
0
                               input) &
245
0
        input_mask;
246
0
    *error = (mask ^ spaces);
247
0
  }
248
0
  b->chunks[0] = translated;
249
250
0
  return mask | (~input_mask);
251
0
}
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, true, true>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, false, true>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<true, true, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<true, false, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, true, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::to_base64_mask<false, false, false>(simdutf::icelake::(anonymous namespace)::block64*, unsigned long*, unsigned long)
252
253
0
static inline void copy_block(block64 *b, char *output) {
254
0
  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
255
0
}
256
257
0
static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
258
0
  uint64_t nmask = ~mask;
259
0
  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
260
0
  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
261
0
  return _mm_popcnt_u64(nmask);
262
0
}
263
264
// The caller of this function is responsible to ensure that there are 64 bytes
265
// available from reading at src. The data is read into a block64 structure.
266
0
static inline void load_block(block64 *b, const char *src) {
267
0
  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
268
0
}
269
270
static inline void load_block_partial(block64 *b, const char *src,
271
0
                                      __mmask64 input_mask) {
272
0
  b->chunks[0] = _mm512_maskz_loadu_epi8(
273
0
      input_mask, reinterpret_cast<const __m512i *>(src));
274
0
}
275
276
// The caller of this function is responsible to ensure that there are 128 bytes
277
// available from reading at src. The data is read into a block64 structure.
278
0
static inline void load_block(block64 *b, const char16_t *src) {
279
0
  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
280
0
  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
281
0
  __m512i p = _mm512_packus_epi16(m1, m2);
282
0
  b->chunks[0] =
283
0
      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
284
0
}
285
286
static inline void load_block_partial(block64 *b, const char16_t *src,
287
0
                                      __mmask64 input_mask) {
288
0
  __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask,
289
0
                                        reinterpret_cast<const __m512i *>(src));
290
0
  __m512i m2 =
291
0
      _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32),
292
0
                               reinterpret_cast<const __m512i *>(src + 32));
293
0
  __m512i p = _mm512_packus_epi16(m1, m2);
294
0
  b->chunks[0] =
295
0
      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
296
0
}
297
298
0
static inline void base64_decode(char *out, __m512i str) {
299
0
  const __m512i merge_ab_and_bc =
300
0
      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
301
0
  const __m512i merged =
302
0
      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
303
0
  const __m512i pack = _mm512_set_epi8(
304
0
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
305
0
      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
306
0
      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
307
0
      5, 6, 0, 1, 2);
308
0
  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
309
0
  _mm512_mask_storeu_epi8(
310
0
      (__m512i *)out, 0xffffffffffff,
311
0
      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
312
0
}
313
// decode 64 bytes and output 48 bytes
314
0
static inline void base64_decode_block(char *out, const char *src) {
315
0
  base64_decode(out,
316
0
                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
317
0
}
318
0
static inline void base64_decode_block(char *out, block64 *b) {
319
0
  base64_decode(out, b->chunks[0]);
320
0
}
321
322
template <bool base64_url, bool ignore_garbage, bool default_or_url,
323
          typename chartype>
324
full_result
325
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
326
                       base64_options options,
327
0
                       last_chunk_handling_options last_chunk_options) {
328
0
  (void)options;
329
0
  const uint8_t *to_base64 =
330
0
      default_or_url ? tables::base64::to_base64_default_or_url_value
331
0
                     : (base64_url ? tables::base64::to_base64_url_value
332
0
                                   : tables::base64::to_base64_value);
333
0
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
334
0
  size_t equallocation = ri.equallocation;
335
0
  size_t padding_characters = ri.equalsigns;
336
0
  srclen = ri.srclen;
337
0
  size_t full_input_length = ri.full_input_length;
338
0
  if (srclen == 0) {
339
0
    if (!ignore_garbage && padding_characters > 0) {
340
0
      return {INVALID_BASE64_CHARACTER, equallocation, 0};
341
0
    }
342
0
    return {SUCCESS, full_input_length, 0};
343
0
  }
344
0
  const chartype *const srcinit = src;
345
0
  const char *const dstinit = dst;
346
0
  const chartype *const srcend = src + srclen;
347
348
  // figure out why block_size == 2 is sometimes best???
349
0
  constexpr size_t block_size = 6;
350
0
  char buffer[block_size * 64];
351
0
  char *bufferptr = buffer;
352
0
  if (srclen >= 64) {
353
0
    const chartype *const srcend64 = src + srclen - 64;
354
0
    while (src <= srcend64) {
355
0
      block64 b;
356
0
      load_block(&b, src);
357
0
      src += 64;
358
0
      uint64_t error = 0;
359
0
      uint64_t badcharmask =
360
0
          to_base64_mask<base64_url, ignore_garbage, default_or_url>(&b,
361
0
                                                                     &error);
362
0
      if (!ignore_garbage && error) {
363
0
        src -= 64;
364
0
        size_t error_offset = _tzcnt_u64(error);
365
0
        return {error_code::INVALID_BASE64_CHARACTER,
366
0
                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
367
0
      }
368
0
      if (badcharmask != 0) {
369
        // optimization opportunity: check for simple masks like those made of
370
        // continuous 1s followed by continuous 0s. And masks containing a
371
        // single bad character.
372
0
        bufferptr += compress_block(&b, badcharmask, bufferptr);
373
0
      } else if (bufferptr != buffer) {
374
0
        copy_block(&b, bufferptr);
375
0
        bufferptr += 64;
376
0
      } else {
377
0
        base64_decode_block(dst, &b);
378
0
        dst += 48;
379
0
      }
380
0
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
381
0
        for (size_t i = 0; i < (block_size - 1); i++) {
382
0
          base64_decode_block(dst, buffer + i * 64);
383
0
          dst += 48;
384
0
        }
385
0
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
386
0
                    64); // 64 might be too much
387
0
        bufferptr -= (block_size - 1) * 64;
388
0
      }
389
0
    }
390
0
  }
391
392
0
  int last_block_len = (int)(srcend - src);
393
0
  if (last_block_len != 0) {
394
0
    __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1;
395
0
    block64 b;
396
0
    load_block_partial(&b, src, input_mask);
397
0
    uint64_t error = 0;
398
0
    uint64_t badcharmask =
399
0
        to_base64_mask<base64_url, ignore_garbage, default_or_url>(&b, &error,
400
0
                                                                   input_mask);
401
0
    if (!ignore_garbage && error) {
402
0
      size_t error_offset = _tzcnt_u64(error);
403
0
      return {error_code::INVALID_BASE64_CHARACTER,
404
0
              size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
405
0
    }
406
0
    src += last_block_len;
407
0
    bufferptr += compress_block(&b, badcharmask, bufferptr);
408
0
  }
409
410
0
  char *buffer_start = buffer;
411
0
  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
412
0
    base64_decode_block(dst, buffer_start);
413
0
    dst += 48;
414
0
  }
415
0
  if ((bufferptr - buffer_start) != 0) {
416
    // For efficiency reasons, we end up reproducing much of the code
417
    // in base64_tail_decode_impl. Better engineering would be to
418
    // refactor the code so that we can call it without a performance hit.
419
0
    size_t rem = (bufferptr - buffer_start);
420
0
    int idx = rem % 4;
421
0
    __mmask64 mask = ((__mmask64)1 << rem) - 1;
422
0
    __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start);
423
0
    size_t output_len = (rem / 4) * 3;
424
0
    __mmask64 output_mask = mask >> (rem - output_len);
425
0
    const __m512i merge_ab_and_bc =
426
0
        _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140));
427
0
    const __m512i merged =
428
0
        _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
429
0
    const __m512i pack = _mm512_set_epi8(
430
0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
431
0
        52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
432
0
        28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
433
0
        5, 6, 0, 1, 2);
434
0
    const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
435
    // We never should have that the number of base64 characters + the
436
    // number of padding characters is more than 4.
437
0
    if (!ignore_garbage && (idx + padding_characters > 4)) {
438
0
      return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
439
0
              size_t(dst - dstinit), true};
440
0
    }
441
    // The idea here is that in loose mode,
442
    // if there is padding at all, it must be used
443
    // to form 4-wise chunk. However, in loose mode,
444
    // we do accept no padding at all.
445
0
    if (!ignore_garbage &&
446
0
        last_chunk_options == last_chunk_handling_options::loose &&
447
0
        (idx >= 2) && padding_characters > 0 &&
448
0
        ((idx + padding_characters) & 3) != 0) {
449
0
      return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
450
0
              size_t(dst - dstinit), true};
451
0
    } else
452
      // The idea here is that in strict mode, we do not want to accept
453
      // incomplete base64 chunks. So if the chunk was otherwise valid, we
454
      // return BASE64_INPUT_REMAINDER.
455
0
      if (!ignore_garbage &&
456
0
          last_chunk_options == last_chunk_handling_options::strict &&
457
0
          (idx >= 2) && ((idx + padding_characters) & 3) != 0) {
458
        // The partial chunk was at src - idx
459
0
        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
460
0
        dst += output_len;
461
0
        return {BASE64_INPUT_REMAINDER, equallocation, size_t(dst - dstinit)};
462
0
      } else
463
        // If there is a partial chunk with insufficient padding, with
464
        // stop_before_partial, we need to just ignore it. In "only full" mode,
465
        // skip the minute there are padding characters.
466
0
        if ((last_chunk_options ==
467
0
                 last_chunk_handling_options::stop_before_partial &&
468
0
             (padding_characters + idx < 4) && (idx != 0) &&
469
0
             (idx >= 2 || padding_characters == 0)) ||
470
0
            (last_chunk_options ==
471
0
                 last_chunk_handling_options::only_full_chunks &&
472
0
             (idx >= 2 || padding_characters == 0))) {
473
0
          _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
474
0
          dst += output_len;
475
          // we need to rewind src to before the partial chunk
476
0
          size_t characters_to_skip = idx;
477
0
          while (characters_to_skip > 0) {
478
0
            src--;
479
0
            auto c = *src;
480
0
            uint8_t code = to_base64[uint8_t(c)];
481
0
            if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) {
482
0
              characters_to_skip--;
483
0
            }
484
0
          }
485
          // And then we need to skip ignored characters
486
          // See https://github.com/simdutf/simdutf/issues/824
487
0
          while (src > srcinit) {
488
0
            auto c = *(src - 1);
489
0
            uint8_t code = to_base64[uint8_t(c)];
490
0
            if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) {
491
0
              break;
492
0
            }
493
0
            src--;
494
0
          }
495
0
          return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
496
0
        } else {
497
0
          if (idx == 2) {
498
0
            if (!ignore_garbage &&
499
0
                last_chunk_options == last_chunk_handling_options::strict) {
500
0
              uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) +
501
0
                                (uint32_t(bufferptr[-1]) << 2 * 6);
502
0
              if (triple & 0xffff) {
503
0
                _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
504
0
                dst += output_len;
505
0
                return {BASE64_EXTRA_BITS, size_t(src - srcinit),
506
0
                        size_t(dst - dstinit)};
507
0
              }
508
0
            }
509
0
            output_mask = (output_mask << 1) | 1;
510
0
            output_len += 1;
511
0
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
512
0
            dst += output_len;
513
0
          } else if (idx == 3) {
514
0
            if (!ignore_garbage &&
515
0
                last_chunk_options == last_chunk_handling_options::strict) {
516
0
              uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) +
517
0
                                (uint32_t(bufferptr[-2]) << 2 * 6) +
518
0
                                (uint32_t(bufferptr[-1]) << 1 * 6);
519
0
              if (triple & 0xff) {
520
0
                _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
521
0
                dst += output_len;
522
0
                return {BASE64_EXTRA_BITS, size_t(src - srcinit),
523
0
                        size_t(dst - dstinit)};
524
0
              }
525
0
            }
526
0
            output_mask = (output_mask << 2) | 3;
527
0
            output_len += 2;
528
0
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
529
0
            dst += output_len;
530
0
          } else if (!ignore_garbage && idx == 1 &&
531
0
                     (!is_partial(last_chunk_options) ||
532
0
                      (is_partial(last_chunk_options) &&
533
0
                       padding_characters > 0))) {
534
0
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
535
0
            dst += output_len;
536
0
            return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
537
0
                    size_t(dst - dstinit)};
538
0
          } else if (!ignore_garbage && idx == 0 && padding_characters > 0) {
539
0
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
540
0
            dst += output_len;
541
0
            return {INVALID_BASE64_CHARACTER, equallocation,
542
0
                    size_t(dst - dstinit)};
543
0
          } else {
544
0
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
545
0
            dst += output_len;
546
0
          }
547
0
        }
548
0
    if (!ignore_garbage && !is_partial(last_chunk_options) &&
549
0
        padding_characters > 0) {
550
0
      size_t output_count = size_t(dst - dstinit);
551
0
      if ((output_count % 3 == 0) ||
552
0
          ((output_count % 3) + 1 + padding_characters != 4)) {
553
0
        return {INVALID_BASE64_CHARACTER, equallocation, output_count};
554
0
      }
555
0
    }
556
0
    return {SUCCESS, full_input_length, size_t(dst - dstinit)};
557
0
  }
558
559
0
  if (!ignore_garbage && padding_characters > 0) {
560
0
    if ((size_t(dst - dstinit) % 3 == 0) ||
561
0
        ((size_t(dst - dstinit) % 3) + 1 + padding_characters != 4)) {
562
0
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
563
0
    }
564
0
  }
565
0
  return {SUCCESS, srclen, size_t(dst - dstinit)};
566
0
}
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, true, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, true, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, true, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, false, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, false, char>(char*, char const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, true, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, true, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, true, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<true, false, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, true, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)
Unexecuted instantiation: simdutf.cpp:simdutf::full_result simdutf::icelake::(anonymous namespace)::compress_decode_base64<false, false, false, char16_t>(char*, char16_t const*, unsigned long, simdutf::base64_options, simdutf::last_chunk_handling_options)