Coverage Report

Created: 2026-02-26 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/src/haswell/avx2_base64.cpp
Line
Count
Source
1
/**
2
 * References and further reading:
3
 *
4
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
5
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
6
 * https://arxiv.org/abs/1910.05109
7
 *
8
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
9
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
10
 * https://arxiv.org/abs/1704.00605
11
 *
12
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
13
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
14
 * Request for Comments: 4648.
15
 *
16
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
17
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
18
 *
19
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
20
 * acceleration. https://github.com/aklomp/base64. (2014).
21
 *
22
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
23
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
24
 *
25
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
26
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
27
 */
28
29
template <bool base64_url>
30
5.54M
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
31
  // Precomputed shuffle masks for K = 1 to 16
32
  // credit: Wojciech Muła
33
5.54M
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
34
5.54M
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
35
5.54M
  result =
36
5.54M
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
37
5.54M
  __m256i shift_LUT;
38
5.54M
  if (base64_url) {
39
2.12M
    shift_LUT = _mm256_setr_epi8(
40
2.12M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
41
2.12M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
42
43
2.12M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
44
2.12M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
45
3.41M
  } else {
46
3.41M
    shift_LUT = _mm256_setr_epi8(
47
3.41M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
48
3.41M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
49
50
3.41M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
51
3.41M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
52
3.41M
  }
53
54
5.54M
  result = _mm256_shuffle_epi8(shift_LUT, result);
55
5.54M
  return _mm256_add_epi8(result, input);
56
5.54M
}
simdutf.cpp:long long __vector(4) simdutf::haswell::(anonymous namespace)::lookup_pshufb_improved<true>(long long __vector(4))
Line
Count
Source
30
2.12M
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
31
  // Precomputed shuffle masks for K = 1 to 16
32
  // credit: Wojciech Muła
33
2.12M
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
34
2.12M
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
35
2.12M
  result =
36
2.12M
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
37
2.12M
  __m256i shift_LUT;
38
2.12M
  if (base64_url) {
39
2.12M
    shift_LUT = _mm256_setr_epi8(
40
2.12M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
41
2.12M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
42
43
2.12M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
44
2.12M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
45
2.12M
  } else {
46
0
    shift_LUT = _mm256_setr_epi8(
47
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
48
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
49
50
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
51
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
52
0
  }
53
54
2.12M
  result = _mm256_shuffle_epi8(shift_LUT, result);
55
2.12M
  return _mm256_add_epi8(result, input);
56
2.12M
}
simdutf.cpp:long long __vector(4) simdutf::haswell::(anonymous namespace)::lookup_pshufb_improved<false>(long long __vector(4))
Line
Count
Source
30
3.41M
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
31
  // Precomputed shuffle masks for K = 1 to 16
32
  // credit: Wojciech Muła
33
3.41M
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
34
3.41M
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
35
3.41M
  result =
36
3.41M
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
37
3.41M
  __m256i shift_LUT;
38
3.41M
  if (base64_url) {
39
0
    shift_LUT = _mm256_setr_epi8(
40
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
41
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
42
43
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
44
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
45
3.41M
  } else {
46
3.41M
    shift_LUT = _mm256_setr_epi8(
47
3.41M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
48
3.41M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
49
50
3.41M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
51
3.41M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
52
3.41M
  }
53
54
3.41M
  result = _mm256_shuffle_epi8(shift_LUT, result);
55
3.41M
  return _mm256_add_epi8(result, input);
56
3.41M
}
57
58
206k
simdutf_really_inline __m256i insert_line_feed32(__m256i input, int K) {
59
60
206k
  static const uint8_t low_table[16][32] = {
61
206k
      {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
62
206k
       0,    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
63
206k
      {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
64
206k
       0, 1,    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
65
206k
      {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
66
206k
       0, 1, 2,    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
67
206k
      {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
68
206k
       0, 1, 2, 3,    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
69
206k
      {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
70
206k
       0, 1, 2, 3, 4,    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
71
206k
      {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
72
206k
       0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
73
206k
      {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9,  10, 11, 12, 13, 14,
74
206k
       0, 1, 2, 3, 4, 5, 6,    7, 8, 9, 10, 11, 12, 13, 14, 15},
75
206k
      {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9,  10, 11, 12, 13, 14,
76
206k
       0, 1, 2, 3, 4, 5, 6, 7,    8, 9, 10, 11, 12, 13, 14, 15},
77
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9,  10, 11, 12, 13, 14,
78
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8,    9, 10, 11, 12, 13, 14, 15},
79
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9,  10, 11, 12, 13, 14,
80
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9,    10, 11, 12, 13, 14, 15},
81
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14,
82
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,   11, 12, 13, 14, 15},
83
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14,
84
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   12, 13, 14, 15},
85
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14,
86
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,   13, 14, 15},
87
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14,
88
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,   14, 15},
89
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14,
90
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,   15},
91
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80,
92
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
93
206k
  static const uint8_t high_table[16][32] = {
94
206k
      {0,    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
95
206k
       0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
96
206k
      {0, 1,    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
97
206k
       0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
98
206k
      {0, 1, 2,    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
99
206k
       0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
100
206k
      {0, 1, 2, 3,    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
101
206k
       0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
102
206k
      {0, 1, 2, 3, 4,    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
103
206k
       0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
104
206k
      {0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
105
206k
       0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
106
206k
      {0, 1, 2, 3, 4, 5, 6,    7, 8, 9, 10, 11, 12, 13, 14, 15,
107
206k
       0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9,  10, 11, 12, 13, 14},
108
206k
      {0, 1, 2, 3, 4, 5, 6, 7,    8, 9, 10, 11, 12, 13, 14, 15,
109
206k
       0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9,  10, 11, 12, 13, 14},
110
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8,    9, 10, 11, 12, 13, 14, 15,
111
206k
       0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9,  10, 11, 12, 13, 14},
112
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,    10, 11, 12, 13, 14, 15,
113
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9,  10, 11, 12, 13, 14},
114
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,   11, 12, 13, 14, 15,
115
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14},
116
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   12, 13, 14, 15,
117
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14},
118
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,   13, 14, 15,
119
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14},
120
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,   14, 15,
121
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14},
122
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,   15,
123
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14},
124
206k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
125
206k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}};
126
127
206k
  __m256i line_feed_vector = _mm256_set1_epi8('\n');
128
206k
  if (K >= 16) {
129
99.1k
    __m256i mask = _mm256_loadu_si256((const __m256i *)high_table[K - 16]);
130
99.1k
    __m256i lf_pos =
131
99.1k
        _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80)));
132
99.1k
    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
133
99.1k
    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
134
99.1k
    return result;
135
99.1k
  }
136
  // Shift input right by 1 byte
137
107k
  __m256i shift = _mm256_alignr_epi8(
138
107k
      input, _mm256_permute2x128_si256(input, input, 0x21), 15);
139
140
107k
  input = _mm256_blend_epi32(input, shift, 0xF0);
141
142
107k
  __m256i mask = _mm256_loadu_si256((const __m256i *)low_table[K]);
143
144
107k
  __m256i lf_pos =
145
107k
      _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80)));
146
107k
  __m256i shuffled = _mm256_shuffle_epi8(input, mask);
147
148
107k
  __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
149
107k
  return result;
150
206k
}
151
152
template <bool isbase64url, bool use_lines>
153
size_t
154
avx2_encode_base64_impl(char *dst, const char *src, size_t srclen,
155
                        base64_options options,
156
17.0k
                        size_t line_length = simdutf::default_line_length) {
157
17.0k
  size_t offset = 0;
158
159
17.0k
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
17.0k
  const uint8_t *input = (const uint8_t *)src;
164
165
17.0k
  uint8_t *out = (uint8_t *)dst;
166
17.0k
  const __m256i shuf =
167
17.0k
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
17.0k
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
17.0k
  size_t i = 0;
171
1.40M
  for (; i + 100 <= srclen; i += 96) {
172
1.38M
    const __m128i lo0 = _mm_loadu_si128(
173
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
1.38M
    const __m128i hi0 = _mm_loadu_si128(
175
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
1.38M
    const __m128i lo1 = _mm_loadu_si128(
177
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
1.38M
    const __m128i hi1 = _mm_loadu_si128(
179
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
1.38M
    const __m128i lo2 = _mm_loadu_si128(
181
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
1.38M
    const __m128i hi2 = _mm_loadu_si128(
183
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
1.38M
    const __m128i lo3 = _mm_loadu_si128(
185
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
1.38M
    const __m128i hi3 = _mm_loadu_si128(
187
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
1.38M
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
1.38M
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
1.38M
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
1.38M
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
1.38M
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
1.38M
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
1.38M
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
1.38M
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
1.38M
    const __m256i t1_0 =
200
1.38M
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
1.38M
    const __m256i t1_1 =
202
1.38M
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
1.38M
    const __m256i t1_2 =
204
1.38M
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
1.38M
    const __m256i t1_3 =
206
1.38M
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
1.38M
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
1.38M
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
1.38M
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
1.38M
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
1.38M
    const __m256i t3_0 =
214
1.38M
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
1.38M
    const __m256i t3_1 =
216
1.38M
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
1.38M
    const __m256i t3_2 =
218
1.38M
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
1.38M
    const __m256i t3_3 =
220
1.38M
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
1.38M
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
1.38M
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
1.38M
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
1.38M
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
1.38M
    if (use_lines) {
228
691k
      if (line_length >= 32) { // fast path
229
113k
        __m256i result;
230
113k
        result = lookup_pshufb_improved<isbase64url>(input0);
231
113k
        if (offset + 32 > line_length) {
232
52.9k
          size_t location_end = line_length - offset;
233
52.9k
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
52.9k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
52.9k
          _mm256_storeu_si256(
237
52.9k
              reinterpret_cast<__m256i *>(out),
238
52.9k
              insert_line_feed32(result, static_cast<int>(location_end)));
239
52.9k
          offset = to_move;
240
52.9k
          out += 32 + 1;
241
60.4k
        } else {
242
60.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
60.4k
          offset += 32;
244
60.4k
          out += 32;
245
60.4k
        }
246
113k
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
113k
        if (offset + 32 > line_length) {
249
50.1k
          size_t location_end = line_length - offset;
250
50.1k
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
50.1k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
50.1k
          _mm256_storeu_si256(
255
50.1k
              reinterpret_cast<__m256i *>(out),
256
50.1k
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
50.1k
          offset = to_move;
260
50.1k
          out += 32 + 1;
261
63.2k
        } else {
262
263
63.2k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
63.2k
          offset += 32;
266
63.2k
          out += 32;
267
63.2k
        }
268
113k
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
113k
        if (offset + 32 > line_length) {
271
52.7k
          size_t location_end = line_length - offset;
272
52.7k
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
52.7k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
52.7k
          _mm256_storeu_si256(
277
52.7k
              reinterpret_cast<__m256i *>(out),
278
52.7k
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
52.7k
          offset = to_move;
282
52.7k
          out += 32 + 1;
283
60.6k
        } else {
284
60.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
60.6k
          offset += 32;
286
60.6k
          out += 32;
287
60.6k
        }
288
113k
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
113k
        if (offset + 32 > line_length) {
291
50.4k
          size_t location_end = line_length - offset;
292
50.4k
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
50.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
50.4k
          _mm256_storeu_si256(
297
50.4k
              reinterpret_cast<__m256i *>(out),
298
50.4k
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
50.4k
          offset = to_move;
302
50.4k
          out += 32 + 1;
303
63.0k
        } else {
304
63.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
63.0k
          offset += 32;
306
63.0k
          out += 32;
307
63.0k
        }
308
577k
      } else { // slow path
309
        // could be optimized
310
577k
        uint8_t buffer[128];
311
577k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
577k
                            lookup_pshufb_improved<isbase64url>(input0));
313
577k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
577k
                            lookup_pshufb_improved<isbase64url>(input1));
315
577k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
577k
                            lookup_pshufb_improved<isbase64url>(input2));
317
577k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
577k
                            lookup_pshufb_improved<isbase64url>(input3));
319
577k
        size_t out_pos = 0;
320
577k
        size_t local_offset = offset;
321
74.5M
        for (size_t j = 0; j < 128;) {
322
73.9M
          if (local_offset == line_length) {
323
17.3M
            out[out_pos++] = '\n';
324
17.3M
            local_offset = 0;
325
17.3M
          }
326
73.9M
          out[out_pos++] = buffer[j++];
327
73.9M
          local_offset++;
328
73.9M
        }
329
577k
        offset = local_offset;
330
577k
        out += out_pos;
331
577k
      }
332
693k
    } else {
333
693k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
693k
                          lookup_pshufb_improved<isbase64url>(input0));
335
693k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
693k
                          lookup_pshufb_improved<isbase64url>(input1));
337
693k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
693k
                          lookup_pshufb_improved<isbase64url>(input2));
339
693k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
693k
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
693k
      out += 128;
343
693k
    }
344
1.38M
  }
345
20.9k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
3.93k
    const __m128i lo =
349
3.93k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
3.93k
    const __m128i hi =
351
3.93k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
3.93k
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
3.93k
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
3.93k
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
3.93k
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
3.93k
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
3.93k
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
3.93k
    if (use_lines) {
366
1.11k
      if (line_length >= 32) { // fast path
367
571
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
571
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
571
        if (offset + 32 > line_length) {
371
184
          size_t location_end = line_length - offset;
372
184
          size_t to_move = 32 - location_end;
373
184
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
184
          out[location_end] = '\n';
375
184
          offset = to_move;
376
184
          out += 32 + 1;
377
387
        } else {
378
387
          offset += 32;
379
387
          out += 32;
380
387
        }
381
571
      } else { // slow path
382
        // could be optimized
383
548
        alignas(32) uint8_t buffer[32];
384
548
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
548
                            lookup_pshufb_improved<isbase64url>(indices));
386
548
        std::memcpy(out, buffer, 32);
387
548
        size_t out_pos = 0;
388
548
        size_t local_offset = offset;
389
18.0k
        for (size_t j = 0; j < 32;) {
390
17.5k
          if (local_offset == line_length) {
391
3.34k
            out[out_pos++] = '\n';
392
3.34k
            local_offset = 0;
393
3.34k
          }
394
17.5k
          out[out_pos++] = buffer[j++];
395
17.5k
          local_offset++;
396
17.5k
        }
397
548
        offset = local_offset;
398
548
        out += out_pos;
399
548
      }
400
2.81k
    } else {
401
2.81k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
2.81k
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
2.81k
      out += 32;
405
2.81k
    }
406
3.93k
  }
407
17.0k
  return ((char *)out - (char *)dst) +
408
17.0k
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
17.0k
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
17.0k
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<true, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
579
                        size_t line_length = simdutf::default_line_length) {
157
579
  size_t offset = 0;
158
159
579
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
579
  const uint8_t *input = (const uint8_t *)src;
164
165
579
  uint8_t *out = (uint8_t *)dst;
166
579
  const __m256i shuf =
167
579
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
579
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
579
  size_t i = 0;
171
266k
  for (; i + 100 <= srclen; i += 96) {
172
265k
    const __m128i lo0 = _mm_loadu_si128(
173
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
265k
    const __m128i hi0 = _mm_loadu_si128(
175
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
265k
    const __m128i lo1 = _mm_loadu_si128(
177
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
265k
    const __m128i hi1 = _mm_loadu_si128(
179
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
265k
    const __m128i lo2 = _mm_loadu_si128(
181
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
265k
    const __m128i hi2 = _mm_loadu_si128(
183
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
265k
    const __m128i lo3 = _mm_loadu_si128(
185
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
265k
    const __m128i hi3 = _mm_loadu_si128(
187
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
265k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
265k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
265k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
265k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
265k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
265k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
265k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
265k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
265k
    const __m256i t1_0 =
200
265k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
265k
    const __m256i t1_1 =
202
265k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
265k
    const __m256i t1_2 =
204
265k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
265k
    const __m256i t1_3 =
206
265k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
265k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
265k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
265k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
265k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
265k
    const __m256i t3_0 =
214
265k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
265k
    const __m256i t3_1 =
216
265k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
265k
    const __m256i t3_2 =
218
265k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
265k
    const __m256i t3_3 =
220
265k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
265k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
265k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
265k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
265k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
265k
    if (use_lines) {
228
0
      if (line_length >= 32) { // fast path
229
0
        __m256i result;
230
0
        result = lookup_pshufb_improved<isbase64url>(input0);
231
0
        if (offset + 32 > line_length) {
232
0
          size_t location_end = line_length - offset;
233
0
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
0
          _mm256_storeu_si256(
237
0
              reinterpret_cast<__m256i *>(out),
238
0
              insert_line_feed32(result, static_cast<int>(location_end)));
239
0
          offset = to_move;
240
0
          out += 32 + 1;
241
0
        } else {
242
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
0
          offset += 32;
244
0
          out += 32;
245
0
        }
246
0
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
0
        if (offset + 32 > line_length) {
249
0
          size_t location_end = line_length - offset;
250
0
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
0
          _mm256_storeu_si256(
255
0
              reinterpret_cast<__m256i *>(out),
256
0
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
0
          offset = to_move;
260
0
          out += 32 + 1;
261
0
        } else {
262
263
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
0
          offset += 32;
266
0
          out += 32;
267
0
        }
268
0
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
0
        if (offset + 32 > line_length) {
271
0
          size_t location_end = line_length - offset;
272
0
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
0
          _mm256_storeu_si256(
277
0
              reinterpret_cast<__m256i *>(out),
278
0
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
0
          offset = to_move;
282
0
          out += 32 + 1;
283
0
        } else {
284
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
0
          offset += 32;
286
0
          out += 32;
287
0
        }
288
0
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
0
        if (offset + 32 > line_length) {
291
0
          size_t location_end = line_length - offset;
292
0
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
0
          _mm256_storeu_si256(
297
0
              reinterpret_cast<__m256i *>(out),
298
0
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
0
          offset = to_move;
302
0
          out += 32 + 1;
303
0
        } else {
304
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
0
          offset += 32;
306
0
          out += 32;
307
0
        }
308
0
      } else { // slow path
309
        // could be optimized
310
0
        uint8_t buffer[128];
311
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
0
                            lookup_pshufb_improved<isbase64url>(input0));
313
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
0
                            lookup_pshufb_improved<isbase64url>(input1));
315
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
0
                            lookup_pshufb_improved<isbase64url>(input2));
317
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
0
                            lookup_pshufb_improved<isbase64url>(input3));
319
0
        size_t out_pos = 0;
320
0
        size_t local_offset = offset;
321
0
        for (size_t j = 0; j < 128;) {
322
0
          if (local_offset == line_length) {
323
0
            out[out_pos++] = '\n';
324
0
            local_offset = 0;
325
0
          }
326
0
          out[out_pos++] = buffer[j++];
327
0
          local_offset++;
328
0
        }
329
0
        offset = local_offset;
330
0
        out += out_pos;
331
0
      }
332
265k
    } else {
333
265k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
265k
                          lookup_pshufb_improved<isbase64url>(input0));
335
265k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
265k
                          lookup_pshufb_improved<isbase64url>(input1));
337
265k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
265k
                          lookup_pshufb_improved<isbase64url>(input2));
339
265k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
265k
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
265k
      out += 128;
343
265k
    }
344
265k
  }
345
1.14k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
565
    const __m128i lo =
349
565
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
565
    const __m128i hi =
351
565
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
565
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
565
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
565
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
565
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
565
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
565
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
565
    if (use_lines) {
366
0
      if (line_length >= 32) { // fast path
367
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
0
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
0
        if (offset + 32 > line_length) {
371
0
          size_t location_end = line_length - offset;
372
0
          size_t to_move = 32 - location_end;
373
0
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
0
          out[location_end] = '\n';
375
0
          offset = to_move;
376
0
          out += 32 + 1;
377
0
        } else {
378
0
          offset += 32;
379
0
          out += 32;
380
0
        }
381
0
      } else { // slow path
382
        // could be optimized
383
0
        alignas(32) uint8_t buffer[32];
384
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
0
                            lookup_pshufb_improved<isbase64url>(indices));
386
0
        std::memcpy(out, buffer, 32);
387
0
        size_t out_pos = 0;
388
0
        size_t local_offset = offset;
389
0
        for (size_t j = 0; j < 32;) {
390
0
          if (local_offset == line_length) {
391
0
            out[out_pos++] = '\n';
392
0
            local_offset = 0;
393
0
          }
394
0
          out[out_pos++] = buffer[j++];
395
0
          local_offset++;
396
0
        }
397
0
        offset = local_offset;
398
0
        out += out_pos;
399
0
      }
400
565
    } else {
401
565
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
565
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
565
      out += 32;
405
565
    }
406
565
  }
407
579
  return ((char *)out - (char *)dst) +
408
579
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
579
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
579
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<false, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
15.2k
                        size_t line_length = simdutf::default_line_length) {
157
15.2k
  size_t offset = 0;
158
159
15.2k
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
15.2k
  const uint8_t *input = (const uint8_t *)src;
164
165
15.2k
  uint8_t *out = (uint8_t *)dst;
166
15.2k
  const __m256i shuf =
167
15.2k
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
15.2k
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
15.2k
  size_t i = 0;
171
442k
  for (; i + 100 <= srclen; i += 96) {
172
427k
    const __m128i lo0 = _mm_loadu_si128(
173
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
427k
    const __m128i hi0 = _mm_loadu_si128(
175
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
427k
    const __m128i lo1 = _mm_loadu_si128(
177
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
427k
    const __m128i hi1 = _mm_loadu_si128(
179
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
427k
    const __m128i lo2 = _mm_loadu_si128(
181
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
427k
    const __m128i hi2 = _mm_loadu_si128(
183
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
427k
    const __m128i lo3 = _mm_loadu_si128(
185
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
427k
    const __m128i hi3 = _mm_loadu_si128(
187
427k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
427k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
427k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
427k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
427k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
427k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
427k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
427k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
427k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
427k
    const __m256i t1_0 =
200
427k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
427k
    const __m256i t1_1 =
202
427k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
427k
    const __m256i t1_2 =
204
427k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
427k
    const __m256i t1_3 =
206
427k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
427k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
427k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
427k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
427k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
427k
    const __m256i t3_0 =
214
427k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
427k
    const __m256i t3_1 =
216
427k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
427k
    const __m256i t3_2 =
218
427k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
427k
    const __m256i t3_3 =
220
427k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
427k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
427k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
427k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
427k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
427k
    if (use_lines) {
228
0
      if (line_length >= 32) { // fast path
229
0
        __m256i result;
230
0
        result = lookup_pshufb_improved<isbase64url>(input0);
231
0
        if (offset + 32 > line_length) {
232
0
          size_t location_end = line_length - offset;
233
0
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
0
          _mm256_storeu_si256(
237
0
              reinterpret_cast<__m256i *>(out),
238
0
              insert_line_feed32(result, static_cast<int>(location_end)));
239
0
          offset = to_move;
240
0
          out += 32 + 1;
241
0
        } else {
242
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
0
          offset += 32;
244
0
          out += 32;
245
0
        }
246
0
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
0
        if (offset + 32 > line_length) {
249
0
          size_t location_end = line_length - offset;
250
0
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
0
          _mm256_storeu_si256(
255
0
              reinterpret_cast<__m256i *>(out),
256
0
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
0
          offset = to_move;
260
0
          out += 32 + 1;
261
0
        } else {
262
263
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
0
          offset += 32;
266
0
          out += 32;
267
0
        }
268
0
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
0
        if (offset + 32 > line_length) {
271
0
          size_t location_end = line_length - offset;
272
0
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
0
          _mm256_storeu_si256(
277
0
              reinterpret_cast<__m256i *>(out),
278
0
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
0
          offset = to_move;
282
0
          out += 32 + 1;
283
0
        } else {
284
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
0
          offset += 32;
286
0
          out += 32;
287
0
        }
288
0
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
0
        if (offset + 32 > line_length) {
291
0
          size_t location_end = line_length - offset;
292
0
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
0
          _mm256_storeu_si256(
297
0
              reinterpret_cast<__m256i *>(out),
298
0
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
0
          offset = to_move;
302
0
          out += 32 + 1;
303
0
        } else {
304
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
0
          offset += 32;
306
0
          out += 32;
307
0
        }
308
0
      } else { // slow path
309
        // could be optimized
310
0
        uint8_t buffer[128];
311
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
0
                            lookup_pshufb_improved<isbase64url>(input0));
313
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
0
                            lookup_pshufb_improved<isbase64url>(input1));
315
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
0
                            lookup_pshufb_improved<isbase64url>(input2));
317
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
0
                            lookup_pshufb_improved<isbase64url>(input3));
319
0
        size_t out_pos = 0;
320
0
        size_t local_offset = offset;
321
0
        for (size_t j = 0; j < 128;) {
322
0
          if (local_offset == line_length) {
323
0
            out[out_pos++] = '\n';
324
0
            local_offset = 0;
325
0
          }
326
0
          out[out_pos++] = buffer[j++];
327
0
          local_offset++;
328
0
        }
329
0
        offset = local_offset;
330
0
        out += out_pos;
331
0
      }
332
427k
    } else {
333
427k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
427k
                          lookup_pshufb_improved<isbase64url>(input0));
335
427k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
427k
                          lookup_pshufb_improved<isbase64url>(input1));
337
427k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
427k
                          lookup_pshufb_improved<isbase64url>(input2));
339
427k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
427k
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
427k
      out += 128;
343
427k
    }
344
427k
  }
345
17.5k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
2.24k
    const __m128i lo =
349
2.24k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
2.24k
    const __m128i hi =
351
2.24k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
2.24k
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
2.24k
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
2.24k
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
2.24k
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
2.24k
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
2.24k
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
2.24k
    if (use_lines) {
366
0
      if (line_length >= 32) { // fast path
367
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
0
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
0
        if (offset + 32 > line_length) {
371
0
          size_t location_end = line_length - offset;
372
0
          size_t to_move = 32 - location_end;
373
0
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
0
          out[location_end] = '\n';
375
0
          offset = to_move;
376
0
          out += 32 + 1;
377
0
        } else {
378
0
          offset += 32;
379
0
          out += 32;
380
0
        }
381
0
      } else { // slow path
382
        // could be optimized
383
0
        alignas(32) uint8_t buffer[32];
384
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
0
                            lookup_pshufb_improved<isbase64url>(indices));
386
0
        std::memcpy(out, buffer, 32);
387
0
        size_t out_pos = 0;
388
0
        size_t local_offset = offset;
389
0
        for (size_t j = 0; j < 32;) {
390
0
          if (local_offset == line_length) {
391
0
            out[out_pos++] = '\n';
392
0
            local_offset = 0;
393
0
          }
394
0
          out[out_pos++] = buffer[j++];
395
0
          local_offset++;
396
0
        }
397
0
        offset = local_offset;
398
0
        out += out_pos;
399
0
      }
400
2.24k
    } else {
401
2.24k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
2.24k
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
2.24k
      out += 32;
405
2.24k
    }
406
2.24k
  }
407
15.2k
  return ((char *)out - (char *)dst) +
408
15.2k
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
15.2k
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
15.2k
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<true, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
579
                        size_t line_length = simdutf::default_line_length) {
157
579
  size_t offset = 0;
158
159
579
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
579
  const uint8_t *input = (const uint8_t *)src;
164
165
579
  uint8_t *out = (uint8_t *)dst;
166
579
  const __m256i shuf =
167
579
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
579
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
579
  size_t i = 0;
171
266k
  for (; i + 100 <= srclen; i += 96) {
172
265k
    const __m128i lo0 = _mm_loadu_si128(
173
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
265k
    const __m128i hi0 = _mm_loadu_si128(
175
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
265k
    const __m128i lo1 = _mm_loadu_si128(
177
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
265k
    const __m128i hi1 = _mm_loadu_si128(
179
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
265k
    const __m128i lo2 = _mm_loadu_si128(
181
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
265k
    const __m128i hi2 = _mm_loadu_si128(
183
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
265k
    const __m128i lo3 = _mm_loadu_si128(
185
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
265k
    const __m128i hi3 = _mm_loadu_si128(
187
265k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
265k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
265k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
265k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
265k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
265k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
265k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
265k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
265k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
265k
    const __m256i t1_0 =
200
265k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
265k
    const __m256i t1_1 =
202
265k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
265k
    const __m256i t1_2 =
204
265k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
265k
    const __m256i t1_3 =
206
265k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
265k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
265k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
265k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
265k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
265k
    const __m256i t3_0 =
214
265k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
265k
    const __m256i t3_1 =
216
265k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
265k
    const __m256i t3_2 =
218
265k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
265k
    const __m256i t3_3 =
220
265k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
265k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
265k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
265k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
265k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
265k
    if (use_lines) {
228
265k
      if (line_length >= 32) { // fast path
229
36.2k
        __m256i result;
230
36.2k
        result = lookup_pshufb_improved<isbase64url>(input0);
231
36.2k
        if (offset + 32 > line_length) {
232
17.7k
          size_t location_end = line_length - offset;
233
17.7k
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
17.7k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
17.7k
          _mm256_storeu_si256(
237
17.7k
              reinterpret_cast<__m256i *>(out),
238
17.7k
              insert_line_feed32(result, static_cast<int>(location_end)));
239
17.7k
          offset = to_move;
240
17.7k
          out += 32 + 1;
241
18.4k
        } else {
242
18.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
18.4k
          offset += 32;
244
18.4k
          out += 32;
245
18.4k
        }
246
36.2k
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
36.2k
        if (offset + 32 > line_length) {
249
16.0k
          size_t location_end = line_length - offset;
250
16.0k
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
16.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
16.0k
          _mm256_storeu_si256(
255
16.0k
              reinterpret_cast<__m256i *>(out),
256
16.0k
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
16.0k
          offset = to_move;
260
16.0k
          out += 32 + 1;
261
20.2k
        } else {
262
263
20.2k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
20.2k
          offset += 32;
266
20.2k
          out += 32;
267
20.2k
        }
268
36.2k
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
36.2k
        if (offset + 32 > line_length) {
271
17.6k
          size_t location_end = line_length - offset;
272
17.6k
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
17.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
17.6k
          _mm256_storeu_si256(
277
17.6k
              reinterpret_cast<__m256i *>(out),
278
17.6k
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
17.6k
          offset = to_move;
282
17.6k
          out += 32 + 1;
283
18.5k
        } else {
284
18.5k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
18.5k
          offset += 32;
286
18.5k
          out += 32;
287
18.5k
        }
288
36.2k
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
36.2k
        if (offset + 32 > line_length) {
291
16.1k
          size_t location_end = line_length - offset;
292
16.1k
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
16.1k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
16.1k
          _mm256_storeu_si256(
297
16.1k
              reinterpret_cast<__m256i *>(out),
298
16.1k
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
16.1k
          offset = to_move;
302
16.1k
          out += 32 + 1;
303
20.0k
        } else {
304
20.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
20.0k
          offset += 32;
306
20.0k
          out += 32;
307
20.0k
        }
308
229k
      } else { // slow path
309
        // could be optimized
310
229k
        uint8_t buffer[128];
311
229k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
229k
                            lookup_pshufb_improved<isbase64url>(input0));
313
229k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
229k
                            lookup_pshufb_improved<isbase64url>(input1));
315
229k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
229k
                            lookup_pshufb_improved<isbase64url>(input2));
317
229k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
229k
                            lookup_pshufb_improved<isbase64url>(input3));
319
229k
        size_t out_pos = 0;
320
229k
        size_t local_offset = offset;
321
29.6M
        for (size_t j = 0; j < 128;) {
322
29.3M
          if (local_offset == line_length) {
323
6.86M
            out[out_pos++] = '\n';
324
6.86M
            local_offset = 0;
325
6.86M
          }
326
29.3M
          out[out_pos++] = buffer[j++];
327
29.3M
          local_offset++;
328
29.3M
        }
329
229k
        offset = local_offset;
330
229k
        out += out_pos;
331
229k
      }
332
265k
    } else {
333
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
0
                          lookup_pshufb_improved<isbase64url>(input0));
335
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
0
                          lookup_pshufb_improved<isbase64url>(input1));
337
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
0
                          lookup_pshufb_improved<isbase64url>(input2));
339
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
0
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
0
      out += 128;
343
0
    }
344
265k
  }
345
1.14k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
565
    const __m128i lo =
349
565
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
565
    const __m128i hi =
351
565
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
565
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
565
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
565
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
565
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
565
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
565
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
565
    if (use_lines) {
366
565
      if (line_length >= 32) { // fast path
367
310
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
310
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
310
        if (offset + 32 > line_length) {
371
103
          size_t location_end = line_length - offset;
372
103
          size_t to_move = 32 - location_end;
373
103
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
103
          out[location_end] = '\n';
375
103
          offset = to_move;
376
103
          out += 32 + 1;
377
207
        } else {
378
207
          offset += 32;
379
207
          out += 32;
380
207
        }
381
310
      } else { // slow path
382
        // could be optimized
383
255
        alignas(32) uint8_t buffer[32];
384
255
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
255
                            lookup_pshufb_improved<isbase64url>(indices));
386
255
        std::memcpy(out, buffer, 32);
387
255
        size_t out_pos = 0;
388
255
        size_t local_offset = offset;
389
8.41k
        for (size_t j = 0; j < 32;) {
390
8.16k
          if (local_offset == line_length) {
391
1.49k
            out[out_pos++] = '\n';
392
1.49k
            local_offset = 0;
393
1.49k
          }
394
8.16k
          out[out_pos++] = buffer[j++];
395
8.16k
          local_offset++;
396
8.16k
        }
397
255
        offset = local_offset;
398
255
        out += out_pos;
399
255
      }
400
565
    } else {
401
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
0
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
0
      out += 32;
405
0
    }
406
565
  }
407
579
  return ((char *)out - (char *)dst) +
408
579
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
579
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
579
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<false, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
564
                        size_t line_length = simdutf::default_line_length) {
157
564
  size_t offset = 0;
158
159
564
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
564
  const uint8_t *input = (const uint8_t *)src;
164
165
564
  uint8_t *out = (uint8_t *)dst;
166
564
  const __m256i shuf =
167
564
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
564
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
564
  size_t i = 0;
171
426k
  for (; i + 100 <= srclen; i += 96) {
172
425k
    const __m128i lo0 = _mm_loadu_si128(
173
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
425k
    const __m128i hi0 = _mm_loadu_si128(
175
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
425k
    const __m128i lo1 = _mm_loadu_si128(
177
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
425k
    const __m128i hi1 = _mm_loadu_si128(
179
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
425k
    const __m128i lo2 = _mm_loadu_si128(
181
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
425k
    const __m128i hi2 = _mm_loadu_si128(
183
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
425k
    const __m128i lo3 = _mm_loadu_si128(
185
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
425k
    const __m128i hi3 = _mm_loadu_si128(
187
425k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
425k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
425k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
425k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
425k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
425k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
425k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
425k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
425k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
425k
    const __m256i t1_0 =
200
425k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
425k
    const __m256i t1_1 =
202
425k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
425k
    const __m256i t1_2 =
204
425k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
425k
    const __m256i t1_3 =
206
425k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
425k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
425k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
425k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
425k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
425k
    const __m256i t3_0 =
214
425k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
425k
    const __m256i t3_1 =
216
425k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
425k
    const __m256i t3_2 =
218
425k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
425k
    const __m256i t3_3 =
220
425k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
425k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
425k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
425k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
425k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
425k
    if (use_lines) {
228
425k
      if (line_length >= 32) { // fast path
229
77.2k
        __m256i result;
230
77.2k
        result = lookup_pshufb_improved<isbase64url>(input0);
231
77.2k
        if (offset + 32 > line_length) {
232
35.1k
          size_t location_end = line_length - offset;
233
35.1k
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
35.1k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
35.1k
          _mm256_storeu_si256(
237
35.1k
              reinterpret_cast<__m256i *>(out),
238
35.1k
              insert_line_feed32(result, static_cast<int>(location_end)));
239
35.1k
          offset = to_move;
240
35.1k
          out += 32 + 1;
241
42.0k
        } else {
242
42.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
42.0k
          offset += 32;
244
42.0k
          out += 32;
245
42.0k
        }
246
77.2k
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
77.2k
        if (offset + 32 > line_length) {
249
34.1k
          size_t location_end = line_length - offset;
250
34.1k
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
34.1k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
34.1k
          _mm256_storeu_si256(
255
34.1k
              reinterpret_cast<__m256i *>(out),
256
34.1k
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
34.1k
          offset = to_move;
260
34.1k
          out += 32 + 1;
261
43.0k
        } else {
262
263
43.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
43.0k
          offset += 32;
266
43.0k
          out += 32;
267
43.0k
        }
268
77.2k
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
77.2k
        if (offset + 32 > line_length) {
271
35.0k
          size_t location_end = line_length - offset;
272
35.0k
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
35.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
35.0k
          _mm256_storeu_si256(
277
35.0k
              reinterpret_cast<__m256i *>(out),
278
35.0k
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
35.0k
          offset = to_move;
282
35.0k
          out += 32 + 1;
283
42.1k
        } else {
284
42.1k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
42.1k
          offset += 32;
286
42.1k
          out += 32;
287
42.1k
        }
288
77.2k
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
77.2k
        if (offset + 32 > line_length) {
291
34.2k
          size_t location_end = line_length - offset;
292
34.2k
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
34.2k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
34.2k
          _mm256_storeu_si256(
297
34.2k
              reinterpret_cast<__m256i *>(out),
298
34.2k
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
34.2k
          offset = to_move;
302
34.2k
          out += 32 + 1;
303
43.0k
        } else {
304
43.0k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
43.0k
          offset += 32;
306
43.0k
          out += 32;
307
43.0k
        }
308
348k
      } else { // slow path
309
        // could be optimized
310
348k
        uint8_t buffer[128];
311
348k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
348k
                            lookup_pshufb_improved<isbase64url>(input0));
313
348k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
348k
                            lookup_pshufb_improved<isbase64url>(input1));
315
348k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
348k
                            lookup_pshufb_improved<isbase64url>(input2));
317
348k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
348k
                            lookup_pshufb_improved<isbase64url>(input3));
319
348k
        size_t out_pos = 0;
320
348k
        size_t local_offset = offset;
321
44.9M
        for (size_t j = 0; j < 128;) {
322
44.5M
          if (local_offset == line_length) {
323
10.4M
            out[out_pos++] = '\n';
324
10.4M
            local_offset = 0;
325
10.4M
          }
326
44.5M
          out[out_pos++] = buffer[j++];
327
44.5M
          local_offset++;
328
44.5M
        }
329
348k
        offset = local_offset;
330
348k
        out += out_pos;
331
348k
      }
332
425k
    } else {
333
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
0
                          lookup_pshufb_improved<isbase64url>(input0));
335
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
0
                          lookup_pshufb_improved<isbase64url>(input1));
337
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
0
                          lookup_pshufb_improved<isbase64url>(input2));
339
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
0
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
0
      out += 128;
343
0
    }
344
425k
  }
345
1.11k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
554
    const __m128i lo =
349
554
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
554
    const __m128i hi =
351
554
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
554
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
554
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
554
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
554
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
554
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
554
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
554
    if (use_lines) {
366
554
      if (line_length >= 32) { // fast path
367
261
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
261
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
261
        if (offset + 32 > line_length) {
371
81
          size_t location_end = line_length - offset;
372
81
          size_t to_move = 32 - location_end;
373
81
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
81
          out[location_end] = '\n';
375
81
          offset = to_move;
376
81
          out += 32 + 1;
377
180
        } else {
378
180
          offset += 32;
379
180
          out += 32;
380
180
        }
381
293
      } else { // slow path
382
        // could be optimized
383
293
        alignas(32) uint8_t buffer[32];
384
293
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
293
                            lookup_pshufb_improved<isbase64url>(indices));
386
293
        std::memcpy(out, buffer, 32);
387
293
        size_t out_pos = 0;
388
293
        size_t local_offset = offset;
389
9.66k
        for (size_t j = 0; j < 32;) {
390
9.37k
          if (local_offset == line_length) {
391
1.84k
            out[out_pos++] = '\n';
392
1.84k
            local_offset = 0;
393
1.84k
          }
394
9.37k
          out[out_pos++] = buffer[j++];
395
9.37k
          local_offset++;
396
9.37k
        }
397
293
        offset = local_offset;
398
293
        out += out_pos;
399
293
      }
400
554
    } else {
401
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
0
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
0
      out += 32;
405
0
    }
406
554
  }
407
564
  return ((char *)out - (char *)dst) +
408
564
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
564
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
564
}
411
412
template <bool isbase64url>
413
size_t encode_base64(char *dst, const char *src, size_t srclen,
414
15.8k
                     base64_options options) {
415
15.8k
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
416
15.8k
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::encode_base64<true>(char*, char const*, unsigned long, simdutf::base64_options)
Line
Count
Source
414
579
                     base64_options options) {
415
579
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
416
579
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::encode_base64<false>(char*, char const*, unsigned long, simdutf::base64_options)
Line
Count
Source
414
15.2k
                     base64_options options) {
415
15.2k
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
416
15.2k
}
417
418
380k
static inline void compress(__m128i data, uint16_t mask, char *output) {
419
380k
  if (mask == 0) {
420
22.7k
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
421
22.7k
    return;
422
22.7k
  }
423
  // this particular implementation was inspired by work done by @animetosho
424
  // we do it in two steps, first 8 bytes and then second 8 bytes
425
358k
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
426
358k
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
427
  // next line just loads the 64-bit values thintable_epi8[mask1] and
428
  // thintable_epi8[mask2] into a 128-bit register, using only
429
  // two instructions on most compilers.
430
431
358k
  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
432
358k
                                    tables::base64::thintable_epi8[mask1]);
433
  // we increment by 0x08 the second half of the mask
434
358k
  shufmask =
435
358k
      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
436
  // this is the version "nearly pruned"
437
358k
  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
438
  // we still need to put the two halves together.
439
  // we compute the popcount of the first half:
440
358k
  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
441
  // then load the corresponding mask, what it does is to write
442
  // only the first pop1 bytes from the first 8 bytes, and then
443
  // it fills in with the bytes from the second 8 bytes + some filling
444
  // at the end.
445
358k
  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
446
358k
      tables::base64::pshufb_combine_table + pop1 * 8));
447
358k
  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
448
449
358k
  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
450
358k
}
451
452
// --- decoding -----------------------------------------------
453
454
template <typename = void>
455
199k
simdutf_really_inline void compress(__m256i data, uint32_t mask, char *output) {
456
199k
  if (mask == 0) {
457
8.73k
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
458
8.73k
    return;
459
8.73k
  }
460
190k
  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
461
190k
  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
462
190k
           output + count_ones(~mask & 0xFFFF));
463
190k
}
464
465
template <typename = void>
466
2.91M
simdutf_really_inline void base64_decode(char *out, __m256i str) {
467
  // credit: aqrit
468
2.91M
  const __m256i pack_shuffle =
469
2.91M
      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
470
2.91M
                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
471
2.91M
  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
472
2.91M
  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
473
2.91M
  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
474
475
  // Store the output:
476
2.91M
  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
477
2.91M
  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
478
2.91M
}
479
480
template <typename = void>
481
64.5k
simdutf_really_inline void base64_decode_block(char *out, const char *src) {
482
64.5k
  base64_decode(out,
483
64.5k
                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
484
64.5k
  base64_decode(out + 24, _mm256_loadu_si256(
485
64.5k
                              reinterpret_cast<const __m256i *>(src + 32)));
486
64.5k
}
487
488
template <typename = void>
489
simdutf_really_inline void base64_decode_block_safe(char *out,
490
120
                                                    const char *src) {
491
120
  base64_decode(out,
492
120
                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
493
120
  alignas(32) char buffer[32]; // We enforce safety with a buffer.
494
120
  base64_decode(
495
120
      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
496
120
  std::memcpy(out + 24, buffer, 24);
497
120
}
498
499
// --- decoding - base64 class --------------------------------
500
501
class block64 {
502
  __m256i chunks[2];
503
504
public:
505
  // The caller of this function is responsible to ensure that there are 64
506
  // bytes available from reading at src.
507
1.49M
  simdutf_really_inline block64(const char *src) {
508
1.49M
    chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
509
1.49M
    chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
510
1.49M
  }
511
512
  // The caller of this function is responsible to ensure that there are 128
513
  // bytes available from reading at src.
514
50.6k
  simdutf_really_inline block64(const char16_t *src) {
515
50.6k
    const auto m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
516
50.6k
    const auto m2 =
517
50.6k
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
518
50.6k
    const auto m3 =
519
50.6k
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
520
50.6k
    const auto m4 =
521
50.6k
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
522
523
50.6k
    const auto m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
524
50.6k
    const auto m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
525
50.6k
    const auto m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
526
50.6k
    const auto m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
527
528
50.6k
    chunks[0] = _mm256_packus_epi16(m1p, m2p);
529
50.6k
    chunks[1] = _mm256_packus_epi16(m3p, m4p);
530
50.6k
  }
531
532
16.3k
  simdutf_really_inline void copy_block(char *output) {
533
16.3k
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), chunks[0]);
534
16.3k
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), chunks[1]);
535
16.3k
  }
536
537
  // decode 64 bytes and output 48 bytes
538
1.39M
  simdutf_really_inline void base64_decode_block(char *out) {
539
1.39M
    base64_decode(out, chunks[0]);
540
1.39M
    base64_decode(out + 24, chunks[1]);
541
1.39M
  }
542
543
1.33k
  simdutf_really_inline void base64_decode_block_safe(char *out) {
544
1.33k
    base64_decode(out, chunks[0]);
545
1.33k
    alignas(32) char buffer[32]; // We enforce safety with a buffer.
546
1.33k
    base64_decode(buffer, chunks[1]);
547
1.33k
    std::memcpy(out + 24, buffer, 24);
548
1.33k
  }
549
550
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
551
1.54M
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
552
1.54M
    uint32_t err0 = 0;
553
1.54M
    uint32_t err1 = 0;
554
1.54M
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
555
1.54M
        &chunks[0], &err0);
556
1.54M
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
557
1.54M
        &chunks[1], &err1);
558
1.54M
    if (!ignore_garbage) {
559
1.54M
      *error = err0 | ((uint64_t)err1 << 32);
560
1.54M
    }
561
1.54M
    return m0 | (m1 << 32);
562
1.54M
  }
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, true>(unsigned long*)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, true>(unsigned long*)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, true, false>(unsigned long*)
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, false, false>(unsigned long*)
Line
Count
Source
551
593k
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
552
593k
    uint32_t err0 = 0;
553
593k
    uint32_t err1 = 0;
554
593k
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
555
593k
        &chunks[0], &err0);
556
593k
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
557
593k
        &chunks[1], &err1);
558
593k
    if (!ignore_garbage) {
559
593k
      *error = err0 | ((uint64_t)err1 << 32);
560
593k
    }
561
593k
    return m0 | (m1 << 32);
562
593k
  }
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, false>(unsigned long*)
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, false>(unsigned long*)
Line
Count
Source
551
951k
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
552
951k
    uint32_t err0 = 0;
553
951k
    uint32_t err1 = 0;
554
951k
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
555
951k
        &chunks[0], &err0);
556
951k
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
557
951k
        &chunks[1], &err1);
558
951k
    if (!ignore_garbage) {
559
951k
      *error = err0 | ((uint64_t)err1 << 32);
560
951k
    }
561
951k
    return m0 | (m1 << 32);
562
951k
  }
563
564
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
565
3.09M
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
566
3.09M
    const __m256i ascii_space_tbl =
567
3.09M
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
568
3.09M
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
569
3.09M
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
570
    // credit: aqrit
571
3.09M
    __m256i delta_asso;
572
3.09M
    if (default_or_url) {
573
0
      delta_asso = _mm256_setr_epi8(
574
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
575
0
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
576
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
577
3.09M
    } else if (base64_url) {
578
1.18M
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
579
1.18M
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
580
1.18M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
581
1.18M
                                    0x0, 0x0, 0xF, 0x0, 0xF);
582
1.90M
    } else {
583
1.90M
      delta_asso = _mm256_setr_epi8(
584
1.90M
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
585
1.90M
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
586
1.90M
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
587
1.90M
    }
588
589
3.09M
    __m256i delta_values;
590
3.09M
    if (default_or_url) {
591
0
      delta_values = _mm256_setr_epi8(
592
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
593
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
594
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
595
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
596
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
597
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
598
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
599
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
600
3.09M
    } else if (base64_url) {
601
1.18M
      delta_values = _mm256_setr_epi8(
602
1.18M
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
603
1.18M
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
604
1.18M
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
605
1.18M
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
606
1.18M
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
607
1.90M
    } else {
608
1.90M
      delta_values = _mm256_setr_epi8(
609
1.90M
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
610
1.90M
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
611
1.90M
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
612
1.90M
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
613
1.90M
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
614
1.90M
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
615
1.90M
          int8_t(0xB9), int8_t(0xB9));
616
1.90M
    }
617
618
3.09M
    __m256i check_asso;
619
3.09M
    if (default_or_url) {
620
0
      check_asso = _mm256_setr_epi8(
621
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
622
0
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
623
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
624
3.09M
    } else if (base64_url) {
625
1.18M
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
626
1.18M
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
627
1.18M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
628
1.18M
                                    0x7, 0xB, 0xE, 0xB, 0x6);
629
1.90M
    } else {
630
1.90M
      check_asso = _mm256_setr_epi8(
631
1.90M
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
632
1.90M
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
633
1.90M
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
634
1.90M
    }
635
3.09M
    __m256i check_values;
636
3.09M
    if (default_or_url) {
637
0
      check_values = _mm256_setr_epi8(
638
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
639
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
640
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
641
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
642
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
643
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
644
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
645
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
646
3.09M
    } else if (base64_url) {
647
1.18M
      check_values = _mm256_setr_epi8(
648
1.18M
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
649
1.18M
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
650
1.18M
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
651
1.18M
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
652
1.18M
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
653
1.18M
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
654
1.18M
          uint8_t(0x80), 0x0, uint8_t(0x80));
655
1.90M
    } else {
656
1.90M
      check_values = _mm256_setr_epi8(
657
1.90M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
658
1.90M
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
659
1.90M
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
660
1.90M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
661
1.90M
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
662
1.90M
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
663
1.90M
          int8_t(0x91), int8_t(0x80));
664
1.90M
    }
665
3.09M
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
666
3.09M
    __m256i delta_hash =
667
3.09M
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
668
3.09M
    if (default_or_url) {
669
0
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
670
0
    }
671
3.09M
    const __m256i check_hash =
672
3.09M
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
673
3.09M
    const __m256i out =
674
3.09M
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
675
3.09M
    const __m256i chk =
676
3.09M
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
677
3.09M
    const int mask = _mm256_movemask_epi8(chk);
678
3.09M
    if (!ignore_garbage && mask) {
679
226k
      __m256i ascii_space =
680
226k
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
681
226k
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
682
226k
    }
683
3.09M
    *src = out;
684
3.09M
    return (uint32_t)mask;
685
3.09M
  }
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, true>(long long __vector(4)*, unsigned int*)
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, true>(long long __vector(4)*, unsigned int*)
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, true, false>(long long __vector(4)*, unsigned int*)
simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, false, false>(long long __vector(4)*, unsigned int*)
Line
Count
Source
565
1.18M
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
566
1.18M
    const __m256i ascii_space_tbl =
567
1.18M
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
568
1.18M
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
569
1.18M
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
570
    // credit: aqrit
571
1.18M
    __m256i delta_asso;
572
1.18M
    if (default_or_url) {
573
0
      delta_asso = _mm256_setr_epi8(
574
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
575
0
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
576
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
577
1.18M
    } else if (base64_url) {
578
1.18M
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
579
1.18M
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
580
1.18M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
581
1.18M
                                    0x0, 0x0, 0xF, 0x0, 0xF);
582
1.18M
    } else {
583
0
      delta_asso = _mm256_setr_epi8(
584
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
585
0
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
586
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
587
0
    }
588
589
1.18M
    __m256i delta_values;
590
1.18M
    if (default_or_url) {
591
0
      delta_values = _mm256_setr_epi8(
592
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
593
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
594
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
595
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
596
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
597
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
598
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
599
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
600
1.18M
    } else if (base64_url) {
601
1.18M
      delta_values = _mm256_setr_epi8(
602
1.18M
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
603
1.18M
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
604
1.18M
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
605
1.18M
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
606
1.18M
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
607
1.18M
    } else {
608
0
      delta_values = _mm256_setr_epi8(
609
0
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
610
0
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
611
0
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
612
0
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
613
0
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
614
0
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
615
0
          int8_t(0xB9), int8_t(0xB9));
616
0
    }
617
618
1.18M
    __m256i check_asso;
619
1.18M
    if (default_or_url) {
620
0
      check_asso = _mm256_setr_epi8(
621
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
622
0
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
623
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
624
1.18M
    } else if (base64_url) {
625
1.18M
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
626
1.18M
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
627
1.18M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
628
1.18M
                                    0x7, 0xB, 0xE, 0xB, 0x6);
629
1.18M
    } else {
630
0
      check_asso = _mm256_setr_epi8(
631
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
632
0
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
633
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
634
0
    }
635
1.18M
    __m256i check_values;
636
1.18M
    if (default_or_url) {
637
0
      check_values = _mm256_setr_epi8(
638
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
639
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
640
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
641
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
642
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
643
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
644
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
645
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
646
1.18M
    } else if (base64_url) {
647
1.18M
      check_values = _mm256_setr_epi8(
648
1.18M
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
649
1.18M
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
650
1.18M
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
651
1.18M
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
652
1.18M
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
653
1.18M
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
654
1.18M
          uint8_t(0x80), 0x0, uint8_t(0x80));
655
1.18M
    } else {
656
0
      check_values = _mm256_setr_epi8(
657
0
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
658
0
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
659
0
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
660
0
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
661
0
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
662
0
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
663
0
          int8_t(0x91), int8_t(0x80));
664
0
    }
665
1.18M
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
666
1.18M
    __m256i delta_hash =
667
1.18M
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
668
1.18M
    if (default_or_url) {
669
0
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
670
0
    }
671
1.18M
    const __m256i check_hash =
672
1.18M
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
673
1.18M
    const __m256i out =
674
1.18M
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
675
1.18M
    const __m256i chk =
676
1.18M
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
677
1.18M
    const int mask = _mm256_movemask_epi8(chk);
678
1.18M
    if (!ignore_garbage && mask) {
679
85.5k
      __m256i ascii_space =
680
85.5k
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
681
85.5k
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
682
85.5k
    }
683
1.18M
    *src = out;
684
1.18M
    return (uint32_t)mask;
685
1.18M
  }
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, false>(long long __vector(4)*, unsigned int*)
simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, false>(long long __vector(4)*, unsigned int*)
Line
Count
Source
565
1.90M
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
566
1.90M
    const __m256i ascii_space_tbl =
567
1.90M
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
568
1.90M
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
569
1.90M
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
570
    // credit: aqrit
571
1.90M
    __m256i delta_asso;
572
1.90M
    if (default_or_url) {
573
0
      delta_asso = _mm256_setr_epi8(
574
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
575
0
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
576
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
577
1.90M
    } else if (base64_url) {
578
0
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
579
0
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
580
0
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
581
0
                                    0x0, 0x0, 0xF, 0x0, 0xF);
582
1.90M
    } else {
583
1.90M
      delta_asso = _mm256_setr_epi8(
584
1.90M
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
585
1.90M
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
586
1.90M
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
587
1.90M
    }
588
589
1.90M
    __m256i delta_values;
590
1.90M
    if (default_or_url) {
591
0
      delta_values = _mm256_setr_epi8(
592
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
593
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
594
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
595
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
596
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
597
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
598
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
599
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
600
1.90M
    } else if (base64_url) {
601
0
      delta_values = _mm256_setr_epi8(
602
0
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
603
0
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
604
0
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
605
0
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
606
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
607
1.90M
    } else {
608
1.90M
      delta_values = _mm256_setr_epi8(
609
1.90M
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
610
1.90M
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
611
1.90M
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
612
1.90M
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
613
1.90M
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
614
1.90M
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
615
1.90M
          int8_t(0xB9), int8_t(0xB9));
616
1.90M
    }
617
618
1.90M
    __m256i check_asso;
619
1.90M
    if (default_or_url) {
620
0
      check_asso = _mm256_setr_epi8(
621
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
622
0
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
623
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
624
1.90M
    } else if (base64_url) {
625
0
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
626
0
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
627
0
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
628
0
                                    0x7, 0xB, 0xE, 0xB, 0x6);
629
1.90M
    } else {
630
1.90M
      check_asso = _mm256_setr_epi8(
631
1.90M
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
632
1.90M
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
633
1.90M
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
634
1.90M
    }
635
1.90M
    __m256i check_values;
636
1.90M
    if (default_or_url) {
637
0
      check_values = _mm256_setr_epi8(
638
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
639
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
640
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
641
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
642
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
643
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
644
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
645
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
646
1.90M
    } else if (base64_url) {
647
0
      check_values = _mm256_setr_epi8(
648
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
649
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
650
0
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
651
0
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
652
0
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
653
0
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
654
0
          uint8_t(0x80), 0x0, uint8_t(0x80));
655
1.90M
    } else {
656
1.90M
      check_values = _mm256_setr_epi8(
657
1.90M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
658
1.90M
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
659
1.90M
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
660
1.90M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
661
1.90M
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
662
1.90M
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
663
1.90M
          int8_t(0x91), int8_t(0x80));
664
1.90M
    }
665
1.90M
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
666
1.90M
    __m256i delta_hash =
667
1.90M
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
668
1.90M
    if (default_or_url) {
669
0
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
670
0
    }
671
1.90M
    const __m256i check_hash =
672
1.90M
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
673
1.90M
    const __m256i out =
674
1.90M
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
675
1.90M
    const __m256i chk =
676
1.90M
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
677
1.90M
    const int mask = _mm256_movemask_epi8(chk);
678
1.90M
    if (!ignore_garbage && mask) {
679
140k
      __m256i ascii_space =
680
140k
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
681
140k
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
682
140k
    }
683
1.90M
    *src = out;
684
1.90M
    return (uint32_t)mask;
685
1.90M
  }
686
687
132k
  simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
688
132k
    if (is_power_of_two(mask)) {
689
33.0k
      return compress_block_single(mask, output);
690
33.0k
    }
691
692
99.5k
    uint64_t nmask = ~mask;
693
99.5k
    compress(chunks[0], uint32_t(mask), output);
694
99.5k
    compress(chunks[1], uint32_t(mask >> 32),
695
99.5k
             output + count_ones(nmask & 0xFFFFFFFF));
696
99.5k
    return count_ones(nmask);
697
132k
  }
698
699
  simdutf_really_inline size_t compress_block_single(uint64_t mask,
700
33.0k
                                                     char *output) {
701
33.0k
    const size_t pos64 = trailing_zeroes(mask);
702
33.0k
    const int8_t pos = pos64 & 0xf;
703
33.0k
    switch (pos64 >> 4) {
704
7.94k
    case 0b00: {
705
7.94k
      const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
706
7.94k
      const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
707
708
7.94k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
709
7.94k
      const __m128i v1 =
710
7.94k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
711
7.94k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
712
7.94k
      const __m128i sh = _mm_sub_epi8(v1, v2);
713
7.94k
      const __m128i compressed = _mm_shuffle_epi8(lane0, sh);
714
715
7.94k
      _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed);
716
7.94k
      _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), lane1);
717
7.94k
      _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
718
7.94k
    } break;
719
8.71k
    case 0b01: {
720
8.71k
      const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
721
8.71k
      const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
722
8.71k
      _mm_storeu_si128((__m128i *)(output + 0 * 16), lane0);
723
724
8.71k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
725
8.71k
      const __m128i v1 =
726
8.71k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
727
8.71k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
728
8.71k
      const __m128i sh = _mm_sub_epi8(v1, v2);
729
8.71k
      const __m128i compressed = _mm_shuffle_epi8(lane1, sh);
730
731
8.71k
      _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed);
732
8.71k
      _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
733
8.71k
    } break;
734
8.68k
    case 0b10: {
735
8.68k
      const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
736
8.68k
      const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);
737
738
8.68k
      _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
739
740
8.68k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
741
8.68k
      const __m128i v1 =
742
8.68k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
743
8.68k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
744
8.68k
      const __m128i sh = _mm_sub_epi8(v1, v2);
745
8.68k
      const __m128i compressed = _mm_shuffle_epi8(lane2, sh);
746
747
8.68k
      _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed);
748
8.68k
      _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), lane3);
749
8.68k
    } break;
750
7.75k
    case 0b11: {
751
7.75k
      const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
752
7.75k
      const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);
753
754
7.75k
      _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
755
7.75k
      _mm_storeu_si128((__m128i *)(output + 2 * 16), lane2);
756
757
7.75k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
758
7.75k
      const __m128i v1 =
759
7.75k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
760
7.75k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
761
7.75k
      const __m128i sh = _mm_sub_epi8(v1, v2);
762
7.75k
      const __m128i compressed = _mm_shuffle_epi8(lane3, sh);
763
764
7.75k
      _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed);
765
7.75k
    } break;
766
33.0k
    }
767
768
33.0k
    return 63;
769
33.0k
  }
770
};