Coverage Report

Created: 2026-01-17 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/src/haswell/avx2_base64.cpp
Line
Count
Source
1
/**
2
 * References and further reading:
3
 *
4
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
5
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
6
 * https://arxiv.org/abs/1910.05109
7
 *
8
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
9
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
10
 * https://arxiv.org/abs/1704.00605
11
 *
12
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
13
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
14
 * Request for Comments: 4648.
15
 *
16
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
17
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
18
 *
19
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
20
 * acceleration. https://github.com/aklomp/base64. (2014).
21
 *
22
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
23
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
24
 *
25
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
26
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
27
 */
28
29
template <bool base64_url>
30
5.55M
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
31
  // Precomputed shuffle masks for K = 1 to 16
32
  // credit: Wojciech Muła
33
5.55M
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
34
5.55M
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
35
5.55M
  result =
36
5.55M
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
37
5.55M
  __m256i shift_LUT;
38
5.55M
  if (base64_url) {
39
2.69M
    shift_LUT = _mm256_setr_epi8(
40
2.69M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
41
2.69M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
42
43
2.69M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
44
2.69M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
45
2.86M
  } else {
46
2.86M
    shift_LUT = _mm256_setr_epi8(
47
2.86M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
48
2.86M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
49
50
2.86M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
51
2.86M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
52
2.86M
  }
53
54
5.55M
  result = _mm256_shuffle_epi8(shift_LUT, result);
55
5.55M
  return _mm256_add_epi8(result, input);
56
5.55M
}
simdutf.cpp:long long __vector(4) simdutf::haswell::(anonymous namespace)::lookup_pshufb_improved<true>(long long __vector(4))
Line
Count
Source
30
2.69M
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
31
  // Precomputed shuffle masks for K = 1 to 16
32
  // credit: Wojciech Muła
33
2.69M
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
34
2.69M
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
35
2.69M
  result =
36
2.69M
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
37
2.69M
  __m256i shift_LUT;
38
2.69M
  if (base64_url) {
39
2.69M
    shift_LUT = _mm256_setr_epi8(
40
2.69M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
41
2.69M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
42
43
2.69M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
44
2.69M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
45
2.69M
  } else {
46
0
    shift_LUT = _mm256_setr_epi8(
47
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
48
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
49
50
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
51
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
52
0
  }
53
54
2.69M
  result = _mm256_shuffle_epi8(shift_LUT, result);
55
2.69M
  return _mm256_add_epi8(result, input);
56
2.69M
}
simdutf.cpp:long long __vector(4) simdutf::haswell::(anonymous namespace)::lookup_pshufb_improved<false>(long long __vector(4))
Line
Count
Source
30
2.86M
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
31
  // Precomputed shuffle masks for K = 1 to 16
32
  // credit: Wojciech Muła
33
2.86M
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
34
2.86M
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
35
2.86M
  result =
36
2.86M
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
37
2.86M
  __m256i shift_LUT;
38
2.86M
  if (base64_url) {
39
0
    shift_LUT = _mm256_setr_epi8(
40
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
41
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
42
43
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
44
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
45
2.86M
  } else {
46
2.86M
    shift_LUT = _mm256_setr_epi8(
47
2.86M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
48
2.86M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
49
50
2.86M
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
51
2.86M
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
52
2.86M
  }
53
54
2.86M
  result = _mm256_shuffle_epi8(shift_LUT, result);
55
2.86M
  return _mm256_add_epi8(result, input);
56
2.86M
}
57
58
193k
simdutf_really_inline __m256i insert_line_feed32(__m256i input, int K) {
59
60
193k
  static const uint8_t low_table[16][32] = {
61
193k
      {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
62
193k
       0,    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
63
193k
      {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
64
193k
       0, 1,    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
65
193k
      {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
66
193k
       0, 1, 2,    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
67
193k
      {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
68
193k
       0, 1, 2, 3,    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
69
193k
      {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
70
193k
       0, 1, 2, 3, 4,    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
71
193k
      {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
72
193k
       0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
73
193k
      {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9,  10, 11, 12, 13, 14,
74
193k
       0, 1, 2, 3, 4, 5, 6,    7, 8, 9, 10, 11, 12, 13, 14, 15},
75
193k
      {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9,  10, 11, 12, 13, 14,
76
193k
       0, 1, 2, 3, 4, 5, 6, 7,    8, 9, 10, 11, 12, 13, 14, 15},
77
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9,  10, 11, 12, 13, 14,
78
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8,    9, 10, 11, 12, 13, 14, 15},
79
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9,  10, 11, 12, 13, 14,
80
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9,    10, 11, 12, 13, 14, 15},
81
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14,
82
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,   11, 12, 13, 14, 15},
83
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14,
84
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   12, 13, 14, 15},
85
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14,
86
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,   13, 14, 15},
87
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14,
88
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,   14, 15},
89
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14,
90
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,   15},
91
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80,
92
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
93
193k
  static const uint8_t high_table[16][32] = {
94
193k
      {0,    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
95
193k
       0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
96
193k
      {0, 1,    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
97
193k
       0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
98
193k
      {0, 1, 2,    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
99
193k
       0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
100
193k
      {0, 1, 2, 3,    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
101
193k
       0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
102
193k
      {0, 1, 2, 3, 4,    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
103
193k
       0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
104
193k
      {0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
105
193k
       0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
106
193k
      {0, 1, 2, 3, 4, 5, 6,    7, 8, 9, 10, 11, 12, 13, 14, 15,
107
193k
       0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9,  10, 11, 12, 13, 14},
108
193k
      {0, 1, 2, 3, 4, 5, 6, 7,    8, 9, 10, 11, 12, 13, 14, 15,
109
193k
       0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9,  10, 11, 12, 13, 14},
110
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8,    9, 10, 11, 12, 13, 14, 15,
111
193k
       0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9,  10, 11, 12, 13, 14},
112
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,    10, 11, 12, 13, 14, 15,
113
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9,  10, 11, 12, 13, 14},
114
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,   11, 12, 13, 14, 15,
115
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14},
116
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   12, 13, 14, 15,
117
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14},
118
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,   13, 14, 15,
119
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14},
120
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,   14, 15,
121
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14},
122
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,   15,
123
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14},
124
193k
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
125
193k
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}};
126
127
193k
  __m256i line_feed_vector = _mm256_set1_epi8('\n');
128
193k
  if (K >= 16) {
129
90.2k
    __m256i mask = _mm256_loadu_si256((const __m256i *)high_table[K - 16]);
130
90.2k
    __m256i lf_pos =
131
90.2k
        _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80)));
132
90.2k
    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
133
90.2k
    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
134
90.2k
    return result;
135
90.2k
  }
136
  // Shift input right by 1 byte
137
103k
  __m256i shift = _mm256_alignr_epi8(
138
103k
      input, _mm256_permute2x128_si256(input, input, 0x21), 15);
139
140
103k
  input = _mm256_blend_epi32(input, shift, 0xF0);
141
142
103k
  __m256i mask = _mm256_loadu_si256((const __m256i *)low_table[K]);
143
144
103k
  __m256i lf_pos =
145
103k
      _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80)));
146
103k
  __m256i shuffled = _mm256_shuffle_epi8(input, mask);
147
148
103k
  __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
149
103k
  return result;
150
193k
}
151
152
template <bool isbase64url, bool use_lines>
153
size_t
154
avx2_encode_base64_impl(char *dst, const char *src, size_t srclen,
155
                        base64_options options,
156
16.3k
                        size_t line_length = simdutf::default_line_length) {
157
16.3k
  size_t offset = 0;
158
159
16.3k
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
16.3k
  const uint8_t *input = (const uint8_t *)src;
164
165
16.3k
  uint8_t *out = (uint8_t *)dst;
166
16.3k
  const __m256i shuf =
167
16.3k
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
16.3k
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
16.3k
  size_t i = 0;
171
1.40M
  for (; i + 100 <= srclen; i += 96) {
172
1.38M
    const __m128i lo0 = _mm_loadu_si128(
173
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
1.38M
    const __m128i hi0 = _mm_loadu_si128(
175
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
1.38M
    const __m128i lo1 = _mm_loadu_si128(
177
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
1.38M
    const __m128i hi1 = _mm_loadu_si128(
179
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
1.38M
    const __m128i lo2 = _mm_loadu_si128(
181
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
1.38M
    const __m128i hi2 = _mm_loadu_si128(
183
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
1.38M
    const __m128i lo3 = _mm_loadu_si128(
185
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
1.38M
    const __m128i hi3 = _mm_loadu_si128(
187
1.38M
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
1.38M
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
1.38M
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
1.38M
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
1.38M
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
1.38M
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
1.38M
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
1.38M
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
1.38M
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
1.38M
    const __m256i t1_0 =
200
1.38M
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
1.38M
    const __m256i t1_1 =
202
1.38M
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
1.38M
    const __m256i t1_2 =
204
1.38M
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
1.38M
    const __m256i t1_3 =
206
1.38M
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
1.38M
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
1.38M
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
1.38M
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
1.38M
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
1.38M
    const __m256i t3_0 =
214
1.38M
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
1.38M
    const __m256i t3_1 =
216
1.38M
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
1.38M
    const __m256i t3_2 =
218
1.38M
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
1.38M
    const __m256i t3_3 =
220
1.38M
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
1.38M
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
1.38M
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
1.38M
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
1.38M
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
1.38M
    if (use_lines) {
228
693k
      if (line_length >= 32) { // fast path
229
141k
        __m256i result;
230
141k
        result = lookup_pshufb_improved<isbase64url>(input0);
231
141k
        if (offset + 32 > line_length) {
232
49.2k
          size_t location_end = line_length - offset;
233
49.2k
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
49.2k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
49.2k
          _mm256_storeu_si256(
237
49.2k
              reinterpret_cast<__m256i *>(out),
238
49.2k
              insert_line_feed32(result, static_cast<int>(location_end)));
239
49.2k
          offset = to_move;
240
49.2k
          out += 32 + 1;
241
92.6k
        } else {
242
92.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
92.6k
          offset += 32;
244
92.6k
          out += 32;
245
92.6k
        }
246
141k
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
141k
        if (offset + 32 > line_length) {
249
47.7k
          size_t location_end = line_length - offset;
250
47.7k
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
47.7k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
47.7k
          _mm256_storeu_si256(
255
47.7k
              reinterpret_cast<__m256i *>(out),
256
47.7k
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
47.7k
          offset = to_move;
260
47.7k
          out += 32 + 1;
261
94.1k
        } else {
262
263
94.1k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
94.1k
          offset += 32;
266
94.1k
          out += 32;
267
94.1k
        }
268
141k
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
141k
        if (offset + 32 > line_length) {
271
48.5k
          size_t location_end = line_length - offset;
272
48.5k
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
48.5k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
48.5k
          _mm256_storeu_si256(
277
48.5k
              reinterpret_cast<__m256i *>(out),
278
48.5k
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
48.5k
          offset = to_move;
282
48.5k
          out += 32 + 1;
283
93.4k
        } else {
284
93.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
93.4k
          offset += 32;
286
93.4k
          out += 32;
287
93.4k
        }
288
141k
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
141k
        if (offset + 32 > line_length) {
291
47.9k
          size_t location_end = line_length - offset;
292
47.9k
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
47.9k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
47.9k
          _mm256_storeu_si256(
297
47.9k
              reinterpret_cast<__m256i *>(out),
298
47.9k
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
47.9k
          offset = to_move;
302
47.9k
          out += 32 + 1;
303
93.9k
        } else {
304
93.9k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
93.9k
          offset += 32;
306
93.9k
          out += 32;
307
93.9k
        }
308
551k
      } else { // slow path
309
        // could be optimized
310
551k
        uint8_t buffer[128];
311
551k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
551k
                            lookup_pshufb_improved<isbase64url>(input0));
313
551k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
551k
                            lookup_pshufb_improved<isbase64url>(input1));
315
551k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
551k
                            lookup_pshufb_improved<isbase64url>(input2));
317
551k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
551k
                            lookup_pshufb_improved<isbase64url>(input3));
319
551k
        size_t out_pos = 0;
320
551k
        size_t local_offset = offset;
321
71.1M
        for (size_t j = 0; j < 128;) {
322
70.5M
          if (local_offset == line_length) {
323
16.2M
            out[out_pos++] = '\n';
324
16.2M
            local_offset = 0;
325
16.2M
          }
326
70.5M
          out[out_pos++] = buffer[j++];
327
70.5M
          local_offset++;
328
70.5M
        }
329
551k
        offset = local_offset;
330
551k
        out += out_pos;
331
551k
      }
332
695k
    } else {
333
695k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
695k
                          lookup_pshufb_improved<isbase64url>(input0));
335
695k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
695k
                          lookup_pshufb_improved<isbase64url>(input1));
337
695k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
695k
                          lookup_pshufb_improved<isbase64url>(input2));
339
695k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
695k
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
695k
      out += 128;
343
695k
    }
344
1.38M
  }
345
20.0k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
3.69k
    const __m128i lo =
349
3.69k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
3.69k
    const __m128i hi =
351
3.69k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
3.69k
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
3.69k
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
3.69k
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
3.69k
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
3.69k
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
3.69k
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
3.69k
    if (use_lines) {
366
1.06k
      if (line_length >= 32) { // fast path
367
565
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
565
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
565
        if (offset + 32 > line_length) {
371
192
          size_t location_end = line_length - offset;
372
192
          size_t to_move = 32 - location_end;
373
192
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
192
          out[location_end] = '\n';
375
192
          offset = to_move;
376
192
          out += 32 + 1;
377
373
        } else {
378
373
          offset += 32;
379
373
          out += 32;
380
373
        }
381
565
      } else { // slow path
382
        // could be optimized
383
499
        alignas(32) uint8_t buffer[32];
384
499
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
499
                            lookup_pshufb_improved<isbase64url>(indices));
386
499
        std::memcpy(out, buffer, 32);
387
499
        size_t out_pos = 0;
388
499
        size_t local_offset = offset;
389
16.4k
        for (size_t j = 0; j < 32;) {
390
15.9k
          if (local_offset == line_length) {
391
3.01k
            out[out_pos++] = '\n';
392
3.01k
            local_offset = 0;
393
3.01k
          }
394
15.9k
          out[out_pos++] = buffer[j++];
395
15.9k
          local_offset++;
396
15.9k
        }
397
499
        offset = local_offset;
398
499
        out += out_pos;
399
499
      }
400
2.63k
    } else {
401
2.63k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
2.63k
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
2.63k
      out += 32;
405
2.63k
    }
406
3.69k
  }
407
16.3k
  return ((char *)out - (char *)dst) +
408
16.3k
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
16.3k
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
16.3k
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<true, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
558
                        size_t line_length = simdutf::default_line_length) {
157
558
  size_t offset = 0;
158
159
558
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
558
  const uint8_t *input = (const uint8_t *)src;
164
165
558
  uint8_t *out = (uint8_t *)dst;
166
558
  const __m256i shuf =
167
558
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
558
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
558
  size_t i = 0;
171
337k
  for (; i + 100 <= srclen; i += 96) {
172
336k
    const __m128i lo0 = _mm_loadu_si128(
173
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
336k
    const __m128i hi0 = _mm_loadu_si128(
175
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
336k
    const __m128i lo1 = _mm_loadu_si128(
177
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
336k
    const __m128i hi1 = _mm_loadu_si128(
179
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
336k
    const __m128i lo2 = _mm_loadu_si128(
181
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
336k
    const __m128i hi2 = _mm_loadu_si128(
183
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
336k
    const __m128i lo3 = _mm_loadu_si128(
185
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
336k
    const __m128i hi3 = _mm_loadu_si128(
187
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
336k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
336k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
336k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
336k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
336k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
336k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
336k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
336k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
336k
    const __m256i t1_0 =
200
336k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
336k
    const __m256i t1_1 =
202
336k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
336k
    const __m256i t1_2 =
204
336k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
336k
    const __m256i t1_3 =
206
336k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
336k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
336k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
336k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
336k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
336k
    const __m256i t3_0 =
214
336k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
336k
    const __m256i t3_1 =
216
336k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
336k
    const __m256i t3_2 =
218
336k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
336k
    const __m256i t3_3 =
220
336k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
336k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
336k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
336k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
336k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
336k
    if (use_lines) {
228
0
      if (line_length >= 32) { // fast path
229
0
        __m256i result;
230
0
        result = lookup_pshufb_improved<isbase64url>(input0);
231
0
        if (offset + 32 > line_length) {
232
0
          size_t location_end = line_length - offset;
233
0
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
0
          _mm256_storeu_si256(
237
0
              reinterpret_cast<__m256i *>(out),
238
0
              insert_line_feed32(result, static_cast<int>(location_end)));
239
0
          offset = to_move;
240
0
          out += 32 + 1;
241
0
        } else {
242
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
0
          offset += 32;
244
0
          out += 32;
245
0
        }
246
0
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
0
        if (offset + 32 > line_length) {
249
0
          size_t location_end = line_length - offset;
250
0
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
0
          _mm256_storeu_si256(
255
0
              reinterpret_cast<__m256i *>(out),
256
0
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
0
          offset = to_move;
260
0
          out += 32 + 1;
261
0
        } else {
262
263
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
0
          offset += 32;
266
0
          out += 32;
267
0
        }
268
0
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
0
        if (offset + 32 > line_length) {
271
0
          size_t location_end = line_length - offset;
272
0
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
0
          _mm256_storeu_si256(
277
0
              reinterpret_cast<__m256i *>(out),
278
0
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
0
          offset = to_move;
282
0
          out += 32 + 1;
283
0
        } else {
284
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
0
          offset += 32;
286
0
          out += 32;
287
0
        }
288
0
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
0
        if (offset + 32 > line_length) {
291
0
          size_t location_end = line_length - offset;
292
0
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
0
          _mm256_storeu_si256(
297
0
              reinterpret_cast<__m256i *>(out),
298
0
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
0
          offset = to_move;
302
0
          out += 32 + 1;
303
0
        } else {
304
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
0
          offset += 32;
306
0
          out += 32;
307
0
        }
308
0
      } else { // slow path
309
        // could be optimized
310
0
        uint8_t buffer[128];
311
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
0
                            lookup_pshufb_improved<isbase64url>(input0));
313
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
0
                            lookup_pshufb_improved<isbase64url>(input1));
315
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
0
                            lookup_pshufb_improved<isbase64url>(input2));
317
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
0
                            lookup_pshufb_improved<isbase64url>(input3));
319
0
        size_t out_pos = 0;
320
0
        size_t local_offset = offset;
321
0
        for (size_t j = 0; j < 128;) {
322
0
          if (local_offset == line_length) {
323
0
            out[out_pos++] = '\n';
324
0
            local_offset = 0;
325
0
          }
326
0
          out[out_pos++] = buffer[j++];
327
0
          local_offset++;
328
0
        }
329
0
        offset = local_offset;
330
0
        out += out_pos;
331
0
      }
332
336k
    } else {
333
336k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
336k
                          lookup_pshufb_improved<isbase64url>(input0));
335
336k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
336k
                          lookup_pshufb_improved<isbase64url>(input1));
337
336k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
336k
                          lookup_pshufb_improved<isbase64url>(input2));
339
336k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
336k
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
336k
      out += 128;
343
336k
    }
344
336k
  }
345
1.10k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
545
    const __m128i lo =
349
545
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
545
    const __m128i hi =
351
545
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
545
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
545
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
545
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
545
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
545
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
545
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
545
    if (use_lines) {
366
0
      if (line_length >= 32) { // fast path
367
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
0
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
0
        if (offset + 32 > line_length) {
371
0
          size_t location_end = line_length - offset;
372
0
          size_t to_move = 32 - location_end;
373
0
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
0
          out[location_end] = '\n';
375
0
          offset = to_move;
376
0
          out += 32 + 1;
377
0
        } else {
378
0
          offset += 32;
379
0
          out += 32;
380
0
        }
381
0
      } else { // slow path
382
        // could be optimized
383
0
        alignas(32) uint8_t buffer[32];
384
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
0
                            lookup_pshufb_improved<isbase64url>(indices));
386
0
        std::memcpy(out, buffer, 32);
387
0
        size_t out_pos = 0;
388
0
        size_t local_offset = offset;
389
0
        for (size_t j = 0; j < 32;) {
390
0
          if (local_offset == line_length) {
391
0
            out[out_pos++] = '\n';
392
0
            local_offset = 0;
393
0
          }
394
0
          out[out_pos++] = buffer[j++];
395
0
          local_offset++;
396
0
        }
397
0
        offset = local_offset;
398
0
        out += out_pos;
399
0
      }
400
545
    } else {
401
545
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
545
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
545
      out += 32;
405
545
    }
406
545
  }
407
558
  return ((char *)out - (char *)dst) +
408
558
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
558
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
558
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<false, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
14.6k
                        size_t line_length = simdutf::default_line_length) {
157
14.6k
  size_t offset = 0;
158
159
14.6k
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
14.6k
  const uint8_t *input = (const uint8_t *)src;
164
165
14.6k
  uint8_t *out = (uint8_t *)dst;
166
14.6k
  const __m256i shuf =
167
14.6k
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
14.6k
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
14.6k
  size_t i = 0;
171
372k
  for (; i + 100 <= srclen; i += 96) {
172
358k
    const __m128i lo0 = _mm_loadu_si128(
173
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
358k
    const __m128i hi0 = _mm_loadu_si128(
175
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
358k
    const __m128i lo1 = _mm_loadu_si128(
177
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
358k
    const __m128i hi1 = _mm_loadu_si128(
179
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
358k
    const __m128i lo2 = _mm_loadu_si128(
181
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
358k
    const __m128i hi2 = _mm_loadu_si128(
183
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
358k
    const __m128i lo3 = _mm_loadu_si128(
185
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
358k
    const __m128i hi3 = _mm_loadu_si128(
187
358k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
358k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
358k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
358k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
358k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
358k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
358k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
358k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
358k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
358k
    const __m256i t1_0 =
200
358k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
358k
    const __m256i t1_1 =
202
358k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
358k
    const __m256i t1_2 =
204
358k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
358k
    const __m256i t1_3 =
206
358k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
358k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
358k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
358k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
358k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
358k
    const __m256i t3_0 =
214
358k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
358k
    const __m256i t3_1 =
216
358k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
358k
    const __m256i t3_2 =
218
358k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
358k
    const __m256i t3_3 =
220
358k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
358k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
358k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
358k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
358k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
358k
    if (use_lines) {
228
0
      if (line_length >= 32) { // fast path
229
0
        __m256i result;
230
0
        result = lookup_pshufb_improved<isbase64url>(input0);
231
0
        if (offset + 32 > line_length) {
232
0
          size_t location_end = line_length - offset;
233
0
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
0
          _mm256_storeu_si256(
237
0
              reinterpret_cast<__m256i *>(out),
238
0
              insert_line_feed32(result, static_cast<int>(location_end)));
239
0
          offset = to_move;
240
0
          out += 32 + 1;
241
0
        } else {
242
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
0
          offset += 32;
244
0
          out += 32;
245
0
        }
246
0
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
0
        if (offset + 32 > line_length) {
249
0
          size_t location_end = line_length - offset;
250
0
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
0
          _mm256_storeu_si256(
255
0
              reinterpret_cast<__m256i *>(out),
256
0
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
0
          offset = to_move;
260
0
          out += 32 + 1;
261
0
        } else {
262
263
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
0
          offset += 32;
266
0
          out += 32;
267
0
        }
268
0
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
0
        if (offset + 32 > line_length) {
271
0
          size_t location_end = line_length - offset;
272
0
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
0
          _mm256_storeu_si256(
277
0
              reinterpret_cast<__m256i *>(out),
278
0
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
0
          offset = to_move;
282
0
          out += 32 + 1;
283
0
        } else {
284
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
0
          offset += 32;
286
0
          out += 32;
287
0
        }
288
0
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
0
        if (offset + 32 > line_length) {
291
0
          size_t location_end = line_length - offset;
292
0
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
0
          _mm256_storeu_si256(
297
0
              reinterpret_cast<__m256i *>(out),
298
0
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
0
          offset = to_move;
302
0
          out += 32 + 1;
303
0
        } else {
304
0
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
0
          offset += 32;
306
0
          out += 32;
307
0
        }
308
0
      } else { // slow path
309
        // could be optimized
310
0
        uint8_t buffer[128];
311
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
0
                            lookup_pshufb_improved<isbase64url>(input0));
313
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
0
                            lookup_pshufb_improved<isbase64url>(input1));
315
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
0
                            lookup_pshufb_improved<isbase64url>(input2));
317
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
0
                            lookup_pshufb_improved<isbase64url>(input3));
319
0
        size_t out_pos = 0;
320
0
        size_t local_offset = offset;
321
0
        for (size_t j = 0; j < 128;) {
322
0
          if (local_offset == line_length) {
323
0
            out[out_pos++] = '\n';
324
0
            local_offset = 0;
325
0
          }
326
0
          out[out_pos++] = buffer[j++];
327
0
          local_offset++;
328
0
        }
329
0
        offset = local_offset;
330
0
        out += out_pos;
331
0
      }
332
358k
    } else {
333
358k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
358k
                          lookup_pshufb_improved<isbase64url>(input0));
335
358k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
358k
                          lookup_pshufb_improved<isbase64url>(input1));
337
358k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
358k
                          lookup_pshufb_improved<isbase64url>(input2));
339
358k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
358k
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
358k
      out += 128;
343
358k
    }
344
358k
  }
345
16.7k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
2.08k
    const __m128i lo =
349
2.08k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
2.08k
    const __m128i hi =
351
2.08k
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
2.08k
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
2.08k
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
2.08k
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
2.08k
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
2.08k
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
2.08k
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
2.08k
    if (use_lines) {
366
0
      if (line_length >= 32) { // fast path
367
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
0
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
0
        if (offset + 32 > line_length) {
371
0
          size_t location_end = line_length - offset;
372
0
          size_t to_move = 32 - location_end;
373
0
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
0
          out[location_end] = '\n';
375
0
          offset = to_move;
376
0
          out += 32 + 1;
377
0
        } else {
378
0
          offset += 32;
379
0
          out += 32;
380
0
        }
381
0
      } else { // slow path
382
        // could be optimized
383
0
        alignas(32) uint8_t buffer[32];
384
0
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
0
                            lookup_pshufb_improved<isbase64url>(indices));
386
0
        std::memcpy(out, buffer, 32);
387
0
        size_t out_pos = 0;
388
0
        size_t local_offset = offset;
389
0
        for (size_t j = 0; j < 32;) {
390
0
          if (local_offset == line_length) {
391
0
            out[out_pos++] = '\n';
392
0
            local_offset = 0;
393
0
          }
394
0
          out[out_pos++] = buffer[j++];
395
0
          local_offset++;
396
0
        }
397
0
        offset = local_offset;
398
0
        out += out_pos;
399
0
      }
400
2.08k
    } else {
401
2.08k
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
2.08k
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
2.08k
      out += 32;
405
2.08k
    }
406
2.08k
  }
407
14.6k
  return ((char *)out - (char *)dst) +
408
14.6k
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
14.6k
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
14.6k
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<true, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
558
                        size_t line_length = simdutf::default_line_length) {
157
558
  size_t offset = 0;
158
159
558
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
558
  const uint8_t *input = (const uint8_t *)src;
164
165
558
  uint8_t *out = (uint8_t *)dst;
166
558
  const __m256i shuf =
167
558
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
558
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
558
  size_t i = 0;
171
337k
  for (; i + 100 <= srclen; i += 96) {
172
336k
    const __m128i lo0 = _mm_loadu_si128(
173
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
336k
    const __m128i hi0 = _mm_loadu_si128(
175
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
336k
    const __m128i lo1 = _mm_loadu_si128(
177
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
336k
    const __m128i hi1 = _mm_loadu_si128(
179
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
336k
    const __m128i lo2 = _mm_loadu_si128(
181
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
336k
    const __m128i hi2 = _mm_loadu_si128(
183
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
336k
    const __m128i lo3 = _mm_loadu_si128(
185
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
336k
    const __m128i hi3 = _mm_loadu_si128(
187
336k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
336k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
336k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
336k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
336k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
336k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
336k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
336k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
336k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
336k
    const __m256i t1_0 =
200
336k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
336k
    const __m256i t1_1 =
202
336k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
336k
    const __m256i t1_2 =
204
336k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
336k
    const __m256i t1_3 =
206
336k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
336k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
336k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
336k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
336k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
336k
    const __m256i t3_0 =
214
336k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
336k
    const __m256i t3_1 =
216
336k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
336k
    const __m256i t3_2 =
218
336k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
336k
    const __m256i t3_3 =
220
336k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
336k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
336k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
336k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
336k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
336k
    if (use_lines) {
228
336k
      if (line_length >= 32) { // fast path
229
59.8k
        __m256i result;
230
59.8k
        result = lookup_pshufb_improved<isbase64url>(input0);
231
59.8k
        if (offset + 32 > line_length) {
232
20.6k
          size_t location_end = line_length - offset;
233
20.6k
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
20.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
20.6k
          _mm256_storeu_si256(
237
20.6k
              reinterpret_cast<__m256i *>(out),
238
20.6k
              insert_line_feed32(result, static_cast<int>(location_end)));
239
20.6k
          offset = to_move;
240
20.6k
          out += 32 + 1;
241
39.2k
        } else {
242
39.2k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
39.2k
          offset += 32;
244
39.2k
          out += 32;
245
39.2k
        }
246
59.8k
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
59.8k
        if (offset + 32 > line_length) {
249
19.4k
          size_t location_end = line_length - offset;
250
19.4k
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
19.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
19.4k
          _mm256_storeu_si256(
255
19.4k
              reinterpret_cast<__m256i *>(out),
256
19.4k
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
19.4k
          offset = to_move;
260
19.4k
          out += 32 + 1;
261
40.3k
        } else {
262
263
40.3k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
40.3k
          offset += 32;
266
40.3k
          out += 32;
267
40.3k
        }
268
59.8k
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
59.8k
        if (offset + 32 > line_length) {
271
19.8k
          size_t location_end = line_length - offset;
272
19.8k
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
19.8k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
19.8k
          _mm256_storeu_si256(
277
19.8k
              reinterpret_cast<__m256i *>(out),
278
19.8k
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
19.8k
          offset = to_move;
282
19.8k
          out += 32 + 1;
283
39.9k
        } else {
284
39.9k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
39.9k
          offset += 32;
286
39.9k
          out += 32;
287
39.9k
        }
288
59.8k
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
59.8k
        if (offset + 32 > line_length) {
291
19.6k
          size_t location_end = line_length - offset;
292
19.6k
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
19.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
19.6k
          _mm256_storeu_si256(
297
19.6k
              reinterpret_cast<__m256i *>(out),
298
19.6k
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
19.6k
          offset = to_move;
302
19.6k
          out += 32 + 1;
303
40.2k
        } else {
304
40.2k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
40.2k
          offset += 32;
306
40.2k
          out += 32;
307
40.2k
        }
308
277k
      } else { // slow path
309
        // could be optimized
310
277k
        uint8_t buffer[128];
311
277k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
277k
                            lookup_pshufb_improved<isbase64url>(input0));
313
277k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
277k
                            lookup_pshufb_improved<isbase64url>(input1));
315
277k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
277k
                            lookup_pshufb_improved<isbase64url>(input2));
317
277k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
277k
                            lookup_pshufb_improved<isbase64url>(input3));
319
277k
        size_t out_pos = 0;
320
277k
        size_t local_offset = offset;
321
35.7M
        for (size_t j = 0; j < 128;) {
322
35.4M
          if (local_offset == line_length) {
323
7.80M
            out[out_pos++] = '\n';
324
7.80M
            local_offset = 0;
325
7.80M
          }
326
35.4M
          out[out_pos++] = buffer[j++];
327
35.4M
          local_offset++;
328
35.4M
        }
329
277k
        offset = local_offset;
330
277k
        out += out_pos;
331
277k
      }
332
336k
    } else {
333
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
0
                          lookup_pshufb_improved<isbase64url>(input0));
335
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
0
                          lookup_pshufb_improved<isbase64url>(input1));
337
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
0
                          lookup_pshufb_improved<isbase64url>(input2));
339
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
0
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
0
      out += 128;
343
0
    }
344
336k
  }
345
1.10k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
545
    const __m128i lo =
349
545
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
545
    const __m128i hi =
351
545
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
545
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
545
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
545
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
545
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
545
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
545
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
545
    if (use_lines) {
366
545
      if (line_length >= 32) { // fast path
367
289
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
289
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
289
        if (offset + 32 > line_length) {
371
102
          size_t location_end = line_length - offset;
372
102
          size_t to_move = 32 - location_end;
373
102
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
102
          out[location_end] = '\n';
375
102
          offset = to_move;
376
102
          out += 32 + 1;
377
187
        } else {
378
187
          offset += 32;
379
187
          out += 32;
380
187
        }
381
289
      } else { // slow path
382
        // could be optimized
383
256
        alignas(32) uint8_t buffer[32];
384
256
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
256
                            lookup_pshufb_improved<isbase64url>(indices));
386
256
        std::memcpy(out, buffer, 32);
387
256
        size_t out_pos = 0;
388
256
        size_t local_offset = offset;
389
8.44k
        for (size_t j = 0; j < 32;) {
390
8.19k
          if (local_offset == line_length) {
391
1.46k
            out[out_pos++] = '\n';
392
1.46k
            local_offset = 0;
393
1.46k
          }
394
8.19k
          out[out_pos++] = buffer[j++];
395
8.19k
          local_offset++;
396
8.19k
        }
397
256
        offset = local_offset;
398
256
        out += out_pos;
399
256
      }
400
545
    } else {
401
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
0
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
0
      out += 32;
405
0
    }
406
545
  }
407
558
  return ((char *)out - (char *)dst) +
408
558
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
558
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
558
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<false, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long)
Line
Count
Source
156
558
                        size_t line_length = simdutf::default_line_length) {
157
558
  size_t offset = 0;
158
159
558
  if (line_length < 4) {
160
0
    line_length = 4; // We do not support line_length less than 4
161
0
  }
162
  // credit: Wojciech Muła
163
558
  const uint8_t *input = (const uint8_t *)src;
164
165
558
  uint8_t *out = (uint8_t *)dst;
166
558
  const __m256i shuf =
167
558
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
168
169
558
                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
170
558
  size_t i = 0;
171
356k
  for (; i + 100 <= srclen; i += 96) {
172
356k
    const __m128i lo0 = _mm_loadu_si128(
173
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
174
356k
    const __m128i hi0 = _mm_loadu_si128(
175
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
176
356k
    const __m128i lo1 = _mm_loadu_si128(
177
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
178
356k
    const __m128i hi1 = _mm_loadu_si128(
179
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
180
356k
    const __m128i lo2 = _mm_loadu_si128(
181
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
182
356k
    const __m128i hi2 = _mm_loadu_si128(
183
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
184
356k
    const __m128i lo3 = _mm_loadu_si128(
185
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
186
356k
    const __m128i hi3 = _mm_loadu_si128(
187
356k
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
188
189
356k
    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
190
356k
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
191
356k
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
192
356k
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
193
194
356k
    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
195
356k
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
196
356k
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
197
356k
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
198
199
356k
    const __m256i t1_0 =
200
356k
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
201
356k
    const __m256i t1_1 =
202
356k
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
203
356k
    const __m256i t1_2 =
204
356k
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
205
356k
    const __m256i t1_3 =
206
356k
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
207
208
356k
    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
209
356k
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
210
356k
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
211
356k
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
212
213
356k
    const __m256i t3_0 =
214
356k
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
215
356k
    const __m256i t3_1 =
216
356k
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
217
356k
    const __m256i t3_2 =
218
356k
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
219
356k
    const __m256i t3_3 =
220
356k
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
221
222
356k
    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
223
356k
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
224
356k
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
225
356k
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
226
227
356k
    if (use_lines) {
228
356k
      if (line_length >= 32) { // fast path
229
82.0k
        __m256i result;
230
82.0k
        result = lookup_pshufb_improved<isbase64url>(input0);
231
82.0k
        if (offset + 32 > line_length) {
232
28.6k
          size_t location_end = line_length - offset;
233
28.6k
          size_t to_move = 32 - location_end;
234
          // We could do this, or extract instead.
235
28.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
236
28.6k
          _mm256_storeu_si256(
237
28.6k
              reinterpret_cast<__m256i *>(out),
238
28.6k
              insert_line_feed32(result, static_cast<int>(location_end)));
239
28.6k
          offset = to_move;
240
28.6k
          out += 32 + 1;
241
53.4k
        } else {
242
53.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
243
53.4k
          offset += 32;
244
53.4k
          out += 32;
245
53.4k
        }
246
82.0k
        result = lookup_pshufb_improved<isbase64url>(input1);
247
248
82.0k
        if (offset + 32 > line_length) {
249
28.3k
          size_t location_end = line_length - offset;
250
28.3k
          size_t to_move = 32 - location_end;
251
252
          // We could do this, or extract instead.
253
28.3k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
254
28.3k
          _mm256_storeu_si256(
255
28.3k
              reinterpret_cast<__m256i *>(out),
256
28.3k
              insert_line_feed32(result, static_cast<int>(location_end)));
257
          // see above.
258
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
259
28.3k
          offset = to_move;
260
28.3k
          out += 32 + 1;
261
53.7k
        } else {
262
263
53.7k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
264
265
53.7k
          offset += 32;
266
53.7k
          out += 32;
267
53.7k
        }
268
82.0k
        result = lookup_pshufb_improved<isbase64url>(input2);
269
270
82.0k
        if (offset + 32 > line_length) {
271
28.6k
          size_t location_end = line_length - offset;
272
28.6k
          size_t to_move = 32 - location_end;
273
274
          // We could do this, or extract instead.
275
28.6k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
276
28.6k
          _mm256_storeu_si256(
277
28.6k
              reinterpret_cast<__m256i *>(out),
278
28.6k
              insert_line_feed32(result, static_cast<int>(location_end)));
279
          // see above.
280
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
281
28.6k
          offset = to_move;
282
28.6k
          out += 32 + 1;
283
53.4k
        } else {
284
53.4k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
285
53.4k
          offset += 32;
286
53.4k
          out += 32;
287
53.4k
        }
288
82.0k
        result = lookup_pshufb_improved<isbase64url>(input3);
289
290
82.0k
        if (offset + 32 > line_length) {
291
28.3k
          size_t location_end = line_length - offset;
292
28.3k
          size_t to_move = 32 - location_end;
293
294
          // We could do this, or extract instead.
295
28.3k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
296
28.3k
          _mm256_storeu_si256(
297
28.3k
              reinterpret_cast<__m256i *>(out),
298
28.3k
              insert_line_feed32(result, static_cast<int>(location_end)));
299
          // see above.
300
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
301
28.3k
          offset = to_move;
302
28.3k
          out += 32 + 1;
303
53.7k
        } else {
304
53.7k
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
305
53.7k
          offset += 32;
306
53.7k
          out += 32;
307
53.7k
        }
308
274k
      } else { // slow path
309
        // could be optimized
310
274k
        uint8_t buffer[128];
311
274k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
312
274k
                            lookup_pshufb_improved<isbase64url>(input0));
313
274k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
314
274k
                            lookup_pshufb_improved<isbase64url>(input1));
315
274k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
316
274k
                            lookup_pshufb_improved<isbase64url>(input2));
317
274k
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
318
274k
                            lookup_pshufb_improved<isbase64url>(input3));
319
274k
        size_t out_pos = 0;
320
274k
        size_t local_offset = offset;
321
35.3M
        for (size_t j = 0; j < 128;) {
322
35.1M
          if (local_offset == line_length) {
323
8.47M
            out[out_pos++] = '\n';
324
8.47M
            local_offset = 0;
325
8.47M
          }
326
35.1M
          out[out_pos++] = buffer[j++];
327
35.1M
          local_offset++;
328
35.1M
        }
329
274k
        offset = local_offset;
330
274k
        out += out_pos;
331
274k
      }
332
356k
    } else {
333
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
334
0
                          lookup_pshufb_improved<isbase64url>(input0));
335
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
336
0
                          lookup_pshufb_improved<isbase64url>(input1));
337
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
338
0
                          lookup_pshufb_improved<isbase64url>(input2));
339
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
340
0
                          lookup_pshufb_improved<isbase64url>(input3));
341
342
0
      out += 128;
343
0
    }
344
356k
  }
345
1.07k
  for (; i + 28 <= srclen; i += 24) {
346
    // lo = [xxxx|DDDC|CCBB|BAAA]
347
    // hi = [xxxx|HHHG|GGFF|FEEE]
348
519
    const __m128i lo =
349
519
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
350
519
    const __m128i hi =
351
519
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
352
353
    // bytes from groups A, B and C are needed in separate 32-bit lanes
354
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
355
519
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
356
357
    // this part is well commented in encode.sse.cpp
358
359
519
    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
360
519
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
361
519
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
362
519
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
363
519
    const __m256i indices = _mm256_or_si256(t1, t3);
364
365
519
    if (use_lines) {
366
519
      if (line_length >= 32) { // fast path
367
276
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
368
276
                            lookup_pshufb_improved<isbase64url>(indices));
369
370
276
        if (offset + 32 > line_length) {
371
90
          size_t location_end = line_length - offset;
372
90
          size_t to_move = 32 - location_end;
373
90
          std::memmove(out + location_end + 1, out + location_end, to_move);
374
90
          out[location_end] = '\n';
375
90
          offset = to_move;
376
90
          out += 32 + 1;
377
186
        } else {
378
186
          offset += 32;
379
186
          out += 32;
380
186
        }
381
276
      } else { // slow path
382
        // could be optimized
383
243
        alignas(32) uint8_t buffer[32];
384
243
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
385
243
                            lookup_pshufb_improved<isbase64url>(indices));
386
243
        std::memcpy(out, buffer, 32);
387
243
        size_t out_pos = 0;
388
243
        size_t local_offset = offset;
389
8.01k
        for (size_t j = 0; j < 32;) {
390
7.77k
          if (local_offset == line_length) {
391
1.55k
            out[out_pos++] = '\n';
392
1.55k
            local_offset = 0;
393
1.55k
          }
394
7.77k
          out[out_pos++] = buffer[j++];
395
7.77k
          local_offset++;
396
7.77k
        }
397
243
        offset = local_offset;
398
243
        out += out_pos;
399
243
      }
400
519
    } else {
401
0
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
402
0
                          lookup_pshufb_improved<isbase64url>(indices));
403
404
0
      out += 32;
405
0
    }
406
519
  }
407
558
  return ((char *)out - (char *)dst) +
408
558
         scalar::base64::tail_encode_base64_impl<use_lines>(
409
558
             (char *)out, src + i, srclen - i, options, line_length, offset);
410
558
}
411
412
template <bool isbase64url>
413
size_t encode_base64(char *dst, const char *src, size_t srclen,
414
15.2k
                     base64_options options) {
415
15.2k
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
416
15.2k
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::encode_base64<true>(char*, char const*, unsigned long, simdutf::base64_options)
Line
Count
Source
414
558
                     base64_options options) {
415
558
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
416
558
}
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::encode_base64<false>(char*, char const*, unsigned long, simdutf::base64_options)
Line
Count
Source
414
14.6k
                     base64_options options) {
415
14.6k
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
416
14.6k
}
417
418
411k
static inline void compress(__m128i data, uint16_t mask, char *output) {
419
411k
  if (mask == 0) {
420
22.1k
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
421
22.1k
    return;
422
22.1k
  }
423
  // this particular implementation was inspired by work done by @animetosho
424
  // we do it in two steps, first 8 bytes and then second 8 bytes
425
389k
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
426
389k
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
427
  // next line just loads the 64-bit values thintable_epi8[mask1] and
428
  // thintable_epi8[mask2] into a 128-bit register, using only
429
  // two instructions on most compilers.
430
431
389k
  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
432
389k
                                    tables::base64::thintable_epi8[mask1]);
433
  // we increment by 0x08 the second half of the mask
434
389k
  shufmask =
435
389k
      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
436
  // this is the version "nearly pruned"
437
389k
  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
438
  // we still need to put the two halves together.
439
  // we compute the popcount of the first half:
440
389k
  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
441
  // then load the corresponding mask, what it does is to write
442
  // only the first pop1 bytes from the first 8 bytes, and then
443
  // it fills in with the bytes from the second 8 bytes + some filling
444
  // at the end.
445
389k
  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
446
389k
      tables::base64::pshufb_combine_table + pop1 * 8));
447
389k
  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
448
449
389k
  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
450
389k
}
451
452
// --- decoding -----------------------------------------------
453
454
template <typename = void>
455
217k
simdutf_really_inline void compress(__m256i data, uint32_t mask, char *output) {
456
217k
  if (mask == 0) {
457
12.0k
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
458
12.0k
    return;
459
12.0k
  }
460
205k
  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
461
205k
  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
462
205k
           output + count_ones(~mask & 0xFFFF));
463
205k
}
464
465
template <typename = void>
466
2.92M
simdutf_really_inline void base64_decode(char *out, __m256i str) {
467
  // credit: aqrit
468
2.92M
  const __m256i pack_shuffle =
469
2.92M
      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
470
2.92M
                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
471
2.92M
  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
472
2.92M
  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
473
2.92M
  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
474
475
  // Store the output:
476
2.92M
  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
477
2.92M
  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
478
2.92M
}
479
480
template <typename = void>
481
61.4k
simdutf_really_inline void base64_decode_block(char *out, const char *src) {
482
61.4k
  base64_decode(out,
483
61.4k
                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
484
61.4k
  base64_decode(out + 24, _mm256_loadu_si256(
485
61.4k
                              reinterpret_cast<const __m256i *>(src + 32)));
486
61.4k
}
487
488
template <typename = void>
489
simdutf_really_inline void base64_decode_block_safe(char *out,
490
108
                                                    const char *src) {
491
108
  base64_decode(out,
492
108
                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
493
108
  alignas(32) char buffer[32]; // We enforce safety with a buffer.
494
108
  base64_decode(
495
108
      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
496
108
  std::memcpy(out + 24, buffer, 24);
497
108
}
498
499
// --- decoding - base64 class --------------------------------
500
501
class block64 {
502
  __m256i chunks[2];
503
504
public:
505
  // The caller of this function is responsible to ensure that there are 64
506
  // bytes available from reading at src.
507
1.49M
  simdutf_really_inline block64(const char *src) {
508
1.49M
    chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
509
1.49M
    chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
510
1.49M
  }
511
512
  // The caller of this function is responsible to ensure that there are 128
513
  // bytes available from reading at src.
514
63.3k
  simdutf_really_inline block64(const char16_t *src) {
515
63.3k
    const auto m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
516
63.3k
    const auto m2 =
517
63.3k
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
518
63.3k
    const auto m3 =
519
63.3k
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
520
63.3k
    const auto m4 =
521
63.3k
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
522
523
63.3k
    const auto m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
524
63.3k
    const auto m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
525
63.3k
    const auto m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
526
63.3k
    const auto m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
527
528
63.3k
    chunks[0] = _mm256_packus_epi16(m1p, m2p);
529
63.3k
    chunks[1] = _mm256_packus_epi16(m3p, m4p);
530
63.3k
  }
531
532
15.8k
  simdutf_really_inline void copy_block(char *output) {
533
15.8k
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), chunks[0]);
534
15.8k
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), chunks[1]);
535
15.8k
  }
536
537
  // decode 64 bytes and output 48 bytes
538
1.39M
  simdutf_really_inline void base64_decode_block(char *out) {
539
1.39M
    base64_decode(out, chunks[0]);
540
1.39M
    base64_decode(out + 24, chunks[1]);
541
1.39M
  }
542
543
1.33k
  simdutf_really_inline void base64_decode_block_safe(char *out) {
544
1.33k
    base64_decode(out, chunks[0]);
545
1.33k
    alignas(32) char buffer[32]; // We enforce safety with a buffer.
546
1.33k
    base64_decode(buffer, chunks[1]);
547
1.33k
    std::memcpy(out + 24, buffer, 24);
548
1.33k
  }
549
550
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
551
1.55M
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
552
1.55M
    uint32_t err0 = 0;
553
1.55M
    uint32_t err1 = 0;
554
1.55M
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
555
1.55M
        &chunks[0], &err0);
556
1.55M
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
557
1.55M
        &chunks[1], &err1);
558
1.55M
    if (!ignore_garbage) {
559
1.55M
      *error = err0 | ((uint64_t)err1 << 32);
560
1.55M
    }
561
1.55M
    return m0 | (m1 << 32);
562
1.55M
  }
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, true>(unsigned long*)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, true>(unsigned long*)
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, true, false>(unsigned long*)
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, false, false>(unsigned long*)
Line
Count
Source
551
737k
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
552
737k
    uint32_t err0 = 0;
553
737k
    uint32_t err1 = 0;
554
737k
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
555
737k
        &chunks[0], &err0);
556
737k
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
557
737k
        &chunks[1], &err1);
558
737k
    if (!ignore_garbage) {
559
737k
      *error = err0 | ((uint64_t)err1 << 32);
560
737k
    }
561
737k
    return m0 | (m1 << 32);
562
737k
  }
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, false>(unsigned long*)
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, false>(unsigned long*)
Line
Count
Source
551
816k
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
552
816k
    uint32_t err0 = 0;
553
816k
    uint32_t err1 = 0;
554
816k
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
555
816k
        &chunks[0], &err0);
556
816k
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
557
816k
        &chunks[1], &err1);
558
816k
    if (!ignore_garbage) {
559
816k
      *error = err0 | ((uint64_t)err1 << 32);
560
816k
    }
561
816k
    return m0 | (m1 << 32);
562
816k
  }
563
564
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
565
3.10M
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
566
3.10M
    const __m256i ascii_space_tbl =
567
3.10M
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
568
3.10M
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
569
3.10M
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
570
    // credit: aqrit
571
3.10M
    __m256i delta_asso;
572
3.10M
    if (default_or_url) {
573
0
      delta_asso = _mm256_setr_epi8(
574
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
575
0
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
576
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
577
3.10M
    } else if (base64_url) {
578
1.47M
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
579
1.47M
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
580
1.47M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
581
1.47M
                                    0x0, 0x0, 0xF, 0x0, 0xF);
582
1.63M
    } else {
583
1.63M
      delta_asso = _mm256_setr_epi8(
584
1.63M
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
585
1.63M
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
586
1.63M
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
587
1.63M
    }
588
589
3.10M
    __m256i delta_values;
590
3.10M
    if (default_or_url) {
591
0
      delta_values = _mm256_setr_epi8(
592
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
593
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
594
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
595
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
596
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
597
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
598
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
599
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
600
3.10M
    } else if (base64_url) {
601
1.47M
      delta_values = _mm256_setr_epi8(
602
1.47M
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
603
1.47M
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
604
1.47M
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
605
1.47M
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
606
1.47M
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
607
1.63M
    } else {
608
1.63M
      delta_values = _mm256_setr_epi8(
609
1.63M
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
610
1.63M
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
611
1.63M
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
612
1.63M
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
613
1.63M
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
614
1.63M
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
615
1.63M
          int8_t(0xB9), int8_t(0xB9));
616
1.63M
    }
617
618
3.10M
    __m256i check_asso;
619
3.10M
    if (default_or_url) {
620
0
      check_asso = _mm256_setr_epi8(
621
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
622
0
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
623
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
624
3.10M
    } else if (base64_url) {
625
1.47M
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
626
1.47M
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
627
1.47M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
628
1.47M
                                    0x7, 0xB, 0xE, 0xB, 0x6);
629
1.63M
    } else {
630
1.63M
      check_asso = _mm256_setr_epi8(
631
1.63M
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
632
1.63M
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
633
1.63M
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
634
1.63M
    }
635
3.10M
    __m256i check_values;
636
3.10M
    if (default_or_url) {
637
0
      check_values = _mm256_setr_epi8(
638
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
639
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
640
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
641
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
642
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
643
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
644
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
645
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
646
3.10M
    } else if (base64_url) {
647
1.47M
      check_values = _mm256_setr_epi8(
648
1.47M
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
649
1.47M
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
650
1.47M
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
651
1.47M
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
652
1.47M
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
653
1.47M
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
654
1.47M
          uint8_t(0x80), 0x0, uint8_t(0x80));
655
1.63M
    } else {
656
1.63M
      check_values = _mm256_setr_epi8(
657
1.63M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
658
1.63M
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
659
1.63M
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
660
1.63M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
661
1.63M
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
662
1.63M
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
663
1.63M
          int8_t(0x91), int8_t(0x80));
664
1.63M
    }
665
3.10M
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
666
3.10M
    __m256i delta_hash =
667
3.10M
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
668
3.10M
    if (default_or_url) {
669
0
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
670
0
    }
671
3.10M
    const __m256i check_hash =
672
3.10M
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
673
3.10M
    const __m256i out =
674
3.10M
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
675
3.10M
    const __m256i chk =
676
3.10M
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
677
3.10M
    const int mask = _mm256_movemask_epi8(chk);
678
3.10M
    if (!ignore_garbage && mask) {
679
237k
      __m256i ascii_space =
680
237k
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
681
237k
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
682
237k
    }
683
3.10M
    *src = out;
684
3.10M
    return (uint32_t)mask;
685
3.10M
  }
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, true>(long long __vector(4)*, unsigned int*)
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, true>(long long __vector(4)*, unsigned int*)
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, true, false>(long long __vector(4)*, unsigned int*)
simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, false, false>(long long __vector(4)*, unsigned int*)
Line
Count
Source
565
1.47M
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
566
1.47M
    const __m256i ascii_space_tbl =
567
1.47M
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
568
1.47M
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
569
1.47M
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
570
    // credit: aqrit
571
1.47M
    __m256i delta_asso;
572
1.47M
    if (default_or_url) {
573
0
      delta_asso = _mm256_setr_epi8(
574
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
575
0
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
576
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
577
1.47M
    } else if (base64_url) {
578
1.47M
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
579
1.47M
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
580
1.47M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
581
1.47M
                                    0x0, 0x0, 0xF, 0x0, 0xF);
582
1.47M
    } else {
583
0
      delta_asso = _mm256_setr_epi8(
584
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
585
0
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
586
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
587
0
    }
588
589
1.47M
    __m256i delta_values;
590
1.47M
    if (default_or_url) {
591
0
      delta_values = _mm256_setr_epi8(
592
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
593
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
594
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
595
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
596
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
597
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
598
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
599
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
600
1.47M
    } else if (base64_url) {
601
1.47M
      delta_values = _mm256_setr_epi8(
602
1.47M
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
603
1.47M
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
604
1.47M
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
605
1.47M
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
606
1.47M
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
607
1.47M
    } else {
608
0
      delta_values = _mm256_setr_epi8(
609
0
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
610
0
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
611
0
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
612
0
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
613
0
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
614
0
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
615
0
          int8_t(0xB9), int8_t(0xB9));
616
0
    }
617
618
1.47M
    __m256i check_asso;
619
1.47M
    if (default_or_url) {
620
0
      check_asso = _mm256_setr_epi8(
621
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
622
0
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
623
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
624
1.47M
    } else if (base64_url) {
625
1.47M
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
626
1.47M
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
627
1.47M
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
628
1.47M
                                    0x7, 0xB, 0xE, 0xB, 0x6);
629
1.47M
    } else {
630
0
      check_asso = _mm256_setr_epi8(
631
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
632
0
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
633
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
634
0
    }
635
1.47M
    __m256i check_values;
636
1.47M
    if (default_or_url) {
637
0
      check_values = _mm256_setr_epi8(
638
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
639
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
640
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
641
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
642
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
643
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
644
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
645
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
646
1.47M
    } else if (base64_url) {
647
1.47M
      check_values = _mm256_setr_epi8(
648
1.47M
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
649
1.47M
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
650
1.47M
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
651
1.47M
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
652
1.47M
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
653
1.47M
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
654
1.47M
          uint8_t(0x80), 0x0, uint8_t(0x80));
655
1.47M
    } else {
656
0
      check_values = _mm256_setr_epi8(
657
0
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
658
0
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
659
0
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
660
0
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
661
0
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
662
0
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
663
0
          int8_t(0x91), int8_t(0x80));
664
0
    }
665
1.47M
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
666
1.47M
    __m256i delta_hash =
667
1.47M
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
668
1.47M
    if (default_or_url) {
669
0
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
670
0
    }
671
1.47M
    const __m256i check_hash =
672
1.47M
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
673
1.47M
    const __m256i out =
674
1.47M
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
675
1.47M
    const __m256i chk =
676
1.47M
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
677
1.47M
    const int mask = _mm256_movemask_epi8(chk);
678
1.47M
    if (!ignore_garbage && mask) {
679
78.7k
      __m256i ascii_space =
680
78.7k
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
681
78.7k
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
682
78.7k
    }
683
1.47M
    *src = out;
684
1.47M
    return (uint32_t)mask;
685
1.47M
  }
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, false>(long long __vector(4)*, unsigned int*)
simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, false>(long long __vector(4)*, unsigned int*)
Line
Count
Source
565
1.63M
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
566
1.63M
    const __m256i ascii_space_tbl =
567
1.63M
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
568
1.63M
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
569
1.63M
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
570
    // credit: aqrit
571
1.63M
    __m256i delta_asso;
572
1.63M
    if (default_or_url) {
573
0
      delta_asso = _mm256_setr_epi8(
574
0
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
575
0
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
576
0
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
577
1.63M
    } else if (base64_url) {
578
0
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
579
0
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
580
0
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
581
0
                                    0x0, 0x0, 0xF, 0x0, 0xF);
582
1.63M
    } else {
583
1.63M
      delta_asso = _mm256_setr_epi8(
584
1.63M
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
585
1.63M
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
586
1.63M
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
587
1.63M
    }
588
589
1.63M
    __m256i delta_values;
590
1.63M
    if (default_or_url) {
591
0
      delta_values = _mm256_setr_epi8(
592
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
593
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
594
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
595
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
596
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
597
0
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
598
0
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
599
0
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
600
1.63M
    } else if (base64_url) {
601
0
      delta_values = _mm256_setr_epi8(
602
0
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
603
0
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
604
0
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
605
0
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
606
0
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
607
1.63M
    } else {
608
1.63M
      delta_values = _mm256_setr_epi8(
609
1.63M
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
610
1.63M
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
611
1.63M
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
612
1.63M
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
613
1.63M
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
614
1.63M
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
615
1.63M
          int8_t(0xB9), int8_t(0xB9));
616
1.63M
    }
617
618
1.63M
    __m256i check_asso;
619
1.63M
    if (default_or_url) {
620
0
      check_asso = _mm256_setr_epi8(
621
0
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
622
0
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
623
0
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
624
1.63M
    } else if (base64_url) {
625
0
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
626
0
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
627
0
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
628
0
                                    0x7, 0xB, 0xE, 0xB, 0x6);
629
1.63M
    } else {
630
1.63M
      check_asso = _mm256_setr_epi8(
631
1.63M
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
632
1.63M
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
633
1.63M
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
634
1.63M
    }
635
1.63M
    __m256i check_values;
636
1.63M
    if (default_or_url) {
637
0
      check_values = _mm256_setr_epi8(
638
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
639
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
640
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
641
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
642
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
643
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
644
0
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
645
0
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
646
1.63M
    } else if (base64_url) {
647
0
      check_values = _mm256_setr_epi8(
648
0
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
649
0
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
650
0
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
651
0
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
652
0
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
653
0
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
654
0
          uint8_t(0x80), 0x0, uint8_t(0x80));
655
1.63M
    } else {
656
1.63M
      check_values = _mm256_setr_epi8(
657
1.63M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
658
1.63M
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
659
1.63M
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
660
1.63M
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
661
1.63M
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
662
1.63M
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
663
1.63M
          int8_t(0x91), int8_t(0x80));
664
1.63M
    }
665
1.63M
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
666
1.63M
    __m256i delta_hash =
667
1.63M
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
668
1.63M
    if (default_or_url) {
669
0
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
670
0
    }
671
1.63M
    const __m256i check_hash =
672
1.63M
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
673
1.63M
    const __m256i out =
674
1.63M
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
675
1.63M
    const __m256i chk =
676
1.63M
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
677
1.63M
    const int mask = _mm256_movemask_epi8(chk);
678
1.63M
    if (!ignore_garbage && mask) {
679
158k
      __m256i ascii_space =
680
158k
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
681
158k
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
682
158k
    }
683
1.63M
    *src = out;
684
1.63M
    return (uint32_t)mask;
685
1.63M
  }
686
687
137k
  simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
688
137k
    if (is_power_of_two(mask)) {
689
29.0k
      return compress_block_single(mask, output);
690
29.0k
    }
691
692
108k
    uint64_t nmask = ~mask;
693
108k
    compress(chunks[0], uint32_t(mask), output);
694
108k
    compress(chunks[1], uint32_t(mask >> 32),
695
108k
             output + count_ones(nmask & 0xFFFFFFFF));
696
108k
    return count_ones(nmask);
697
137k
  }
698
699
  simdutf_really_inline size_t compress_block_single(uint64_t mask,
700
29.0k
                                                     char *output) {
701
29.0k
    const size_t pos64 = trailing_zeroes(mask);
702
29.0k
    const int8_t pos = pos64 & 0xf;
703
29.0k
    switch (pos64 >> 4) {
704
6.59k
    case 0b00: {
705
6.59k
      const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
706
6.59k
      const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
707
708
6.59k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
709
6.59k
      const __m128i v1 =
710
6.59k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
711
6.59k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
712
6.59k
      const __m128i sh = _mm_sub_epi8(v1, v2);
713
6.59k
      const __m128i compressed = _mm_shuffle_epi8(lane0, sh);
714
715
6.59k
      _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed);
716
6.59k
      _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), lane1);
717
6.59k
      _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
718
6.59k
    } break;
719
7.67k
    case 0b01: {
720
7.67k
      const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
721
7.67k
      const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
722
7.67k
      _mm_storeu_si128((__m128i *)(output + 0 * 16), lane0);
723
724
7.67k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
725
7.67k
      const __m128i v1 =
726
7.67k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
727
7.67k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
728
7.67k
      const __m128i sh = _mm_sub_epi8(v1, v2);
729
7.67k
      const __m128i compressed = _mm_shuffle_epi8(lane1, sh);
730
731
7.67k
      _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed);
732
7.67k
      _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
733
7.67k
    } break;
734
7.62k
    case 0b10: {
735
7.62k
      const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
736
7.62k
      const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);
737
738
7.62k
      _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
739
740
7.62k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
741
7.62k
      const __m128i v1 =
742
7.62k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
743
7.62k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
744
7.62k
      const __m128i sh = _mm_sub_epi8(v1, v2);
745
7.62k
      const __m128i compressed = _mm_shuffle_epi8(lane2, sh);
746
747
7.62k
      _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed);
748
7.62k
      _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), lane3);
749
7.62k
    } break;
750
7.15k
    case 0b11: {
751
7.15k
      const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
752
7.15k
      const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);
753
754
7.15k
      _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
755
7.15k
      _mm_storeu_si128((__m128i *)(output + 2 * 16), lane2);
756
757
7.15k
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
758
7.15k
      const __m128i v1 =
759
7.15k
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
760
7.15k
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
761
7.15k
      const __m128i sh = _mm_sub_epi8(v1, v2);
762
7.15k
      const __m128i compressed = _mm_shuffle_epi8(lane3, sh);
763
764
7.15k
      _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed);
765
7.15k
    } break;
766
29.0k
    }
767
768
29.0k
    return 63;
769
29.0k
  }
770
};