/src/simdutf/src/haswell/avx2_base64.cpp
Line | Count | Source |
1 | | /** |
2 | | * References and further reading: |
3 | | * |
4 | | * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the |
5 | | * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. |
6 | | * https://arxiv.org/abs/1910.05109 |
7 | | * |
8 | | * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 |
9 | | * Instructions, ACM Transactions on the Web 12 (3), 2018. |
10 | | * https://arxiv.org/abs/1704.00605 |
11 | | * |
12 | | * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. |
13 | | * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, |
14 | | * Request for Comments: 4648. |
15 | | * |
16 | | * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. |
17 | | * http://www.alfredklomp.com/programming/sse-base64/. (2014). |
18 | | * |
19 | | * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD |
20 | | * acceleration. https://github.com/aklomp/base64. (2014). |
21 | | * |
22 | | * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). |
23 | | * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ |
24 | | * |
25 | | * Nick Kopp. 2013. Base64 Encoding on a GPU. |
26 | | * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). |
27 | | */ |
28 | | |
29 | | template <bool base64_url> |
30 | 5.54M | simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) { |
31 | | // Precomputed shuffle masks for K = 1 to 16 |
32 | | // credit: Wojciech Muła |
33 | 5.54M | __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); |
34 | 5.54M | const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); |
35 | 5.54M | result = |
36 | 5.54M | _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); |
37 | 5.54M | __m256i shift_LUT; |
38 | 5.54M | if (base64_url) { |
39 | 2.12M | shift_LUT = _mm256_setr_epi8( |
40 | 2.12M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
41 | 2.12M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0, |
42 | | |
43 | 2.12M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
44 | 2.12M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); |
45 | 3.41M | } else { |
46 | 3.41M | shift_LUT = _mm256_setr_epi8( |
47 | 3.41M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
48 | 3.41M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0, |
49 | | |
50 | 3.41M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
51 | 3.41M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); |
52 | 3.41M | } |
53 | | |
54 | 5.54M | result = _mm256_shuffle_epi8(shift_LUT, result); |
55 | 5.54M | return _mm256_add_epi8(result, input); |
56 | 5.54M | } simdutf.cpp:long long __vector(4) simdutf::haswell::(anonymous namespace)::lookup_pshufb_improved<true>(long long __vector(4)) Line | Count | Source | 30 | 2.12M | simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) { | 31 | | // Precomputed shuffle masks for K = 1 to 16 | 32 | | // credit: Wojciech Muła | 33 | 2.12M | __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); | 34 | 2.12M | const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); | 35 | 2.12M | result = | 36 | 2.12M | _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); | 37 | 2.12M | __m256i shift_LUT; | 38 | 2.12M | if (base64_url) { | 39 | 2.12M | shift_LUT = _mm256_setr_epi8( | 40 | 2.12M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 41 | 2.12M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0, | 42 | | | 43 | 2.12M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 44 | 2.12M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); | 45 | 2.12M | } else { | 46 | 0 | shift_LUT = _mm256_setr_epi8( | 47 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 48 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0, | 49 | |
| 50 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 51 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); | 52 | 0 | } | 53 | | | 54 | 2.12M | result = _mm256_shuffle_epi8(shift_LUT, result); | 55 | 2.12M | return _mm256_add_epi8(result, input); | 56 | 2.12M | } |
simdutf.cpp:long long __vector(4) simdutf::haswell::(anonymous namespace)::lookup_pshufb_improved<false>(long long __vector(4)) Line | Count | Source | 30 | 3.41M | simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) { | 31 | | // Precomputed shuffle masks for K = 1 to 16 | 32 | | // credit: Wojciech Muła | 33 | 3.41M | __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); | 34 | 3.41M | const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); | 35 | 3.41M | result = | 36 | 3.41M | _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); | 37 | 3.41M | __m256i shift_LUT; | 38 | 3.41M | if (base64_url) { | 39 | 0 | shift_LUT = _mm256_setr_epi8( | 40 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 41 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0, | 42 | |
| 43 | 0 | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 44 | 0 | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); | 45 | 3.41M | } else { | 46 | 3.41M | shift_LUT = _mm256_setr_epi8( | 47 | 3.41M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 48 | 3.41M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0, | 49 | | | 50 | 3.41M | 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, | 51 | 3.41M | '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); | 52 | 3.41M | } | 53 | | | 54 | 3.41M | result = _mm256_shuffle_epi8(shift_LUT, result); | 55 | 3.41M | return _mm256_add_epi8(result, input); | 56 | 3.41M | } |
|
57 | | |
58 | 206k | simdutf_really_inline __m256i insert_line_feed32(__m256i input, int K) { |
59 | | |
60 | 206k | static const uint8_t low_table[16][32] = { |
61 | 206k | {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
62 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
63 | 206k | {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
64 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
65 | 206k | {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
66 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
67 | 206k | {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
68 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
69 | 206k | {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
70 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
71 | 206k | {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
72 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
73 | 206k | {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
74 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
75 | 206k | {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14, |
76 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
77 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14, |
78 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
79 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14, |
80 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
81 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14, |
82 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
83 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14, |
84 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
85 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14, |
86 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
87 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14, |
88 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
89 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14, |
90 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
91 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80, |
92 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}; |
93 | 206k | static const uint8_t high_table[16][32] = { |
94 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
95 | 206k | 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
96 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
97 | 206k | 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
98 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
99 | 206k | 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
100 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
101 | 206k | 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
102 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
103 | 206k | 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
104 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
105 | 206k | 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
106 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
107 | 206k | 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14}, |
108 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
109 | 206k | 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14}, |
110 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
111 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14}, |
112 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
113 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14}, |
114 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
115 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14}, |
116 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
117 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14}, |
118 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
119 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14}, |
120 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
121 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14}, |
122 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
123 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14}, |
124 | 206k | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
125 | 206k | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}}; |
126 | | |
127 | 206k | __m256i line_feed_vector = _mm256_set1_epi8('\n'); |
128 | 206k | if (K >= 16) { |
129 | 99.1k | __m256i mask = _mm256_loadu_si256((const __m256i *)high_table[K - 16]); |
130 | 99.1k | __m256i lf_pos = |
131 | 99.1k | _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80))); |
132 | 99.1k | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
133 | 99.1k | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
134 | 99.1k | return result; |
135 | 99.1k | } |
136 | | // Shift input right by 1 byte |
137 | 107k | __m256i shift = _mm256_alignr_epi8( |
138 | 107k | input, _mm256_permute2x128_si256(input, input, 0x21), 15); |
139 | | |
140 | 107k | input = _mm256_blend_epi32(input, shift, 0xF0); |
141 | | |
142 | 107k | __m256i mask = _mm256_loadu_si256((const __m256i *)low_table[K]); |
143 | | |
144 | 107k | __m256i lf_pos = |
145 | 107k | _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80))); |
146 | 107k | __m256i shuffled = _mm256_shuffle_epi8(input, mask); |
147 | | |
148 | 107k | __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos); |
149 | 107k | return result; |
150 | 206k | } |
151 | | |
152 | | template <bool isbase64url, bool use_lines> |
153 | | size_t |
154 | | avx2_encode_base64_impl(char *dst, const char *src, size_t srclen, |
155 | | base64_options options, |
156 | 17.0k | size_t line_length = simdutf::default_line_length) { |
157 | 17.0k | size_t offset = 0; |
158 | | |
159 | 17.0k | if (line_length < 4) { |
160 | 0 | line_length = 4; // We do not support line_length less than 4 |
161 | 0 | } |
162 | | // credit: Wojciech Muła |
163 | 17.0k | const uint8_t *input = (const uint8_t *)src; |
164 | | |
165 | 17.0k | uint8_t *out = (uint8_t *)dst; |
166 | 17.0k | const __m256i shuf = |
167 | 17.0k | _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
168 | | |
169 | 17.0k | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
170 | 17.0k | size_t i = 0; |
171 | 1.40M | for (; i + 100 <= srclen; i += 96) { |
172 | 1.38M | const __m128i lo0 = _mm_loadu_si128( |
173 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0)); |
174 | 1.38M | const __m128i hi0 = _mm_loadu_si128( |
175 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1)); |
176 | 1.38M | const __m128i lo1 = _mm_loadu_si128( |
177 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2)); |
178 | 1.38M | const __m128i hi1 = _mm_loadu_si128( |
179 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3)); |
180 | 1.38M | const __m128i lo2 = _mm_loadu_si128( |
181 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4)); |
182 | 1.38M | const __m128i hi2 = _mm_loadu_si128( |
183 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5)); |
184 | 1.38M | const __m128i lo3 = _mm_loadu_si128( |
185 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6)); |
186 | 1.38M | const __m128i hi3 = _mm_loadu_si128( |
187 | 1.38M | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7)); |
188 | | |
189 | 1.38M | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); |
190 | 1.38M | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); |
191 | 1.38M | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); |
192 | 1.38M | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); |
193 | | |
194 | 1.38M | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); |
195 | 1.38M | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); |
196 | 1.38M | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); |
197 | 1.38M | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); |
198 | | |
199 | 1.38M | const __m256i t1_0 = |
200 | 1.38M | _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); |
201 | 1.38M | const __m256i t1_1 = |
202 | 1.38M | _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); |
203 | 1.38M | const __m256i t1_2 = |
204 | 1.38M | _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); |
205 | 1.38M | const __m256i t1_3 = |
206 | 1.38M | _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); |
207 | | |
208 | 1.38M | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); |
209 | 1.38M | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); |
210 | 1.38M | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); |
211 | 1.38M | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); |
212 | | |
213 | 1.38M | const __m256i t3_0 = |
214 | 1.38M | _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); |
215 | 1.38M | const __m256i t3_1 = |
216 | 1.38M | _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); |
217 | 1.38M | const __m256i t3_2 = |
218 | 1.38M | _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); |
219 | 1.38M | const __m256i t3_3 = |
220 | 1.38M | _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); |
221 | | |
222 | 1.38M | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); |
223 | 1.38M | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); |
224 | 1.38M | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); |
225 | 1.38M | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); |
226 | | |
227 | 1.38M | if (use_lines) { |
228 | 691k | if (line_length >= 32) { // fast path |
229 | 113k | __m256i result; |
230 | 113k | result = lookup_pshufb_improved<isbase64url>(input0); |
231 | 113k | if (offset + 32 > line_length) { |
232 | 52.9k | size_t location_end = line_length - offset; |
233 | 52.9k | size_t to_move = 32 - location_end; |
234 | | // We could do this, or extract instead. |
235 | 52.9k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); |
236 | 52.9k | _mm256_storeu_si256( |
237 | 52.9k | reinterpret_cast<__m256i *>(out), |
238 | 52.9k | insert_line_feed32(result, static_cast<int>(location_end))); |
239 | 52.9k | offset = to_move; |
240 | 52.9k | out += 32 + 1; |
241 | 60.4k | } else { |
242 | 60.4k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); |
243 | 60.4k | offset += 32; |
244 | 60.4k | out += 32; |
245 | 60.4k | } |
246 | 113k | result = lookup_pshufb_improved<isbase64url>(input1); |
247 | | |
248 | 113k | if (offset + 32 > line_length) { |
249 | 50.1k | size_t location_end = line_length - offset; |
250 | 50.1k | size_t to_move = 32 - location_end; |
251 | | |
252 | | // We could do this, or extract instead. |
253 | 50.1k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); |
254 | 50.1k | _mm256_storeu_si256( |
255 | 50.1k | reinterpret_cast<__m256i *>(out), |
256 | 50.1k | insert_line_feed32(result, static_cast<int>(location_end))); |
257 | | // see above. |
258 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); |
259 | 50.1k | offset = to_move; |
260 | 50.1k | out += 32 + 1; |
261 | 63.2k | } else { |
262 | | |
263 | 63.2k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); |
264 | | |
265 | 63.2k | offset += 32; |
266 | 63.2k | out += 32; |
267 | 63.2k | } |
268 | 113k | result = lookup_pshufb_improved<isbase64url>(input2); |
269 | | |
270 | 113k | if (offset + 32 > line_length) { |
271 | 52.7k | size_t location_end = line_length - offset; |
272 | 52.7k | size_t to_move = 32 - location_end; |
273 | | |
274 | | // We could do this, or extract instead. |
275 | 52.7k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); |
276 | 52.7k | _mm256_storeu_si256( |
277 | 52.7k | reinterpret_cast<__m256i *>(out), |
278 | 52.7k | insert_line_feed32(result, static_cast<int>(location_end))); |
279 | | // see above. |
280 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); |
281 | 52.7k | offset = to_move; |
282 | 52.7k | out += 32 + 1; |
283 | 60.6k | } else { |
284 | 60.6k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); |
285 | 60.6k | offset += 32; |
286 | 60.6k | out += 32; |
287 | 60.6k | } |
288 | 113k | result = lookup_pshufb_improved<isbase64url>(input3); |
289 | | |
290 | 113k | if (offset + 32 > line_length) { |
291 | 50.4k | size_t location_end = line_length - offset; |
292 | 50.4k | size_t to_move = 32 - location_end; |
293 | | |
294 | | // We could do this, or extract instead. |
295 | 50.4k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); |
296 | 50.4k | _mm256_storeu_si256( |
297 | 50.4k | reinterpret_cast<__m256i *>(out), |
298 | 50.4k | insert_line_feed32(result, static_cast<int>(location_end))); |
299 | | // see above. |
300 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); |
301 | 50.4k | offset = to_move; |
302 | 50.4k | out += 32 + 1; |
303 | 63.0k | } else { |
304 | 63.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); |
305 | 63.0k | offset += 32; |
306 | 63.0k | out += 32; |
307 | 63.0k | } |
308 | 577k | } else { // slow path |
309 | | // could be optimized |
310 | 577k | uint8_t buffer[128]; |
311 | 577k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), |
312 | 577k | lookup_pshufb_improved<isbase64url>(input0)); |
313 | 577k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32), |
314 | 577k | lookup_pshufb_improved<isbase64url>(input1)); |
315 | 577k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64), |
316 | 577k | lookup_pshufb_improved<isbase64url>(input2)); |
317 | 577k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96), |
318 | 577k | lookup_pshufb_improved<isbase64url>(input3)); |
319 | 577k | size_t out_pos = 0; |
320 | 577k | size_t local_offset = offset; |
321 | 74.5M | for (size_t j = 0; j < 128;) { |
322 | 73.9M | if (local_offset == line_length) { |
323 | 17.3M | out[out_pos++] = '\n'; |
324 | 17.3M | local_offset = 0; |
325 | 17.3M | } |
326 | 73.9M | out[out_pos++] = buffer[j++]; |
327 | 73.9M | local_offset++; |
328 | 73.9M | } |
329 | 577k | offset = local_offset; |
330 | 577k | out += out_pos; |
331 | 577k | } |
332 | 693k | } else { |
333 | 693k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), |
334 | 693k | lookup_pshufb_improved<isbase64url>(input0)); |
335 | 693k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), |
336 | 693k | lookup_pshufb_improved<isbase64url>(input1)); |
337 | 693k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64), |
338 | 693k | lookup_pshufb_improved<isbase64url>(input2)); |
339 | 693k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96), |
340 | 693k | lookup_pshufb_improved<isbase64url>(input3)); |
341 | | |
342 | 693k | out += 128; |
343 | 693k | } |
344 | 1.38M | } |
345 | 20.9k | for (; i + 28 <= srclen; i += 24) { |
346 | | // lo = [xxxx|DDDC|CCBB|BAAA] |
347 | | // hi = [xxxx|HHHG|GGFF|FEEE] |
348 | 3.93k | const __m128i lo = |
349 | 3.93k | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i)); |
350 | 3.93k | const __m128i hi = |
351 | 3.93k | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3)); |
352 | | |
353 | | // bytes from groups A, B and C are needed in separate 32-bit lanes |
354 | | // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] |
355 | 3.93k | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); |
356 | | |
357 | | // this part is well commented in encode.sse.cpp |
358 | | |
359 | 3.93k | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); |
360 | 3.93k | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); |
361 | 3.93k | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); |
362 | 3.93k | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); |
363 | 3.93k | const __m256i indices = _mm256_or_si256(t1, t3); |
364 | | |
365 | 3.93k | if (use_lines) { |
366 | 1.11k | if (line_length >= 32) { // fast path |
367 | 571 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), |
368 | 571 | lookup_pshufb_improved<isbase64url>(indices)); |
369 | | |
370 | 571 | if (offset + 32 > line_length) { |
371 | 184 | size_t location_end = line_length - offset; |
372 | 184 | size_t to_move = 32 - location_end; |
373 | 184 | std::memmove(out + location_end + 1, out + location_end, to_move); |
374 | 184 | out[location_end] = '\n'; |
375 | 184 | offset = to_move; |
376 | 184 | out += 32 + 1; |
377 | 387 | } else { |
378 | 387 | offset += 32; |
379 | 387 | out += 32; |
380 | 387 | } |
381 | 571 | } else { // slow path |
382 | | // could be optimized |
383 | 548 | alignas(32) uint8_t buffer[32]; |
384 | 548 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), |
385 | 548 | lookup_pshufb_improved<isbase64url>(indices)); |
386 | 548 | std::memcpy(out, buffer, 32); |
387 | 548 | size_t out_pos = 0; |
388 | 548 | size_t local_offset = offset; |
389 | 18.0k | for (size_t j = 0; j < 32;) { |
390 | 17.5k | if (local_offset == line_length) { |
391 | 3.34k | out[out_pos++] = '\n'; |
392 | 3.34k | local_offset = 0; |
393 | 3.34k | } |
394 | 17.5k | out[out_pos++] = buffer[j++]; |
395 | 17.5k | local_offset++; |
396 | 17.5k | } |
397 | 548 | offset = local_offset; |
398 | 548 | out += out_pos; |
399 | 548 | } |
400 | 2.81k | } else { |
401 | 2.81k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), |
402 | 2.81k | lookup_pshufb_improved<isbase64url>(indices)); |
403 | | |
404 | 2.81k | out += 32; |
405 | 2.81k | } |
406 | 3.93k | } |
407 | 17.0k | return ((char *)out - (char *)dst) + |
408 | 17.0k | scalar::base64::tail_encode_base64_impl<use_lines>( |
409 | 17.0k | (char *)out, src + i, srclen - i, options, line_length, offset); |
410 | 17.0k | } simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<true, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Line | Count | Source | 156 | 579 | size_t line_length = simdutf::default_line_length) { | 157 | 579 | size_t offset = 0; | 158 | | | 159 | 579 | if (line_length < 4) { | 160 | 0 | line_length = 4; // We do not support line_length less than 4 | 161 | 0 | } | 162 | | // credit: Wojciech Muła | 163 | 579 | const uint8_t *input = (const uint8_t *)src; | 164 | | | 165 | 579 | uint8_t *out = (uint8_t *)dst; | 166 | 579 | const __m256i shuf = | 167 | 579 | _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, | 168 | | | 169 | 579 | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); | 170 | 579 | size_t i = 0; | 171 | 266k | for (; i + 100 <= srclen; i += 96) { | 172 | 265k | const __m128i lo0 = _mm_loadu_si128( | 173 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0)); | 174 | 265k | const __m128i hi0 = _mm_loadu_si128( | 175 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1)); | 176 | 265k | const __m128i lo1 = _mm_loadu_si128( | 177 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2)); | 178 | 265k | const __m128i hi1 = _mm_loadu_si128( | 179 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3)); | 180 | 265k | const __m128i lo2 = _mm_loadu_si128( | 181 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4)); | 182 | 265k | const __m128i hi2 = _mm_loadu_si128( | 183 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5)); | 184 | 265k | const __m128i lo3 = _mm_loadu_si128( | 185 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6)); | 186 | 265k | const __m128i hi3 = _mm_loadu_si128( | 187 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7)); | 188 | | | 189 | 265k | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); | 190 | 265k | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); | 191 | 265k | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); | 192 | 265k | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); | 193 | | | 194 | 265k | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); | 195 | 265k | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); | 196 | 265k | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); | 197 | 265k | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); | 198 | | | 199 | 265k | const __m256i t1_0 = | 200 | 265k | _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); | 201 | 265k | const __m256i t1_1 = | 202 | 265k | _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); | 203 | 265k | const __m256i t1_2 = | 204 | 265k | _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); | 205 | 265k | const __m256i t1_3 = | 206 | 265k | _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); | 207 | | | 208 | 265k | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); | 209 | 265k | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); | 210 | 265k | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); | 211 | 265k | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); | 212 | | | 213 | 265k | const __m256i t3_0 = | 214 | 265k | _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); | 215 | 265k | const __m256i t3_1 = | 216 | 265k | _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); | 217 | 265k | const __m256i t3_2 = | 218 | 265k | _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); | 219 | 265k | const __m256i t3_3 = | 220 | 265k | _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); | 221 | | | 222 | 265k | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); | 223 | 265k | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); | 224 | 265k | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); | 225 | 265k | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); | 226 | | | 227 | 265k | if (use_lines) { | 228 | 0 | if (line_length >= 32) { // fast path | 229 | 0 | __m256i result; | 230 | 0 | result = lookup_pshufb_improved<isbase64url>(input0); | 231 | 0 | if (offset + 32 > line_length) { | 232 | 0 | size_t location_end = line_length - offset; | 233 | 0 | size_t to_move = 32 - location_end; | 234 | | // We could do this, or extract instead. | 235 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 236 | 0 | _mm256_storeu_si256( | 237 | 0 | reinterpret_cast<__m256i *>(out), | 238 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 239 | 0 | offset = to_move; | 240 | 0 | out += 32 + 1; | 241 | 0 | } else { | 242 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 243 | 0 | offset += 32; | 244 | 0 | out += 32; | 245 | 0 | } | 246 | 0 | result = lookup_pshufb_improved<isbase64url>(input1); | 247 | |
| 248 | 0 | if (offset + 32 > line_length) { | 249 | 0 | size_t location_end = line_length - offset; | 250 | 0 | size_t to_move = 32 - location_end; | 251 | | | 252 | | // We could do this, or extract instead. | 253 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 254 | 0 | _mm256_storeu_si256( | 255 | 0 | reinterpret_cast<__m256i *>(out), | 256 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 257 | | // see above. | 258 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 259 | 0 | offset = to_move; | 260 | 0 | out += 32 + 1; | 261 | 0 | } else { | 262 | |
| 263 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 264 | |
| 265 | 0 | offset += 32; | 266 | 0 | out += 32; | 267 | 0 | } | 268 | 0 | result = lookup_pshufb_improved<isbase64url>(input2); | 269 | |
| 270 | 0 | if (offset + 32 > line_length) { | 271 | 0 | size_t location_end = line_length - offset; | 272 | 0 | size_t to_move = 32 - location_end; | 273 | | | 274 | | // We could do this, or extract instead. | 275 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 276 | 0 | _mm256_storeu_si256( | 277 | 0 | reinterpret_cast<__m256i *>(out), | 278 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 279 | | // see above. | 280 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 281 | 0 | offset = to_move; | 282 | 0 | out += 32 + 1; | 283 | 0 | } else { | 284 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 285 | 0 | offset += 32; | 286 | 0 | out += 32; | 287 | 0 | } | 288 | 0 | result = lookup_pshufb_improved<isbase64url>(input3); | 289 | |
| 290 | 0 | if (offset + 32 > line_length) { | 291 | 0 | size_t location_end = line_length - offset; | 292 | 0 | size_t to_move = 32 - location_end; | 293 | | | 294 | | // We could do this, or extract instead. | 295 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 296 | 0 | _mm256_storeu_si256( | 297 | 0 | reinterpret_cast<__m256i *>(out), | 298 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 299 | | // see above. | 300 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 301 | 0 | offset = to_move; | 302 | 0 | out += 32 + 1; | 303 | 0 | } else { | 304 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 305 | 0 | offset += 32; | 306 | 0 | out += 32; | 307 | 0 | } | 308 | 0 | } else { // slow path | 309 | | // could be optimized | 310 | 0 | uint8_t buffer[128]; | 311 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 312 | 0 | lookup_pshufb_improved<isbase64url>(input0)); | 313 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32), | 314 | 0 | lookup_pshufb_improved<isbase64url>(input1)); | 315 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64), | 316 | 0 | lookup_pshufb_improved<isbase64url>(input2)); | 317 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96), | 318 | 0 | lookup_pshufb_improved<isbase64url>(input3)); | 319 | 0 | size_t out_pos = 0; | 320 | 0 | size_t local_offset = offset; | 321 | 0 | for (size_t j = 0; j < 128;) { | 322 | 0 | if (local_offset == line_length) { | 323 | 0 | out[out_pos++] = '\n'; | 324 | 0 | local_offset = 0; | 325 | 0 | } | 326 | 0 | out[out_pos++] = buffer[j++]; | 327 | 0 | local_offset++; | 328 | 0 | } | 329 | 0 | offset = local_offset; | 330 | 0 | out += out_pos; | 331 | 0 | } | 332 | 265k | } else { | 333 | 265k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 334 | 265k | lookup_pshufb_improved<isbase64url>(input0)); | 335 | 265k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), | 336 | 265k | lookup_pshufb_improved<isbase64url>(input1)); | 337 | 265k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64), | 338 | 265k | lookup_pshufb_improved<isbase64url>(input2)); | 339 | 265k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96), | 340 | 265k | lookup_pshufb_improved<isbase64url>(input3)); | 341 | | | 342 | 265k | out += 128; | 343 | 265k | } | 344 | 265k | } | 345 | 1.14k | for (; i + 28 <= srclen; i += 24) { | 346 | | // lo = [xxxx|DDDC|CCBB|BAAA] | 347 | | // hi = [xxxx|HHHG|GGFF|FEEE] | 348 | 565 | const __m128i lo = | 349 | 565 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i)); | 350 | 565 | const __m128i hi = | 351 | 565 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3)); | 352 | | | 353 | | // bytes from groups A, B and C are needed in separate 32-bit lanes | 354 | | // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] | 355 | 565 | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); | 356 | | | 357 | | // this part is well commented in encode.sse.cpp | 358 | | | 359 | 565 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); | 360 | 565 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); | 361 | 565 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); | 362 | 565 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); | 363 | 565 | const __m256i indices = _mm256_or_si256(t1, t3); | 364 | | | 365 | 565 | if (use_lines) { | 366 | 0 | if (line_length >= 32) { // fast path | 367 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 368 | 0 | lookup_pshufb_improved<isbase64url>(indices)); | 369 | |
| 370 | 0 | if (offset + 32 > line_length) { | 371 | 0 | size_t location_end = line_length - offset; | 372 | 0 | size_t to_move = 32 - location_end; | 373 | 0 | std::memmove(out + location_end + 1, out + location_end, to_move); | 374 | 0 | out[location_end] = '\n'; | 375 | 0 | offset = to_move; | 376 | 0 | out += 32 + 1; | 377 | 0 | } else { | 378 | 0 | offset += 32; | 379 | 0 | out += 32; | 380 | 0 | } | 381 | 0 | } else { // slow path | 382 | | // could be optimized | 383 | 0 | alignas(32) uint8_t buffer[32]; | 384 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 385 | 0 | lookup_pshufb_improved<isbase64url>(indices)); | 386 | 0 | std::memcpy(out, buffer, 32); | 387 | 0 | size_t out_pos = 0; | 388 | 0 | size_t local_offset = offset; | 389 | 0 | for (size_t j = 0; j < 32;) { | 390 | 0 | if (local_offset == line_length) { | 391 | 0 | out[out_pos++] = '\n'; | 392 | 0 | local_offset = 0; | 393 | 0 | } | 394 | 0 | out[out_pos++] = buffer[j++]; | 395 | 0 | local_offset++; | 396 | 0 | } | 397 | 0 | offset = local_offset; | 398 | 0 | out += out_pos; | 399 | 0 | } | 400 | 565 | } else { | 401 | 565 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 402 | 565 | lookup_pshufb_improved<isbase64url>(indices)); | 403 | | | 404 | 565 | out += 32; | 405 | 565 | } | 406 | 565 | } | 407 | 579 | return ((char *)out - (char *)dst) + | 408 | 579 | scalar::base64::tail_encode_base64_impl<use_lines>( | 409 | 579 | (char *)out, src + i, srclen - i, options, line_length, offset); | 410 | 579 | } |
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<false, false>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Line | Count | Source | 156 | 15.2k | size_t line_length = simdutf::default_line_length) { | 157 | 15.2k | size_t offset = 0; | 158 | | | 159 | 15.2k | if (line_length < 4) { | 160 | 0 | line_length = 4; // We do not support line_length less than 4 | 161 | 0 | } | 162 | | // credit: Wojciech Muła | 163 | 15.2k | const uint8_t *input = (const uint8_t *)src; | 164 | | | 165 | 15.2k | uint8_t *out = (uint8_t *)dst; | 166 | 15.2k | const __m256i shuf = | 167 | 15.2k | _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, | 168 | | | 169 | 15.2k | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); | 170 | 15.2k | size_t i = 0; | 171 | 442k | for (; i + 100 <= srclen; i += 96) { | 172 | 427k | const __m128i lo0 = _mm_loadu_si128( | 173 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0)); | 174 | 427k | const __m128i hi0 = _mm_loadu_si128( | 175 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1)); | 176 | 427k | const __m128i lo1 = _mm_loadu_si128( | 177 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2)); | 178 | 427k | const __m128i hi1 = _mm_loadu_si128( | 179 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3)); | 180 | 427k | const __m128i lo2 = _mm_loadu_si128( | 181 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4)); | 182 | 427k | const __m128i hi2 = _mm_loadu_si128( | 183 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5)); | 184 | 427k | const __m128i lo3 = _mm_loadu_si128( | 185 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6)); | 186 | 427k | const __m128i hi3 = _mm_loadu_si128( | 187 | 427k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7)); | 188 | | | 189 | 427k | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); | 190 | 427k | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); | 191 | 427k | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); | 192 | 427k | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); | 193 | | | 194 | 427k | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); | 195 | 427k | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); | 196 | 427k | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); | 197 | 427k | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); | 198 | | | 199 | 427k | const __m256i t1_0 = | 200 | 427k | _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); | 201 | 427k | const __m256i t1_1 = | 202 | 427k | _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); | 203 | 427k | const __m256i t1_2 = | 204 | 427k | _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); | 205 | 427k | const __m256i t1_3 = | 206 | 427k | _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); | 207 | | | 208 | 427k | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); | 209 | 427k | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); | 210 | 427k | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); | 211 | 427k | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); | 212 | | | 213 | 427k | const __m256i t3_0 = | 214 | 427k | _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); | 215 | 427k | const __m256i t3_1 = | 216 | 427k | _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); | 217 | 427k | const __m256i t3_2 = | 218 | 427k | _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); | 219 | 427k | const __m256i t3_3 = | 220 | 427k | _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); | 221 | | | 222 | 427k | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); | 223 | 427k | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); | 224 | 427k | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); | 225 | 427k | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); | 226 | | | 227 | 427k | if (use_lines) { | 228 | 0 | if (line_length >= 32) { // fast path | 229 | 0 | __m256i result; | 230 | 0 | result = lookup_pshufb_improved<isbase64url>(input0); | 231 | 0 | if (offset + 32 > line_length) { | 232 | 0 | size_t location_end = line_length - offset; | 233 | 0 | size_t to_move = 32 - location_end; | 234 | | // We could do this, or extract instead. | 235 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 236 | 0 | _mm256_storeu_si256( | 237 | 0 | reinterpret_cast<__m256i *>(out), | 238 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 239 | 0 | offset = to_move; | 240 | 0 | out += 32 + 1; | 241 | 0 | } else { | 242 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 243 | 0 | offset += 32; | 244 | 0 | out += 32; | 245 | 0 | } | 246 | 0 | result = lookup_pshufb_improved<isbase64url>(input1); | 247 | |
| 248 | 0 | if (offset + 32 > line_length) { | 249 | 0 | size_t location_end = line_length - offset; | 250 | 0 | size_t to_move = 32 - location_end; | 251 | | | 252 | | // We could do this, or extract instead. | 253 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 254 | 0 | _mm256_storeu_si256( | 255 | 0 | reinterpret_cast<__m256i *>(out), | 256 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 257 | | // see above. | 258 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 259 | 0 | offset = to_move; | 260 | 0 | out += 32 + 1; | 261 | 0 | } else { | 262 | |
| 263 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 264 | |
| 265 | 0 | offset += 32; | 266 | 0 | out += 32; | 267 | 0 | } | 268 | 0 | result = lookup_pshufb_improved<isbase64url>(input2); | 269 | |
| 270 | 0 | if (offset + 32 > line_length) { | 271 | 0 | size_t location_end = line_length - offset; | 272 | 0 | size_t to_move = 32 - location_end; | 273 | | | 274 | | // We could do this, or extract instead. | 275 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 276 | 0 | _mm256_storeu_si256( | 277 | 0 | reinterpret_cast<__m256i *>(out), | 278 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 279 | | // see above. | 280 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 281 | 0 | offset = to_move; | 282 | 0 | out += 32 + 1; | 283 | 0 | } else { | 284 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 285 | 0 | offset += 32; | 286 | 0 | out += 32; | 287 | 0 | } | 288 | 0 | result = lookup_pshufb_improved<isbase64url>(input3); | 289 | |
| 290 | 0 | if (offset + 32 > line_length) { | 291 | 0 | size_t location_end = line_length - offset; | 292 | 0 | size_t to_move = 32 - location_end; | 293 | | | 294 | | // We could do this, or extract instead. | 295 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 296 | 0 | _mm256_storeu_si256( | 297 | 0 | reinterpret_cast<__m256i *>(out), | 298 | 0 | insert_line_feed32(result, static_cast<int>(location_end))); | 299 | | // see above. | 300 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 301 | 0 | offset = to_move; | 302 | 0 | out += 32 + 1; | 303 | 0 | } else { | 304 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 305 | 0 | offset += 32; | 306 | 0 | out += 32; | 307 | 0 | } | 308 | 0 | } else { // slow path | 309 | | // could be optimized | 310 | 0 | uint8_t buffer[128]; | 311 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 312 | 0 | lookup_pshufb_improved<isbase64url>(input0)); | 313 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32), | 314 | 0 | lookup_pshufb_improved<isbase64url>(input1)); | 315 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64), | 316 | 0 | lookup_pshufb_improved<isbase64url>(input2)); | 317 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96), | 318 | 0 | lookup_pshufb_improved<isbase64url>(input3)); | 319 | 0 | size_t out_pos = 0; | 320 | 0 | size_t local_offset = offset; | 321 | 0 | for (size_t j = 0; j < 128;) { | 322 | 0 | if (local_offset == line_length) { | 323 | 0 | out[out_pos++] = '\n'; | 324 | 0 | local_offset = 0; | 325 | 0 | } | 326 | 0 | out[out_pos++] = buffer[j++]; | 327 | 0 | local_offset++; | 328 | 0 | } | 329 | 0 | offset = local_offset; | 330 | 0 | out += out_pos; | 331 | 0 | } | 332 | 427k | } else { | 333 | 427k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 334 | 427k | lookup_pshufb_improved<isbase64url>(input0)); | 335 | 427k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), | 336 | 427k | lookup_pshufb_improved<isbase64url>(input1)); | 337 | 427k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64), | 338 | 427k | lookup_pshufb_improved<isbase64url>(input2)); | 339 | 427k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96), | 340 | 427k | lookup_pshufb_improved<isbase64url>(input3)); | 341 | | | 342 | 427k | out += 128; | 343 | 427k | } | 344 | 427k | } | 345 | 17.5k | for (; i + 28 <= srclen; i += 24) { | 346 | | // lo = [xxxx|DDDC|CCBB|BAAA] | 347 | | // hi = [xxxx|HHHG|GGFF|FEEE] | 348 | 2.24k | const __m128i lo = | 349 | 2.24k | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i)); | 350 | 2.24k | const __m128i hi = | 351 | 2.24k | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3)); | 352 | | | 353 | | // bytes from groups A, B and C are needed in separate 32-bit lanes | 354 | | // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] | 355 | 2.24k | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); | 356 | | | 357 | | // this part is well commented in encode.sse.cpp | 358 | | | 359 | 2.24k | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); | 360 | 2.24k | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); | 361 | 2.24k | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); | 362 | 2.24k | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); | 363 | 2.24k | const __m256i indices = _mm256_or_si256(t1, t3); | 364 | | | 365 | 2.24k | if (use_lines) { | 366 | 0 | if (line_length >= 32) { // fast path | 367 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 368 | 0 | lookup_pshufb_improved<isbase64url>(indices)); | 369 | |
| 370 | 0 | if (offset + 32 > line_length) { | 371 | 0 | size_t location_end = line_length - offset; | 372 | 0 | size_t to_move = 32 - location_end; | 373 | 0 | std::memmove(out + location_end + 1, out + location_end, to_move); | 374 | 0 | out[location_end] = '\n'; | 375 | 0 | offset = to_move; | 376 | 0 | out += 32 + 1; | 377 | 0 | } else { | 378 | 0 | offset += 32; | 379 | 0 | out += 32; | 380 | 0 | } | 381 | 0 | } else { // slow path | 382 | | // could be optimized | 383 | 0 | alignas(32) uint8_t buffer[32]; | 384 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 385 | 0 | lookup_pshufb_improved<isbase64url>(indices)); | 386 | 0 | std::memcpy(out, buffer, 32); | 387 | 0 | size_t out_pos = 0; | 388 | 0 | size_t local_offset = offset; | 389 | 0 | for (size_t j = 0; j < 32;) { | 390 | 0 | if (local_offset == line_length) { | 391 | 0 | out[out_pos++] = '\n'; | 392 | 0 | local_offset = 0; | 393 | 0 | } | 394 | 0 | out[out_pos++] = buffer[j++]; | 395 | 0 | local_offset++; | 396 | 0 | } | 397 | 0 | offset = local_offset; | 398 | 0 | out += out_pos; | 399 | 0 | } | 400 | 2.24k | } else { | 401 | 2.24k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 402 | 2.24k | lookup_pshufb_improved<isbase64url>(indices)); | 403 | | | 404 | 2.24k | out += 32; | 405 | 2.24k | } | 406 | 2.24k | } | 407 | 15.2k | return ((char *)out - (char *)dst) + | 408 | 15.2k | scalar::base64::tail_encode_base64_impl<use_lines>( | 409 | 15.2k | (char *)out, src + i, srclen - i, options, line_length, offset); | 410 | 15.2k | } |
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<true, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Line | Count | Source | 156 | 579 | size_t line_length = simdutf::default_line_length) { | 157 | 579 | size_t offset = 0; | 158 | | | 159 | 579 | if (line_length < 4) { | 160 | 0 | line_length = 4; // We do not support line_length less than 4 | 161 | 0 | } | 162 | | // credit: Wojciech Muła | 163 | 579 | const uint8_t *input = (const uint8_t *)src; | 164 | | | 165 | 579 | uint8_t *out = (uint8_t *)dst; | 166 | 579 | const __m256i shuf = | 167 | 579 | _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, | 168 | | | 169 | 579 | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); | 170 | 579 | size_t i = 0; | 171 | 266k | for (; i + 100 <= srclen; i += 96) { | 172 | 265k | const __m128i lo0 = _mm_loadu_si128( | 173 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0)); | 174 | 265k | const __m128i hi0 = _mm_loadu_si128( | 175 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1)); | 176 | 265k | const __m128i lo1 = _mm_loadu_si128( | 177 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2)); | 178 | 265k | const __m128i hi1 = _mm_loadu_si128( | 179 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3)); | 180 | 265k | const __m128i lo2 = _mm_loadu_si128( | 181 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4)); | 182 | 265k | const __m128i hi2 = _mm_loadu_si128( | 183 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5)); | 184 | 265k | const __m128i lo3 = _mm_loadu_si128( | 185 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6)); | 186 | 265k | const __m128i hi3 = _mm_loadu_si128( | 187 | 265k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7)); | 188 | | | 189 | 265k | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); | 190 | 265k | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); | 191 | 265k | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); | 192 | 265k | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); | 193 | | | 194 | 265k | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); | 195 | 265k | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); | 196 | 265k | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); | 197 | 265k | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); | 198 | | | 199 | 265k | const __m256i t1_0 = | 200 | 265k | _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); | 201 | 265k | const __m256i t1_1 = | 202 | 265k | _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); | 203 | 265k | const __m256i t1_2 = | 204 | 265k | _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); | 205 | 265k | const __m256i t1_3 = | 206 | 265k | _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); | 207 | | | 208 | 265k | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); | 209 | 265k | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); | 210 | 265k | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); | 211 | 265k | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); | 212 | | | 213 | 265k | const __m256i t3_0 = | 214 | 265k | _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); | 215 | 265k | const __m256i t3_1 = | 216 | 265k | _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); | 217 | 265k | const __m256i t3_2 = | 218 | 265k | _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); | 219 | 265k | const __m256i t3_3 = | 220 | 265k | _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); | 221 | | | 222 | 265k | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); | 223 | 265k | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); | 224 | 265k | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); | 225 | 265k | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); | 226 | | | 227 | 265k | if (use_lines) { | 228 | 265k | if (line_length >= 32) { // fast path | 229 | 36.2k | __m256i result; | 230 | 36.2k | result = lookup_pshufb_improved<isbase64url>(input0); | 231 | 36.2k | if (offset + 32 > line_length) { | 232 | 17.7k | size_t location_end = line_length - offset; | 233 | 17.7k | size_t to_move = 32 - location_end; | 234 | | // We could do this, or extract instead. | 235 | 17.7k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 236 | 17.7k | _mm256_storeu_si256( | 237 | 17.7k | reinterpret_cast<__m256i *>(out), | 238 | 17.7k | insert_line_feed32(result, static_cast<int>(location_end))); | 239 | 17.7k | offset = to_move; | 240 | 17.7k | out += 32 + 1; | 241 | 18.4k | } else { | 242 | 18.4k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 243 | 18.4k | offset += 32; | 244 | 18.4k | out += 32; | 245 | 18.4k | } | 246 | 36.2k | result = lookup_pshufb_improved<isbase64url>(input1); | 247 | | | 248 | 36.2k | if (offset + 32 > line_length) { | 249 | 16.0k | size_t location_end = line_length - offset; | 250 | 16.0k | size_t to_move = 32 - location_end; | 251 | | | 252 | | // We could do this, or extract instead. | 253 | 16.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 254 | 16.0k | _mm256_storeu_si256( | 255 | 16.0k | reinterpret_cast<__m256i *>(out), | 256 | 16.0k | insert_line_feed32(result, static_cast<int>(location_end))); | 257 | | // see above. | 258 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 259 | 16.0k | offset = to_move; | 260 | 16.0k | out += 32 + 1; | 261 | 20.2k | } else { | 262 | | | 263 | 20.2k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 264 | | | 265 | 20.2k | offset += 32; | 266 | 20.2k | out += 32; | 267 | 20.2k | } | 268 | 36.2k | result = lookup_pshufb_improved<isbase64url>(input2); | 269 | | | 270 | 36.2k | if (offset + 32 > line_length) { | 271 | 17.6k | size_t location_end = line_length - offset; | 272 | 17.6k | size_t to_move = 32 - location_end; | 273 | | | 274 | | // We could do this, or extract instead. | 275 | 17.6k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 276 | 17.6k | _mm256_storeu_si256( | 277 | 17.6k | reinterpret_cast<__m256i *>(out), | 278 | 17.6k | insert_line_feed32(result, static_cast<int>(location_end))); | 279 | | // see above. | 280 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 281 | 17.6k | offset = to_move; | 282 | 17.6k | out += 32 + 1; | 283 | 18.5k | } else { | 284 | 18.5k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 285 | 18.5k | offset += 32; | 286 | 18.5k | out += 32; | 287 | 18.5k | } | 288 | 36.2k | result = lookup_pshufb_improved<isbase64url>(input3); | 289 | | | 290 | 36.2k | if (offset + 32 > line_length) { | 291 | 16.1k | size_t location_end = line_length - offset; | 292 | 16.1k | size_t to_move = 32 - location_end; | 293 | | | 294 | | // We could do this, or extract instead. | 295 | 16.1k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 296 | 16.1k | _mm256_storeu_si256( | 297 | 16.1k | reinterpret_cast<__m256i *>(out), | 298 | 16.1k | insert_line_feed32(result, static_cast<int>(location_end))); | 299 | | // see above. | 300 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 301 | 16.1k | offset = to_move; | 302 | 16.1k | out += 32 + 1; | 303 | 20.0k | } else { | 304 | 20.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 305 | 20.0k | offset += 32; | 306 | 20.0k | out += 32; | 307 | 20.0k | } | 308 | 229k | } else { // slow path | 309 | | // could be optimized | 310 | 229k | uint8_t buffer[128]; | 311 | 229k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 312 | 229k | lookup_pshufb_improved<isbase64url>(input0)); | 313 | 229k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32), | 314 | 229k | lookup_pshufb_improved<isbase64url>(input1)); | 315 | 229k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64), | 316 | 229k | lookup_pshufb_improved<isbase64url>(input2)); | 317 | 229k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96), | 318 | 229k | lookup_pshufb_improved<isbase64url>(input3)); | 319 | 229k | size_t out_pos = 0; | 320 | 229k | size_t local_offset = offset; | 321 | 29.6M | for (size_t j = 0; j < 128;) { | 322 | 29.3M | if (local_offset == line_length) { | 323 | 6.86M | out[out_pos++] = '\n'; | 324 | 6.86M | local_offset = 0; | 325 | 6.86M | } | 326 | 29.3M | out[out_pos++] = buffer[j++]; | 327 | 29.3M | local_offset++; | 328 | 29.3M | } | 329 | 229k | offset = local_offset; | 330 | 229k | out += out_pos; | 331 | 229k | } | 332 | 265k | } else { | 333 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 334 | 0 | lookup_pshufb_improved<isbase64url>(input0)); | 335 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), | 336 | 0 | lookup_pshufb_improved<isbase64url>(input1)); | 337 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64), | 338 | 0 | lookup_pshufb_improved<isbase64url>(input2)); | 339 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96), | 340 | 0 | lookup_pshufb_improved<isbase64url>(input3)); | 341 | |
| 342 | 0 | out += 128; | 343 | 0 | } | 344 | 265k | } | 345 | 1.14k | for (; i + 28 <= srclen; i += 24) { | 346 | | // lo = [xxxx|DDDC|CCBB|BAAA] | 347 | | // hi = [xxxx|HHHG|GGFF|FEEE] | 348 | 565 | const __m128i lo = | 349 | 565 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i)); | 350 | 565 | const __m128i hi = | 351 | 565 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3)); | 352 | | | 353 | | // bytes from groups A, B and C are needed in separate 32-bit lanes | 354 | | // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] | 355 | 565 | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); | 356 | | | 357 | | // this part is well commented in encode.sse.cpp | 358 | | | 359 | 565 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); | 360 | 565 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); | 361 | 565 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); | 362 | 565 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); | 363 | 565 | const __m256i indices = _mm256_or_si256(t1, t3); | 364 | | | 365 | 565 | if (use_lines) { | 366 | 565 | if (line_length >= 32) { // fast path | 367 | 310 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 368 | 310 | lookup_pshufb_improved<isbase64url>(indices)); | 369 | | | 370 | 310 | if (offset + 32 > line_length) { | 371 | 103 | size_t location_end = line_length - offset; | 372 | 103 | size_t to_move = 32 - location_end; | 373 | 103 | std::memmove(out + location_end + 1, out + location_end, to_move); | 374 | 103 | out[location_end] = '\n'; | 375 | 103 | offset = to_move; | 376 | 103 | out += 32 + 1; | 377 | 207 | } else { | 378 | 207 | offset += 32; | 379 | 207 | out += 32; | 380 | 207 | } | 381 | 310 | } else { // slow path | 382 | | // could be optimized | 383 | 255 | alignas(32) uint8_t buffer[32]; | 384 | 255 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 385 | 255 | lookup_pshufb_improved<isbase64url>(indices)); | 386 | 255 | std::memcpy(out, buffer, 32); | 387 | 255 | size_t out_pos = 0; | 388 | 255 | size_t local_offset = offset; | 389 | 8.41k | for (size_t j = 0; j < 32;) { | 390 | 8.16k | if (local_offset == line_length) { | 391 | 1.49k | out[out_pos++] = '\n'; | 392 | 1.49k | local_offset = 0; | 393 | 1.49k | } | 394 | 8.16k | out[out_pos++] = buffer[j++]; | 395 | 8.16k | local_offset++; | 396 | 8.16k | } | 397 | 255 | offset = local_offset; | 398 | 255 | out += out_pos; | 399 | 255 | } | 400 | 565 | } else { | 401 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 402 | 0 | lookup_pshufb_improved<isbase64url>(indices)); | 403 | |
| 404 | 0 | out += 32; | 405 | 0 | } | 406 | 565 | } | 407 | 579 | return ((char *)out - (char *)dst) + | 408 | 579 | scalar::base64::tail_encode_base64_impl<use_lines>( | 409 | 579 | (char *)out, src + i, srclen - i, options, line_length, offset); | 410 | 579 | } |
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::avx2_encode_base64_impl<false, true>(char*, char const*, unsigned long, simdutf::base64_options, unsigned long) Line | Count | Source | 156 | 564 | size_t line_length = simdutf::default_line_length) { | 157 | 564 | size_t offset = 0; | 158 | | | 159 | 564 | if (line_length < 4) { | 160 | 0 | line_length = 4; // We do not support line_length less than 4 | 161 | 0 | } | 162 | | // credit: Wojciech Muła | 163 | 564 | const uint8_t *input = (const uint8_t *)src; | 164 | | | 165 | 564 | uint8_t *out = (uint8_t *)dst; | 166 | 564 | const __m256i shuf = | 167 | 564 | _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, | 168 | | | 169 | 564 | 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); | 170 | 564 | size_t i = 0; | 171 | 426k | for (; i + 100 <= srclen; i += 96) { | 172 | 425k | const __m128i lo0 = _mm_loadu_si128( | 173 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0)); | 174 | 425k | const __m128i hi0 = _mm_loadu_si128( | 175 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1)); | 176 | 425k | const __m128i lo1 = _mm_loadu_si128( | 177 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2)); | 178 | 425k | const __m128i hi1 = _mm_loadu_si128( | 179 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3)); | 180 | 425k | const __m128i lo2 = _mm_loadu_si128( | 181 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4)); | 182 | 425k | const __m128i hi2 = _mm_loadu_si128( | 183 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5)); | 184 | 425k | const __m128i lo3 = _mm_loadu_si128( | 185 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6)); | 186 | 425k | const __m128i hi3 = _mm_loadu_si128( | 187 | 425k | reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7)); | 188 | | | 189 | 425k | __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); | 190 | 425k | __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); | 191 | 425k | __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); | 192 | 425k | __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); | 193 | | | 194 | 425k | const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); | 195 | 425k | const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); | 196 | 425k | const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); | 197 | 425k | const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); | 198 | | | 199 | 425k | const __m256i t1_0 = | 200 | 425k | _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); | 201 | 425k | const __m256i t1_1 = | 202 | 425k | _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); | 203 | 425k | const __m256i t1_2 = | 204 | 425k | _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); | 205 | 425k | const __m256i t1_3 = | 206 | 425k | _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); | 207 | | | 208 | 425k | const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); | 209 | 425k | const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); | 210 | 425k | const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); | 211 | 425k | const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); | 212 | | | 213 | 425k | const __m256i t3_0 = | 214 | 425k | _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); | 215 | 425k | const __m256i t3_1 = | 216 | 425k | _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); | 217 | 425k | const __m256i t3_2 = | 218 | 425k | _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); | 219 | 425k | const __m256i t3_3 = | 220 | 425k | _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); | 221 | | | 222 | 425k | const __m256i input0 = _mm256_or_si256(t1_0, t3_0); | 223 | 425k | const __m256i input1 = _mm256_or_si256(t1_1, t3_1); | 224 | 425k | const __m256i input2 = _mm256_or_si256(t1_2, t3_2); | 225 | 425k | const __m256i input3 = _mm256_or_si256(t1_3, t3_3); | 226 | | | 227 | 425k | if (use_lines) { | 228 | 425k | if (line_length >= 32) { // fast path | 229 | 77.2k | __m256i result; | 230 | 77.2k | result = lookup_pshufb_improved<isbase64url>(input0); | 231 | 77.2k | if (offset + 32 > line_length) { | 232 | 35.1k | size_t location_end = line_length - offset; | 233 | 35.1k | size_t to_move = 32 - location_end; | 234 | | // We could do this, or extract instead. | 235 | 35.1k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 236 | 35.1k | _mm256_storeu_si256( | 237 | 35.1k | reinterpret_cast<__m256i *>(out), | 238 | 35.1k | insert_line_feed32(result, static_cast<int>(location_end))); | 239 | 35.1k | offset = to_move; | 240 | 35.1k | out += 32 + 1; | 241 | 42.0k | } else { | 242 | 42.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 243 | 42.0k | offset += 32; | 244 | 42.0k | out += 32; | 245 | 42.0k | } | 246 | 77.2k | result = lookup_pshufb_improved<isbase64url>(input1); | 247 | | | 248 | 77.2k | if (offset + 32 > line_length) { | 249 | 34.1k | size_t location_end = line_length - offset; | 250 | 34.1k | size_t to_move = 32 - location_end; | 251 | | | 252 | | // We could do this, or extract instead. | 253 | 34.1k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 254 | 34.1k | _mm256_storeu_si256( | 255 | 34.1k | reinterpret_cast<__m256i *>(out), | 256 | 34.1k | insert_line_feed32(result, static_cast<int>(location_end))); | 257 | | // see above. | 258 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 259 | 34.1k | offset = to_move; | 260 | 34.1k | out += 32 + 1; | 261 | 43.0k | } else { | 262 | | | 263 | 43.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 264 | | | 265 | 43.0k | offset += 32; | 266 | 43.0k | out += 32; | 267 | 43.0k | } | 268 | 77.2k | result = lookup_pshufb_improved<isbase64url>(input2); | 269 | | | 270 | 77.2k | if (offset + 32 > line_length) { | 271 | 35.0k | size_t location_end = line_length - offset; | 272 | 35.0k | size_t to_move = 32 - location_end; | 273 | | | 274 | | // We could do this, or extract instead. | 275 | 35.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 276 | 35.0k | _mm256_storeu_si256( | 277 | 35.0k | reinterpret_cast<__m256i *>(out), | 278 | 35.0k | insert_line_feed32(result, static_cast<int>(location_end))); | 279 | | // see above. | 280 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 281 | 35.0k | offset = to_move; | 282 | 35.0k | out += 32 + 1; | 283 | 42.1k | } else { | 284 | 42.1k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 285 | 42.1k | offset += 32; | 286 | 42.1k | out += 32; | 287 | 42.1k | } | 288 | 77.2k | result = lookup_pshufb_improved<isbase64url>(input3); | 289 | | | 290 | 77.2k | if (offset + 32 > line_length) { | 291 | 34.2k | size_t location_end = line_length - offset; | 292 | 34.2k | size_t to_move = 32 - location_end; | 293 | | | 294 | | // We could do this, or extract instead. | 295 | 34.2k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result); | 296 | 34.2k | _mm256_storeu_si256( | 297 | 34.2k | reinterpret_cast<__m256i *>(out), | 298 | 34.2k | insert_line_feed32(result, static_cast<int>(location_end))); | 299 | | // see above. | 300 | | // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31)); | 301 | 34.2k | offset = to_move; | 302 | 34.2k | out += 32 + 1; | 303 | 43.0k | } else { | 304 | 43.0k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result); | 305 | 43.0k | offset += 32; | 306 | 43.0k | out += 32; | 307 | 43.0k | } | 308 | 348k | } else { // slow path | 309 | | // could be optimized | 310 | 348k | uint8_t buffer[128]; | 311 | 348k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 312 | 348k | lookup_pshufb_improved<isbase64url>(input0)); | 313 | 348k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32), | 314 | 348k | lookup_pshufb_improved<isbase64url>(input1)); | 315 | 348k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64), | 316 | 348k | lookup_pshufb_improved<isbase64url>(input2)); | 317 | 348k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96), | 318 | 348k | lookup_pshufb_improved<isbase64url>(input3)); | 319 | 348k | size_t out_pos = 0; | 320 | 348k | size_t local_offset = offset; | 321 | 44.9M | for (size_t j = 0; j < 128;) { | 322 | 44.5M | if (local_offset == line_length) { | 323 | 10.4M | out[out_pos++] = '\n'; | 324 | 10.4M | local_offset = 0; | 325 | 10.4M | } | 326 | 44.5M | out[out_pos++] = buffer[j++]; | 327 | 44.5M | local_offset++; | 328 | 44.5M | } | 329 | 348k | offset = local_offset; | 330 | 348k | out += out_pos; | 331 | 348k | } | 332 | 425k | } else { | 333 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 334 | 0 | lookup_pshufb_improved<isbase64url>(input0)); | 335 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), | 336 | 0 | lookup_pshufb_improved<isbase64url>(input1)); | 337 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64), | 338 | 0 | lookup_pshufb_improved<isbase64url>(input2)); | 339 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96), | 340 | 0 | lookup_pshufb_improved<isbase64url>(input3)); | 341 | |
| 342 | 0 | out += 128; | 343 | 0 | } | 344 | 425k | } | 345 | 1.11k | for (; i + 28 <= srclen; i += 24) { | 346 | | // lo = [xxxx|DDDC|CCBB|BAAA] | 347 | | // hi = [xxxx|HHHG|GGFF|FEEE] | 348 | 554 | const __m128i lo = | 349 | 554 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i)); | 350 | 554 | const __m128i hi = | 351 | 554 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3)); | 352 | | | 353 | | // bytes from groups A, B and C are needed in separate 32-bit lanes | 354 | | // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] | 355 | 554 | __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); | 356 | | | 357 | | // this part is well commented in encode.sse.cpp | 358 | | | 359 | 554 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); | 360 | 554 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); | 361 | 554 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); | 362 | 554 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); | 363 | 554 | const __m256i indices = _mm256_or_si256(t1, t3); | 364 | | | 365 | 554 | if (use_lines) { | 366 | 554 | if (line_length >= 32) { // fast path | 367 | 261 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 368 | 261 | lookup_pshufb_improved<isbase64url>(indices)); | 369 | | | 370 | 261 | if (offset + 32 > line_length) { | 371 | 81 | size_t location_end = line_length - offset; | 372 | 81 | size_t to_move = 32 - location_end; | 373 | 81 | std::memmove(out + location_end + 1, out + location_end, to_move); | 374 | 81 | out[location_end] = '\n'; | 375 | 81 | offset = to_move; | 376 | 81 | out += 32 + 1; | 377 | 180 | } else { | 378 | 180 | offset += 32; | 379 | 180 | out += 32; | 380 | 180 | } | 381 | 293 | } else { // slow path | 382 | | // could be optimized | 383 | 293 | alignas(32) uint8_t buffer[32]; | 384 | 293 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer), | 385 | 293 | lookup_pshufb_improved<isbase64url>(indices)); | 386 | 293 | std::memcpy(out, buffer, 32); | 387 | 293 | size_t out_pos = 0; | 388 | 293 | size_t local_offset = offset; | 389 | 9.66k | for (size_t j = 0; j < 32;) { | 390 | 9.37k | if (local_offset == line_length) { | 391 | 1.84k | out[out_pos++] = '\n'; | 392 | 1.84k | local_offset = 0; | 393 | 1.84k | } | 394 | 9.37k | out[out_pos++] = buffer[j++]; | 395 | 9.37k | local_offset++; | 396 | 9.37k | } | 397 | 293 | offset = local_offset; | 398 | 293 | out += out_pos; | 399 | 293 | } | 400 | 554 | } else { | 401 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), | 402 | 0 | lookup_pshufb_improved<isbase64url>(indices)); | 403 | |
| 404 | 0 | out += 32; | 405 | 0 | } | 406 | 554 | } | 407 | 564 | return ((char *)out - (char *)dst) + | 408 | 564 | scalar::base64::tail_encode_base64_impl<use_lines>( | 409 | 564 | (char *)out, src + i, srclen - i, options, line_length, offset); | 410 | 564 | } |
|
411 | | |
412 | | template <bool isbase64url> |
413 | | size_t encode_base64(char *dst, const char *src, size_t srclen, |
414 | 15.8k | base64_options options) { |
415 | 15.8k | return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options); |
416 | 15.8k | } simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::encode_base64<true>(char*, char const*, unsigned long, simdutf::base64_options) Line | Count | Source | 414 | 579 | base64_options options) { | 415 | 579 | return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options); | 416 | 579 | } |
simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::encode_base64<false>(char*, char const*, unsigned long, simdutf::base64_options) Line | Count | Source | 414 | 15.2k | base64_options options) { | 415 | 15.2k | return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options); | 416 | 15.2k | } |
|
417 | | |
418 | 380k | static inline void compress(__m128i data, uint16_t mask, char *output) { |
419 | 380k | if (mask == 0) { |
420 | 22.7k | _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); |
421 | 22.7k | return; |
422 | 22.7k | } |
423 | | // this particular implementation was inspired by work done by @animetosho |
424 | | // we do it in two steps, first 8 bytes and then second 8 bytes |
425 | 358k | uint8_t mask1 = uint8_t(mask); // least significant 8 bits |
426 | 358k | uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits |
427 | | // next line just loads the 64-bit values thintable_epi8[mask1] and |
428 | | // thintable_epi8[mask2] into a 128-bit register, using only |
429 | | // two instructions on most compilers. |
430 | | |
431 | 358k | __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], |
432 | 358k | tables::base64::thintable_epi8[mask1]); |
433 | | // we increment by 0x08 the second half of the mask |
434 | 358k | shufmask = |
435 | 358k | _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); |
436 | | // this is the version "nearly pruned" |
437 | 358k | __m128i pruned = _mm_shuffle_epi8(data, shufmask); |
438 | | // we still need to put the two halves together. |
439 | | // we compute the popcount of the first half: |
440 | 358k | int pop1 = tables::base64::BitsSetTable256mul2[mask1]; |
441 | | // then load the corresponding mask, what it does is to write |
442 | | // only the first pop1 bytes from the first 8 bytes, and then |
443 | | // it fills in with the bytes from the second 8 bytes + some filling |
444 | | // at the end. |
445 | 358k | __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>( |
446 | 358k | tables::base64::pshufb_combine_table + pop1 * 8)); |
447 | 358k | __m128i answer = _mm_shuffle_epi8(pruned, compactmask); |
448 | | |
449 | 358k | _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); |
450 | 358k | } |
451 | | |
452 | | // --- decoding ----------------------------------------------- |
453 | | |
454 | | template <typename = void> |
455 | 199k | simdutf_really_inline void compress(__m256i data, uint32_t mask, char *output) { |
456 | 199k | if (mask == 0) { |
457 | 8.73k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data); |
458 | 8.73k | return; |
459 | 8.73k | } |
460 | 190k | compress(_mm256_castsi256_si128(data), uint16_t(mask), output); |
461 | 190k | compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16), |
462 | 190k | output + count_ones(~mask & 0xFFFF)); |
463 | 190k | } |
464 | | |
465 | | template <typename = void> |
466 | 2.91M | simdutf_really_inline void base64_decode(char *out, __m256i str) { |
467 | | // credit: aqrit |
468 | 2.91M | const __m256i pack_shuffle = |
469 | 2.91M | _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, |
470 | 2.91M | 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); |
471 | 2.91M | const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140)); |
472 | 2.91M | const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000)); |
473 | 2.91M | const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle); |
474 | | |
475 | | // Store the output: |
476 | 2.91M | _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2)); |
477 | 2.91M | _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1)); |
478 | 2.91M | } |
479 | | |
480 | | template <typename = void> |
481 | 64.5k | simdutf_really_inline void base64_decode_block(char *out, const char *src) { |
482 | 64.5k | base64_decode(out, |
483 | 64.5k | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src))); |
484 | 64.5k | base64_decode(out + 24, _mm256_loadu_si256( |
485 | 64.5k | reinterpret_cast<const __m256i *>(src + 32))); |
486 | 64.5k | } |
487 | | |
488 | | template <typename = void> |
489 | | simdutf_really_inline void base64_decode_block_safe(char *out, |
490 | 120 | const char *src) { |
491 | 120 | base64_decode(out, |
492 | 120 | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src))); |
493 | 120 | alignas(32) char buffer[32]; // We enforce safety with a buffer. |
494 | 120 | base64_decode( |
495 | 120 | buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32))); |
496 | 120 | std::memcpy(out + 24, buffer, 24); |
497 | 120 | } |
498 | | |
499 | | // --- decoding - base64 class -------------------------------- |
500 | | |
501 | | class block64 { |
502 | | __m256i chunks[2]; |
503 | | |
504 | | public: |
505 | | // The caller of this function is responsible to ensure that there are 64 |
506 | | // bytes available from reading at src. |
507 | 1.49M | simdutf_really_inline block64(const char *src) { |
508 | 1.49M | chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); |
509 | 1.49M | chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)); |
510 | 1.49M | } |
511 | | |
512 | | // The caller of this function is responsible to ensure that there are 128 |
513 | | // bytes available from reading at src. |
514 | 50.6k | simdutf_really_inline block64(const char16_t *src) { |
515 | 50.6k | const auto m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); |
516 | 50.6k | const auto m2 = |
517 | 50.6k | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16)); |
518 | 50.6k | const auto m3 = |
519 | 50.6k | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)); |
520 | 50.6k | const auto m4 = |
521 | 50.6k | _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48)); |
522 | | |
523 | 50.6k | const auto m1p = _mm256_permute2x128_si256(m1, m2, 0x20); |
524 | 50.6k | const auto m2p = _mm256_permute2x128_si256(m1, m2, 0x31); |
525 | 50.6k | const auto m3p = _mm256_permute2x128_si256(m3, m4, 0x20); |
526 | 50.6k | const auto m4p = _mm256_permute2x128_si256(m3, m4, 0x31); |
527 | | |
528 | 50.6k | chunks[0] = _mm256_packus_epi16(m1p, m2p); |
529 | 50.6k | chunks[1] = _mm256_packus_epi16(m3p, m4p); |
530 | 50.6k | } |
531 | | |
532 | 16.3k | simdutf_really_inline void copy_block(char *output) { |
533 | 16.3k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), chunks[0]); |
534 | 16.3k | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), chunks[1]); |
535 | 16.3k | } |
536 | | |
537 | | // decode 64 bytes and output 48 bytes |
538 | 1.39M | simdutf_really_inline void base64_decode_block(char *out) { |
539 | 1.39M | base64_decode(out, chunks[0]); |
540 | 1.39M | base64_decode(out + 24, chunks[1]); |
541 | 1.39M | } |
542 | | |
543 | 1.33k | simdutf_really_inline void base64_decode_block_safe(char *out) { |
544 | 1.33k | base64_decode(out, chunks[0]); |
545 | 1.33k | alignas(32) char buffer[32]; // We enforce safety with a buffer. |
546 | 1.33k | base64_decode(buffer, chunks[1]); |
547 | 1.33k | std::memcpy(out + 24, buffer, 24); |
548 | 1.33k | } |
549 | | |
550 | | template <bool base64_url, bool ignore_garbage, bool default_or_url> |
551 | 1.54M | simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { |
552 | 1.54M | uint32_t err0 = 0; |
553 | 1.54M | uint32_t err1 = 0; |
554 | 1.54M | uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>( |
555 | 1.54M | &chunks[0], &err0); |
556 | 1.54M | uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>( |
557 | 1.54M | &chunks[1], &err1); |
558 | 1.54M | if (!ignore_garbage) { |
559 | 1.54M | *error = err0 | ((uint64_t)err1 << 32); |
560 | 1.54M | } |
561 | 1.54M | return m0 | (m1 << 32); |
562 | 1.54M | } Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, true>(unsigned long*) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, true>(unsigned long*) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, true, false>(unsigned long*) simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, false, false>(unsigned long*) Line | Count | Source | 551 | 593k | simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { | 552 | 593k | uint32_t err0 = 0; | 553 | 593k | uint32_t err1 = 0; | 554 | 593k | uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>( | 555 | 593k | &chunks[0], &err0); | 556 | 593k | uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>( | 557 | 593k | &chunks[1], &err1); | 558 | 593k | if (!ignore_garbage) { | 559 | 593k | *error = err0 | ((uint64_t)err1 << 32); | 560 | 593k | } | 561 | 593k | return m0 | (m1 << 32); | 562 | 593k | } |
Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, false>(unsigned long*) simdutf.cpp:unsigned long simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, false>(unsigned long*) Line | Count | Source | 551 | 951k | simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { | 552 | 951k | uint32_t err0 = 0; | 553 | 951k | uint32_t err1 = 0; | 554 | 951k | uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>( | 555 | 951k | &chunks[0], &err0); | 556 | 951k | uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>( | 557 | 951k | &chunks[1], &err1); | 558 | 951k | if (!ignore_garbage) { | 559 | 951k | *error = err0 | ((uint64_t)err1 << 32); | 560 | 951k | } | 561 | 951k | return m0 | (m1 << 32); | 562 | 951k | } |
|
563 | | |
564 | | template <bool base64_url, bool ignore_garbage, bool default_or_url> |
565 | 3.09M | simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { |
566 | 3.09M | const __m256i ascii_space_tbl = |
567 | 3.09M | _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, |
568 | 3.09M | 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, |
569 | 3.09M | 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); |
570 | | // credit: aqrit |
571 | 3.09M | __m256i delta_asso; |
572 | 3.09M | if (default_or_url) { |
573 | 0 | delta_asso = _mm256_setr_epi8( |
574 | 0 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, |
575 | 0 | 0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, |
576 | 0 | 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); |
577 | 3.09M | } else if (base64_url) { |
578 | 1.18M | delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, |
579 | 1.18M | 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, |
580 | 1.18M | 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, |
581 | 1.18M | 0x0, 0x0, 0xF, 0x0, 0xF); |
582 | 1.90M | } else { |
583 | 1.90M | delta_asso = _mm256_setr_epi8( |
584 | 1.90M | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, |
585 | 1.90M | 0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, |
586 | 1.90M | 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); |
587 | 1.90M | } |
588 | | |
589 | 3.09M | __m256i delta_values; |
590 | 3.09M | if (default_or_url) { |
591 | 0 | delta_values = _mm256_setr_epi8( |
592 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), |
593 | 0 | uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), |
594 | 0 | uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), |
595 | 0 | uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9), |
596 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), |
597 | 0 | uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), |
598 | 0 | uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), |
599 | 0 | uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9)); |
600 | 3.09M | } else if (base64_url) { |
601 | 1.18M | delta_values = _mm256_setr_epi8( |
602 | 1.18M | 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), |
603 | 1.18M | uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), |
604 | 1.18M | uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), |
605 | 1.18M | uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), |
606 | 1.18M | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); |
607 | 1.90M | } else { |
608 | 1.90M | delta_values = _mm256_setr_epi8( |
609 | 1.90M | int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), |
610 | 1.90M | int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), |
611 | 1.90M | int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), |
612 | 1.90M | int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), |
613 | 1.90M | int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), |
614 | 1.90M | int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), |
615 | 1.90M | int8_t(0xB9), int8_t(0xB9)); |
616 | 1.90M | } |
617 | | |
618 | 3.09M | __m256i check_asso; |
619 | 3.09M | if (default_or_url) { |
620 | 0 | check_asso = _mm256_setr_epi8( |
621 | 0 | 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, |
622 | 0 | 0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, |
623 | 0 | 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06); |
624 | 3.09M | } else if (base64_url) { |
625 | 1.18M | check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, |
626 | 1.18M | 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, |
627 | 1.18M | 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, |
628 | 1.18M | 0x7, 0xB, 0xE, 0xB, 0x6); |
629 | 1.90M | } else { |
630 | 1.90M | check_asso = _mm256_setr_epi8( |
631 | 1.90M | 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, |
632 | 1.90M | 0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, |
633 | 1.90M | 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); |
634 | 1.90M | } |
635 | 3.09M | __m256i check_values; |
636 | 3.09M | if (default_or_url) { |
637 | 0 | check_values = _mm256_setr_epi8( |
638 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), |
639 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), |
640 | 0 | uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), |
641 | 0 | uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80), |
642 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), |
643 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), |
644 | 0 | uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), |
645 | 0 | uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80)); |
646 | 3.09M | } else if (base64_url) { |
647 | 1.18M | check_values = _mm256_setr_epi8( |
648 | 1.18M | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), |
649 | 1.18M | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6), |
650 | 1.18M | uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80), |
651 | 1.18M | 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), |
652 | 1.18M | uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), |
653 | 1.18M | uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, |
654 | 1.18M | uint8_t(0x80), 0x0, uint8_t(0x80)); |
655 | 1.90M | } else { |
656 | 1.90M | check_values = _mm256_setr_epi8( |
657 | 1.90M | int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), |
658 | 1.90M | int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), |
659 | 1.90M | int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), |
660 | 1.90M | int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), |
661 | 1.90M | int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), |
662 | 1.90M | int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), |
663 | 1.90M | int8_t(0x91), int8_t(0x80)); |
664 | 1.90M | } |
665 | 3.09M | const __m256i shifted = _mm256_srli_epi32(*src, 3); |
666 | 3.09M | __m256i delta_hash = |
667 | 3.09M | _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); |
668 | 3.09M | if (default_or_url) { |
669 | 0 | delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf)); |
670 | 0 | } |
671 | 3.09M | const __m256i check_hash = |
672 | 3.09M | _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted); |
673 | 3.09M | const __m256i out = |
674 | 3.09M | _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src); |
675 | 3.09M | const __m256i chk = |
676 | 3.09M | _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); |
677 | 3.09M | const int mask = _mm256_movemask_epi8(chk); |
678 | 3.09M | if (!ignore_garbage && mask) { |
679 | 226k | __m256i ascii_space = |
680 | 226k | _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); |
681 | 226k | *error = (mask ^ _mm256_movemask_epi8(ascii_space)); |
682 | 226k | } |
683 | 3.09M | *src = out; |
684 | 3.09M | return (uint32_t)mask; |
685 | 3.09M | } Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, true>(long long __vector(4)*, unsigned int*) Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, true>(long long __vector(4)*, unsigned int*) Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, true, false>(long long __vector(4)*, unsigned int*) simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<true, false, false>(long long __vector(4)*, unsigned int*) Line | Count | Source | 565 | 1.18M | simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { | 566 | 1.18M | const __m256i ascii_space_tbl = | 567 | 1.18M | _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, | 568 | 1.18M | 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, | 569 | 1.18M | 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); | 570 | | // credit: aqrit | 571 | 1.18M | __m256i delta_asso; | 572 | 1.18M | if (default_or_url) { | 573 | 0 | delta_asso = _mm256_setr_epi8( | 574 | 0 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, | 575 | 0 | 0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, | 576 | 0 | 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); | 577 | 1.18M | } else if (base64_url) { | 578 | 1.18M | delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, | 579 | 1.18M | 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, | 580 | 1.18M | 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, | 581 | 1.18M | 0x0, 0x0, 0xF, 0x0, 0xF); | 582 | 1.18M | } else { | 583 | 0 | delta_asso = _mm256_setr_epi8( | 584 | 0 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, | 585 | 0 | 0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, | 586 | 0 | 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); | 587 | 0 | } | 588 | | | 589 | 1.18M | __m256i delta_values; | 590 | 1.18M | if (default_or_url) { | 591 | 0 | delta_values = _mm256_setr_epi8( | 592 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), | 593 | 0 | uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), | 594 | 0 | uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), | 595 | 0 | uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9), | 596 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), | 597 | 0 | uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), | 598 | 0 | uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), | 599 | 0 | uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9)); | 600 | 1.18M | } else if (base64_url) { | 601 | 1.18M | delta_values = _mm256_setr_epi8( | 602 | 1.18M | 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), | 603 | 1.18M | uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), | 604 | 1.18M | uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), | 605 | 1.18M | uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), | 606 | 1.18M | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); | 607 | 1.18M | } else { | 608 | 0 | delta_values = _mm256_setr_epi8( | 609 | 0 | int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), | 610 | 0 | int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), | 611 | 0 | int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), | 612 | 0 | int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), | 613 | 0 | int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), | 614 | 0 | int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), | 615 | 0 | int8_t(0xB9), int8_t(0xB9)); | 616 | 0 | } | 617 | | | 618 | 1.18M | __m256i check_asso; | 619 | 1.18M | if (default_or_url) { | 620 | 0 | check_asso = _mm256_setr_epi8( | 621 | 0 | 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, | 622 | 0 | 0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, | 623 | 0 | 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06); | 624 | 1.18M | } else if (base64_url) { | 625 | 1.18M | check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, | 626 | 1.18M | 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, | 627 | 1.18M | 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, | 628 | 1.18M | 0x7, 0xB, 0xE, 0xB, 0x6); | 629 | 1.18M | } else { | 630 | 0 | check_asso = _mm256_setr_epi8( | 631 | 0 | 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, | 632 | 0 | 0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, | 633 | 0 | 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); | 634 | 0 | } | 635 | 1.18M | __m256i check_values; | 636 | 1.18M | if (default_or_url) { | 637 | 0 | check_values = _mm256_setr_epi8( | 638 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 639 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), | 640 | 0 | uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), | 641 | 0 | uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80), | 642 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 643 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), | 644 | 0 | uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), | 645 | 0 | uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80)); | 646 | 1.18M | } else if (base64_url) { | 647 | 1.18M | check_values = _mm256_setr_epi8( | 648 | 1.18M | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 649 | 1.18M | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6), | 650 | 1.18M | uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80), | 651 | 1.18M | 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 652 | 1.18M | uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), | 653 | 1.18M | uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, | 654 | 1.18M | uint8_t(0x80), 0x0, uint8_t(0x80)); | 655 | 1.18M | } else { | 656 | 0 | check_values = _mm256_setr_epi8( | 657 | 0 | int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), | 658 | 0 | int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), | 659 | 0 | int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), | 660 | 0 | int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), | 661 | 0 | int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), | 662 | 0 | int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), | 663 | 0 | int8_t(0x91), int8_t(0x80)); | 664 | 0 | } | 665 | 1.18M | const __m256i shifted = _mm256_srli_epi32(*src, 3); | 666 | 1.18M | __m256i delta_hash = | 667 | 1.18M | _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); | 668 | 1.18M | if (default_or_url) { | 669 | 0 | delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf)); | 670 | 0 | } | 671 | 1.18M | const __m256i check_hash = | 672 | 1.18M | _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted); | 673 | 1.18M | const __m256i out = | 674 | 1.18M | _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src); | 675 | 1.18M | const __m256i chk = | 676 | 1.18M | _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); | 677 | 1.18M | const int mask = _mm256_movemask_epi8(chk); | 678 | 1.18M | if (!ignore_garbage && mask) { | 679 | 85.5k | __m256i ascii_space = | 680 | 85.5k | _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); | 681 | 85.5k | *error = (mask ^ _mm256_movemask_epi8(ascii_space)); | 682 | 85.5k | } | 683 | 1.18M | *src = out; | 684 | 1.18M | return (uint32_t)mask; | 685 | 1.18M | } |
Unexecuted instantiation: simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, true, false>(long long __vector(4)*, unsigned int*) simdutf.cpp:unsigned int simdutf::haswell::(anonymous namespace)::block64::to_base64_mask<false, false, false>(long long __vector(4)*, unsigned int*) Line | Count | Source | 565 | 1.90M | simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { | 566 | 1.90M | const __m256i ascii_space_tbl = | 567 | 1.90M | _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, | 568 | 1.90M | 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, | 569 | 1.90M | 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); | 570 | | // credit: aqrit | 571 | 1.90M | __m256i delta_asso; | 572 | 1.90M | if (default_or_url) { | 573 | 0 | delta_asso = _mm256_setr_epi8( | 574 | 0 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, | 575 | 0 | 0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, | 576 | 0 | 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); | 577 | 1.90M | } else if (base64_url) { | 578 | 0 | delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, | 579 | 0 | 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, | 580 | 0 | 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, | 581 | 0 | 0x0, 0x0, 0xF, 0x0, 0xF); | 582 | 1.90M | } else { | 583 | 1.90M | delta_asso = _mm256_setr_epi8( | 584 | 1.90M | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, | 585 | 1.90M | 0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, | 586 | 1.90M | 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); | 587 | 1.90M | } | 588 | | | 589 | 1.90M | __m256i delta_values; | 590 | 1.90M | if (default_or_url) { | 591 | 0 | delta_values = _mm256_setr_epi8( | 592 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), | 593 | 0 | uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), | 594 | 0 | uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), | 595 | 0 | uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9), | 596 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), | 597 | 0 | uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), | 598 | 0 | uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), | 599 | 0 | uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9)); | 600 | 1.90M | } else if (base64_url) { | 601 | 0 | delta_values = _mm256_setr_epi8( | 602 | 0 | 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), | 603 | 0 | uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), | 604 | 0 | uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), | 605 | 0 | uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), | 606 | 0 | uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); | 607 | 1.90M | } else { | 608 | 1.90M | delta_values = _mm256_setr_epi8( | 609 | 1.90M | int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), | 610 | 1.90M | int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), | 611 | 1.90M | int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), | 612 | 1.90M | int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), | 613 | 1.90M | int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), | 614 | 1.90M | int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), | 615 | 1.90M | int8_t(0xB9), int8_t(0xB9)); | 616 | 1.90M | } | 617 | | | 618 | 1.90M | __m256i check_asso; | 619 | 1.90M | if (default_or_url) { | 620 | 0 | check_asso = _mm256_setr_epi8( | 621 | 0 | 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, | 622 | 0 | 0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, | 623 | 0 | 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06); | 624 | 1.90M | } else if (base64_url) { | 625 | 0 | check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, | 626 | 0 | 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, | 627 | 0 | 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, | 628 | 0 | 0x7, 0xB, 0xE, 0xB, 0x6); | 629 | 1.90M | } else { | 630 | 1.90M | check_asso = _mm256_setr_epi8( | 631 | 1.90M | 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, | 632 | 1.90M | 0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, | 633 | 1.90M | 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); | 634 | 1.90M | } | 635 | 1.90M | __m256i check_values; | 636 | 1.90M | if (default_or_url) { | 637 | 0 | check_values = _mm256_setr_epi8( | 638 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 639 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), | 640 | 0 | uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), | 641 | 0 | uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80), | 642 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 643 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), | 644 | 0 | uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), | 645 | 0 | uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80)); | 646 | 1.90M | } else if (base64_url) { | 647 | 0 | check_values = _mm256_setr_epi8( | 648 | 0 | uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 649 | 0 | uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6), | 650 | 0 | uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80), | 651 | 0 | 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), | 652 | 0 | uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), | 653 | 0 | uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, | 654 | 0 | uint8_t(0x80), 0x0, uint8_t(0x80)); | 655 | 1.90M | } else { | 656 | 1.90M | check_values = _mm256_setr_epi8( | 657 | 1.90M | int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), | 658 | 1.90M | int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), | 659 | 1.90M | int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), | 660 | 1.90M | int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), | 661 | 1.90M | int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), | 662 | 1.90M | int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), | 663 | 1.90M | int8_t(0x91), int8_t(0x80)); | 664 | 1.90M | } | 665 | 1.90M | const __m256i shifted = _mm256_srli_epi32(*src, 3); | 666 | 1.90M | __m256i delta_hash = | 667 | 1.90M | _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); | 668 | 1.90M | if (default_or_url) { | 669 | 0 | delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf)); | 670 | 0 | } | 671 | 1.90M | const __m256i check_hash = | 672 | 1.90M | _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted); | 673 | 1.90M | const __m256i out = | 674 | 1.90M | _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src); | 675 | 1.90M | const __m256i chk = | 676 | 1.90M | _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); | 677 | 1.90M | const int mask = _mm256_movemask_epi8(chk); | 678 | 1.90M | if (!ignore_garbage && mask) { | 679 | 140k | __m256i ascii_space = | 680 | 140k | _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); | 681 | 140k | *error = (mask ^ _mm256_movemask_epi8(ascii_space)); | 682 | 140k | } | 683 | 1.90M | *src = out; | 684 | 1.90M | return (uint32_t)mask; | 685 | 1.90M | } |
|
686 | | |
687 | 132k | simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) { |
688 | 132k | if (is_power_of_two(mask)) { |
689 | 33.0k | return compress_block_single(mask, output); |
690 | 33.0k | } |
691 | | |
692 | 99.5k | uint64_t nmask = ~mask; |
693 | 99.5k | compress(chunks[0], uint32_t(mask), output); |
694 | 99.5k | compress(chunks[1], uint32_t(mask >> 32), |
695 | 99.5k | output + count_ones(nmask & 0xFFFFFFFF)); |
696 | 99.5k | return count_ones(nmask); |
697 | 132k | } |
698 | | |
699 | | simdutf_really_inline size_t compress_block_single(uint64_t mask, |
700 | 33.0k | char *output) { |
701 | 33.0k | const size_t pos64 = trailing_zeroes(mask); |
702 | 33.0k | const int8_t pos = pos64 & 0xf; |
703 | 33.0k | switch (pos64 >> 4) { |
704 | 7.94k | case 0b00: { |
705 | 7.94k | const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0); |
706 | 7.94k | const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1); |
707 | | |
708 | 7.94k | const __m128i v0 = _mm_set1_epi8(char(pos - 1)); |
709 | 7.94k | const __m128i v1 = |
710 | 7.94k | _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
711 | 7.94k | const __m128i v2 = _mm_cmpgt_epi8(v1, v0); |
712 | 7.94k | const __m128i sh = _mm_sub_epi8(v1, v2); |
713 | 7.94k | const __m128i compressed = _mm_shuffle_epi8(lane0, sh); |
714 | | |
715 | 7.94k | _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed); |
716 | 7.94k | _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), lane1); |
717 | 7.94k | _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]); |
718 | 7.94k | } break; |
719 | 8.71k | case 0b01: { |
720 | 8.71k | const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0); |
721 | 8.71k | const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1); |
722 | 8.71k | _mm_storeu_si128((__m128i *)(output + 0 * 16), lane0); |
723 | | |
724 | 8.71k | const __m128i v0 = _mm_set1_epi8(char(pos - 1)); |
725 | 8.71k | const __m128i v1 = |
726 | 8.71k | _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
727 | 8.71k | const __m128i v2 = _mm_cmpgt_epi8(v1, v0); |
728 | 8.71k | const __m128i sh = _mm_sub_epi8(v1, v2); |
729 | 8.71k | const __m128i compressed = _mm_shuffle_epi8(lane1, sh); |
730 | | |
731 | 8.71k | _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed); |
732 | 8.71k | _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]); |
733 | 8.71k | } break; |
734 | 8.68k | case 0b10: { |
735 | 8.68k | const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0); |
736 | 8.68k | const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1); |
737 | | |
738 | 8.68k | _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]); |
739 | | |
740 | 8.68k | const __m128i v0 = _mm_set1_epi8(char(pos - 1)); |
741 | 8.68k | const __m128i v1 = |
742 | 8.68k | _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
743 | 8.68k | const __m128i v2 = _mm_cmpgt_epi8(v1, v0); |
744 | 8.68k | const __m128i sh = _mm_sub_epi8(v1, v2); |
745 | 8.68k | const __m128i compressed = _mm_shuffle_epi8(lane2, sh); |
746 | | |
747 | 8.68k | _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed); |
748 | 8.68k | _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), lane3); |
749 | 8.68k | } break; |
750 | 7.75k | case 0b11: { |
751 | 7.75k | const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0); |
752 | 7.75k | const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1); |
753 | | |
754 | 7.75k | _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]); |
755 | 7.75k | _mm_storeu_si128((__m128i *)(output + 2 * 16), lane2); |
756 | | |
757 | 7.75k | const __m128i v0 = _mm_set1_epi8(char(pos - 1)); |
758 | 7.75k | const __m128i v1 = |
759 | 7.75k | _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
760 | 7.75k | const __m128i v2 = _mm_cmpgt_epi8(v1, v0); |
761 | 7.75k | const __m128i sh = _mm_sub_epi8(v1, v2); |
762 | 7.75k | const __m128i compressed = _mm_shuffle_epi8(lane3, sh); |
763 | | |
764 | 7.75k | _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed); |
765 | 7.75k | } break; |
766 | 33.0k | } |
767 | | |
768 | 33.0k | return 63; |
769 | 33.0k | } |
770 | | }; |