/src/simdutf/src/icelake/icelake_utf8_common.inl.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Common procedures for both validating and non-validating conversions from |
2 | | // UTF-8. |
3 | | enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL }; |
4 | | |
5 | | using utf8_to_utf16_result = std::pair<const char *, char16_t *>; |
6 | | using utf8_to_utf32_result = std::pair<const char *, uint32_t *>; |
7 | | |
8 | | /* |
9 | | process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8 |
10 | | to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes) |
11 | | might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which |
12 | | indicates how many input bytes are relevant. |
13 | | |
14 | | Returns true when the result is correct, otherwise it returns false. |
15 | | |
16 | | The provided in and out pointers are advanced according to how many input |
17 | | bytes have been processed, upon success. |
18 | | */ |
19 | | template <block_processing_mode tail, endianness big_endian> |
20 | | simdutf_really_inline bool |
21 | 0 | process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) { |
22 | | // constants |
23 | 0 | __m512i mask_identity = _mm512_set_epi8( |
24 | 0 | 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, |
25 | 0 | 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, |
26 | 0 | 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, |
27 | 0 | 8, 7, 6, 5, 4, 3, 2, 1, 0); |
28 | 0 | __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0); |
29 | 0 | __m512i mask_80808080 = _mm512_set1_epi32(0x80808080); |
30 | 0 | __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0); |
31 | 0 | __m512i mask_dfdfdfdf_tail = _mm512_set_epi64( |
32 | 0 | 0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, |
33 | 0 | 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, |
34 | 0 | 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf); |
35 | 0 | __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2); |
36 | 0 | __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff); |
37 | 0 | __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0); |
38 | 0 | __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00); |
39 | 0 | __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809, |
40 | 0 | 0x0607040502030001, 0x0e0f0c0d0a0b0809, |
41 | 0 | 0x0607040502030001, 0x0e0f0c0d0a0b0809, |
42 | 0 | 0x0607040502030001, 0x0e0f0c0d0a0b0809); |
43 | | // Note that 'tail' is a compile-time constant ! |
44 | 0 | __mmask64 b = |
45 | 0 | (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1; |
46 | 0 | __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) |
47 | 0 | : _mm512_maskz_loadu_epi8(b, in); |
48 | 0 | __mmask64 m1 = (tail == SIMDUTF_FULL) |
49 | 0 | ? _mm512_cmplt_epu8_mask(input, mask_80808080) |
50 | 0 | : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080); |
51 | 0 | if (_ktestc_mask64_u8(m1, |
52 | 0 | b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII |
53 | | // alternatively, we could do 'if (m1 == b) { ' |
54 | 0 | if (tail == SIMDUTF_FULL) { |
55 | 0 | in += 64; // consumed 64 bytes |
56 | | // we convert a full 64-byte block, writing 128 bytes. |
57 | 0 | __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); |
58 | 0 | if (big_endian) { |
59 | 0 | input1 = _mm512_shuffle_epi8(input1, byteflip); |
60 | 0 | } |
61 | 0 | _mm512_storeu_si512(out, input1); |
62 | 0 | out += 32; |
63 | 0 | __m512i input2 = |
64 | 0 | _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); |
65 | 0 | if (big_endian) { |
66 | 0 | input2 = _mm512_shuffle_epi8(input2, byteflip); |
67 | 0 | } |
68 | 0 | _mm512_storeu_si512(out, input2); |
69 | 0 | out += 32; |
70 | 0 | return true; // we are done |
71 | 0 | } else { |
72 | 0 | in += gap; |
73 | 0 | if (gap <= 32) { |
74 | 0 | __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); |
75 | 0 | if (big_endian) { |
76 | 0 | input1 = _mm512_shuffle_epi8(input1, byteflip); |
77 | 0 | } |
78 | 0 | _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), |
79 | 0 | input1); |
80 | 0 | out += gap; |
81 | 0 | } else { |
82 | 0 | __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); |
83 | 0 | if (big_endian) { |
84 | 0 | input1 = _mm512_shuffle_epi8(input1, byteflip); |
85 | 0 | } |
86 | 0 | _mm512_storeu_si512(out, input1); |
87 | 0 | out += 32; |
88 | 0 | __m512i input2 = |
89 | 0 | _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); |
90 | 0 | if (big_endian) { |
91 | 0 | input2 = _mm512_shuffle_epi8(input2, byteflip); |
92 | 0 | } |
93 | 0 | _mm512_mask_storeu_epi16( |
94 | 0 | out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2); |
95 | 0 | out += gap - 32; |
96 | 0 | } |
97 | 0 | return true; // we are done |
98 | 0 | } |
99 | 0 | } |
100 | | // classify characters further |
101 | 0 | __mmask64 m234 = _mm512_cmp_epu8_mask( |
102 | 0 | mask_c0c0c0c0, input, |
103 | 0 | _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte |
104 | 0 | __mmask64 m34 = |
105 | 0 | _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input, |
106 | 0 | _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte |
107 | |
|
108 | 0 | __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask( |
109 | 0 | m234, input, mask_c2c2c2c2, |
110 | 0 | _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence) |
111 | | // Overlong 2-byte sequence |
112 | 0 | if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) { |
113 | | // Overlong 2-byte sequence |
114 | 0 | return false; |
115 | 0 | } |
116 | 0 | if (_ktestz_mask64_u8(m34, m34) == 0) { |
117 | | // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a |
118 | | // 4-byte sequence! |
119 | 0 | __mmask64 m4 = _mm512_cmp_epu8_mask( |
120 | 0 | input, mask_f0f0f0f0, |
121 | 0 | _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes) |
122 | |
|
123 | 0 | __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) |
124 | 0 | ? _knot_mask64(m1) |
125 | 0 | : _kand_mask64(_knot_mask64(m1), b); |
126 | |
|
127 | 0 | __mmask64 mp1 = _kshiftli_mask64(m234, 1); |
128 | 0 | __mmask64 mp2 = _kshiftli_mask64(m34, 2); |
129 | | // We could do it as follows... |
130 | | // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit |
131 | | // masks a and b and return 1 if all zeroes but GCC generates better code |
132 | | // when we do: |
133 | 0 | if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and |
134 | | // return 1 if all zeroes |
135 | | // Fast path with 1,2,3 bytes |
136 | 0 | __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes |
137 | 0 | __mmask64 m1234 = _kor_mask64(m1, m234); |
138 | | // mismatched continuation bytes: |
139 | 0 | if (tail == SIMDUTF_FULL) { |
140 | 0 | __mmask64 xnormcm1234 = _kxnor_mask64( |
141 | 0 | mc, |
142 | 0 | m1234); // XNOR of mc and m1234 should be all zero if they differ |
143 | | // the presence of a 1 bit indicates that they overlap. |
144 | | // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return |
145 | | // 1 if all zeroes. |
146 | 0 | if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { |
147 | 0 | return false; |
148 | 0 | } |
149 | 0 | } else { |
150 | 0 | __mmask64 bxorm1234 = _kxor_mask64(b, m1234); |
151 | 0 | if (mc != bxorm1234) { |
152 | 0 | return false; |
153 | 0 | } |
154 | 0 | } |
155 | | // mend: identifying the last bytes of each sequence to be decoded |
156 | 0 | __mmask64 mend = _kshiftri_mask64(m1234, 1); |
157 | 0 | if (tail != SIMDUTF_FULL) { |
158 | 0 | mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1))); |
159 | 0 | } |
160 | |
|
161 | 0 | __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); |
162 | 0 | __m512i last_and_thirdu16 = |
163 | 0 | _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); |
164 | |
|
165 | 0 | __m512i nonasciitags = _mm512_maskz_mov_epi8( |
166 | 0 | mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 |
167 | 0 | __m512i clearedbytes = _mm512_andnot_si512( |
168 | 0 | nonasciitags, input); // high two bits cleared where not ASCII |
169 | 0 | __m512i lastbytes = _mm512_maskz_permutexvar_epi8( |
170 | 0 | 0x5555555555555555, last_and_thirdu16, |
171 | 0 | clearedbytes); // the last byte of each character |
172 | |
|
173 | 0 | __mmask64 mask_before_non_ascii = _kshiftri_mask64( |
174 | 0 | mask_not_ascii, 1); // bytes that precede non-ASCII bytes |
175 | 0 | __m512i indexofsecondlastbytes = _mm512_add_epi16( |
176 | 0 | mask_ffffffff, last_and_thirdu16); // indices of the second last bytes |
177 | 0 | __m512i beforeasciibytes = |
178 | 0 | _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); |
179 | 0 | __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8( |
180 | 0 | 0x5555555555555555, indexofsecondlastbytes, |
181 | 0 | beforeasciibytes); // the second last bytes (of two, three byte seq, |
182 | | // surrogates) |
183 | 0 | secondlastbytes = |
184 | 0 | _mm512_slli_epi16(secondlastbytes, 6); // shifted into position |
185 | |
|
186 | 0 | __m512i indexofthirdlastbytes = _mm512_add_epi16( |
187 | 0 | mask_ffffffff, |
188 | 0 | indexofsecondlastbytes); // indices of the second last bytes |
189 | 0 | __m512i thirdlastbyte = |
190 | 0 | _mm512_maskz_mov_epi8(m34, |
191 | 0 | clearedbytes); // only those that are the third |
192 | | // last byte of a sequence |
193 | 0 | __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8( |
194 | 0 | 0x5555555555555555, indexofthirdlastbytes, |
195 | 0 | thirdlastbyte); // the third last bytes (of three byte sequences, hi |
196 | | // surrogate) |
197 | 0 | thirdlastbytes = |
198 | 0 | _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position |
199 | 0 | __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, |
200 | 0 | thirdlastbytes, 254); |
201 | | // the elements of Wout excluding the last element if it happens to be a |
202 | | // high surrogate: |
203 | |
|
204 | 0 | __mmask64 mprocessed = |
205 | 0 | (tail == SIMDUTF_FULL) |
206 | 0 | ? _pdep_u64(0xFFFFFFFF, mend) |
207 | 0 | : _pdep_u64( |
208 | 0 | 0xFFFFFFFF, |
209 | 0 | _kand_mask64( |
210 | 0 | mend, b)); // we adjust mend at the end of the output. |
211 | | |
212 | | // Encodings out of range... |
213 | 0 | { |
214 | | // the location of 3-byte sequence start bytes in the input |
215 | 0 | __mmask64 m3 = m34 & (b ^ m4); |
216 | | // code units in Wout corresponding to 3-byte sequences. |
217 | 0 | __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); |
218 | 0 | __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); |
219 | 0 | __mmask32 Msmall800 = |
220 | 0 | _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); |
221 | 0 | __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); |
222 | 0 | __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); |
223 | 0 | __mmask32 M3s = |
224 | 0 | _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); |
225 | 0 | if (_kor_mask32(Msmall800, M3s)) { |
226 | 0 | return false; |
227 | 0 | } |
228 | 0 | } |
229 | 0 | int64_t nout = _mm_popcnt_u64(mprocessed); |
230 | 0 | in += 64 - _lzcnt_u64(mprocessed); |
231 | 0 | if (big_endian) { |
232 | 0 | Wout = _mm512_shuffle_epi8(Wout, byteflip); |
233 | 0 | } |
234 | 0 | _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); |
235 | 0 | out += nout; |
236 | 0 | return true; // ok |
237 | 0 | } |
238 | | // |
239 | | // We have a 4-byte sequence, this is the general case. |
240 | | // Slow! |
241 | 0 | __mmask64 mp3 = _kshiftli_mask64(m4, 3); |
242 | 0 | __mmask64 mc = |
243 | 0 | _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes |
244 | 0 | __mmask64 m1234 = _kor_mask64(m1, m234); |
245 | | |
246 | | // mend: identifying the last bytes of each sequence to be decoded |
247 | 0 | __mmask64 mend = |
248 | 0 | _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3); |
249 | 0 | if (tail != SIMDUTF_FULL) { |
250 | 0 | mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1))); |
251 | 0 | } |
252 | 0 | __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); |
253 | 0 | __m512i last_and_thirdu16 = |
254 | 0 | _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); |
255 | |
|
256 | 0 | __m512i nonasciitags = _mm512_maskz_mov_epi8( |
257 | 0 | mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 |
258 | 0 | __m512i clearedbytes = _mm512_andnot_si512( |
259 | 0 | nonasciitags, input); // high two bits cleared where not ASCII |
260 | 0 | __m512i lastbytes = _mm512_maskz_permutexvar_epi8( |
261 | 0 | 0x5555555555555555, last_and_thirdu16, |
262 | 0 | clearedbytes); // the last byte of each character |
263 | |
|
264 | 0 | __mmask64 mask_before_non_ascii = _kshiftri_mask64( |
265 | 0 | mask_not_ascii, 1); // bytes that precede non-ASCII bytes |
266 | 0 | __m512i indexofsecondlastbytes = _mm512_add_epi16( |
267 | 0 | mask_ffffffff, last_and_thirdu16); // indices of the second last bytes |
268 | 0 | __m512i beforeasciibytes = |
269 | 0 | _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); |
270 | 0 | __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8( |
271 | 0 | 0x5555555555555555, indexofsecondlastbytes, |
272 | 0 | beforeasciibytes); // the second last bytes (of two, three byte seq, |
273 | | // surrogates) |
274 | 0 | secondlastbytes = |
275 | 0 | _mm512_slli_epi16(secondlastbytes, 6); // shifted into position |
276 | |
|
277 | 0 | __m512i indexofthirdlastbytes = _mm512_add_epi16( |
278 | 0 | mask_ffffffff, |
279 | 0 | indexofsecondlastbytes); // indices of the second last bytes |
280 | 0 | __m512i thirdlastbyte = _mm512_maskz_mov_epi8( |
281 | 0 | m34, |
282 | 0 | clearedbytes); // only those that are the third last byte of a sequence |
283 | 0 | __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8( |
284 | 0 | 0x5555555555555555, indexofthirdlastbytes, |
285 | 0 | thirdlastbyte); // the third last bytes (of three byte sequences, hi |
286 | | // surrogate) |
287 | 0 | thirdlastbytes = |
288 | 0 | _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position |
289 | 0 | __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32( |
290 | 0 | lastbytes, secondlastbytes, thirdlastbytes, 254); |
291 | 0 | uint64_t Mlo_uint64 = _pext_u64(mp3, mend); |
292 | 0 | __mmask32 Mlo = __mmask32(Mlo_uint64); |
293 | 0 | __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1); |
294 | 0 | __m512i lo_surr_mask = _mm512_maskz_mov_epi16( |
295 | 0 | Mlo, |
296 | 0 | mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000 |
297 | 0 | __m512i shifted4_thirdsecondandlastbytes = |
298 | 0 | _mm512_srli_epi16(thirdsecondandlastbytes, |
299 | 0 | 4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1 |
300 | 0 | __m512i tagged_lo_surrogates = _mm512_or_si512( |
301 | 0 | thirdsecondandlastbytes, |
302 | 0 | lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged |
303 | 0 | __m512i Wout = _mm512_mask_add_epi16( |
304 | 0 | tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes, |
305 | 0 | mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged |
306 | | // the elements of Wout excluding the last element if it happens to be a |
307 | | // high surrogate: |
308 | 0 | __mmask32 Mout = ~(Mhi & 0x80000000); |
309 | 0 | __mmask64 mprocessed = |
310 | 0 | (tail == SIMDUTF_FULL) |
311 | 0 | ? _pdep_u64(Mout, mend) |
312 | 0 | : _pdep_u64( |
313 | 0 | Mout, |
314 | 0 | _kand_mask64(mend, |
315 | 0 | b)); // we adjust mend at the end of the output. |
316 | | |
317 | | // mismatched continuation bytes: |
318 | 0 | if (tail == SIMDUTF_FULL) { |
319 | 0 | __mmask64 xnormcm1234 = _kxnor_mask64( |
320 | 0 | mc, m1234); // XNOR of mc and m1234 should be all zero if they differ |
321 | | // the presence of a 1 bit indicates that they overlap. |
322 | | // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 |
323 | | // if all zeroes. |
324 | 0 | if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { |
325 | 0 | return false; |
326 | 0 | } |
327 | 0 | } else { |
328 | 0 | __mmask64 bxorm1234 = _kxor_mask64(b, m1234); |
329 | 0 | if (mc != bxorm1234) { |
330 | 0 | return false; |
331 | 0 | } |
332 | 0 | } |
333 | | // Encodings out of range... |
334 | 0 | { |
335 | | // the location of 3-byte sequence start bytes in the input |
336 | 0 | __mmask64 m3 = m34 & (b ^ m4); |
337 | | // code units in Wout corresponding to 3-byte sequences. |
338 | 0 | __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); |
339 | 0 | __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); |
340 | 0 | __mmask32 Msmall800 = |
341 | 0 | _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); |
342 | 0 | __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); |
343 | 0 | __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); |
344 | 0 | __mmask32 M3s = |
345 | 0 | _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); |
346 | 0 | __m512i mask_04000400 = _mm512_set1_epi32(0x04000400); |
347 | 0 | __mmask32 M4s = |
348 | 0 | _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400); |
349 | 0 | if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { |
350 | 0 | return false; |
351 | 0 | } |
352 | 0 | } |
353 | 0 | in += 64 - _lzcnt_u64(mprocessed); |
354 | 0 | int64_t nout = _mm_popcnt_u64(mprocessed); |
355 | 0 | if (big_endian) { |
356 | 0 | Wout = _mm512_shuffle_epi8(Wout, byteflip); |
357 | 0 | } |
358 | 0 | _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); |
359 | 0 | out += nout; |
360 | 0 | return true; // ok |
361 | 0 | } |
362 | | // Fast path 2: all ASCII or 2 byte |
363 | 0 | __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) |
364 | 0 | ? _knot_mask64(m234) |
365 | 0 | : _kand_mask64(_knot_mask64(m234), b); |
366 | | // on top of -0xc0 we subtract -2 which we get back later of the |
367 | | // continuation byte tags |
368 | 0 | __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2); |
369 | 0 | __mmask64 leading = tail == (tail == SIMDUTF_FULL) |
370 | 0 | ? _kor_mask64(m1, m234) |
371 | 0 | : _kand_mask64(_kor_mask64(m1, m234), |
372 | 0 | b); // first bytes of each sequence |
373 | 0 | if (tail == SIMDUTF_FULL) { |
374 | 0 | __mmask64 xnor234leading = |
375 | 0 | _kxnor_mask64(_kshiftli_mask64(m234, 1), leading); |
376 | 0 | if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { |
377 | 0 | return false; |
378 | 0 | } |
379 | 0 | } else { |
380 | 0 | __mmask64 bxorleading = _kxor_mask64(b, leading); |
381 | 0 | if (_kshiftli_mask64(m234, 1) != bxorleading) { |
382 | 0 | return false; |
383 | 0 | } |
384 | 0 | } |
385 | | // |
386 | 0 | if (tail == SIMDUTF_FULL) { |
387 | | // In the two-byte/ASCII scenario, we are easily latency bound, so we want |
388 | | // to increment the input buffer as quickly as possible. |
389 | | // We process 32 bytes unless the byte at index 32 is a continuation byte, |
390 | | // in which case we include it as well for a total of 33 bytes. |
391 | | // Note that if x is an ASCII byte, then the following is false: |
392 | | // int8_t(x) <= int8_t(0xc0) under two's complement. |
393 | 0 | in += 32; |
394 | 0 | if (int8_t(*in) <= int8_t(0xc0)) |
395 | 0 | in++; |
396 | | // The alternative is to do |
397 | | // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); |
398 | | // but it requires loading the input, doing the mask computation, and |
399 | | // converting back the mask to a general register. It just takes too long, |
400 | | // leaving the processor likely to be idle. |
401 | 0 | } else { |
402 | 0 | in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); |
403 | 0 | } |
404 | 0 | __m512i lead = _mm512_maskz_compress_epi8( |
405 | 0 | leading, leading2byte); // will contain zero for ascii, and the data |
406 | 0 | lead = _mm512_cvtepu8_epi16( |
407 | 0 | _mm512_castsi512_si256(lead)); // ... zero extended into code units |
408 | 0 | __m512i follow = _mm512_maskz_compress_epi8( |
409 | 0 | continuation_or_ascii, input); // the last bytes of each sequence |
410 | 0 | follow = _mm512_cvtepu8_epi16( |
411 | 0 | _mm512_castsi512_si256(follow)); // ... zero extended into code units |
412 | 0 | lead = _mm512_slli_epi16(lead, 6); // shifted into position |
413 | 0 | __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow |
414 | |
|
415 | 0 | if (big_endian) { |
416 | 0 | final = _mm512_shuffle_epi8(final, byteflip); |
417 | 0 | } |
418 | 0 | if (tail == SIMDUTF_FULL) { |
419 | | // Next part is UTF-16 specific and can be generalized to UTF-32. |
420 | 0 | int nout = _mm_popcnt_u32(uint32_t(leading)); |
421 | 0 | _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); |
422 | 0 | out += nout; // UTF-8 to UTF-16 is only expansionary in this case. |
423 | 0 | } else { |
424 | 0 | int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading))); |
425 | 0 | _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); |
426 | 0 | out += nout; // UTF-8 to UTF-16 is only expansionary in this case. |
427 | 0 | } |
428 | |
|
429 | 0 | return true; // we are fine. |
430 | 0 | } Unexecuted instantiation: simdutf.cpp:bool simdutf::icelake::(anonymous namespace)::process_block_utf8_to_utf16<(simdutf::icelake::(anonymous namespace)::block_processing_mode)0, (simdutf::endianness)0>(char const*&, char16_t*&, unsigned long) Unexecuted instantiation: simdutf.cpp:bool simdutf::icelake::(anonymous namespace)::process_block_utf8_to_utf16<(simdutf::icelake::(anonymous namespace)::block_processing_mode)1, (simdutf::endianness)0>(char const*&, char16_t*&, unsigned long) Unexecuted instantiation: simdutf.cpp:bool simdutf::icelake::(anonymous namespace)::process_block_utf8_to_utf16<(simdutf::icelake::(anonymous namespace)::block_processing_mode)0, (simdutf::endianness)1>(char const*&, char16_t*&, unsigned long) Unexecuted instantiation: simdutf.cpp:bool simdutf::icelake::(anonymous namespace)::process_block_utf8_to_utf16<(simdutf::icelake::(anonymous namespace)::block_processing_mode)1, (simdutf::endianness)1>(char const*&, char16_t*&, unsigned long) |
431 | | |
432 | | /* |
433 | | utf32_to_utf16_masked converts `count` lower UTF-32 code units |
434 | | from input `utf32` into UTF-16. It differs from utf32_to_utf16 |
435 | | in that it 'masks' the writes. |
436 | | |
437 | | Returns how many 16-bit code units were stored. |
438 | | |
439 | | byteflip is used for flipping 16-bit code units, and it should be |
440 | | __m512i byteflip = _mm512_setr_epi64( |
441 | | 0x0607040502030001, |
442 | | 0x0e0f0c0d0a0b0809, |
443 | | 0x0607040502030001, |
444 | | 0x0e0f0c0d0a0b0809, |
445 | | 0x0607040502030001, |
446 | | 0x0e0f0c0d0a0b0809, |
447 | | 0x0607040502030001, |
448 | | 0x0e0f0c0d0a0b0809 |
449 | | ); |
450 | | We pass it to the (always inlined) function to encourage the compiler to |
451 | | keep the value in a (constant) register. |
452 | | */ |
453 | | template <endianness big_endian> |
454 | | simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, |
455 | | __m512i utf32, |
456 | | unsigned int count, |
457 | 0 | char16_t *output) { |
458 | |
|
459 | 0 | const __mmask16 valid = uint16_t((1 << count) - 1); |
460 | | // 1. check if we have any surrogate pairs |
461 | 0 | const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); |
462 | 0 | const __mmask16 sp_mask = |
463 | 0 | _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff); |
464 | |
|
465 | 0 | if (sp_mask == 0) { |
466 | 0 | if (big_endian) { |
467 | 0 | _mm256_mask_storeu_epi16( |
468 | 0 | (__m256i *)output, valid, |
469 | 0 | _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), |
470 | 0 | _mm512_castsi512_si256(byteflip))); |
471 | |
|
472 | 0 | } else { |
473 | 0 | _mm256_mask_storeu_epi16((__m256i *)output, valid, |
474 | 0 | _mm512_cvtepi32_epi16(utf32)); |
475 | 0 | } |
476 | 0 | return count; |
477 | 0 | } |
478 | | |
479 | 0 | { |
480 | | // build surrogate pair code units in 32-bit lanes |
481 | | |
482 | | // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] |
483 | 0 | const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); |
484 | 0 | const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); |
485 | | |
486 | | // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] |
487 | 0 | const __m512i t1 = _mm512_slli_epi32(t0, 6); |
488 | | |
489 | | // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 |
490 | | // to t0 |
491 | | // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) |
492 | 0 | const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); |
493 | 0 | const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); |
494 | | |
495 | | // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 |
496 | | // to t0 |
497 | | // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 |
498 | 0 | const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); |
499 | 0 | const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); |
500 | 0 | const __m512i t3 = |
501 | 0 | _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); |
502 | 0 | const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); |
503 | 0 | __m512i t5 = _mm512_ror_epi32(t4, 16); |
504 | | // Here we want to trim all of the upper 16-bit code units from the 2-byte |
505 | | // characters represented as 4-byte values. We can compute it from |
506 | | // sp_mask or the following... It can be more optimized! |
507 | 0 | const __mmask32 nonzero = _kor_mask32( |
508 | 0 | 0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); |
509 | 0 | const __mmask32 nonzero_masked = |
510 | 0 | _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1)); |
511 | 0 | if (big_endian) { |
512 | 0 | t5 = _mm512_shuffle_epi8(t5, byteflip); |
513 | 0 | } |
514 | | // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability |
515 | | // (AMD Zen4 has terrible performance with it, it is effectively broken) |
516 | 0 | __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5); |
517 | 0 | _mm512_mask_storeu_epi16( |
518 | 0 | output, _bzhi_u32(0xFFFFFFFF, count + _mm_popcnt_u32(sp_mask)), |
519 | 0 | compressed); |
520 | | //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5); |
521 | 0 | } |
522 | |
|
523 | 0 | return count + static_cast<unsigned int>(count_ones(sp_mask)); |
524 | 0 | } Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::utf32_to_utf16_masked<(simdutf::endianness)0>(long long __vector(8), long long __vector(8), unsigned int, char16_t*) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::utf32_to_utf16_masked<(simdutf::endianness)1>(long long __vector(8), long long __vector(8), unsigned int, char16_t*) |
525 | | |
526 | | /* |
527 | | utf32_to_utf16 converts `count` lower UTF-32 code units |
528 | | from input `utf32` into UTF-16. It may overflow. |
529 | | |
530 | | Returns how many 16-bit code units were stored. |
531 | | |
532 | | byteflip is used for flipping 16-bit code units, and it should be |
533 | | __m512i byteflip = _mm512_setr_epi64( |
534 | | 0x0607040502030001, |
535 | | 0x0e0f0c0d0a0b0809, |
536 | | 0x0607040502030001, |
537 | | 0x0e0f0c0d0a0b0809, |
538 | | 0x0607040502030001, |
539 | | 0x0e0f0c0d0a0b0809, |
540 | | 0x0607040502030001, |
541 | | 0x0e0f0c0d0a0b0809 |
542 | | ); |
543 | | We pass it to the (always inlined) function to encourage the compiler to |
544 | | keep the value in a (constant) register. |
545 | | */ |
546 | | template <endianness big_endian> |
547 | | simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, |
548 | | __m512i utf32, unsigned int count, |
549 | 0 | char16_t *output) { |
550 | 0 | // check if we have any surrogate pairs |
551 | 0 | const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); |
552 | 0 | const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); |
553 | 0 |
|
554 | 0 | if (sp_mask == 0) { |
555 | 0 | // technically, it should be _mm256_storeu_epi16 |
556 | 0 | if (big_endian) { |
557 | 0 | _mm256_storeu_si256( |
558 | 0 | (__m256i *)output, |
559 | 0 | _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), |
560 | 0 | _mm512_castsi512_si256(byteflip))); |
561 | 0 | } else { |
562 | 0 | _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32)); |
563 | 0 | } |
564 | 0 | return count; |
565 | 0 | } |
566 | 0 |
|
567 | 0 | { |
568 | 0 | // build surrogate pair code units in 32-bit lanes |
569 | 0 |
|
570 | 0 | // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] |
571 | 0 | const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); |
572 | 0 | const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); |
573 | 0 |
|
574 | 0 | // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] |
575 | 0 | const __m512i t1 = _mm512_slli_epi32(t0, 6); |
576 | 0 |
|
577 | 0 | // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 |
578 | 0 | // to t0 |
579 | 0 | // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) |
580 | 0 | const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); |
581 | 0 | const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); |
582 | 0 |
|
583 | 0 | // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 |
584 | 0 | // to t0 |
585 | 0 | // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 |
586 | 0 | const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); |
587 | 0 | const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); |
588 | 0 | const __m512i t3 = |
589 | 0 | _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); |
590 | 0 | const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); |
591 | 0 | __m512i t5 = _mm512_ror_epi32(t4, 16); |
592 | 0 | const __mmask32 nonzero = _kor_mask32( |
593 | 0 | 0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); |
594 | 0 | if (big_endian) { |
595 | 0 | t5 = _mm512_shuffle_epi8(t5, byteflip); |
596 | 0 | } |
597 | 0 | // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability |
598 | 0 | // (zen4) |
599 | 0 | __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5); |
600 | 0 | _mm512_mask_storeu_epi16( |
601 | 0 | output, |
602 | 0 | (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, |
603 | 0 | compressed); |
604 | 0 | //_mm512_mask_compressstoreu_epi16(output, nonzero, t5); |
605 | 0 | } |
606 | 0 |
|
607 | 0 | return count + static_cast<unsigned int>(count_ones(sp_mask)); |
608 | 0 | } Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::utf32_to_utf16<(simdutf::endianness)0>(long long __vector(8), long long __vector(8), unsigned int, char16_t*) Unexecuted instantiation: simdutf.cpp:unsigned long simdutf::icelake::(anonymous namespace)::utf32_to_utf16<(simdutf::endianness)1>(long long __vector(8), long long __vector(8), unsigned int, char16_t*) |
609 | | |
610 | | /* |
611 | | expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`) |
612 | | stored at separate 32-bit lanes. |
613 | | |
614 | | For each lane we have also a character class (`char_class), given in form |
615 | | 0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets |
616 | | corresponding bytes during pshufb. |
617 | | */ |
618 | | simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, |
619 | 0 | __m512i utf8) { |
620 | | /* |
621 | | Input: |
622 | | - utf8: bytes stored at separate 32-bit code units |
623 | | - valid: which code units have valid UTF-8 characters |
624 | | |
625 | | Bit layout of single word. We show 4 cases for each possible |
626 | | UTF-8 character encoding. The `?` denotes bits we must not |
627 | | assume their value. |
628 | | |
629 | | |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char |
630 | | |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char |
631 | | |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char |
632 | | |????.????|????.????|????.????|0aaa.aaaa| ASCII char |
633 | | byte 3 byte 2 byte 1 byte 0 |
634 | | */ |
635 | | |
636 | | /* 1. Reset control bits of continuation bytes and the MSB |
637 | | of the leading byte; this makes all bytes unsigned (and |
638 | | does not alter ASCII char). |
639 | | |
640 | | |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char |
641 | | |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char |
642 | | |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char |
643 | | |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char |
644 | | ^^ ^^ ^^ ^ |
645 | | */ |
646 | 0 | __m512i values; |
647 | 0 | const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f); |
648 | 0 | values = _mm512_and_si512(utf8, v_3f3f_3f7f); |
649 | | |
650 | | /* 2. Swap and join fields A-B and C-D |
651 | | |
652 | | |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char |
653 | | |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char |
654 | | |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char |
655 | | |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */ |
656 | 0 | const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140); |
657 | 0 | values = _mm512_maddubs_epi16(values, v_0140_0140); |
658 | | |
659 | | /* 3. Swap and join fields AB & CD |
660 | | |
661 | | |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char |
662 | | |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char |
663 | | |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char |
664 | | |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */ |
665 | 0 | const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000); |
666 | 0 | values = _mm512_madd_epi16(values, v_0001_1000); |
667 | | |
668 | | /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits |
669 | | |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11 |
670 | | |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10 |
671 | | |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9 |
672 | | |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */ |
673 | 0 | { |
674 | | /** pshufb |
675 | | |
676 | | continuation = 0 |
677 | | ascii = 7 |
678 | | _2_bytes = 9 |
679 | | _3_bytes = 10 |
680 | | _4_bytes = 11 |
681 | | |
682 | | shift_left_v3 = 4 * [ |
683 | | ascii, # 0000 |
684 | | ascii, # 0001 |
685 | | ascii, # 0010 |
686 | | ascii, # 0011 |
687 | | ascii, # 0100 |
688 | | ascii, # 0101 |
689 | | ascii, # 0110 |
690 | | ascii, # 0111 |
691 | | continuation, # 1000 |
692 | | continuation, # 1001 |
693 | | continuation, # 1010 |
694 | | continuation, # 1011 |
695 | | _2_bytes, # 1100 |
696 | | _2_bytes, # 1101 |
697 | | _3_bytes, # 1110 |
698 | | _4_bytes, # 1111 |
699 | | ] */ |
700 | 0 | const __m512i shift_left_v3 = _mm512_setr_epi64( |
701 | 0 | 0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707, |
702 | 0 | 0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000, |
703 | 0 | 0x0707070707070707, 0x0b0a090900000000); |
704 | |
|
705 | 0 | const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class); |
706 | 0 | values = _mm512_sllv_epi32(values, shift); |
707 | 0 | } |
708 | | |
709 | | /* 5. Shift right the values by variable amounts to reset lowest bits |
710 | | |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11 |
711 | | |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16 |
712 | | |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21 |
713 | | |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */ |
714 | 0 | { |
715 | | // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11] |
716 | 0 | const __m512i shift_right = _mm512_setr_epi64( |
717 | 0 | 0x1919191919191919, 0x0b10151500000000, 0x1919191919191919, |
718 | 0 | 0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000, |
719 | 0 | 0x1919191919191919, 0x0b10151500000000); |
720 | |
|
721 | 0 | const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class); |
722 | 0 | values = _mm512_srlv_epi32(values, shift); |
723 | 0 | } |
724 | |
|
725 | 0 | return values; |
726 | 0 | } |
727 | | |
728 | | simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, |
729 | 0 | int &count) { |
730 | 0 | const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1); |
731 | 0 | const __m512i expand_ver2 = _mm512_setr_epi64( |
732 | 0 | 0x0403020103020100, 0x0605040305040302, 0x0807060507060504, |
733 | 0 | 0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a, |
734 | 0 | 0x000f0e0d0f0e0d0c, 0x0201000f01000f0e); |
735 | 0 | const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); |
736 | 0 | const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); |
737 | 0 | const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); |
738 | 0 | const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); |
739 | 0 | const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); |
740 | 0 | count = static_cast<int>(count_ones(leading_bytes)); |
741 | 0 | return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, |
742 | 0 | input); |
743 | 0 | } |
744 | | |
745 | 0 | simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) { |
746 | 0 | __m512i char_class = _mm512_srli_epi32(input, 4); |
747 | | /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ |
748 | 0 | const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); |
749 | 0 | const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); |
750 | 0 | char_class = |
751 | 0 | _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); |
752 | 0 | return expanded_utf8_to_utf32(char_class, input); |
753 | 0 | } |