/src/simdutf/src/haswell/implementation.cpp
Line | Count | Source |
1 | | #include "simdutf/haswell/begin.h" |
2 | | |
3 | | namespace simdutf { |
4 | | namespace SIMDUTF_IMPLEMENTATION { |
5 | | namespace { |
6 | | #ifndef SIMDUTF_HASWELL_H |
7 | | #error "haswell.h must be included" |
8 | | #endif |
9 | | using namespace simd; |
10 | | |
11 | | #if SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || \ |
12 | | SIMDUTF_FEATURE_UTF8 |
13 | 0 | simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) { |
14 | 0 | return input.reduce_or().is_ascii(); |
15 | 0 | } |
16 | | #endif // SIMDUTF_FEATURE_ASCII || SIMDUTF_FEATURE_DETECT_ENCODING || |
17 | | // SIMDUTF_FEATURE_UTF8 |
18 | | |
19 | | #if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING |
20 | | simdutf_really_inline simd8<bool> |
21 | | must_be_2_3_continuation(const simd8<uint8_t> prev2, |
22 | 0 | const simd8<uint8_t> prev3) { |
23 | 0 | simd8<uint8_t> is_third_byte = |
24 | 0 | prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80 |
25 | 0 | simd8<uint8_t> is_fourth_byte = |
26 | 0 | prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80 |
27 | 0 | return simd8<bool>(is_third_byte | is_fourth_byte); |
28 | 0 | } |
29 | | #endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING |
30 | | |
31 | | #if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING |
32 | | namespace utf16 { |
33 | | #include "haswell/avx2_validate_utf16.cpp" |
34 | | } |
35 | | #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING |
36 | | |
37 | | #if SIMDUTF_FEATURE_UTF16 |
38 | | #include "haswell/avx2_utf16fix.cpp" |
39 | | #endif // SIMDUTF_FEATURE_UTF16 |
40 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
41 | | #include "haswell/avx2_convert_latin1_to_utf8.cpp" |
42 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
43 | | |
44 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
45 | | #include "haswell/avx2_convert_latin1_to_utf16.cpp" |
46 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
47 | | |
48 | | #if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
49 | | #include "haswell/avx2_convert_latin1_to_utf32.cpp" |
50 | | #endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
51 | | |
52 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
53 | | #include "haswell/avx2_convert_utf8_to_utf16.cpp" |
54 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
55 | | |
56 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
57 | | #include "haswell/avx2_convert_utf8_to_utf32.cpp" |
58 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
59 | | |
60 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
61 | | #include "haswell/avx2_convert_utf16_to_latin1.cpp" |
62 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
63 | | |
64 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
65 | | #include "haswell/avx2_convert_utf16_to_utf8.cpp" |
66 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
67 | | |
68 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
69 | | #include "haswell/avx2_convert_utf16_to_utf32.cpp" |
70 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
71 | | |
72 | | #if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
73 | | #include "haswell/avx2_convert_utf32_to_latin1.cpp" |
74 | | #endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
75 | | |
76 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
77 | | #include "haswell/avx2_convert_utf32_to_utf8.cpp" |
78 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
79 | | |
80 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
81 | | #include "haswell/avx2_convert_utf32_to_utf16.cpp" |
82 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
83 | | |
84 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
85 | | #include "haswell/avx2_convert_utf8_to_latin1.cpp" |
86 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
87 | | |
88 | | #if SIMDUTF_FEATURE_BASE64 |
89 | | #include "haswell/avx2_base64.cpp" |
90 | | #endif // SIMDUTF_FEATURE_BASE64 |
91 | | |
92 | | } // unnamed namespace |
93 | | } // namespace SIMDUTF_IMPLEMENTATION |
94 | | } // namespace simdutf |
95 | | |
96 | | #include "generic/buf_block_reader.h" |
97 | | #if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING |
98 | | #include "generic/utf8_validation/utf8_lookup4_algorithm.h" |
99 | | #include "generic/utf8_validation/utf8_validator.h" |
100 | | #endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING |
101 | | |
102 | | #if SIMDUTF_FEATURE_ASCII |
103 | | #include "generic/ascii_validation.h" |
104 | | #endif // SIMDUTF_FEATURE_ASCII |
105 | | |
106 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
107 | | // transcoding from UTF-8 to UTF-16 |
108 | | #include "generic/utf8_to_utf16/valid_utf8_to_utf16.h" |
109 | | #include "generic/utf8_to_utf16/utf8_to_utf16.h" |
110 | | #include "generic/utf8/utf16_length_from_utf8_bytemask.h" |
111 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
112 | | |
113 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
114 | | // transcoding from UTF-8 to UTF-32 |
115 | | #include "generic/utf8_to_utf32/valid_utf8_to_utf32.h" |
116 | | #include "generic/utf8_to_utf32/utf8_to_utf32.h" |
117 | | #include "generic/utf32.h" |
118 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
119 | | |
120 | | // other functions |
121 | | #if SIMDUTF_FEATURE_UTF8 |
122 | | #include "generic/utf8.h" |
123 | | #endif // SIMDUTF_FEATURE_UTF8 |
124 | | |
125 | | #if SIMDUTF_FEATURE_UTF16 |
126 | | #include "generic/utf16.h" |
127 | | #include "generic/utf16/utf8_length_from_utf16_bytemask.h" |
128 | | #endif // SIMDUTF_FEATURE_UTF16 |
129 | | #if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING |
130 | | #include "generic/validate_utf16.h" |
131 | | #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING |
132 | | |
133 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
134 | | // transcoding from UTF-8 to Latin 1 |
135 | | #include "generic/utf8_to_latin1/utf8_to_latin1.h" |
136 | | #include "generic/utf8_to_latin1/valid_utf8_to_latin1.h" |
137 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
138 | | |
139 | | #if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING |
140 | | #include "generic/validate_utf32.h" |
141 | | #endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING |
142 | | |
143 | | #if SIMDUTF_FEATURE_BASE64 |
144 | | #include "generic/base64.h" |
145 | | #include "generic/find.h" |
146 | | #endif // SIMDUTF_FEATURE_BASE64 |
147 | | |
148 | | namespace simdutf { |
149 | | namespace SIMDUTF_IMPLEMENTATION { |
150 | | |
151 | | #if SIMDUTF_FEATURE_DETECT_ENCODING |
152 | | simdutf_warn_unused int |
153 | | implementation::detect_encodings(const char *input, |
154 | 0 | size_t length) const noexcept { |
155 | | // If there is a BOM, then we trust it. |
156 | 0 | auto bom_encoding = simdutf::BOM::check_bom(input, length); |
157 | 0 | if (bom_encoding != encoding_type::unspecified) { |
158 | 0 | return bom_encoding; |
159 | 0 | } |
160 | | |
161 | 0 | int out = 0; |
162 | 0 | uint32_t utf16_err = (length % 2); |
163 | 0 | uint32_t utf32_err = (length % 4); |
164 | 0 | uint32_t ends_with_high = 0; |
165 | 0 | const auto v_d8 = simd8<uint8_t>::splat(0xd8); |
166 | 0 | const auto v_f8 = simd8<uint8_t>::splat(0xf8); |
167 | 0 | const auto v_fc = simd8<uint8_t>::splat(0xfc); |
168 | 0 | const auto v_dc = simd8<uint8_t>::splat(0xdc); |
169 | 0 | const __m256i standardmax = _mm256_set1_epi32(0x10ffff); |
170 | 0 | const __m256i offset = _mm256_set1_epi32(0xffff2000); |
171 | 0 | const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); |
172 | 0 | __m256i currentmax = _mm256_setzero_si256(); |
173 | 0 | __m256i currentoffsetmax = _mm256_setzero_si256(); |
174 | |
|
175 | 0 | utf8_checker c{}; |
176 | 0 | buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length); |
177 | 0 | while (reader.has_full_block()) { |
178 | 0 | simd::simd8x64<uint8_t> in(reader.full_block()); |
179 | | // utf8 checks |
180 | 0 | c.check_next_input(in); |
181 | | |
182 | | // utf16le checks |
183 | 0 | auto in0 = simd16<uint16_t>(in.chunks[0]); |
184 | 0 | auto in1 = simd16<uint16_t>(in.chunks[1]); |
185 | 0 | const auto t0 = in0.shr<8>(); |
186 | 0 | const auto t1 = in1.shr<8>(); |
187 | 0 | const auto in2 = simd16<uint16_t>::pack(t0, t1); |
188 | 0 | const auto surrogates_wordmask = (in2 & v_f8) == v_d8; |
189 | 0 | const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); |
190 | 0 | const auto vL = (in2 & v_fc) == v_dc; |
191 | 0 | const uint32_t L = vL.to_bitmask(); |
192 | 0 | const uint32_t H = L ^ surrogates_bitmask; |
193 | 0 | utf16_err |= (((H << 1) | ends_with_high) != L); |
194 | 0 | ends_with_high = (H & 0x80000000) != 0; |
195 | | |
196 | | // utf32le checks |
197 | 0 | currentmax = _mm256_max_epu32(in.chunks[0], currentmax); |
198 | 0 | currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[0], offset), |
199 | 0 | currentoffsetmax); |
200 | 0 | currentmax = _mm256_max_epu32(in.chunks[1], currentmax); |
201 | 0 | currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[1], offset), |
202 | 0 | currentoffsetmax); |
203 | |
|
204 | 0 | reader.advance(); |
205 | 0 | } |
206 | |
|
207 | 0 | uint8_t block[64]{}; |
208 | 0 | size_t idx = reader.block_index(); |
209 | 0 | std::memcpy(block, &input[idx], length - idx); |
210 | 0 | simd::simd8x64<uint8_t> in(block); |
211 | 0 | c.check_next_input(in); |
212 | | |
213 | | // utf16le last block check |
214 | 0 | auto in0 = simd16<uint16_t>(in.chunks[0]); |
215 | 0 | auto in1 = simd16<uint16_t>(in.chunks[1]); |
216 | 0 | const auto t0 = in0.shr<8>(); |
217 | 0 | const auto t1 = in1.shr<8>(); |
218 | 0 | const auto in2 = simd16<uint16_t>::pack(t0, t1); |
219 | 0 | const auto surrogates_wordmask = (in2 & v_f8) == v_d8; |
220 | 0 | const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); |
221 | 0 | const auto vL = (in2 & v_fc) == v_dc; |
222 | 0 | const uint32_t L = vL.to_bitmask(); |
223 | 0 | const uint32_t H = L ^ surrogates_bitmask; |
224 | 0 | utf16_err |= (((H << 1) | ends_with_high) != L); |
225 | | // this is required to check for last byte ending in high and end of input |
226 | | // is reached |
227 | 0 | ends_with_high = (H & 0x80000000) != 0; |
228 | 0 | utf16_err |= ends_with_high; |
229 | | |
230 | | // utf32le last block check |
231 | 0 | currentmax = _mm256_max_epu32(in.chunks[0], currentmax); |
232 | 0 | currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[0], offset), |
233 | 0 | currentoffsetmax); |
234 | 0 | currentmax = _mm256_max_epu32(in.chunks[1], currentmax); |
235 | 0 | currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in.chunks[1], offset), |
236 | 0 | currentoffsetmax); |
237 | |
|
238 | 0 | reader.advance(); |
239 | |
|
240 | 0 | c.check_eof(); |
241 | 0 | bool is_valid_utf8 = !c.errors(); |
242 | 0 | __m256i is_zero = |
243 | 0 | _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); |
244 | 0 | utf32_err |= (_mm256_testz_si256(is_zero, is_zero) == 0); |
245 | |
|
246 | 0 | is_zero = _mm256_xor_si256( |
247 | 0 | _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); |
248 | 0 | utf32_err |= (_mm256_testz_si256(is_zero, is_zero) == 0); |
249 | 0 | if (is_valid_utf8) { |
250 | 0 | out |= encoding_type::UTF8; |
251 | 0 | } |
252 | 0 | if (utf16_err == 0) { |
253 | 0 | out |= encoding_type::UTF16_LE; |
254 | 0 | } |
255 | 0 | if (utf32_err == 0) { |
256 | 0 | out |= encoding_type::UTF32_LE; |
257 | 0 | } |
258 | 0 | return out; |
259 | 0 | } |
260 | | #endif // SIMDUTF_FEATURE_DETECT_ENCODING |
261 | | |
262 | | #if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING |
263 | | simdutf_warn_unused bool |
264 | 0 | implementation::validate_utf8(const char *buf, size_t len) const noexcept { |
265 | 0 | return haswell::utf8_validation::generic_validate_utf8(buf, len); |
266 | 0 | } |
267 | | #endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING |
268 | | |
269 | | #if SIMDUTF_FEATURE_UTF8 |
270 | | simdutf_warn_unused result implementation::validate_utf8_with_errors( |
271 | 0 | const char *buf, size_t len) const noexcept { |
272 | 0 | return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len); |
273 | 0 | } |
274 | | #endif // SIMDUTF_FEATURE_UTF8 |
275 | | |
276 | | #if SIMDUTF_FEATURE_ASCII |
277 | | simdutf_warn_unused bool |
278 | 0 | implementation::validate_ascii(const char *buf, size_t len) const noexcept { |
279 | 0 | return haswell::ascii_validation::generic_validate_ascii(buf, len); |
280 | 0 | } |
281 | | |
282 | | simdutf_warn_unused result implementation::validate_ascii_with_errors( |
283 | 0 | const char *buf, size_t len) const noexcept { |
284 | 0 | return haswell::ascii_validation::generic_validate_ascii_with_errors(buf, |
285 | 0 | len); |
286 | 0 | } |
287 | | #endif // SIMDUTF_FEATURE_ASCII |
288 | | |
289 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII |
290 | | simdutf_warn_unused bool |
291 | | implementation::validate_utf16le_as_ascii(const char16_t *buf, |
292 | 0 | size_t len) const noexcept { |
293 | 0 | return haswell::utf16::validate_utf16_as_ascii_with_errors< |
294 | 0 | endianness::LITTLE>(buf, len) |
295 | 0 | .error == SUCCESS; |
296 | 0 | } |
297 | | |
298 | | simdutf_warn_unused bool |
299 | | implementation::validate_utf16be_as_ascii(const char16_t *buf, |
300 | 0 | size_t len) const noexcept { |
301 | 0 | return haswell::utf16::validate_utf16_as_ascii_with_errors<endianness::BIG>( |
302 | 0 | buf, len) |
303 | 0 | .error == SUCCESS; |
304 | 0 | } |
305 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII |
306 | | |
307 | | #if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING |
308 | | simdutf_warn_unused bool |
309 | | implementation::validate_utf16le(const char16_t *buf, |
310 | 0 | size_t len) const noexcept { |
311 | 0 | if (simdutf_unlikely(len == 0)) { |
312 | | // empty input is valid UTF-16. protect the implementation from |
313 | | // handling nullptr |
314 | 0 | return true; |
315 | 0 | } |
316 | 0 | const auto res = |
317 | 0 | haswell::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len); |
318 | 0 | if (res.is_err()) { |
319 | 0 | return false; |
320 | 0 | } |
321 | | |
322 | 0 | if (res.count == len) { |
323 | 0 | return true; |
324 | 0 | } |
325 | | |
326 | 0 | return scalar::utf16::validate<endianness::LITTLE>(buf + res.count, |
327 | 0 | len - res.count); |
328 | 0 | } |
329 | | #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING |
330 | | |
331 | | #if SIMDUTF_FEATURE_UTF16 |
332 | | simdutf_warn_unused bool |
333 | | implementation::validate_utf16be(const char16_t *buf, |
334 | 0 | size_t len) const noexcept { |
335 | 0 | if (simdutf_unlikely(len == 0)) { |
336 | | // empty input is valid UTF-16. protect the implementation from |
337 | | // handling nullptr |
338 | 0 | return true; |
339 | 0 | } |
340 | 0 | const auto res = |
341 | 0 | haswell::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len); |
342 | 0 | if (res.is_err()) { |
343 | 0 | return false; |
344 | 0 | } |
345 | | |
346 | 0 | if (res.count == len) { |
347 | 0 | return true; |
348 | 0 | } |
349 | | |
350 | 0 | return scalar::utf16::validate<endianness::BIG>(buf + res.count, |
351 | 0 | len - res.count); |
352 | 0 | } |
353 | | |
354 | | simdutf_warn_unused result implementation::validate_utf16le_with_errors( |
355 | 0 | const char16_t *buf, size_t len) const noexcept { |
356 | |
|
357 | 0 | const result res = |
358 | 0 | haswell::utf16::validate_utf16_with_errors<endianness::LITTLE>(buf, len); |
359 | 0 | if (res.count != len) { |
360 | 0 | const result scalar_res = |
361 | 0 | scalar::utf16::validate_with_errors<endianness::LITTLE>( |
362 | 0 | buf + res.count, len - res.count); |
363 | 0 | return result(scalar_res.error, res.count + scalar_res.count); |
364 | 0 | } else { |
365 | 0 | return res; |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | simdutf_warn_unused result implementation::validate_utf16be_with_errors( |
370 | 0 | const char16_t *buf, size_t len) const noexcept { |
371 | 0 | const result res = |
372 | 0 | haswell::utf16::validate_utf16_with_errors<endianness::BIG>(buf, len); |
373 | 0 | if (res.count != len) { |
374 | 0 | const result scalar_res = |
375 | 0 | scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, |
376 | 0 | len - res.count); |
377 | 0 | return result(scalar_res.error, res.count + scalar_res.count); |
378 | 0 | } else { |
379 | 0 | return res; |
380 | 0 | } |
381 | 0 | } |
382 | | |
383 | | void implementation::to_well_formed_utf16le(const char16_t *input, size_t len, |
384 | 0 | char16_t *output) const noexcept { |
385 | 0 | return utf16fix_avx<endianness::LITTLE>(input, len, output); |
386 | 0 | } |
387 | | |
388 | | void implementation::to_well_formed_utf16be(const char16_t *input, size_t len, |
389 | 0 | char16_t *output) const noexcept { |
390 | 0 | return utf16fix_avx<endianness::BIG>(input, len, output); |
391 | 0 | } |
392 | | #endif // SIMDUTF_FEATURE_UTF16 |
393 | | |
394 | | #if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING |
395 | | simdutf_warn_unused bool |
396 | 0 | implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { |
397 | 0 | return utf32::validate(buf, len); |
398 | 0 | } |
399 | | #endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING |
400 | | |
401 | | #if SIMDUTF_FEATURE_UTF32 |
402 | | simdutf_warn_unused result implementation::validate_utf32_with_errors( |
403 | 0 | const char32_t *buf, size_t len) const noexcept { |
404 | 0 | return utf32::validate_with_errors(buf, len); |
405 | 0 | } |
406 | | #endif // SIMDUTF_FEATURE_UTF32 |
407 | | |
408 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
409 | | simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( |
410 | 0 | const char *buf, size_t len, char *utf8_output) const noexcept { |
411 | 0 | std::pair<const char *, char *> ret = |
412 | 0 | avx2_convert_latin1_to_utf8(buf, len, utf8_output); |
413 | 0 | size_t converted_chars = ret.second - utf8_output; |
414 | |
|
415 | 0 | if (ret.first != buf + len) { |
416 | 0 | const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( |
417 | 0 | ret.first, len - (ret.first - buf), ret.second); |
418 | 0 | converted_chars += scalar_converted_chars; |
419 | 0 | } |
420 | |
|
421 | 0 | return converted_chars; |
422 | 0 | } |
423 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
424 | | |
425 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
426 | | simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( |
427 | 0 | const char *buf, size_t len, char16_t *utf16_output) const noexcept { |
428 | 0 | std::pair<const char *, char16_t *> ret = |
429 | 0 | avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output); |
430 | 0 | if (ret.first == nullptr) { |
431 | 0 | return 0; |
432 | 0 | } |
433 | 0 | size_t converted_chars = ret.second - utf16_output; |
434 | 0 | if (ret.first != buf + len) { |
435 | 0 | const size_t scalar_converted_chars = |
436 | 0 | scalar::latin1_to_utf16::convert<endianness::LITTLE>( |
437 | 0 | ret.first, len - (ret.first - buf), ret.second); |
438 | 0 | if (scalar_converted_chars == 0) { |
439 | 0 | return 0; |
440 | 0 | } |
441 | 0 | converted_chars += scalar_converted_chars; |
442 | 0 | } |
443 | 0 | return converted_chars; |
444 | 0 | } |
445 | | |
446 | | simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( |
447 | 0 | const char *buf, size_t len, char16_t *utf16_output) const noexcept { |
448 | 0 | std::pair<const char *, char16_t *> ret = |
449 | 0 | avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output); |
450 | 0 | if (ret.first == nullptr) { |
451 | 0 | return 0; |
452 | 0 | } |
453 | 0 | size_t converted_chars = ret.second - utf16_output; |
454 | 0 | if (ret.first != buf + len) { |
455 | 0 | const size_t scalar_converted_chars = |
456 | 0 | scalar::latin1_to_utf16::convert<endianness::BIG>( |
457 | 0 | ret.first, len - (ret.first - buf), ret.second); |
458 | 0 | if (scalar_converted_chars == 0) { |
459 | 0 | return 0; |
460 | 0 | } |
461 | 0 | converted_chars += scalar_converted_chars; |
462 | 0 | } |
463 | 0 | return converted_chars; |
464 | 0 | } |
465 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
466 | | |
467 | | #if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
468 | | simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( |
469 | 0 | const char *buf, size_t len, char32_t *utf32_output) const noexcept { |
470 | 0 | std::pair<const char *, char32_t *> ret = |
471 | 0 | avx2_convert_latin1_to_utf32(buf, len, utf32_output); |
472 | 0 | if (ret.first == nullptr) { |
473 | 0 | return 0; |
474 | 0 | } |
475 | 0 | size_t converted_chars = ret.second - utf32_output; |
476 | 0 | if (ret.first != buf + len) { |
477 | 0 | const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( |
478 | 0 | ret.first, len - (ret.first - buf), ret.second); |
479 | 0 | if (scalar_converted_chars == 0) { |
480 | 0 | return 0; |
481 | 0 | } |
482 | 0 | converted_chars += scalar_converted_chars; |
483 | 0 | } |
484 | 0 | return converted_chars; |
485 | 0 | } |
486 | | #endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
487 | | |
488 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
489 | | simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( |
490 | 0 | const char *buf, size_t len, char *latin1_output) const noexcept { |
491 | 0 | utf8_to_latin1::validating_transcoder converter; |
492 | 0 | return converter.convert(buf, len, latin1_output); |
493 | 0 | } |
494 | | |
495 | | simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( |
496 | 0 | const char *buf, size_t len, char *latin1_output) const noexcept { |
497 | 0 | utf8_to_latin1::validating_transcoder converter; |
498 | 0 | return converter.convert_with_errors(buf, len, latin1_output); |
499 | 0 | } |
500 | | |
501 | | simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( |
502 | 0 | const char *input, size_t size, char *latin1_output) const noexcept { |
503 | 0 | return utf8_to_latin1::convert_valid(input, size, latin1_output); |
504 | 0 | } |
505 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
506 | | |
507 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
508 | | simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( |
509 | 0 | const char *buf, size_t len, char16_t *utf16_output) const noexcept { |
510 | 0 | utf8_to_utf16::validating_transcoder converter; |
511 | 0 | return converter.convert<endianness::LITTLE>(buf, len, utf16_output); |
512 | 0 | } |
513 | | |
514 | | simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( |
515 | 0 | const char *buf, size_t len, char16_t *utf16_output) const noexcept { |
516 | 0 | utf8_to_utf16::validating_transcoder converter; |
517 | 0 | return converter.convert<endianness::BIG>(buf, len, utf16_output); |
518 | 0 | } |
519 | | |
520 | | simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( |
521 | 0 | const char *buf, size_t len, char16_t *utf16_output) const noexcept { |
522 | 0 | utf8_to_utf16::validating_transcoder converter; |
523 | 0 | return converter.convert_with_errors<endianness::LITTLE>(buf, len, |
524 | 0 | utf16_output); |
525 | 0 | } |
526 | | |
527 | | simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( |
528 | 0 | const char *buf, size_t len, char16_t *utf16_output) const noexcept { |
529 | 0 | utf8_to_utf16::validating_transcoder converter; |
530 | 0 | return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output); |
531 | 0 | } |
532 | | |
533 | | simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( |
534 | 0 | const char *input, size_t size, char16_t *utf16_output) const noexcept { |
535 | 0 | return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, |
536 | 0 | utf16_output); |
537 | 0 | } |
538 | | |
539 | | simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( |
540 | 0 | const char *input, size_t size, char16_t *utf16_output) const noexcept { |
541 | 0 | return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, |
542 | 0 | utf16_output); |
543 | 0 | } |
544 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
545 | | |
546 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
547 | | simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( |
548 | 0 | const char *buf, size_t len, char32_t *utf32_output) const noexcept { |
549 | 0 | utf8_to_utf32::validating_transcoder converter; |
550 | 0 | return converter.convert(buf, len, utf32_output); |
551 | 0 | } |
552 | | |
553 | | simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( |
554 | 0 | const char *buf, size_t len, char32_t *utf32_output) const noexcept { |
555 | 0 | utf8_to_utf32::validating_transcoder converter; |
556 | 0 | return converter.convert_with_errors(buf, len, utf32_output); |
557 | 0 | } |
558 | | |
559 | | simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( |
560 | 0 | const char *input, size_t size, char32_t *utf32_output) const noexcept { |
561 | 0 | return utf8_to_utf32::convert_valid(input, size, utf32_output); |
562 | 0 | } |
563 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
564 | | |
565 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
566 | | simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( |
567 | 0 | const char16_t *buf, size_t len, char *latin1_output) const noexcept { |
568 | 0 | std::pair<const char16_t *, char *> ret = |
569 | 0 | haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, |
570 | 0 | latin1_output); |
571 | 0 | if (ret.first == nullptr) { |
572 | 0 | return 0; |
573 | 0 | } |
574 | 0 | size_t saved_bytes = ret.second - latin1_output; |
575 | 0 | if (ret.first != buf + len) { |
576 | 0 | const size_t scalar_saved_bytes = |
577 | 0 | scalar::utf16_to_latin1::convert<endianness::LITTLE>( |
578 | 0 | ret.first, len - (ret.first - buf), ret.second); |
579 | 0 | if (scalar_saved_bytes == 0) { |
580 | 0 | return 0; |
581 | 0 | } |
582 | 0 | saved_bytes += scalar_saved_bytes; |
583 | 0 | } |
584 | 0 | return saved_bytes; |
585 | 0 | } |
586 | | |
587 | | simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( |
588 | 0 | const char16_t *buf, size_t len, char *latin1_output) const noexcept { |
589 | 0 | std::pair<const char16_t *, char *> ret = |
590 | 0 | haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len, |
591 | 0 | latin1_output); |
592 | 0 | if (ret.first == nullptr) { |
593 | 0 | return 0; |
594 | 0 | } |
595 | 0 | size_t saved_bytes = ret.second - latin1_output; |
596 | 0 | if (ret.first != buf + len) { |
597 | 0 | const size_t scalar_saved_bytes = |
598 | 0 | scalar::utf16_to_latin1::convert<endianness::BIG>( |
599 | 0 | ret.first, len - (ret.first - buf), ret.second); |
600 | 0 | if (scalar_saved_bytes == 0) { |
601 | 0 | return 0; |
602 | 0 | } |
603 | 0 | saved_bytes += scalar_saved_bytes; |
604 | 0 | } |
605 | 0 | return saved_bytes; |
606 | 0 | } |
607 | | |
608 | | simdutf_warn_unused result |
609 | | implementation::convert_utf16le_to_latin1_with_errors( |
610 | 0 | const char16_t *buf, size_t len, char *latin1_output) const noexcept { |
611 | 0 | std::pair<result, char *> ret = |
612 | 0 | avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>( |
613 | 0 | buf, len, latin1_output); |
614 | 0 | if (ret.first.error) { |
615 | 0 | return ret.first; |
616 | 0 | } // Can return directly since scalar fallback already found correct |
617 | | // ret.first.count |
618 | 0 | if (ret.first.count != len) { // All good so far, but not finished |
619 | 0 | result scalar_res = |
620 | 0 | scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>( |
621 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
622 | 0 | if (scalar_res.error) { |
623 | 0 | scalar_res.count += ret.first.count; |
624 | 0 | return scalar_res; |
625 | 0 | } else { |
626 | 0 | ret.second += scalar_res.count; |
627 | 0 | } |
628 | 0 | } |
629 | 0 | ret.first.count = |
630 | 0 | ret.second - |
631 | 0 | latin1_output; // Set count to the number of 8-bit code units written |
632 | 0 | return ret.first; |
633 | 0 | } |
634 | | |
635 | | simdutf_warn_unused result |
636 | | implementation::convert_utf16be_to_latin1_with_errors( |
637 | 0 | const char16_t *buf, size_t len, char *latin1_output) const noexcept { |
638 | 0 | std::pair<result, char *> ret = |
639 | 0 | avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len, |
640 | 0 | latin1_output); |
641 | 0 | if (ret.first.error) { |
642 | 0 | return ret.first; |
643 | 0 | } // Can return directly since scalar fallback already found correct |
644 | | // ret.first.count |
645 | 0 | if (ret.first.count != len) { // All good so far, but not finished |
646 | 0 | result scalar_res = |
647 | 0 | scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>( |
648 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
649 | 0 | if (scalar_res.error) { |
650 | 0 | scalar_res.count += ret.first.count; |
651 | 0 | return scalar_res; |
652 | 0 | } else { |
653 | 0 | ret.second += scalar_res.count; |
654 | 0 | } |
655 | 0 | } |
656 | 0 | ret.first.count = |
657 | 0 | ret.second - |
658 | 0 | latin1_output; // Set count to the number of 8-bit code units written |
659 | 0 | return ret.first; |
660 | 0 | } |
661 | | |
662 | | simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( |
663 | 0 | const char16_t *buf, size_t len, char *latin1_output) const noexcept { |
664 | | // optimization opportunity: implement a custom function |
665 | 0 | return convert_utf16be_to_latin1(buf, len, latin1_output); |
666 | 0 | } |
667 | | |
668 | | simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( |
669 | 0 | const char16_t *buf, size_t len, char *latin1_output) const noexcept { |
670 | | // optimization opportunity: implement a custom function |
671 | 0 | return convert_utf16le_to_latin1(buf, len, latin1_output); |
672 | 0 | } |
673 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1 |
674 | | |
675 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
676 | | simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( |
677 | 0 | const char16_t *buf, size_t len, char *utf8_output) const noexcept { |
678 | 0 | std::pair<const char16_t *, char *> ret = |
679 | 0 | haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, |
680 | 0 | utf8_output); |
681 | 0 | if (ret.first == nullptr) { |
682 | 0 | return 0; |
683 | 0 | } |
684 | 0 | size_t saved_bytes = ret.second - utf8_output; |
685 | 0 | if (ret.first != buf + len) { |
686 | 0 | const size_t scalar_saved_bytes = |
687 | 0 | scalar::utf16_to_utf8::convert<endianness::LITTLE>( |
688 | 0 | ret.first, len - (ret.first - buf), ret.second); |
689 | 0 | if (scalar_saved_bytes == 0) { |
690 | 0 | return 0; |
691 | 0 | } |
692 | 0 | saved_bytes += scalar_saved_bytes; |
693 | 0 | } |
694 | 0 | return saved_bytes; |
695 | 0 | } |
696 | | |
697 | | simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( |
698 | 0 | const char16_t *buf, size_t len, char *utf8_output) const noexcept { |
699 | 0 | std::pair<const char16_t *, char *> ret = |
700 | 0 | haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, |
701 | 0 | utf8_output); |
702 | 0 | if (ret.first == nullptr) { |
703 | 0 | return 0; |
704 | 0 | } |
705 | 0 | size_t saved_bytes = ret.second - utf8_output; |
706 | 0 | if (ret.first != buf + len) { |
707 | 0 | const size_t scalar_saved_bytes = |
708 | 0 | scalar::utf16_to_utf8::convert<endianness::BIG>( |
709 | 0 | ret.first, len - (ret.first - buf), ret.second); |
710 | 0 | if (scalar_saved_bytes == 0) { |
711 | 0 | return 0; |
712 | 0 | } |
713 | 0 | saved_bytes += scalar_saved_bytes; |
714 | 0 | } |
715 | 0 | return saved_bytes; |
716 | 0 | } |
717 | | |
718 | | simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( |
719 | 0 | const char16_t *buf, size_t len, char *utf8_output) const noexcept { |
720 | | // ret.first.count is always the position in the buffer, not the number of |
721 | | // code units written even if finished |
722 | 0 | std::pair<result, char *> ret = |
723 | 0 | haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>( |
724 | 0 | buf, len, utf8_output); |
725 | 0 | if (ret.first.error) { |
726 | 0 | return ret.first; |
727 | 0 | } // Can return directly since scalar fallback already found correct |
728 | | // ret.first.count |
729 | 0 | if (ret.first.count != len) { // All good so far, but not finished |
730 | 0 | result scalar_res = |
731 | 0 | scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>( |
732 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
733 | 0 | if (scalar_res.error) { |
734 | 0 | scalar_res.count += ret.first.count; |
735 | 0 | return scalar_res; |
736 | 0 | } else { |
737 | 0 | ret.second += scalar_res.count; |
738 | 0 | } |
739 | 0 | } |
740 | 0 | ret.first.count = |
741 | 0 | ret.second - |
742 | 0 | utf8_output; // Set count to the number of 8-bit code units written |
743 | 0 | return ret.first; |
744 | 0 | } |
745 | | |
746 | | simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( |
747 | 0 | const char16_t *buf, size_t len, char *utf8_output) const noexcept { |
748 | | // ret.first.count is always the position in the buffer, not the number of |
749 | | // code units written even if finished |
750 | 0 | std::pair<result, char *> ret = |
751 | 0 | haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>( |
752 | 0 | buf, len, utf8_output); |
753 | 0 | if (ret.first.error) { |
754 | 0 | return ret.first; |
755 | 0 | } // Can return directly since scalar fallback already found correct |
756 | | // ret.first.count |
757 | 0 | if (ret.first.count != len) { // All good so far, but not finished |
758 | 0 | result scalar_res = |
759 | 0 | scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>( |
760 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
761 | 0 | if (scalar_res.error) { |
762 | 0 | scalar_res.count += ret.first.count; |
763 | 0 | return scalar_res; |
764 | 0 | } else { |
765 | 0 | ret.second += scalar_res.count; |
766 | 0 | } |
767 | 0 | } |
768 | 0 | ret.first.count = |
769 | 0 | ret.second - |
770 | 0 | utf8_output; // Set count to the number of 8-bit code units written |
771 | 0 | return ret.first; |
772 | 0 | } |
773 | | |
774 | | simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( |
775 | 0 | const char16_t *buf, size_t len, char *utf8_output) const noexcept { |
776 | 0 | return convert_utf16le_to_utf8(buf, len, utf8_output); |
777 | 0 | } |
778 | | |
779 | | simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( |
780 | 0 | const char16_t *buf, size_t len, char *utf8_output) const noexcept { |
781 | 0 | return convert_utf16be_to_utf8(buf, len, utf8_output); |
782 | 0 | } |
783 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
784 | | |
785 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
786 | | simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( |
787 | 0 | const char32_t *buf, size_t len, char *utf8_output) const noexcept { |
788 | 0 | std::pair<const char32_t *, char *> ret = |
789 | 0 | avx2_convert_utf32_to_utf8(buf, len, utf8_output); |
790 | 0 | if (ret.first == nullptr) { |
791 | 0 | return 0; |
792 | 0 | } |
793 | 0 | size_t saved_bytes = ret.second - utf8_output; |
794 | 0 | if (ret.first != buf + len) { |
795 | 0 | const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( |
796 | 0 | ret.first, len - (ret.first - buf), ret.second); |
797 | 0 | if (scalar_saved_bytes == 0) { |
798 | 0 | return 0; |
799 | 0 | } |
800 | 0 | saved_bytes += scalar_saved_bytes; |
801 | 0 | } |
802 | 0 | return saved_bytes; |
803 | 0 | } |
804 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
805 | | |
806 | | #if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
807 | | simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( |
808 | 0 | const char32_t *buf, size_t len, char *latin1_output) const noexcept { |
809 | 0 | std::pair<const char32_t *, char *> ret = |
810 | 0 | avx2_convert_utf32_to_latin1(buf, len, latin1_output); |
811 | 0 | if (ret.first == nullptr) { |
812 | 0 | return 0; |
813 | 0 | } |
814 | 0 | size_t saved_bytes = ret.second - latin1_output; |
815 | 0 | if (ret.first != buf + len) { |
816 | 0 | const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( |
817 | 0 | ret.first, len - (ret.first - buf), ret.second); |
818 | 0 | if (scalar_saved_bytes == 0) { |
819 | 0 | return 0; |
820 | 0 | } |
821 | 0 | saved_bytes += scalar_saved_bytes; |
822 | 0 | } |
823 | 0 | return saved_bytes; |
824 | 0 | } |
825 | | |
826 | | simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( |
827 | 0 | const char32_t *buf, size_t len, char *latin1_output) const noexcept { |
828 | | // ret.first.count is always the position in the buffer, not the number of |
829 | | // code units written even if finished |
830 | 0 | std::pair<result, char *> ret = |
831 | 0 | avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); |
832 | 0 | if (ret.first.count != len) { |
833 | 0 | result scalar_res = scalar::utf32_to_latin1::convert_with_errors( |
834 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
835 | 0 | if (scalar_res.error) { |
836 | 0 | scalar_res.count += ret.first.count; |
837 | 0 | return scalar_res; |
838 | 0 | } else { |
839 | 0 | ret.second += scalar_res.count; |
840 | 0 | } |
841 | 0 | } |
842 | 0 | ret.first.count = |
843 | 0 | ret.second - |
844 | 0 | latin1_output; // Set count to the number of 8-bit code units written |
845 | 0 | return ret.first; |
846 | 0 | } |
847 | | |
848 | | simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( |
849 | 0 | const char32_t *buf, size_t len, char *latin1_output) const noexcept { |
850 | 0 | return convert_utf32_to_latin1(buf, len, latin1_output); |
851 | 0 | } |
852 | | #endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1 |
853 | | |
854 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
855 | | simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( |
856 | 0 | const char32_t *buf, size_t len, char *utf8_output) const noexcept { |
857 | | // ret.first.count is always the position in the buffer, not the number of |
858 | | // code units written even if finished |
859 | 0 | std::pair<result, char *> ret = |
860 | 0 | haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); |
861 | 0 | if (ret.first.count != len) { |
862 | 0 | result scalar_res = scalar::utf32_to_utf8::convert_with_errors( |
863 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
864 | 0 | if (scalar_res.error) { |
865 | 0 | scalar_res.count += ret.first.count; |
866 | 0 | return scalar_res; |
867 | 0 | } else { |
868 | 0 | ret.second += scalar_res.count; |
869 | 0 | } |
870 | 0 | } |
871 | 0 | ret.first.count = |
872 | 0 | ret.second - |
873 | 0 | utf8_output; // Set count to the number of 8-bit code units written |
874 | 0 | return ret.first; |
875 | 0 | } |
876 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
877 | | |
878 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
879 | | simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( |
880 | 0 | const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { |
881 | 0 | std::pair<const char16_t *, char32_t *> ret = |
882 | 0 | haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, |
883 | 0 | utf32_output); |
884 | 0 | if (ret.first == nullptr) { |
885 | 0 | return 0; |
886 | 0 | } |
887 | 0 | size_t saved_bytes = ret.second - utf32_output; |
888 | 0 | if (ret.first != buf + len) { |
889 | 0 | const size_t scalar_saved_bytes = |
890 | 0 | scalar::utf16_to_utf32::convert<endianness::LITTLE>( |
891 | 0 | ret.first, len - (ret.first - buf), ret.second); |
892 | 0 | if (scalar_saved_bytes == 0) { |
893 | 0 | return 0; |
894 | 0 | } |
895 | 0 | saved_bytes += scalar_saved_bytes; |
896 | 0 | } |
897 | 0 | return saved_bytes; |
898 | 0 | } |
899 | | |
900 | | simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( |
901 | 0 | const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { |
902 | 0 | std::pair<const char16_t *, char32_t *> ret = |
903 | 0 | haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, |
904 | 0 | utf32_output); |
905 | 0 | if (ret.first == nullptr) { |
906 | 0 | return 0; |
907 | 0 | } |
908 | 0 | size_t saved_bytes = ret.second - utf32_output; |
909 | 0 | if (ret.first != buf + len) { |
910 | 0 | const size_t scalar_saved_bytes = |
911 | 0 | scalar::utf16_to_utf32::convert<endianness::BIG>( |
912 | 0 | ret.first, len - (ret.first - buf), ret.second); |
913 | 0 | if (scalar_saved_bytes == 0) { |
914 | 0 | return 0; |
915 | 0 | } |
916 | 0 | saved_bytes += scalar_saved_bytes; |
917 | 0 | } |
918 | 0 | return saved_bytes; |
919 | 0 | } |
920 | | |
921 | | simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( |
922 | 0 | const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { |
923 | | // ret.first.count is always the position in the buffer, not the number of |
924 | | // code units written even if finished |
925 | 0 | std::pair<result, char32_t *> ret = |
926 | 0 | haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>( |
927 | 0 | buf, len, utf32_output); |
928 | 0 | if (ret.first.error) { |
929 | 0 | return ret.first; |
930 | 0 | } // Can return directly since scalar fallback already found correct |
931 | | // ret.first.count |
932 | 0 | if (ret.first.count != len) { // All good so far, but not finished |
933 | 0 | result scalar_res = |
934 | 0 | scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>( |
935 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
936 | 0 | if (scalar_res.error) { |
937 | 0 | scalar_res.count += ret.first.count; |
938 | 0 | return scalar_res; |
939 | 0 | } else { |
940 | 0 | ret.second += scalar_res.count; |
941 | 0 | } |
942 | 0 | } |
943 | 0 | ret.first.count = |
944 | 0 | ret.second - |
945 | 0 | utf32_output; // Set count to the number of 8-bit code units written |
946 | 0 | return ret.first; |
947 | 0 | } |
948 | | |
949 | | simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( |
950 | 0 | const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { |
951 | | // ret.first.count is always the position in the buffer, not the number of |
952 | | // code units written even if finished |
953 | 0 | std::pair<result, char32_t *> ret = |
954 | 0 | haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>( |
955 | 0 | buf, len, utf32_output); |
956 | 0 | if (ret.first.error) { |
957 | 0 | return ret.first; |
958 | 0 | } // Can return directly since scalar fallback already found correct |
959 | | // ret.first.count |
960 | 0 | if (ret.first.count != len) { // All good so far, but not finished |
961 | 0 | result scalar_res = |
962 | 0 | scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>( |
963 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
964 | 0 | if (scalar_res.error) { |
965 | 0 | scalar_res.count += ret.first.count; |
966 | 0 | return scalar_res; |
967 | 0 | } else { |
968 | 0 | ret.second += scalar_res.count; |
969 | 0 | } |
970 | 0 | } |
971 | 0 | ret.first.count = |
972 | 0 | ret.second - |
973 | 0 | utf32_output; // Set count to the number of 8-bit code units written |
974 | 0 | return ret.first; |
975 | 0 | } |
976 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
977 | | |
978 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
979 | | simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( |
980 | 0 | const char32_t *buf, size_t len, char *utf8_output) const noexcept { |
981 | 0 | return convert_utf32_to_utf8(buf, len, utf8_output); |
982 | 0 | } |
983 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
984 | | |
985 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
986 | | simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( |
987 | 0 | const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { |
988 | 0 | std::pair<const char32_t *, char16_t *> ret = |
989 | 0 | avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output); |
990 | 0 | if (ret.first == nullptr) { |
991 | 0 | return 0; |
992 | 0 | } |
993 | 0 | size_t saved_bytes = ret.second - utf16_output; |
994 | 0 | if (ret.first != buf + len) { |
995 | 0 | const size_t scalar_saved_bytes = |
996 | 0 | scalar::utf32_to_utf16::convert<endianness::LITTLE>( |
997 | 0 | ret.first, len - (ret.first - buf), ret.second); |
998 | 0 | if (scalar_saved_bytes == 0) { |
999 | 0 | return 0; |
1000 | 0 | } |
1001 | 0 | saved_bytes += scalar_saved_bytes; |
1002 | 0 | } |
1003 | 0 | return saved_bytes; |
1004 | 0 | } |
1005 | | |
1006 | | simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( |
1007 | 0 | const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { |
1008 | 0 | std::pair<const char32_t *, char16_t *> ret = |
1009 | 0 | avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output); |
1010 | 0 | if (ret.first == nullptr) { |
1011 | 0 | return 0; |
1012 | 0 | } |
1013 | 0 | size_t saved_bytes = ret.second - utf16_output; |
1014 | 0 | if (ret.first != buf + len) { |
1015 | 0 | const size_t scalar_saved_bytes = |
1016 | 0 | scalar::utf32_to_utf16::convert<endianness::BIG>( |
1017 | 0 | ret.first, len - (ret.first - buf), ret.second); |
1018 | 0 | if (scalar_saved_bytes == 0) { |
1019 | 0 | return 0; |
1020 | 0 | } |
1021 | 0 | saved_bytes += scalar_saved_bytes; |
1022 | 0 | } |
1023 | 0 | return saved_bytes; |
1024 | 0 | } |
1025 | | |
1026 | | simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( |
1027 | 0 | const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { |
1028 | | // ret.first.count is always the position in the buffer, not the number of |
1029 | | // code units written even if finished |
1030 | 0 | std::pair<result, char16_t *> ret = |
1031 | 0 | haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>( |
1032 | 0 | buf, len, utf16_output); |
1033 | 0 | if (ret.first.count != len) { |
1034 | 0 | result scalar_res = |
1035 | 0 | scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>( |
1036 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
1037 | 0 | if (scalar_res.error) { |
1038 | 0 | scalar_res.count += ret.first.count; |
1039 | 0 | return scalar_res; |
1040 | 0 | } else { |
1041 | 0 | ret.second += scalar_res.count; |
1042 | 0 | } |
1043 | 0 | } |
1044 | 0 | ret.first.count = |
1045 | 0 | ret.second - |
1046 | 0 | utf16_output; // Set count to the number of 8-bit code units written |
1047 | 0 | return ret.first; |
1048 | 0 | } |
1049 | | |
1050 | | simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( |
1051 | 0 | const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { |
1052 | | // ret.first.count is always the position in the buffer, not the number of |
1053 | | // code units written even if finished |
1054 | 0 | std::pair<result, char16_t *> ret = |
1055 | 0 | haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>( |
1056 | 0 | buf, len, utf16_output); |
1057 | 0 | if (ret.first.count != len) { |
1058 | 0 | result scalar_res = |
1059 | 0 | scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>( |
1060 | 0 | buf + ret.first.count, len - ret.first.count, ret.second); |
1061 | 0 | if (scalar_res.error) { |
1062 | 0 | scalar_res.count += ret.first.count; |
1063 | 0 | return scalar_res; |
1064 | 0 | } else { |
1065 | 0 | ret.second += scalar_res.count; |
1066 | 0 | } |
1067 | 0 | } |
1068 | 0 | ret.first.count = |
1069 | 0 | ret.second - |
1070 | 0 | utf16_output; // Set count to the number of 8-bit code units written |
1071 | 0 | return ret.first; |
1072 | 0 | } |
1073 | | |
1074 | | simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( |
1075 | 0 | const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { |
1076 | 0 | return convert_utf32_to_utf16le(buf, len, utf16_output); |
1077 | 0 | } |
1078 | | |
1079 | | simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( |
1080 | 0 | const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { |
1081 | 0 | return convert_utf32_to_utf16be(buf, len, utf16_output); |
1082 | 0 | } |
1083 | | |
1084 | | simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( |
1085 | 0 | const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { |
1086 | 0 | return convert_utf16le_to_utf32(buf, len, utf32_output); |
1087 | 0 | } |
1088 | | |
1089 | | simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( |
1090 | 0 | const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { |
1091 | 0 | return convert_utf16be_to_utf32(buf, len, utf32_output); |
1092 | 0 | } |
1093 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
1094 | | |
1095 | | #if SIMDUTF_FEATURE_UTF16 |
1096 | | void implementation::change_endianness_utf16(const char16_t *input, |
1097 | | size_t length, |
1098 | 0 | char16_t *output) const noexcept { |
1099 | 0 | utf16::change_endianness_utf16(input, length, output); |
1100 | 0 | } |
1101 | | |
1102 | | simdutf_warn_unused size_t implementation::count_utf16le( |
1103 | 0 | const char16_t *input, size_t length) const noexcept { |
1104 | 0 | return utf16::count_code_points<endianness::LITTLE>(input, length); |
1105 | 0 | } |
1106 | | |
1107 | | simdutf_warn_unused size_t implementation::count_utf16be( |
1108 | 0 | const char16_t *input, size_t length) const noexcept { |
1109 | 0 | return utf16::count_code_points<endianness::BIG>(input, length); |
1110 | 0 | } |
1111 | | #endif // SIMDUTF_FEATURE_UTF16 |
1112 | | |
1113 | | #if SIMDUTF_FEATURE_UTF8 |
1114 | | simdutf_warn_unused size_t |
1115 | 0 | implementation::count_utf8(const char *in, size_t size) const noexcept { |
1116 | 0 | return utf8::count_code_points_bytemask(in, size); |
1117 | 0 | } |
1118 | | #endif // SIMDUTF_FEATURE_UTF8 |
1119 | | |
1120 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
1121 | | simdutf_warn_unused size_t implementation::latin1_length_from_utf8( |
1122 | 0 | const char *buf, size_t len) const noexcept { |
1123 | 0 | return count_utf8(buf, len); |
1124 | 0 | } |
1125 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
1126 | | |
1127 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
1128 | | simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( |
1129 | 0 | const char16_t *input, size_t length) const noexcept { |
1130 | 0 | return utf16::utf8_length_from_utf16_bytemask<endianness::LITTLE>(input, |
1131 | 0 | length); |
1132 | 0 | } |
1133 | | |
1134 | | simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( |
1135 | 0 | const char16_t *input, size_t length) const noexcept { |
1136 | 0 | return utf16::utf8_length_from_utf16_bytemask<endianness::BIG>(input, length); |
1137 | 0 | } |
1138 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
1139 | | |
1140 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
1141 | | simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( |
1142 | 0 | const char16_t *input, size_t length) const noexcept { |
1143 | 0 | return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length); |
1144 | 0 | } |
1145 | | |
1146 | | simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( |
1147 | 0 | const char16_t *input, size_t length) const noexcept { |
1148 | 0 | return utf16::utf32_length_from_utf16<endianness::BIG>(input, length); |
1149 | 0 | } |
1150 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
1151 | | |
1152 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
1153 | | simdutf_warn_unused size_t implementation::utf16_length_from_utf8( |
1154 | 0 | const char *input, size_t length) const noexcept { |
1155 | 0 | return utf8::utf16_length_from_utf8_bytemask(input, length); |
1156 | 0 | } |
1157 | | simdutf_warn_unused result |
1158 | | implementation::utf8_length_from_utf16le_with_replacement( |
1159 | 0 | const char16_t *input, size_t length) const noexcept { |
1160 | 0 | return utf16::utf8_length_from_utf16_with_replacement<endianness::LITTLE>( |
1161 | 0 | input, length); |
1162 | 0 | } |
1163 | | |
1164 | | simdutf_warn_unused result |
1165 | | implementation::utf8_length_from_utf16be_with_replacement( |
1166 | 0 | const char16_t *input, size_t length) const noexcept { |
1167 | 0 | return utf16::utf8_length_from_utf16_with_replacement<endianness::BIG>( |
1168 | 0 | input, length); |
1169 | 0 | } |
1170 | | |
1171 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 |
1172 | | |
1173 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
1174 | | simdutf_warn_unused size_t implementation::utf8_length_from_latin1( |
1175 | 0 | const char *input, size_t len) const noexcept { |
1176 | 0 | const uint8_t *data = reinterpret_cast<const uint8_t *>(input); |
1177 | 0 | size_t answer = len / sizeof(__m256i) * sizeof(__m256i); |
1178 | 0 | size_t i = 0; |
1179 | 0 | if (answer >= 2048) { // long strings optimization |
1180 | 0 | __m256i four_64bits = _mm256_setzero_si256(); |
1181 | 0 | while (i + sizeof(__m256i) <= len) { |
1182 | 0 | __m256i runner = _mm256_setzero_si256(); |
1183 | | // We can do up to 255 loops without overflow. |
1184 | 0 | size_t iterations = (len - i) / sizeof(__m256i); |
1185 | 0 | if (iterations > 255) { |
1186 | 0 | iterations = 255; |
1187 | 0 | } |
1188 | 0 | size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); |
1189 | 0 | for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) { |
1190 | 0 | __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); |
1191 | 0 | __m256i input2 = |
1192 | 0 | _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); |
1193 | 0 | __m256i input3 = _mm256_loadu_si256( |
1194 | 0 | (const __m256i *)(data + i + 2 * sizeof(__m256i))); |
1195 | 0 | __m256i input4 = _mm256_loadu_si256( |
1196 | 0 | (const __m256i *)(data + i + 3 * sizeof(__m256i))); |
1197 | 0 | __m256i input12 = |
1198 | 0 | _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), |
1199 | 0 | _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); |
1200 | 0 | __m256i input23 = |
1201 | 0 | _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), |
1202 | 0 | _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); |
1203 | 0 | __m256i input1234 = _mm256_add_epi8(input12, input23); |
1204 | 0 | runner = _mm256_sub_epi8(runner, input1234); |
1205 | 0 | } |
1206 | 0 | for (; i <= max_i; i += sizeof(__m256i)) { |
1207 | 0 | __m256i input_256_chunk = |
1208 | 0 | _mm256_loadu_si256((const __m256i *)(data + i)); |
1209 | 0 | runner = _mm256_sub_epi8( |
1210 | 0 | runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); |
1211 | 0 | } |
1212 | 0 | four_64bits = _mm256_add_epi64( |
1213 | 0 | four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); |
1214 | 0 | } |
1215 | 0 | answer += _mm256_extract_epi64(four_64bits, 0) + |
1216 | 0 | _mm256_extract_epi64(four_64bits, 1) + |
1217 | 0 | _mm256_extract_epi64(four_64bits, 2) + |
1218 | 0 | _mm256_extract_epi64(four_64bits, 3); |
1219 | 0 | } else if (answer > 0) { |
1220 | 0 | for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) { |
1221 | 0 | __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i)); |
1222 | 0 | uint32_t non_ascii = _mm256_movemask_epi8(latin); |
1223 | 0 | answer += count_ones(non_ascii); |
1224 | 0 | } |
1225 | 0 | } |
1226 | 0 | return answer + scalar::latin1::utf8_length_from_latin1( |
1227 | 0 | reinterpret_cast<const char *>(data + i), len - i); |
1228 | 0 | } |
1229 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1 |
1230 | | |
1231 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
1232 | | simdutf_warn_unused size_t implementation::utf8_length_from_utf32( |
1233 | 0 | const char32_t *input, size_t length) const noexcept { |
1234 | 0 | return utf32::utf8_length_from_utf32(input, length); |
1235 | 0 | } |
1236 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
1237 | | |
1238 | | #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
1239 | | simdutf_warn_unused size_t implementation::utf16_length_from_utf32( |
1240 | 0 | const char32_t *input, size_t length) const noexcept { |
1241 | 0 | const __m256i v_00000000 = _mm256_setzero_si256(); |
1242 | 0 | const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); |
1243 | 0 | size_t pos = 0; |
1244 | 0 | size_t count = 0; |
1245 | 0 | for (; pos + 8 <= length; pos += 8) { |
1246 | 0 | __m256i in = _mm256_loadu_si256((__m256i *)(input + pos)); |
1247 | 0 | const __m256i surrogate_bytemask = |
1248 | 0 | _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); |
1249 | 0 | const uint32_t surrogate_bitmask = |
1250 | 0 | static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask)); |
1251 | 0 | size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4; |
1252 | 0 | count += 8 + surrogate_count; |
1253 | 0 | } |
1254 | 0 | return count + |
1255 | 0 | scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); |
1256 | 0 | } |
1257 | | #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 |
1258 | | |
1259 | | #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
1260 | | simdutf_warn_unused size_t implementation::utf32_length_from_utf8( |
1261 | 0 | const char *input, size_t length) const noexcept { |
1262 | 0 | return utf8::count_code_points(input, length); |
1263 | 0 | } |
1264 | | #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 |
1265 | | |
1266 | | #if SIMDUTF_FEATURE_BASE64 |
1267 | | simdutf_warn_unused result implementation::base64_to_binary( |
1268 | | const char *input, size_t length, char *output, base64_options options, |
1269 | 1.96k | last_chunk_handling_options last_chunk_options) const noexcept { |
1270 | 1.96k | if (options & base64_default_or_url) { |
1271 | 0 | if (options == base64_options::base64_default_or_url_accept_garbage) { |
1272 | 0 | return base64::compress_decode_base64<false, true, true>( |
1273 | 0 | output, input, length, options, last_chunk_options); |
1274 | 0 | } else { |
1275 | 0 | return base64::compress_decode_base64<false, false, true>( |
1276 | 0 | output, input, length, options, last_chunk_options); |
1277 | 0 | } |
1278 | 1.96k | } else if (options & base64_url) { |
1279 | 1.00k | if (options == base64_options::base64_url_accept_garbage) { |
1280 | 0 | return base64::compress_decode_base64<true, true, false>( |
1281 | 0 | output, input, length, options, last_chunk_options); |
1282 | 1.00k | } else { |
1283 | 1.00k | return base64::compress_decode_base64<true, false, false>( |
1284 | 1.00k | output, input, length, options, last_chunk_options); |
1285 | 1.00k | } |
1286 | 1.00k | } else { |
1287 | 960 | if (options == base64_options::base64_default_accept_garbage) { |
1288 | 0 | return base64::compress_decode_base64<false, true, false>( |
1289 | 0 | output, input, length, options, last_chunk_options); |
1290 | 960 | } else { |
1291 | 960 | return base64::compress_decode_base64<false, false, false>( |
1292 | 960 | output, input, length, options, last_chunk_options); |
1293 | 960 | } |
1294 | 960 | } |
1295 | 1.96k | } |
1296 | | |
1297 | | simdutf_warn_unused full_result implementation::base64_to_binary_details( |
1298 | | const char *input, size_t length, char *output, base64_options options, |
1299 | 356 | last_chunk_handling_options last_chunk_options) const noexcept { |
1300 | 356 | if (options & base64_default_or_url) { |
1301 | 0 | if (options == base64_options::base64_default_or_url_accept_garbage) { |
1302 | 0 | return base64::compress_decode_base64<false, true, true>( |
1303 | 0 | output, input, length, options, last_chunk_options); |
1304 | 0 | } else { |
1305 | 0 | return base64::compress_decode_base64<false, false, true>( |
1306 | 0 | output, input, length, options, last_chunk_options); |
1307 | 0 | } |
1308 | 356 | } else if (options & base64_url) { |
1309 | 162 | if (options == base64_options::base64_url_accept_garbage) { |
1310 | 0 | return base64::compress_decode_base64<true, true, false>( |
1311 | 0 | output, input, length, options, last_chunk_options); |
1312 | 162 | } else { |
1313 | 162 | return base64::compress_decode_base64<true, false, false>( |
1314 | 162 | output, input, length, options, last_chunk_options); |
1315 | 162 | } |
1316 | 194 | } else { |
1317 | 194 | if (options == base64_options::base64_default_accept_garbage) { |
1318 | 0 | return base64::compress_decode_base64<false, true, false>( |
1319 | 0 | output, input, length, options, last_chunk_options); |
1320 | 194 | } else { |
1321 | 194 | return base64::compress_decode_base64<false, false, false>( |
1322 | 194 | output, input, length, options, last_chunk_options); |
1323 | 194 | } |
1324 | 194 | } |
1325 | 356 | } |
1326 | | |
1327 | | simdutf_warn_unused result implementation::base64_to_binary( |
1328 | | const char16_t *input, size_t length, char *output, base64_options options, |
1329 | 1.10k | last_chunk_handling_options last_chunk_options) const noexcept { |
1330 | 1.10k | if (options & base64_default_or_url) { |
1331 | 0 | if (options == base64_options::base64_default_or_url_accept_garbage) { |
1332 | 0 | return base64::compress_decode_base64<false, true, true>( |
1333 | 0 | output, input, length, options, last_chunk_options); |
1334 | 0 | } else { |
1335 | 0 | return base64::compress_decode_base64<false, false, true>( |
1336 | 0 | output, input, length, options, last_chunk_options); |
1337 | 0 | } |
1338 | 1.10k | } else if (options & base64_url) { |
1339 | 538 | if (options == base64_options::base64_url_accept_garbage) { |
1340 | 0 | return base64::compress_decode_base64<true, true, false>( |
1341 | 0 | output, input, length, options, last_chunk_options); |
1342 | 538 | } else { |
1343 | 538 | return base64::compress_decode_base64<true, false, false>( |
1344 | 538 | output, input, length, options, last_chunk_options); |
1345 | 538 | } |
1346 | 568 | } else { |
1347 | 568 | if (options == base64_options::base64_default_accept_garbage) { |
1348 | 0 | return base64::compress_decode_base64<false, true, false>( |
1349 | 0 | output, input, length, options, last_chunk_options); |
1350 | 568 | } else { |
1351 | 568 | return base64::compress_decode_base64<false, false, false>( |
1352 | 568 | output, input, length, options, last_chunk_options); |
1353 | 568 | } |
1354 | 568 | } |
1355 | 1.10k | } |
1356 | | |
1357 | | simdutf_warn_unused full_result implementation::base64_to_binary_details( |
1358 | | const char16_t *input, size_t length, char *output, base64_options options, |
1359 | 492 | last_chunk_handling_options last_chunk_options) const noexcept { |
1360 | 492 | if (options & base64_default_or_url) { |
1361 | 0 | if (options == base64_options::base64_default_or_url_accept_garbage) { |
1362 | 0 | return base64::compress_decode_base64<false, true, true>( |
1363 | 0 | output, input, length, options, last_chunk_options); |
1364 | 0 | } else { |
1365 | 0 | return base64::compress_decode_base64<false, false, true>( |
1366 | 0 | output, input, length, options, last_chunk_options); |
1367 | 0 | } |
1368 | 492 | } else if (options & base64_url) { |
1369 | 219 | if (options == base64_options::base64_url_accept_garbage) { |
1370 | 0 | return base64::compress_decode_base64<true, true, false>( |
1371 | 0 | output, input, length, options, last_chunk_options); |
1372 | 219 | } else { |
1373 | 219 | return base64::compress_decode_base64<true, false, false>( |
1374 | 219 | output, input, length, options, last_chunk_options); |
1375 | 219 | } |
1376 | 273 | } else { |
1377 | 273 | if (options == base64_options::base64_default_accept_garbage) { |
1378 | 0 | return base64::compress_decode_base64<false, true, false>( |
1379 | 0 | output, input, length, options, last_chunk_options); |
1380 | 273 | } else { |
1381 | 273 | return base64::compress_decode_base64<false, false, false>( |
1382 | 273 | output, input, length, options, last_chunk_options); |
1383 | 273 | } |
1384 | 273 | } |
1385 | 492 | } |
1386 | | |
1387 | | size_t implementation::binary_to_base64(const char *input, size_t length, |
1388 | | char *output, |
1389 | 1.05k | base64_options options) const noexcept { |
1390 | 1.05k | if (options & base64_url) { |
1391 | 545 | return encode_base64<true>(output, input, length, options); |
1392 | 545 | } else { |
1393 | 512 | return encode_base64<false>(output, input, length, options); |
1394 | 512 | } |
1395 | 1.05k | } |
1396 | | |
1397 | | size_t implementation::binary_to_base64_with_lines( |
1398 | | const char *input, size_t length, char *output, size_t line_length, |
1399 | 1.05k | base64_options options) const noexcept { |
1400 | 1.05k | if (options & base64_url) { |
1401 | 545 | return avx2_encode_base64_impl<true, true>(output, input, length, options, |
1402 | 545 | line_length); |
1403 | 545 | } else { |
1404 | 512 | return avx2_encode_base64_impl<false, true>(output, input, length, options, |
1405 | 512 | line_length); |
1406 | 512 | } |
1407 | 1.05k | } |
1408 | | |
1409 | | const char *implementation::find(const char *start, const char *end, |
1410 | 0 | char character) const noexcept { |
1411 | 0 | return util::find(start, end, character); |
1412 | 0 | } |
1413 | | |
1414 | | const char16_t *implementation::find(const char16_t *start, const char16_t *end, |
1415 | 0 | char16_t character) const noexcept { |
1416 | 0 | return util::find(start, end, character); |
1417 | 0 | } |
1418 | | #endif // SIMDUTF_FEATURE_BASE64 |
1419 | | |
1420 | | } // namespace SIMDUTF_IMPLEMENTATION |
1421 | | } // namespace simdutf |
1422 | | |
1423 | | #include "simdutf/haswell/end.h" |