/work/include/simdutf/scalar/utf8.h
Line | Count | Source |
1 | | #ifndef SIMDUTF_UTF8_H |
2 | | #define SIMDUTF_UTF8_H |
3 | | |
4 | | namespace simdutf { |
5 | | namespace scalar { |
6 | | namespace { |
7 | | namespace utf8 { |
8 | | |
9 | | // credit: based on code from Google Fuchsia (Apache Licensed) |
10 | | template <class BytePtr> |
11 | | simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data, |
12 | 0 | size_t len) noexcept { |
13 | 0 | static_assert( |
14 | 0 | std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value, |
15 | 0 | "dereferencing the data pointer must result in a uint8_t"); |
16 | 0 | uint64_t pos = 0; |
17 | 0 | uint32_t code_point = 0; |
18 | 0 | while (pos < len) { |
19 | 0 | uint64_t next_pos; |
20 | 0 | #if SIMDUTF_CPLUSPLUS23 |
21 | 0 | if !consteval |
22 | 0 | #endif |
23 | 0 | { // check if the next 16 bytes are ascii. |
24 | 0 | next_pos = pos + 16; |
25 | 0 | if (next_pos <= len) { // if it is safe to read 16 more bytes, check |
26 | 0 | // that they are ascii |
27 | 0 | uint64_t v1{}; |
28 | 0 | std::memcpy(&v1, data + pos, sizeof(uint64_t)); |
29 | 0 | uint64_t v2{}; |
30 | 0 | std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); |
31 | 0 | uint64_t v{v1 | v2}; |
32 | 0 | if ((v & 0x8080808080808080) == 0) { |
33 | 0 | pos = next_pos; |
34 | 0 | continue; |
35 | 0 | } |
36 | 0 | } |
37 | 0 | } |
38 | 0 |
|
39 | 0 | unsigned char byte = data[pos]; |
40 | 0 |
|
41 | 0 | while (byte < 0b10000000) { |
42 | 0 | if (++pos == len) { |
43 | 0 | return true; |
44 | 0 | } |
45 | 0 | byte = data[pos]; |
46 | 0 | } |
47 | 0 |
|
48 | 0 | if ((byte & 0b11100000) == 0b11000000) { |
49 | 0 | next_pos = pos + 2; |
50 | 0 | if (next_pos > len) { |
51 | 0 | return false; |
52 | 0 | } |
53 | 0 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
54 | 0 | return false; |
55 | 0 | } |
56 | 0 | // range check |
57 | 0 | code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); |
58 | 0 | if ((code_point < 0x80) || (0x7ff < code_point)) { |
59 | 0 | return false; |
60 | 0 | } |
61 | 0 | } else if ((byte & 0b11110000) == 0b11100000) { |
62 | 0 | next_pos = pos + 3; |
63 | 0 | if (next_pos > len) { |
64 | 0 | return false; |
65 | 0 | } |
66 | 0 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
67 | 0 | return false; |
68 | 0 | } |
69 | 0 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
70 | 0 | return false; |
71 | 0 | } |
72 | 0 | // range check |
73 | 0 | code_point = (byte & 0b00001111) << 12 | |
74 | 0 | (data[pos + 1] & 0b00111111) << 6 | |
75 | 0 | (data[pos + 2] & 0b00111111); |
76 | 0 | if ((code_point < 0x800) || (0xffff < code_point) || |
77 | 0 | (0xd7ff < code_point && code_point < 0xe000)) { |
78 | 0 | return false; |
79 | 0 | } |
80 | 0 | } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 |
81 | 0 | next_pos = pos + 4; |
82 | 0 | if (next_pos > len) { |
83 | 0 | return false; |
84 | 0 | } |
85 | 0 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
86 | 0 | return false; |
87 | 0 | } |
88 | 0 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
89 | 0 | return false; |
90 | 0 | } |
91 | 0 | if ((data[pos + 3] & 0b11000000) != 0b10000000) { |
92 | 0 | return false; |
93 | 0 | } |
94 | 0 | // range check |
95 | 0 | code_point = |
96 | 0 | (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | |
97 | 0 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); |
98 | 0 | if (code_point <= 0xffff || 0x10ffff < code_point) { |
99 | 0 | return false; |
100 | 0 | } |
101 | 0 | } else { |
102 | 0 | // we may have a continuation |
103 | 0 | return false; |
104 | 0 | } |
105 | 0 | pos = next_pos; |
106 | 0 | } |
107 | 0 | return true; |
108 | 0 | } Unexecuted instantiation: roundtrip.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long) Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long) Unexecuted instantiation: misc.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long) Unexecuted instantiation: conversion.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long) |
109 | | |
110 | | simdutf_really_inline simdutf_warn_unused bool validate(const char *buf, |
111 | 0 | size_t len) noexcept { |
112 | 0 | return validate(reinterpret_cast<const uint8_t *>(buf), len); |
113 | 0 | } Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long) |
114 | | |
115 | | template <class BytePtr> |
116 | | simdutf_constexpr23 simdutf_warn_unused result |
117 | 0 | validate_with_errors(BytePtr data, size_t len) noexcept { |
118 | 0 | static_assert( |
119 | 0 | std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value, |
120 | 0 | "dereferencing the data pointer must result in a uint8_t"); |
121 | 0 | size_t pos = 0; |
122 | 0 | uint32_t code_point = 0; |
123 | 0 | while (pos < len) { |
124 | 0 | // check of the next 16 bytes are ascii. |
125 | 0 | size_t next_pos = pos + 16; |
126 | 0 | if (next_pos <= |
127 | 0 | len) { // if it is safe to read 16 more bytes, check that they are ascii |
128 | 0 | uint64_t v1; |
129 | 0 | std::memcpy(&v1, data + pos, sizeof(uint64_t)); |
130 | 0 | uint64_t v2; |
131 | 0 | std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); |
132 | 0 | uint64_t v{v1 | v2}; |
133 | 0 | if ((v & 0x8080808080808080) == 0) { |
134 | 0 | pos = next_pos; |
135 | 0 | continue; |
136 | 0 | } |
137 | 0 | } |
138 | 0 | unsigned char byte = data[pos]; |
139 | 0 |
|
140 | 0 | while (byte < 0b10000000) { |
141 | 0 | if (++pos == len) { |
142 | 0 | return result(error_code::SUCCESS, len); |
143 | 0 | } |
144 | 0 | byte = data[pos]; |
145 | 0 | } |
146 | 0 |
|
147 | 0 | if ((byte & 0b11100000) == 0b11000000) { |
148 | 0 | next_pos = pos + 2; |
149 | 0 | if (next_pos > len) { |
150 | 0 | return result(error_code::TOO_SHORT, pos); |
151 | 0 | } |
152 | 0 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
153 | 0 | return result(error_code::TOO_SHORT, pos); |
154 | 0 | } |
155 | 0 | // range check |
156 | 0 | code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); |
157 | 0 | if ((code_point < 0x80) || (0x7ff < code_point)) { |
158 | 0 | return result(error_code::OVERLONG, pos); |
159 | 0 | } |
160 | 0 | } else if ((byte & 0b11110000) == 0b11100000) { |
161 | 0 | next_pos = pos + 3; |
162 | 0 | if (next_pos > len) { |
163 | 0 | return result(error_code::TOO_SHORT, pos); |
164 | 0 | } |
165 | 0 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
166 | 0 | return result(error_code::TOO_SHORT, pos); |
167 | 0 | } |
168 | 0 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
169 | 0 | return result(error_code::TOO_SHORT, pos); |
170 | 0 | } |
171 | 0 | // range check |
172 | 0 | code_point = (byte & 0b00001111) << 12 | |
173 | 0 | (data[pos + 1] & 0b00111111) << 6 | |
174 | 0 | (data[pos + 2] & 0b00111111); |
175 | 0 | if ((code_point < 0x800) || (0xffff < code_point)) { |
176 | 0 | return result(error_code::OVERLONG, pos); |
177 | 0 | } |
178 | 0 | if (0xd7ff < code_point && code_point < 0xe000) { |
179 | 0 | return result(error_code::SURROGATE, pos); |
180 | 0 | } |
181 | 0 | } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 |
182 | 0 | next_pos = pos + 4; |
183 | 0 | if (next_pos > len) { |
184 | 0 | return result(error_code::TOO_SHORT, pos); |
185 | 0 | } |
186 | 0 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
187 | 0 | return result(error_code::TOO_SHORT, pos); |
188 | 0 | } |
189 | 0 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
190 | 0 | return result(error_code::TOO_SHORT, pos); |
191 | 0 | } |
192 | 0 | if ((data[pos + 3] & 0b11000000) != 0b10000000) { |
193 | 0 | return result(error_code::TOO_SHORT, pos); |
194 | 0 | } |
195 | 0 | // range check |
196 | 0 | code_point = |
197 | 0 | (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | |
198 | 0 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); |
199 | 0 | if (code_point <= 0xffff) { |
200 | 0 | return result(error_code::OVERLONG, pos); |
201 | 0 | } |
202 | 0 | if (0x10ffff < code_point) { |
203 | 0 | return result(error_code::TOO_LARGE, pos); |
204 | 0 | } |
205 | 0 | } else { |
206 | 0 | // we either have too many continuation bytes or an invalid leading byte |
207 | 0 | if ((byte & 0b11000000) == 0b10000000) { |
208 | 0 | return result(error_code::TOO_LONG, pos); |
209 | 0 | } else { |
210 | 0 | return result(error_code::HEADER_BITS, pos); |
211 | 0 | } |
212 | 0 | } |
213 | 0 | pos = next_pos; |
214 | 0 | } |
215 | 0 | return result(error_code::SUCCESS, len); |
216 | 0 | } Unexecuted instantiation: roundtrip.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long) |
217 | | |
218 | | simdutf_really_inline simdutf_warn_unused result |
219 | 0 | validate_with_errors(const char *buf, size_t len) noexcept { |
220 | 0 | return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len); |
221 | 0 | } Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long) |
222 | | |
223 | | // Finds the previous leading byte starting backward from buf and validates with |
224 | | // errors from there Used to pinpoint the location of an error when an invalid |
225 | | // chunk is detected We assume that the stream starts with a leading byte, and |
226 | | // to check that it is the case, we ask that you pass a pointer to the start of |
227 | | // the stream (start). |
228 | | inline simdutf_warn_unused result rewind_and_validate_with_errors( |
229 | 0 | const char *start, const char *buf, size_t len) noexcept { |
230 | 0 | // First check that we start with a leading byte |
231 | 0 | if ((*start & 0b11000000) == 0b10000000) { |
232 | 0 | return result(error_code::TOO_LONG, 0); |
233 | 0 | } |
234 | 0 | size_t extra_len{0}; |
235 | 0 | // A leading byte cannot be further than 4 bytes away |
236 | 0 | for (int i = 0; i < 5; i++) { |
237 | 0 | unsigned char byte = *buf; |
238 | 0 | if ((byte & 0b11000000) != 0b10000000) { |
239 | 0 | break; |
240 | 0 | } else { |
241 | 0 | buf--; |
242 | 0 | extra_len++; |
243 | 0 | } |
244 | 0 | } |
245 | 0 |
|
246 | 0 | result res = validate_with_errors(buf, len + extra_len); |
247 | 0 | res.count -= extra_len; |
248 | 0 | return res; |
249 | 0 | } Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long) |
250 | | |
251 | | template <typename InputPtr> |
252 | | #if SIMDUTF_CPLUSPLUS20 |
253 | | requires simdutf::detail::indexes_into_byte_like<InputPtr> |
254 | | #endif |
255 | | simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) { |
256 | | size_t counter{0}; |
257 | | for (size_t i = 0; i < len; i++) { |
258 | | // -65 is 0b10111111, anything larger in two-complement's should start a new |
259 | | // code point. |
260 | | if (int8_t(data[i]) > -65) { |
261 | | counter++; |
262 | | } |
263 | | } |
264 | | return counter; |
265 | | } |
266 | | |
267 | | template <typename InputPtr> |
268 | | #if SIMDUTF_CPLUSPLUS20 |
269 | | requires simdutf::detail::indexes_into_byte_like<InputPtr> |
270 | | #endif |
271 | | simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) { |
272 | | size_t counter{0}; |
273 | | for (size_t i = 0; i < len; i++) { |
274 | | if (int8_t(data[i]) > -65) { |
275 | | counter++; |
276 | | } |
277 | | if (uint8_t(data[i]) >= 240) { |
278 | | counter++; |
279 | | } |
280 | | } |
281 | | return counter; |
282 | | } |
283 | | |
284 | | template <typename InputPtr> |
285 | | #if SIMDUTF_CPLUSPLUS20 |
286 | | requires simdutf::detail::indexes_into_byte_like<InputPtr> |
287 | | #endif |
288 | | simdutf_warn_unused simdutf_constexpr23 size_t |
289 | | trim_partial_utf8(InputPtr input, size_t length) { |
290 | | if (length < 3) { |
291 | | switch (length) { |
292 | | case 2: |
293 | | if (uint8_t(input[length - 1]) >= 0xc0) { |
294 | | return length - 1; |
295 | | } // 2-, 3- and 4-byte characters with only 1 byte left |
296 | | if (uint8_t(input[length - 2]) >= 0xe0) { |
297 | | return length - 2; |
298 | | } // 3- and 4-byte characters with only 2 bytes left |
299 | | return length; |
300 | | case 1: |
301 | | if (uint8_t(input[length - 1]) >= 0xc0) { |
302 | | return length - 1; |
303 | | } // 2-, 3- and 4-byte characters with only 1 byte left |
304 | | return length; |
305 | | case 0: |
306 | | return length; |
307 | | } |
308 | | } |
309 | | if (uint8_t(input[length - 1]) >= 0xc0) { |
310 | | return length - 1; |
311 | | } // 2-, 3- and 4-byte characters with only 1 byte left |
312 | | if (uint8_t(input[length - 2]) >= 0xe0) { |
313 | | return length - 2; |
314 | | } // 3- and 4-byte characters with only 1 byte left |
315 | | if (uint8_t(input[length - 3]) >= 0xf0) { |
316 | | return length - 3; |
317 | | } // 4-byte characters with only 3 bytes left |
318 | | return length; |
319 | | } |
320 | | |
321 | | } // namespace utf8 |
322 | | } // unnamed namespace |
323 | | } // namespace scalar |
324 | | } // namespace simdutf |
325 | | |
326 | | #endif |