/src/simdutf/src/scalar/utf8.h
Line | Count | Source |
1 | | #ifndef SIMDUTF_UTF8_H |
2 | | #define SIMDUTF_UTF8_H |
3 | | |
4 | | namespace simdutf { |
5 | | namespace scalar { |
6 | | namespace { |
7 | | namespace utf8 { |
8 | | #if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV |
9 | | // only used by the fallback kernel. |
10 | | // credit: based on code from Google Fuchsia (Apache Licensed) |
11 | 3.44k | inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { |
12 | 3.44k | const uint8_t *data = reinterpret_cast<const uint8_t *>(buf); |
13 | 3.44k | uint64_t pos = 0; |
14 | 3.44k | uint32_t code_point = 0; |
15 | 5.92M | while (pos < len) { |
16 | | // check of the next 16 bytes are ascii. |
17 | 5.92M | uint64_t next_pos = pos + 16; |
18 | 5.92M | if (next_pos <= |
19 | 5.92M | len) { // if it is safe to read 16 more bytes, check that they are ascii |
20 | 5.91M | uint64_t v1; |
21 | 5.91M | std::memcpy(&v1, data + pos, sizeof(uint64_t)); |
22 | 5.91M | uint64_t v2; |
23 | 5.91M | std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); |
24 | 5.91M | uint64_t v{v1 | v2}; |
25 | 5.91M | if ((v & 0x8080808080808080) == 0) { |
26 | 4.86M | pos = next_pos; |
27 | 4.86M | continue; |
28 | 4.86M | } |
29 | 5.91M | } |
30 | 1.06M | unsigned char byte = data[pos]; |
31 | | |
32 | 6.07M | while (byte < 0b10000000) { |
33 | 5.01M | if (++pos == len) { |
34 | 1.04k | return true; |
35 | 1.04k | } |
36 | 5.01M | byte = data[pos]; |
37 | 5.01M | } |
38 | | |
39 | 1.06M | if ((byte & 0b11100000) == 0b11000000) { |
40 | 679k | next_pos = pos + 2; |
41 | 679k | if (next_pos > len) { |
42 | 57 | return false; |
43 | 57 | } |
44 | 679k | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
45 | 250 | return false; |
46 | 250 | } |
47 | | // range check |
48 | 678k | code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); |
49 | 678k | if ((code_point < 0x80) || (0x7ff < code_point)) { |
50 | 38 | return false; |
51 | 38 | } |
52 | 678k | } else if ((byte & 0b11110000) == 0b11100000) { |
53 | 318k | next_pos = pos + 3; |
54 | 318k | if (next_pos > len) { |
55 | 58 | return false; |
56 | 58 | } |
57 | 317k | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
58 | 101 | return false; |
59 | 101 | } |
60 | 317k | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
61 | 54 | return false; |
62 | 54 | } |
63 | | // range check |
64 | 317k | code_point = (byte & 0b00001111) << 12 | |
65 | 317k | (data[pos + 1] & 0b00111111) << 6 | |
66 | 317k | (data[pos + 2] & 0b00111111); |
67 | 317k | if ((code_point < 0x800) || (0xffff < code_point) || |
68 | 317k | (0xd7ff < code_point && code_point < 0xe000)) { |
69 | 63 | return false; |
70 | 63 | } |
71 | 317k | } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 |
72 | 62.7k | next_pos = pos + 4; |
73 | 62.7k | if (next_pos > len) { |
74 | 49 | return false; |
75 | 49 | } |
76 | 62.7k | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
77 | 113 | return false; |
78 | 113 | } |
79 | 62.6k | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
80 | 43 | return false; |
81 | 43 | } |
82 | 62.5k | if ((data[pos + 3] & 0b11000000) != 0b10000000) { |
83 | 40 | return false; |
84 | 40 | } |
85 | | // range check |
86 | 62.5k | code_point = |
87 | 62.5k | (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | |
88 | 62.5k | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); |
89 | 62.5k | if (code_point <= 0xffff || 0x10ffff < code_point) { |
90 | 80 | return false; |
91 | 80 | } |
92 | 62.5k | } else { |
93 | | // we may have a continuation |
94 | 855 | return false; |
95 | 855 | } |
96 | 1.05M | pos = next_pos; |
97 | 1.05M | } |
98 | 597 | return true; |
99 | 3.44k | } |
100 | | #endif |
101 | | |
102 | | inline simdutf_warn_unused result validate_with_errors(const char *buf, |
103 | 6.77k | size_t len) noexcept { |
104 | 6.77k | const uint8_t *data = reinterpret_cast<const uint8_t *>(buf); |
105 | 6.77k | size_t pos = 0; |
106 | 6.77k | uint32_t code_point = 0; |
107 | 5.93M | while (pos < len) { |
108 | | // check of the next 16 bytes are ascii. |
109 | 5.93M | size_t next_pos = pos + 16; |
110 | 5.93M | if (next_pos <= |
111 | 5.93M | len) { // if it is safe to read 16 more bytes, check that they are ascii |
112 | 5.92M | uint64_t v1; |
113 | 5.92M | std::memcpy(&v1, data + pos, sizeof(uint64_t)); |
114 | 5.92M | uint64_t v2; |
115 | 5.92M | std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); |
116 | 5.92M | uint64_t v{v1 | v2}; |
117 | 5.92M | if ((v & 0x8080808080808080) == 0) { |
118 | 4.86M | pos = next_pos; |
119 | 4.86M | continue; |
120 | 4.86M | } |
121 | 5.92M | } |
122 | 1.06M | unsigned char byte = data[pos]; |
123 | | |
124 | 6.09M | while (byte < 0b10000000) { |
125 | 5.02M | if (++pos == len) { |
126 | 1.04k | return result(error_code::SUCCESS, len); |
127 | 1.04k | } |
128 | 5.02M | byte = data[pos]; |
129 | 5.02M | } |
130 | | |
131 | 1.06M | if ((byte & 0b11100000) == 0b11000000) { |
132 | 681k | next_pos = pos + 2; |
133 | 681k | if (next_pos > len) { |
134 | 171 | return result(error_code::TOO_SHORT, pos); |
135 | 171 | } |
136 | 681k | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
137 | 750 | return result(error_code::TOO_SHORT, pos); |
138 | 750 | } |
139 | | // range check |
140 | 680k | code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); |
141 | 680k | if ((code_point < 0x80) || (0x7ff < code_point)) { |
142 | 114 | return result(error_code::OVERLONG, pos); |
143 | 114 | } |
144 | 680k | } else if ((byte & 0b11110000) == 0b11100000) { |
145 | 319k | next_pos = pos + 3; |
146 | 319k | if (next_pos > len) { |
147 | 174 | return result(error_code::TOO_SHORT, pos); |
148 | 174 | } |
149 | 319k | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
150 | 303 | return result(error_code::TOO_SHORT, pos); |
151 | 303 | } |
152 | 319k | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
153 | 162 | return result(error_code::TOO_SHORT, pos); |
154 | 162 | } |
155 | | // range check |
156 | 319k | code_point = (byte & 0b00001111) << 12 | |
157 | 319k | (data[pos + 1] & 0b00111111) << 6 | |
158 | 319k | (data[pos + 2] & 0b00111111); |
159 | 319k | if ((code_point < 0x800) || (0xffff < code_point)) { |
160 | 123 | return result(error_code::OVERLONG, pos); |
161 | 123 | } |
162 | 318k | if (0xd7ff < code_point && code_point < 0xe000) { |
163 | 66 | return result(error_code::SURROGATE, pos); |
164 | 66 | } |
165 | 318k | } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 |
166 | 63.9k | next_pos = pos + 4; |
167 | 63.9k | if (next_pos > len) { |
168 | 147 | return result(error_code::TOO_SHORT, pos); |
169 | 147 | } |
170 | 63.7k | if ((data[pos + 1] & 0b11000000) != 0b10000000) { |
171 | 339 | return result(error_code::TOO_SHORT, pos); |
172 | 339 | } |
173 | 63.4k | if ((data[pos + 2] & 0b11000000) != 0b10000000) { |
174 | 129 | return result(error_code::TOO_SHORT, pos); |
175 | 129 | } |
176 | 63.3k | if ((data[pos + 3] & 0b11000000) != 0b10000000) { |
177 | 120 | return result(error_code::TOO_SHORT, pos); |
178 | 120 | } |
179 | | // range check |
180 | 63.2k | code_point = |
181 | 63.2k | (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | |
182 | 63.2k | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); |
183 | 63.2k | if (code_point <= 0xffff) { |
184 | 135 | return result(error_code::OVERLONG, pos); |
185 | 135 | } |
186 | 63.0k | if (0x10ffff < code_point) { |
187 | 105 | return result(error_code::TOO_LARGE, pos); |
188 | 105 | } |
189 | 63.0k | } else { |
190 | | // we either have too many continuation bytes or an invalid leading byte |
191 | 2.29k | if ((byte & 0b11000000) == 0b10000000) { |
192 | 1.23k | return result(error_code::TOO_LONG, pos); |
193 | 1.23k | } else { |
194 | 1.05k | return result(error_code::HEADER_BITS, pos); |
195 | 1.05k | } |
196 | 2.29k | } |
197 | 1.06M | pos = next_pos; |
198 | 1.06M | } |
199 | 597 | return result(error_code::SUCCESS, len); |
200 | 6.77k | } |
201 | | |
202 | | // Finds the previous leading byte starting backward from buf and validates with |
203 | | // errors from there Used to pinpoint the location of an error when an invalid |
204 | | // chunk is detected We assume that the stream starts with a leading byte, and |
205 | | // to check that it is the case, we ask that you pass a pointer to the start of |
206 | | // the stream (start). |
207 | | inline simdutf_warn_unused result rewind_and_validate_with_errors( |
208 | 3.60k | const char *start, const char *buf, size_t len) noexcept { |
209 | | // First check that we start with a leading byte |
210 | 3.60k | if ((*start & 0b11000000) == 0b10000000) { |
211 | 274 | return result(error_code::TOO_LONG, 0); |
212 | 274 | } |
213 | 3.32k | size_t extra_len{0}; |
214 | | // A leading byte cannot be further than 4 bytes away |
215 | 3.49k | for (int i = 0; i < 5; i++) { |
216 | 3.49k | unsigned char byte = *buf; |
217 | 3.49k | if ((byte & 0b11000000) != 0b10000000) { |
218 | 3.32k | break; |
219 | 3.32k | } else { |
220 | 170 | buf--; |
221 | 170 | extra_len++; |
222 | 170 | } |
223 | 3.49k | } |
224 | | |
225 | 3.32k | result res = validate_with_errors(buf, len + extra_len); |
226 | 3.32k | res.count -= extra_len; |
227 | 3.32k | return res; |
228 | 3.60k | } |
229 | | |
230 | 14.9k | inline size_t count_code_points(const char *buf, size_t len) { |
231 | 14.9k | const int8_t *p = reinterpret_cast<const int8_t *>(buf); |
232 | 14.9k | size_t counter{0}; |
233 | 179M | for (size_t i = 0; i < len; i++) { |
234 | | // -65 is 0b10111111, anything larger in two-complement's should start a new |
235 | | // code point. |
236 | 179M | if (p[i] > -65) { |
237 | 176M | counter++; |
238 | 176M | } |
239 | 179M | } |
240 | 14.9k | return counter; |
241 | 14.9k | } |
242 | | |
243 | 5.73k | inline size_t utf16_length_from_utf8(const char *buf, size_t len) { |
244 | 5.73k | const int8_t *p = reinterpret_cast<const int8_t *>(buf); |
245 | 5.73k | size_t counter{0}; |
246 | 66.3M | for (size_t i = 0; i < len; i++) { |
247 | 66.3M | if (p[i] > -65) { |
248 | 65.0M | counter++; |
249 | 65.0M | } |
250 | 66.3M | if (uint8_t(p[i]) >= 240) { |
251 | 157k | counter++; |
252 | 157k | } |
253 | 66.3M | } |
254 | 5.73k | return counter; |
255 | 5.73k | } |
256 | | |
257 | | simdutf_warn_unused inline size_t trim_partial_utf8(const char *input, |
258 | 0 | size_t length) { |
259 | 0 | if (length < 3) { |
260 | 0 | switch (length) { |
261 | 0 | case 2: |
262 | 0 | if (uint8_t(input[length - 1]) >= 0xc0) { |
263 | 0 | return length - 1; |
264 | 0 | } // 2-, 3- and 4-byte characters with only 1 byte left |
265 | 0 | if (uint8_t(input[length - 2]) >= 0xe0) { |
266 | 0 | return length - 2; |
267 | 0 | } // 3- and 4-byte characters with only 2 bytes left |
268 | 0 | return length; |
269 | 0 | case 1: |
270 | 0 | if (uint8_t(input[length - 1]) >= 0xc0) { |
271 | 0 | return length - 1; |
272 | 0 | } // 2-, 3- and 4-byte characters with only 1 byte left |
273 | 0 | return length; |
274 | 0 | case 0: |
275 | 0 | return length; |
276 | 0 | } |
277 | 0 | } |
278 | 0 | if (uint8_t(input[length - 1]) >= 0xc0) { |
279 | 0 | return length - 1; |
280 | 0 | } // 2-, 3- and 4-byte characters with only 1 byte left |
281 | 0 | if (uint8_t(input[length - 2]) >= 0xe0) { |
282 | 0 | return length - 2; |
283 | 0 | } // 3- and 4-byte characters with only 1 byte left |
284 | 0 | if (uint8_t(input[length - 3]) >= 0xf0) { |
285 | 0 | return length - 3; |
286 | 0 | } // 4-byte characters with only 3 bytes left |
287 | 0 | return length; |
288 | 0 | } |
289 | | |
290 | | } // namespace utf8 |
291 | | } // unnamed namespace |
292 | | } // namespace scalar |
293 | | } // namespace simdutf |
294 | | |
295 | | #endif |