/src/dovecot/src/lib/unichar.c
Line | Count | Source |
1 | | /* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */ |
2 | | |
3 | | #include "lib.h" |
4 | | #include "array.h" |
5 | | #include "str.h" |
6 | | #include "bsearch-insert-pos.h" |
7 | | #include "unicode-data.h" |
8 | | #include "unicode-transform.h" |
9 | | #include "unichar.h" |
10 | | |
11 | | const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] = |
12 | | { 0xef, 0xbf, 0xbd }; /* 0xfffd */ |
13 | | |
14 | | static const uint8_t utf8_non1_bytes[256 - 192 - 2] = { |
15 | | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
16 | | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
17 | | }; |
18 | | |
19 | | const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes; |
20 | | |
21 | | unsigned int uni_strlen(const unichar_t *str) |
22 | 0 | { |
23 | 0 | unsigned int len = 0; |
24 | |
|
25 | 0 | for (len = 0; str[len] != 0; len++) ; |
26 | |
|
27 | 0 | return len; |
28 | 0 | } |
29 | | |
30 | | static int |
31 | | uni_utf8_parse_char(const void *_buffer, size_t size, bool cstr, |
32 | | unichar_t *chr_r) |
33 | 538M | { |
34 | 538M | static unichar_t lowest_valid_chr_table[] = |
35 | 538M | { 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 }; |
36 | 538M | const unsigned char *input = _buffer; |
37 | 538M | unichar_t chr, lowest_valid_chr; |
38 | 538M | unsigned int i, len; |
39 | 538M | int ret; |
40 | | |
41 | 538M | i_assert(size > 0); |
42 | | |
43 | 538M | if (*input < 0x80) { |
44 | 185M | *chr_r = *input; |
45 | 185M | return 1; |
46 | 185M | } |
47 | | |
48 | | /* first byte has len highest bits set, followed by zero bit. |
49 | | the rest of the bits are used as the highest bits of the value. */ |
50 | 353M | chr = *input; |
51 | 353M | len = uni_utf8_char_bytes(*input); |
52 | 353M | switch (len) { |
53 | 79.9M | case 2: |
54 | 79.9M | chr &= 0x1f; |
55 | 79.9M | break; |
56 | 260M | case 3: |
57 | 260M | chr &= 0x0f; |
58 | 260M | break; |
59 | 2.99M | case 4: |
60 | 2.99M | chr &= 0x07; |
61 | 2.99M | break; |
62 | 178k | case 5: |
63 | 178k | chr &= 0x03; |
64 | 178k | break; |
65 | 94.6k | case 6: |
66 | 94.6k | chr &= 0x01; |
67 | 94.6k | break; |
68 | 9.88M | default: |
69 | | /* only 7bit chars should have len==1 */ |
70 | 9.88M | i_assert(len == 1); |
71 | 9.88M | return -1; |
72 | 353M | } |
73 | | |
74 | 343M | if (len <= size) { |
75 | 343M | lowest_valid_chr = lowest_valid_chr_table[len]; |
76 | 343M | ret = len; |
77 | 343M | } else { |
78 | | /* check first if the input is invalid before returning 0 */ |
79 | 38.9k | lowest_valid_chr = 0; |
80 | 38.9k | ret = 0; |
81 | 38.9k | len = size; |
82 | 38.9k | } |
83 | | |
84 | | /* the following bytes must all be 10xxxxxx */ |
85 | 936M | for (i = 1; i < len; i++) { |
86 | 602M | if ((input[i] & 0xc0) != 0x80) { |
87 | 9.18M | return (cstr && size == SIZE_MAX && input[i] == '\0' ? |
88 | 9.18M | 0 : -1); |
89 | 9.18M | } |
90 | | |
91 | 592M | chr <<= 6; |
92 | 592M | chr |= input[i] & 0x3f; |
93 | 592M | } |
94 | | /* these are specified as invalid encodings by standards |
95 | | see RFC3629 */ |
96 | 334M | if (!uni_is_valid_ucs4(chr)) |
97 | 43.1k | return -1; |
98 | 334M | if (chr < lowest_valid_chr) { |
99 | | /* overlong encoding */ |
100 | 2.34k | return -1; |
101 | 2.34k | } |
102 | | |
103 | 334M | *chr_r = chr; |
104 | 334M | return ret; |
105 | 334M | } |
106 | | |
107 | | int uni_utf8_get_char(const char *input, unichar_t *chr_r) |
108 | 34.7M | { |
109 | 34.7M | return uni_utf8_parse_char(input, SIZE_MAX, TRUE, chr_r); |
110 | 34.7M | } |
111 | | |
112 | | int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r) |
113 | 490M | { |
114 | 490M | return uni_utf8_parse_char(input, max_len, TRUE, chr_r); |
115 | 490M | } |
116 | | |
117 | | int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r) |
118 | 12.6M | { |
119 | 12.6M | return uni_utf8_parse_char(buffer, size, FALSE, chr_r); |
120 | 12.6M | } |
121 | | |
122 | | int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output) |
123 | 0 | { |
124 | 0 | unichar_t chr; |
125 | |
|
126 | 0 | while (*input != '\0') { |
127 | 0 | int len = uni_utf8_get_char(input, &chr); |
128 | 0 | if (len <= 0) { |
129 | | /* invalid input */ |
130 | 0 | return -1; |
131 | 0 | } |
132 | 0 | input += len; |
133 | |
|
134 | 0 | array_push_back(output, &chr); |
135 | 0 | } |
136 | 0 | return 0; |
137 | 0 | } |
138 | | |
139 | | int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size, |
140 | | ARRAY_TYPE(unichars) *output) |
141 | 0 | { |
142 | 0 | unichar_t chr; |
143 | |
|
144 | 0 | while (size > 0) { |
145 | 0 | int len = uni_utf8_get_char_n(input, size, &chr); |
146 | 0 | if (len <= 0) |
147 | 0 | return -1; /* invalid input */ |
148 | 0 | input += len; size -= len; |
149 | |
|
150 | 0 | array_push_back(output, &chr); |
151 | 0 | } |
152 | 0 | return 0; |
153 | 0 | } |
154 | | |
155 | | void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output) |
156 | 5.27M | { |
157 | 61.3M | for (; len > 0 && *input != '\0'; input++, len--) |
158 | 56.0M | uni_ucs4_to_utf8_c(*input, output); |
159 | 5.27M | } |
160 | | |
161 | | void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output) |
162 | 56.1M | { |
163 | 56.1M | unsigned char first; |
164 | 56.1M | int bitpos; |
165 | | |
166 | 56.1M | if (chr < 0x80) { |
167 | 38.0M | buffer_append_c(output, chr); |
168 | 38.0M | return; |
169 | 38.0M | } |
170 | | |
171 | 56.1M | i_assert(uni_is_valid_ucs4(chr)); |
172 | | |
173 | 18.0M | if (chr < (1 << (6 + 5))) { |
174 | | /* 110xxxxx */ |
175 | 511k | bitpos = 6; |
176 | 511k | first = 0x80 | 0x40; |
177 | 17.5M | } else if (chr < (1 << ((2*6) + 4))) { |
178 | | /* 1110xxxx */ |
179 | 17.5M | bitpos = 2*6; |
180 | 17.5M | first = 0x80 | 0x40 | 0x20; |
181 | 17.5M | } else if (chr < (1 << ((3*6) + 3))) { |
182 | | /* 11110xxx */ |
183 | 6.91k | bitpos = 3*6; |
184 | 6.91k | first = 0x80 | 0x40 | 0x20 | 0x10; |
185 | 6.91k | } else if (chr < (1 << ((4*6) + 2))) { |
186 | | /* 111110xx */ |
187 | 0 | bitpos = 4*6; |
188 | 0 | first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08; |
189 | 0 | } else { |
190 | | /* 1111110x */ |
191 | 0 | bitpos = 5*6; |
192 | 0 | first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04; |
193 | 0 | } |
194 | 18.0M | buffer_append_c(output, first | (chr >> bitpos)); |
195 | | |
196 | 35.6M | do { |
197 | 35.6M | bitpos -= 6; |
198 | 35.6M | buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f)); |
199 | 35.6M | } while (bitpos > 0); |
200 | 18.0M | } |
201 | | |
202 | | unsigned int uni_utf8_strlen(const char *input) |
203 | 0 | { |
204 | 0 | return uni_utf8_strlen_n(input, strlen(input)); |
205 | 0 | } |
206 | | |
207 | | unsigned int uni_utf8_strlen_n(const void *input, size_t size) |
208 | 0 | { |
209 | 0 | size_t partial_pos; |
210 | |
|
211 | 0 | return uni_utf8_partial_strlen_n(input, size, &partial_pos); |
212 | 0 | } |
213 | | |
214 | | unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size, |
215 | | size_t *partial_pos_r) |
216 | 3.99M | { |
217 | 3.99M | const unsigned char *input = _input; |
218 | 3.99M | unsigned int count, len = 0; |
219 | 3.99M | size_t i; |
220 | | |
221 | 365M | for (i = 0; i < size; ) { |
222 | 362M | count = uni_utf8_char_bytes(input[i]); |
223 | 362M | if (i + count > size) |
224 | 7.11k | break; |
225 | 362M | i += count; |
226 | 362M | len++; |
227 | 362M | } |
228 | 3.99M | *partial_pos_r = i; |
229 | 3.99M | return len; |
230 | 3.99M | } |
231 | | |
232 | | unichar_t uni_ucs4_to_titlecase(unichar_t chr) |
233 | 0 | { |
234 | 0 | const struct unicode_code_point_data *cp_data = |
235 | 0 | unicode_code_point_get_data(chr); |
236 | |
|
237 | 0 | if (cp_data->simple_titlecase_mapping != 0x0000) |
238 | 0 | return cp_data->simple_titlecase_mapping; |
239 | 0 | return chr; |
240 | 0 | } |
241 | | |
242 | | static void output_add_replacement_char(buffer_t *output) |
243 | 230k | { |
244 | 230k | if (output->used >= UTF8_REPLACEMENT_CHAR_LEN && |
245 | 228k | memcmp(CONST_PTR_OFFSET(output->data, |
246 | 228k | output->used - UTF8_REPLACEMENT_CHAR_LEN), |
247 | 228k | utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) { |
248 | | /* don't add the replacement char multiple times */ |
249 | 214k | return; |
250 | 214k | } |
251 | 15.7k | buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN); |
252 | 15.7k | } |
253 | | |
254 | | int uni_utf8_run_transform(const void *_input, size_t size, |
255 | | struct unicode_transform *trans, buffer_t *output, |
256 | | const char **error_r) |
257 | 1.43M | { |
258 | 1.43M | struct unicode_transform *trans_last = |
259 | 1.43M | unicode_transform_get_last(trans); |
260 | 1.43M | struct unicode_buffer_sink sink; |
261 | 1.43M | const unsigned char *input = _input; |
262 | 1.43M | unichar_t chr; |
263 | 1.43M | ssize_t sret; |
264 | 1.43M | bool got_chr = FALSE, bad_cp = FALSE; |
265 | 1.43M | int ret = 0; |
266 | | |
267 | 1.43M | unicode_buffer_sink_init(&sink, output); |
268 | 1.43M | unicode_transform_chain(trans_last, &sink.transform); |
269 | | |
270 | 150M | while (size > 0 || got_chr) { |
271 | 149M | if (!got_chr) { |
272 | 149M | int bytes = uni_utf8_get_char_n(input, size, &chr); |
273 | 149M | if (bytes <= 0) { |
274 | | /* Invalid input. try the next byte. */ |
275 | 18.9M | ret = -1; |
276 | 18.9M | input++; size--; |
277 | 18.9M | if (!bad_cp) { |
278 | 4.09M | chr = UNICODE_REPLACEMENT_CHAR; |
279 | 4.09M | bad_cp = TRUE; |
280 | 4.09M | } |
281 | 130M | } else { |
282 | 130M | input += bytes; |
283 | 130M | size -= bytes; |
284 | 130M | bad_cp = FALSE; |
285 | 130M | } |
286 | 149M | } |
287 | | |
288 | 149M | sret = unicode_transform_input(trans, &chr, 1, error_r); |
289 | 149M | if (sret < 0) |
290 | 0 | return -1; |
291 | 149M | if (sret > 0) |
292 | 148M | got_chr = FALSE; |
293 | 149M | } |
294 | | |
295 | 1.43M | int fret = unicode_transform_flush(trans, error_r); |
296 | 1.43M | if (fret < 0) |
297 | 0 | i_panic("unicode_transform_flush(): %s", *error_r); |
298 | 1.43M | i_assert(fret == 1); |
299 | 1.43M | return ret; |
300 | 1.43M | } |
301 | | |
302 | | static inline int |
303 | | uni_utf8_write_nf_common(const void *_input, size_t size, |
304 | | enum unicode_nf_type nf_type, buffer_t *output) |
305 | 1.43M | { |
306 | 1.43M | static struct unicode_nf_context ctx; |
307 | 1.43M | const char *error; |
308 | | |
309 | 1.43M | unicode_nf_init(&ctx, nf_type); |
310 | | |
311 | 1.43M | return uni_utf8_run_transform(_input, size, &ctx.transform, output, |
312 | 1.43M | &error); |
313 | 1.43M | } |
314 | | |
315 | | int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output) |
316 | 0 | { |
317 | 0 | return uni_utf8_write_nf_common(input, size, UNICODE_NFD, output); |
318 | 0 | } |
319 | | |
320 | | int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output) |
321 | 0 | { |
322 | 0 | return uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output); |
323 | 0 | } |
324 | | |
325 | | int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output) |
326 | 1.43M | { |
327 | 1.43M | return uni_utf8_write_nf_common(input, size, UNICODE_NFC, output); |
328 | 1.43M | } |
329 | | |
330 | | int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output) |
331 | 0 | { |
332 | 0 | return uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output); |
333 | 0 | } |
334 | | |
335 | | int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r) |
336 | 0 | { |
337 | 0 | buffer_t *output = t_buffer_create(size); |
338 | |
|
339 | 0 | if (uni_utf8_write_nf_common(input, size, UNICODE_NFD, output) < 0) |
340 | 0 | return -1; |
341 | 0 | *output_r = str_c(output); |
342 | 0 | return 0; |
343 | 0 | } |
344 | | |
345 | | int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r) |
346 | 0 | { |
347 | 0 | buffer_t *output = t_buffer_create(size); |
348 | |
|
349 | 0 | if (uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output) < 0) |
350 | 0 | return -1; |
351 | 0 | *output_r = str_c(output); |
352 | 0 | return 0; |
353 | 0 | } |
354 | | |
355 | | int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r) |
356 | 0 | { |
357 | 0 | buffer_t *output = t_buffer_create(size); |
358 | |
|
359 | 0 | if (uni_utf8_write_nf_common(input, size, UNICODE_NFC, output) < 0) |
360 | 0 | return -1; |
361 | 0 | *output_r = str_c(output); |
362 | 0 | return 0; |
363 | 0 | } |
364 | | |
365 | | int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r) |
366 | 0 | { |
367 | 0 | buffer_t *output = t_buffer_create(size); |
368 | |
|
369 | 0 | if (uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output) < 0) |
370 | 0 | return -1; |
371 | 0 | *output_r = str_c(output); |
372 | 0 | return 0; |
373 | 0 | } |
374 | | |
375 | | static int |
376 | | uni_utf8_is_nf(const void *_input, size_t size, enum unicode_nf_type type) |
377 | 0 | { |
378 | 0 | static struct unicode_nf_checker unc; |
379 | 0 | const unsigned char *input = _input; |
380 | 0 | unichar_t chr; |
381 | 0 | int ret; |
382 | |
|
383 | 0 | unicode_nf_checker_init(&unc, type); |
384 | |
|
385 | 0 | while (size > 0) { |
386 | 0 | const struct unicode_code_point_data *cp_data = NULL; |
387 | 0 | int bytes = uni_utf8_get_char_n(input, size, &chr); |
388 | 0 | if (bytes <= 0) |
389 | 0 | return -1; |
390 | 0 | input += bytes; |
391 | 0 | size -= bytes; |
392 | |
|
393 | 0 | ret = unicode_nf_checker_input(&unc, chr, &cp_data); |
394 | 0 | if (ret <= 0) |
395 | 0 | return ret; |
396 | 0 | } |
397 | | |
398 | 0 | return unicode_nf_checker_finish(&unc); |
399 | 0 | } |
400 | | |
401 | | int uni_utf8_is_nfd(const void *input, size_t size) |
402 | 0 | { |
403 | 0 | return uni_utf8_is_nf(input, size, UNICODE_NFD); |
404 | 0 | } |
405 | | |
406 | | int uni_utf8_is_nfkd(const void *input, size_t size) |
407 | 0 | { |
408 | 0 | return uni_utf8_is_nf(input, size, UNICODE_NFKD); |
409 | 0 | } |
410 | | |
411 | | int uni_utf8_is_nfc(const void *input, size_t size) |
412 | 0 | { |
413 | 0 | return uni_utf8_is_nf(input, size, UNICODE_NFC); |
414 | 0 | } |
415 | | |
416 | | int uni_utf8_is_nfkc(const void *input, size_t size) |
417 | 0 | { |
418 | 0 | return uni_utf8_is_nf(input, size, UNICODE_NFKC); |
419 | 0 | } |
420 | | |
421 | | int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output) |
422 | 0 | { |
423 | 0 | static struct unicode_casemap map; |
424 | 0 | const char *error; |
425 | |
|
426 | 0 | unicode_casemap_init_uppercase(&map); |
427 | |
|
428 | 0 | return uni_utf8_run_transform(_input, size, &map.transform, output, |
429 | 0 | &error); |
430 | 0 | } |
431 | | |
432 | | int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output) |
433 | 0 | { |
434 | 0 | static struct unicode_casemap map; |
435 | 0 | const char *error; |
436 | |
|
437 | 0 | unicode_casemap_init_lowercase(&map); |
438 | |
|
439 | 0 | return uni_utf8_run_transform(_input, size, &map.transform, output, |
440 | 0 | &error); |
441 | 0 | } |
442 | | |
443 | | int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output) |
444 | 0 | { |
445 | 0 | static struct unicode_casemap map; |
446 | 0 | const char *error; |
447 | |
|
448 | 0 | unicode_casemap_init_casefold(&map); |
449 | |
|
450 | 0 | return uni_utf8_run_transform(_input, size, &map.transform, output, |
451 | 0 | &error); |
452 | 0 | } |
453 | | |
454 | | int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r) |
455 | 0 | { |
456 | 0 | buffer_t *output = t_buffer_create(size); |
457 | 0 | int ret; |
458 | |
|
459 | 0 | ret = uni_utf8_write_uppercase(input, size, output); |
460 | 0 | *output_r = str_c(output); |
461 | 0 | return ret; |
462 | 0 | } |
463 | | |
464 | | int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r) |
465 | 0 | { |
466 | 0 | buffer_t *output = t_buffer_create(size); |
467 | 0 | int ret; |
468 | |
|
469 | 0 | ret = uni_utf8_write_lowercase(input, size, output); |
470 | 0 | *output_r = str_c(output); |
471 | 0 | return ret; |
472 | 0 | } |
473 | | |
474 | | int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r) |
475 | 0 | { |
476 | 0 | buffer_t *output = t_buffer_create(size); |
477 | 0 | int ret; |
478 | |
|
479 | 0 | ret = uni_utf8_write_casefold(input, size, output); |
480 | 0 | *output_r = str_c(output); |
481 | 0 | return ret; |
482 | 0 | } |
483 | | |
484 | | int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, |
485 | | buffer_t *output) |
486 | 0 | { |
487 | 0 | struct unicode_rfc5051_context ctx; |
488 | 0 | const unsigned char *input = _input; |
489 | 0 | unichar_t chr; |
490 | 0 | int ret = 0; |
491 | |
|
492 | 0 | unicode_rfc5051_init(&ctx); |
493 | |
|
494 | 0 | while (size > 0) { |
495 | 0 | int bytes = uni_utf8_get_char_n(input, size, &chr); |
496 | 0 | if (bytes <= 0) { |
497 | | /* invalid input. try the next byte. */ |
498 | 0 | ret = -1; |
499 | 0 | input++; size--; |
500 | 0 | output_add_replacement_char(output); |
501 | 0 | continue; |
502 | 0 | } |
503 | 0 | input += bytes; |
504 | 0 | size -= bytes; |
505 | |
|
506 | 0 | const unichar_t *norm; |
507 | 0 | size_t norm_len; |
508 | |
|
509 | 0 | norm_len = unicode_rfc5051_normalize(&ctx, chr, &norm); |
510 | 0 | uni_ucs4_to_utf8(norm, norm_len, output); |
511 | 0 | } |
512 | 0 | return ret; |
513 | 0 | } |
514 | | |
515 | | static inline unsigned int |
516 | | is_valid_utf8_seq(const unsigned char *input, unsigned int size) |
517 | 333M | { |
518 | 333M | unichar_t chr; |
519 | 333M | int len = uni_utf8_get_char_n(input, size, &chr); |
520 | 333M | return len <= 0 ? 0 : len; |
521 | 333M | } |
522 | | |
523 | | static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size, |
524 | | size_t *pos_r) |
525 | 3.29M | { |
526 | 3.29M | size_t i, len; |
527 | | |
528 | | /* find the first invalid utf8 sequence */ |
529 | 317M | for (i = 0; i < size;) { |
530 | 313M | if (input[i] < 0x80) |
531 | 1.38M | i++; |
532 | 312M | else { |
533 | 312M | len = is_valid_utf8_seq(input + i, size-i); |
534 | 312M | if (unlikely(len == 0)) { |
535 | 7.58k | *pos_r = i; |
536 | 7.58k | return -1; |
537 | 7.58k | } |
538 | 312M | i += len; |
539 | 312M | } |
540 | 313M | } |
541 | 3.28M | return 0; |
542 | 3.29M | } |
543 | | |
544 | | bool uni_utf8_get_valid_data(const unsigned char *input, size_t size, |
545 | | buffer_t *buf) |
546 | 3.24M | { |
547 | 3.24M | size_t i, len; |
548 | | |
549 | 3.24M | if (uni_utf8_find_invalid_pos(input, size, &i) == 0) |
550 | 3.23M | return TRUE; |
551 | | |
552 | | /* broken utf-8 input - skip the broken characters */ |
553 | 7.43k | buffer_append(buf, input, i++); |
554 | | |
555 | 7.43k | output_add_replacement_char(buf); |
556 | 26.3M | while (i < size) { |
557 | 26.3M | if (input[i] < 0x80) { |
558 | 5.15M | buffer_append_c(buf, input[i++]); |
559 | 5.15M | continue; |
560 | 5.15M | } |
561 | | |
562 | 21.2M | len = is_valid_utf8_seq(input + i, size-i); |
563 | 21.2M | if (len == 0) { |
564 | 222k | i++; |
565 | 222k | output_add_replacement_char(buf); |
566 | 222k | continue; |
567 | 222k | } |
568 | 20.9M | buffer_append(buf, input + i, len); |
569 | 20.9M | i += len; |
570 | 20.9M | } |
571 | 7.43k | return FALSE; |
572 | 3.24M | } |
573 | | |
574 | | bool uni_utf8_str_is_valid(const char *str) |
575 | 52.2k | { |
576 | 52.2k | size_t i; |
577 | | |
578 | 52.2k | return uni_utf8_find_invalid_pos((const unsigned char *)str, |
579 | 52.2k | strlen(str), &i) == 0; |
580 | 52.2k | } |
581 | | |
582 | | bool uni_utf8_data_is_valid(const unsigned char *data, size_t size) |
583 | 0 | { |
584 | 0 | size_t i; |
585 | |
|
586 | 0 | return uni_utf8_find_invalid_pos(data, size, &i) == 0; |
587 | 0 | } |
588 | | |
589 | | size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size, |
590 | | size_t max_new_size) |
591 | 6.44k | { |
592 | 6.44k | if (max_new_size >= old_size) |
593 | 0 | return old_size; |
594 | 6.44k | if (max_new_size == 0) |
595 | 0 | return 0; |
596 | | |
597 | 6.44k | if ((data[max_new_size] & 0x80) == 0) |
598 | 6.43k | return max_new_size; |
599 | 17 | while (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0x80) |
600 | 5 | max_new_size--; |
601 | 12 | if (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0xc0) |
602 | 12 | max_new_size--; |
603 | 12 | return max_new_size; |
604 | 6.44k | } |
605 | | |
606 | | /* |
607 | | * Grapheme clusters |
608 | | */ |
609 | | |
610 | | void uni_gc_scanner_init(struct uni_gc_scanner *gcsc, |
611 | | const void *input, size_t size) |
612 | 0 | { |
613 | 0 | i_zero(gcsc); |
614 | 0 | unicode_gc_break_init(&gcsc->gcbrk); |
615 | 0 | gcsc->p = input; |
616 | 0 | gcsc->pend = gcsc->p + size; |
617 | 0 | } |
618 | | |
619 | | bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc) |
620 | 0 | { |
621 | 0 | bool first = (gcsc->poffset == NULL); |
622 | | |
623 | | /* Reset offset to last grapheme boundary (after the last grapheme |
624 | | cluster we indicated). */ |
625 | 0 | gcsc->poffset = gcsc->p; |
626 | | /* Shift pointer past last code point; starts the next grapheme cluster |
627 | | we shall compose in this call. */ |
628 | 0 | gcsc->p += gcsc->cp_size; |
629 | 0 | gcsc->cp_size = 0; |
630 | 0 | while (gcsc->p < gcsc->pend) { |
631 | | /* Decode next UTF-8 code point */ |
632 | 0 | gcsc->cp_size = uni_utf8_get_char_n( |
633 | 0 | gcsc->p, gcsc->pend - gcsc->p, &gcsc->cp); |
634 | | /* We expect valid and complete UTF-8 input */ |
635 | 0 | i_assert(gcsc->cp_size > 0); |
636 | | |
637 | | /* Determine whether there exists a grapheme cluster boundary |
638 | | before this code point. */ |
639 | 0 | const struct unicode_code_point_data *cp_data = NULL; |
640 | 0 | if (unicode_gc_break_cp(&gcsc->gcbrk, gcsc->cp, &cp_data)) { |
641 | | /* Yes, but ignore the very first grapheme boundary that |
642 | | occurs at the start of input. */ |
643 | 0 | if (!first) { |
644 | | /* Grapheme cluster detected, but it does *NOT* |
645 | | include the last code point we decoded just |
646 | | now. */ |
647 | 0 | i_assert(gcsc->p > gcsc->poffset); |
648 | 0 | return TRUE; |
649 | 0 | } |
650 | 0 | first = FALSE; |
651 | 0 | } |
652 | | |
653 | | /* Shift pointer past last code point; include this in the next |
654 | | grapheme cluster we shall compose in this call. */ |
655 | 0 | gcsc->p += gcsc->cp_size; |
656 | 0 | gcsc->cp_size = 0; |
657 | 0 | } |
658 | | /* Return whether there is any last remaining grapheme cluster. */ |
659 | 0 | return (gcsc->p > gcsc->poffset); |
660 | 0 | } |