/src/cairo/src/cairo-unicode.c
Line | Count | Source |
1 | | /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */ |
2 | | /* cairo - a vector graphics library with display and print output |
3 | | * |
4 | | * The code in this file is derived from GLib's gutf8.c and |
5 | | * ultimately from libunicode. It is relicensed under the |
6 | | * dual LGPL/MPL with permission of the original authors. |
7 | | * |
8 | | * Copyright © 1999 Tom Tromey |
9 | | * Copyright © 2005 Red Hat, Inc |
10 | | * |
11 | | * This library is free software; you can redistribute it and/or |
12 | | * modify it either under the terms of the GNU Lesser General Public |
13 | | * License version 2.1 as published by the Free Software Foundation |
14 | | * (the "LGPL") or, at your option, under the terms of the Mozilla |
15 | | * Public License Version 1.1 (the "MPL"). If you do not alter this |
16 | | * notice, a recipient may use your version of this file under either |
17 | | * the MPL or the LGPL. |
18 | | * |
19 | | * You should have received a copy of the LGPL along with this library |
20 | | * in the file COPYING-LGPL-2.1; if not, write to the Free Software |
21 | | * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA |
22 | | * You should have received a copy of the MPL along with this library |
23 | | * in the file COPYING-MPL-1.1 |
24 | | * |
25 | | * The contents of this file are subject to the Mozilla Public License |
26 | | * Version 1.1 (the "License"); you may not use this file except in |
27 | | * compliance with the License. You may obtain a copy of the License at |
28 | | * http://www.mozilla.org/MPL/ |
29 | | * |
30 | | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY |
31 | | * OF ANY KIND, either express or implied. See the LGPL or the MPL for |
32 | | * the specific language governing rights and limitations. |
33 | | * |
34 | | * The Original Code is the cairo graphics library. |
35 | | * |
36 | | * The Initial Developer of the Original Code is Tom Tromey. |
37 | | * and Red Hat, Inc. |
38 | | * |
39 | | * Contributor(s): |
40 | | * Owen Taylor <otaylor@redhat.com> |
41 | | */ |
42 | | |
43 | | #include "cairoint.h" |
44 | | #include "cairo-error-private.h" |
45 | | |
46 | | #define UTF8_COMPUTE(Char, Mask, Len) \ |
47 | 47.1k | if (Char < 128) \ |
48 | 47.1k | { \ |
49 | 45.9k | Len = 1; \ |
50 | 45.9k | Mask = 0x7f; \ |
51 | 45.9k | } \ |
52 | 47.1k | else if ((Char & 0xe0) == 0xc0) \ |
53 | 1.15k | { \ |
54 | 352 | Len = 2; \ |
55 | 352 | Mask = 0x1f; \ |
56 | 352 | } \ |
57 | 1.15k | else if ((Char & 0xf0) == 0xe0) \ |
58 | 802 | { \ |
59 | 786 | Len = 3; \ |
60 | 786 | Mask = 0x0f; \ |
61 | 786 | } \ |
62 | 802 | else if ((Char & 0xf8) == 0xf0) \ |
63 | 16 | { \ |
64 | 16 | Len = 4; \ |
65 | 16 | Mask = 0x07; \ |
66 | 16 | } \ |
67 | 16 | else if ((Char & 0xfc) == 0xf8) \ |
68 | 0 | { \ |
69 | 0 | Len = 5; \ |
70 | 0 | Mask = 0x03; \ |
71 | 0 | } \ |
72 | 0 | else if ((Char & 0xfe) == 0xfc) \ |
73 | 0 | { \ |
74 | 0 | Len = 6; \ |
75 | 0 | Mask = 0x01; \ |
76 | 0 | } \ |
77 | 0 | else \ |
78 | 0 | Len = -1; |
79 | | |
80 | | #define UTF8_LENGTH(Char) \ |
81 | 2.00k | ((Char) < 0x80 ? 1 : \ |
82 | 2.00k | ((Char) < 0x800 ? 2 : \ |
83 | 2.00k | ((Char) < 0x10000 ? 3 : \ |
84 | 1.28k | ((Char) < 0x200000 ? 4 : \ |
85 | 16 | ((Char) < 0x4000000 ? 5 : 6))))) |
86 | | |
87 | | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
88 | 47.1k | (Result) = (Chars)[0] & (Mask); \ |
89 | 49.1k | for ((Count) = 1; (Count) < (Len); ++(Count)) \ |
90 | 47.1k | { \ |
91 | 1.97k | if (((Chars)[(Count)] & 0xc0) != 0x80) \ |
92 | 1.97k | { \ |
93 | 0 | (Result) = -1; \ |
94 | 0 | break; \ |
95 | 0 | } \ |
96 | 1.97k | (Result) <<= 6; \ |
97 | 1.97k | (Result) |= ((Chars)[(Count)] & 0x3f); \ |
98 | 1.97k | } |
99 | | |
100 | | #define UNICODE_VALID(Char) \ |
101 | 264k | ((Char) < 0x110000 && \ |
102 | 264k | (((Char) & 0xFFFFF800) != 0xD800)) |
103 | | |
104 | | static const char utf8_skip_data[256] = { |
105 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
106 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
107 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
108 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
109 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
110 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
111 | | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
112 | | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
113 | | }; |
114 | | |
115 | 298k | #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)]) |
116 | | |
117 | | /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
118 | | * If @p does not point to a valid UTF-8 encoded character, results are |
119 | | * undefined. |
120 | | **/ |
121 | | static uint32_t |
122 | | _utf8_get_char (const unsigned char *p) |
123 | 33.7k | { |
124 | 33.7k | int i, mask = 0, len; |
125 | 33.7k | uint32_t result; |
126 | 33.7k | unsigned char c = (unsigned char) *p; |
127 | | |
128 | 33.7k | UTF8_COMPUTE (c, mask, len); |
129 | 33.7k | if (len == -1) |
130 | 0 | return (uint32_t)-1; |
131 | 33.7k | UTF8_GET (result, p, i, mask, len); |
132 | | |
133 | 33.7k | return result; |
134 | 33.7k | } |
135 | | |
136 | | /* Like _utf8_get_char, but take a maximum length |
137 | | * and return (uint32_t)-2 on incomplete trailing character |
138 | | */ |
139 | | static uint32_t |
140 | | _utf8_get_char_extended (const unsigned char *p, |
141 | | long max_len) |
142 | 264k | { |
143 | 264k | int i, len; |
144 | 264k | uint32_t wc = (unsigned char) *p; |
145 | | |
146 | 264k | if (wc < 0x80) { |
147 | 262k | return wc; |
148 | 262k | } else if (wc < 0xc0) { |
149 | 0 | return (uint32_t)-1; |
150 | 2.00k | } else if (wc < 0xe0) { |
151 | 726 | len = 2; |
152 | 726 | wc &= 0x1f; |
153 | 1.28k | } else if (wc < 0xf0) { |
154 | 1.26k | len = 3; |
155 | 1.26k | wc &= 0x0f; |
156 | 1.26k | } else if (wc < 0xf8) { |
157 | 16 | len = 4; |
158 | 16 | wc &= 0x07; |
159 | 16 | } else if (wc < 0xfc) { |
160 | 0 | len = 5; |
161 | 0 | wc &= 0x03; |
162 | 0 | } else if (wc < 0xfe) { |
163 | 0 | len = 6; |
164 | 0 | wc &= 0x01; |
165 | 0 | } else { |
166 | 0 | return (uint32_t)-1; |
167 | 0 | } |
168 | | |
169 | 2.00k | if (max_len >= 0 && len > max_len) { |
170 | 0 | for (i = 1; i < max_len; i++) { |
171 | 0 | if ((((unsigned char *)p)[i] & 0xc0) != 0x80) |
172 | 0 | return (uint32_t)-1; |
173 | 0 | } |
174 | 0 | return (uint32_t)-2; |
175 | 0 | } |
176 | | |
177 | 5.30k | for (i = 1; i < len; ++i) { |
178 | 3.30k | uint32_t ch = ((unsigned char *)p)[i]; |
179 | | |
180 | 3.30k | if ((ch & 0xc0) != 0x80) { |
181 | 0 | if (ch) |
182 | 0 | return (uint32_t)-1; |
183 | 0 | else |
184 | 0 | return (uint32_t)-2; |
185 | 0 | } |
186 | | |
187 | 3.30k | wc <<= 6; |
188 | 3.30k | wc |= (ch & 0x3f); |
189 | 3.30k | } |
190 | | |
191 | 2.00k | if (UTF8_LENGTH(wc) != len) |
192 | 0 | return (uint32_t)-1; |
193 | | |
194 | 2.00k | return wc; |
195 | 2.00k | } |
196 | | |
197 | | /** |
198 | | * _cairo_utf8_get_char_validated: |
199 | | * @p: a UTF-8 string |
200 | | * @unicode: location to store one Unicode character |
201 | | * |
202 | | * Decodes the first character of a valid UTF-8 string, and returns |
203 | | * the number of bytes consumed. |
204 | | * |
205 | | * Note that the string should be valid. Do not use this without |
206 | | * validating the string first. |
207 | | * |
208 | | * Returns: the number of bytes forming the character returned. |
209 | | **/ |
210 | | int |
211 | | _cairo_utf8_get_char_validated (const char *p, |
212 | | uint32_t *unicode) |
213 | 13.3k | { |
214 | 13.3k | int i, mask = 0, len; |
215 | 13.3k | uint32_t result; |
216 | 13.3k | unsigned char c = (unsigned char) *p; |
217 | | |
218 | 13.3k | UTF8_COMPUTE (c, mask, len); |
219 | 13.3k | if (len == -1) { |
220 | 0 | if (unicode) |
221 | 0 | *unicode = (uint32_t)-1; |
222 | 0 | return 1; |
223 | 0 | } |
224 | 13.3k | UTF8_GET (result, p, i, mask, len); |
225 | | |
226 | 13.3k | if (unicode) |
227 | 13.3k | *unicode = result; |
228 | 13.3k | return len; |
229 | 13.3k | } |
230 | | |
231 | | /** |
232 | | * _cairo_utf8_to_ucs4: |
233 | | * @str: an UTF-8 string |
234 | | * @len: length of @str in bytes, or -1 if it is nul-terminated. |
235 | | * If @len is supplied and the string has an embedded nul |
236 | | * byte, only the portion before the nul byte is converted. |
237 | | * @result: location to store a pointer to a newly allocated UTF-32 |
238 | | * string (always native endian), or %NULL. Free with free(). A 0 |
239 | | * word will be written after the last character. |
240 | | * @items_written: location to store number of 32-bit words |
241 | | * written. (Not including the trailing 0) |
242 | | * |
243 | | * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode |
244 | | * with 1 32-bit word per character. The string is validated to |
245 | | * consist entirely of valid Unicode characters. |
246 | | * |
247 | | * Return value: %CAIRO_STATUS_SUCCESS if the entire string was |
248 | | * successfully converted. %CAIRO_STATUS_INVALID_STRING if an |
249 | | * invalid sequence was found. |
250 | | **/ |
251 | | cairo_status_t |
252 | | _cairo_utf8_to_ucs4 (const char *str, |
253 | | int len, |
254 | | uint32_t **result, |
255 | | int *items_written) |
256 | 265k | { |
257 | 265k | uint32_t *str32 = NULL; |
258 | 265k | int n_chars, i; |
259 | 265k | const unsigned char *in; |
260 | 265k | const unsigned char * const ustr = (const unsigned char *) str; |
261 | | |
262 | 265k | in = ustr; |
263 | 265k | n_chars = 0; |
264 | 504k | while ((len < 0 || ustr + len - in > 0) && *in) |
265 | 238k | { |
266 | 238k | uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); |
267 | 238k | if (wc & 0x80000000 || !UNICODE_VALID (wc)) |
268 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
269 | | |
270 | 238k | n_chars++; |
271 | 238k | if (n_chars == INT_MAX) |
272 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
273 | | |
274 | 238k | in = UTF8_NEXT_CHAR (in); |
275 | 238k | } |
276 | | |
277 | 265k | if (result) { |
278 | 7.69k | str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t)); |
279 | 7.69k | if (!str32) |
280 | 0 | return _cairo_error (CAIRO_STATUS_NO_MEMORY); |
281 | | |
282 | 7.69k | in = ustr; |
283 | 15.3k | for (i=0; i < n_chars; i++) { |
284 | 7.69k | str32[i] = _utf8_get_char (in); |
285 | 7.69k | in = UTF8_NEXT_CHAR (in); |
286 | 7.69k | } |
287 | 7.69k | str32[i] = 0; |
288 | | |
289 | 7.69k | *result = str32; |
290 | 7.69k | } |
291 | | |
292 | 265k | if (items_written) |
293 | 21.0k | *items_written = n_chars; |
294 | | |
295 | 265k | return CAIRO_STATUS_SUCCESS; |
296 | 265k | } |
297 | | |
298 | | /** |
299 | | * _cairo_ucs4_to_utf8: |
300 | | * @unicode: a UCS-4 character |
301 | | * @utf8: buffer to write utf8 string into. Must have at least 4 bytes |
302 | | * space available. Or %NULL. |
303 | | * |
304 | | * This space left intentionally blank. |
305 | | * |
306 | | * Return value: Number of bytes in the utf8 string or 0 if an invalid |
307 | | * unicode character |
308 | | **/ |
309 | | int |
310 | | _cairo_ucs4_to_utf8 (uint32_t unicode, |
311 | | char *utf8) |
312 | 55.3k | { |
313 | 55.3k | int bytes; |
314 | 55.3k | char *p; |
315 | | |
316 | 55.3k | if (unicode < 0x80) { |
317 | 54.4k | if (utf8) |
318 | 37.3k | *utf8 = unicode; |
319 | 54.4k | return 1; |
320 | 54.4k | } else if (unicode < 0x800) { |
321 | 276 | bytes = 2; |
322 | 577 | } else if (unicode < 0x10000) { |
323 | 561 | bytes = 3; |
324 | 561 | } else if (unicode < 0x200000) { |
325 | 16 | bytes = 4; |
326 | 16 | } else { |
327 | 0 | return 0; |
328 | 0 | } |
329 | | |
330 | 853 | if (!utf8) |
331 | 0 | return bytes; |
332 | | |
333 | 853 | p = utf8 + bytes; |
334 | 3.15k | while (p > utf8) { |
335 | 2.29k | *--p = 0x80 | (unicode & 0x3f); |
336 | 2.29k | unicode >>= 6; |
337 | 2.29k | } |
338 | 853 | *p |= 0xf0 << (4 - bytes); |
339 | | |
340 | 853 | return bytes; |
341 | 853 | } |
342 | | |
343 | | /** |
344 | | * _cairo_ucs4_to_utf16: |
345 | | * @unicode: a UCS-4 character |
346 | | * @utf16: buffer to write utf16 string into. Must have at least 2 |
347 | | * elements. Or %NULL. |
348 | | * |
349 | | * This space left intentionally blank. |
350 | | * |
351 | | * Return value: Number of elements in the utf16 string or 0 if an |
352 | | * invalid unicode character |
353 | | **/ |
354 | | int |
355 | | _cairo_ucs4_to_utf16 (uint32_t unicode, |
356 | | uint16_t *utf16) |
357 | 26.0k | { |
358 | 26.0k | if (unicode < 0x10000) { |
359 | 26.0k | if (utf16) |
360 | 26.0k | utf16[0] = unicode; |
361 | 26.0k | return 1; |
362 | 26.0k | } else if (unicode < 0x110000) { |
363 | 16 | if (utf16) { |
364 | 16 | utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800; |
365 | 16 | utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00; |
366 | 16 | } |
367 | 16 | return 2; |
368 | 16 | } else { |
369 | 0 | return 0; |
370 | 0 | } |
371 | 26.0k | } |
372 | | |
373 | | #if CAIRO_HAS_UTF8_TO_UTF16 |
374 | | /** |
375 | | * _cairo_utf8_to_utf16: |
376 | | * @str: an UTF-8 string |
377 | | * @len: length of @str in bytes, or -1 if it is nul-terminated. |
378 | | * If @len is supplied and the string has an embedded nul |
379 | | * byte, only the portion before the nul byte is converted. |
380 | | * @result: location to store a pointer to a newly allocated UTF-16 |
381 | | * string (always native endian). Free with free(). A 0 |
382 | | * word will be written after the last character. |
383 | | * @items_written: location to store number of 16-bit words |
384 | | * written. (Not including the trailing 0) |
385 | | * |
386 | | * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode |
387 | | * where characters are represented either as a single 16-bit word, or |
388 | | * as a pair of 16-bit "surrogates". The string is validated to |
389 | | * consist entirely of valid Unicode characters. |
390 | | * |
391 | | * Return value: %CAIRO_STATUS_SUCCESS if the entire string was |
392 | | * successfully converted. %CAIRO_STATUS_INVALID_STRING if an |
393 | | * an invalid sequence was found. |
394 | | **/ |
395 | | cairo_status_t |
396 | | _cairo_utf8_to_utf16 (const char *str, |
397 | | int len, |
398 | | uint16_t **result, |
399 | | int *items_written) |
400 | 26.0k | { |
401 | 26.0k | uint16_t *str16 = NULL; |
402 | 26.0k | int n16, i; |
403 | 26.0k | const unsigned char *in; |
404 | 26.0k | const unsigned char * const ustr = (const unsigned char *) str; |
405 | | |
406 | 26.0k | in = ustr; |
407 | 26.0k | n16 = 0; |
408 | 52.1k | while ((len < 0 || ustr + len - in > 0) && *in) { |
409 | 26.0k | uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); |
410 | 26.0k | if (wc & 0x80000000 || !UNICODE_VALID (wc)) |
411 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
412 | | |
413 | 26.0k | if (wc < 0x10000) |
414 | 26.0k | n16 += 1; |
415 | 16 | else |
416 | 16 | n16 += 2; |
417 | | |
418 | 26.0k | if (n16 == INT_MAX - 1 || n16 == INT_MAX) |
419 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
420 | | |
421 | 26.0k | in = UTF8_NEXT_CHAR (in); |
422 | 26.0k | } |
423 | | |
424 | 26.0k | str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t)); |
425 | 26.0k | if (!str16) |
426 | 0 | return _cairo_error (CAIRO_STATUS_NO_MEMORY); |
427 | | |
428 | 26.0k | in = ustr; |
429 | 52.1k | for (i = 0; i < n16;) { |
430 | 26.0k | uint32_t wc = _utf8_get_char (in); |
431 | | |
432 | 26.0k | i += _cairo_ucs4_to_utf16 (wc, str16 + i); |
433 | | |
434 | 26.0k | in = UTF8_NEXT_CHAR (in); |
435 | 26.0k | } |
436 | | |
437 | 26.0k | str16[i] = 0; |
438 | | |
439 | 26.0k | *result = str16; |
440 | 26.0k | if (items_written) |
441 | 26.0k | *items_written = n16; |
442 | | |
443 | 26.0k | return CAIRO_STATUS_SUCCESS; |
444 | 26.0k | } |
445 | | #endif |