/work/workdir/UnpackedTarball/cairo/src/cairo-unicode.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */ |
2 | | /* cairo - a vector graphics library with display and print output |
3 | | * |
4 | | * The code in this file is derived from GLib's gutf8.c and |
5 | | * ultimately from libunicode. It is relicensed under the |
6 | | * dual LGPL/MPL with permission of the original authors. |
7 | | * |
8 | | * Copyright © 1999 Tom Tromey |
9 | | * Copyright © 2005 Red Hat, Inc |
10 | | * |
11 | | * This library is free software; you can redistribute it and/or |
12 | | * modify it either under the terms of the GNU Lesser General Public |
13 | | * License version 2.1 as published by the Free Software Foundation |
14 | | * (the "LGPL") or, at your option, under the terms of the Mozilla |
15 | | * Public License Version 1.1 (the "MPL"). If you do not alter this |
16 | | * notice, a recipient may use your version of this file under either |
17 | | * the MPL or the LGPL. |
18 | | * |
19 | | * You should have received a copy of the LGPL along with this library |
20 | | * in the file COPYING-LGPL-2.1; if not, write to the Free Software |
21 | | * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA |
22 | | * You should have received a copy of the MPL along with this library |
23 | | * in the file COPYING-MPL-1.1 |
24 | | * |
25 | | * The contents of this file are subject to the Mozilla Public License |
26 | | * Version 1.1 (the "License"); you may not use this file except in |
27 | | * compliance with the License. You may obtain a copy of the License at |
28 | | * http://www.mozilla.org/MPL/ |
29 | | * |
30 | | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY |
31 | | * OF ANY KIND, either express or implied. See the LGPL or the MPL for |
32 | | * the specific language governing rights and limitations. |
33 | | * |
34 | | * The Original Code is the cairo graphics library. |
35 | | * |
36 | | * The Initial Developer of the Original Code is Tom Tromey. |
37 | | * and Red Hat, Inc. |
38 | | * |
39 | | * Contributor(s): |
40 | | * Owen Taylor <otaylor@redhat.com> |
41 | | */ |
42 | | |
43 | | #include "cairoint.h" |
44 | | #include "cairo-error-private.h" |
45 | | |
46 | | #define UTF8_COMPUTE(Char, Mask, Len) \ |
47 | 0 | if (Char < 128) \ |
48 | 0 | { \ |
49 | 0 | Len = 1; \ |
50 | 0 | Mask = 0x7f; \ |
51 | 0 | } \ |
52 | 0 | else if ((Char & 0xe0) == 0xc0) \ |
53 | 0 | { \ |
54 | 0 | Len = 2; \ |
55 | 0 | Mask = 0x1f; \ |
56 | 0 | } \ |
57 | 0 | else if ((Char & 0xf0) == 0xe0) \ |
58 | 0 | { \ |
59 | 0 | Len = 3; \ |
60 | 0 | Mask = 0x0f; \ |
61 | 0 | } \ |
62 | 0 | else if ((Char & 0xf8) == 0xf0) \ |
63 | 0 | { \ |
64 | 0 | Len = 4; \ |
65 | 0 | Mask = 0x07; \ |
66 | 0 | } \ |
67 | 0 | else if ((Char & 0xfc) == 0xf8) \ |
68 | 0 | { \ |
69 | 0 | Len = 5; \ |
70 | 0 | Mask = 0x03; \ |
71 | 0 | } \ |
72 | 0 | else if ((Char & 0xfe) == 0xfc) \ |
73 | 0 | { \ |
74 | 0 | Len = 6; \ |
75 | 0 | Mask = 0x01; \ |
76 | 0 | } \ |
77 | 0 | else \ |
78 | 0 | Len = -1; |
79 | | |
80 | | #define UTF8_LENGTH(Char) \ |
81 | 0 | ((Char) < 0x80 ? 1 : \ |
82 | 0 | ((Char) < 0x800 ? 2 : \ |
83 | 0 | ((Char) < 0x10000 ? 3 : \ |
84 | 0 | ((Char) < 0x200000 ? 4 : \ |
85 | 0 | ((Char) < 0x4000000 ? 5 : 6))))) |
86 | | |
87 | | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
88 | 0 | (Result) = (Chars)[0] & (Mask); \ |
89 | 0 | for ((Count) = 1; (Count) < (Len); ++(Count)) \ |
90 | 0 | { \ |
91 | 0 | if (((Chars)[(Count)] & 0xc0) != 0x80) \ |
92 | 0 | { \ |
93 | 0 | (Result) = -1; \ |
94 | 0 | break; \ |
95 | 0 | } \ |
96 | 0 | (Result) <<= 6; \ |
97 | 0 | (Result) |= ((Chars)[(Count)] & 0x3f); \ |
98 | 0 | } |
99 | | |
100 | | #define UNICODE_VALID(Char) \ |
101 | 0 | ((Char) < 0x110000 && \ |
102 | 0 | (((Char) & 0xFFFFF800) != 0xD800) && \ |
103 | 0 | ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ |
104 | 0 | ((Char) & 0xFFFE) != 0xFFFE) |
105 | | |
106 | | static const char utf8_skip_data[256] = { |
107 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
108 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
109 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
110 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
111 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
112 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
113 | | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
114 | | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
115 | | }; |
116 | | |
117 | 0 | #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)]) |
118 | | |
119 | | /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
120 | | * If @p does not point to a valid UTF-8 encoded character, results are |
121 | | * undefined. |
122 | | **/ |
123 | | static uint32_t |
124 | | _utf8_get_char (const unsigned char *p) |
125 | 0 | { |
126 | 0 | int i, mask = 0, len; |
127 | 0 | uint32_t result; |
128 | 0 | unsigned char c = (unsigned char) *p; |
129 | |
|
130 | 0 | UTF8_COMPUTE (c, mask, len); |
131 | 0 | if (len == -1) |
132 | 0 | return (uint32_t)-1; |
133 | 0 | UTF8_GET (result, p, i, mask, len); |
134 | |
|
135 | 0 | return result; |
136 | 0 | } |
137 | | |
138 | | /* Like _utf8_get_char, but take a maximum length |
139 | | * and return (uint32_t)-2 on incomplete trailing character |
140 | | */ |
141 | | static uint32_t |
142 | | _utf8_get_char_extended (const unsigned char *p, |
143 | | long max_len) |
144 | 0 | { |
145 | 0 | int i, len; |
146 | 0 | uint32_t wc = (unsigned char) *p; |
147 | |
|
148 | 0 | if (wc < 0x80) { |
149 | 0 | return wc; |
150 | 0 | } else if (wc < 0xc0) { |
151 | 0 | return (uint32_t)-1; |
152 | 0 | } else if (wc < 0xe0) { |
153 | 0 | len = 2; |
154 | 0 | wc &= 0x1f; |
155 | 0 | } else if (wc < 0xf0) { |
156 | 0 | len = 3; |
157 | 0 | wc &= 0x0f; |
158 | 0 | } else if (wc < 0xf8) { |
159 | 0 | len = 4; |
160 | 0 | wc &= 0x07; |
161 | 0 | } else if (wc < 0xfc) { |
162 | 0 | len = 5; |
163 | 0 | wc &= 0x03; |
164 | 0 | } else if (wc < 0xfe) { |
165 | 0 | len = 6; |
166 | 0 | wc &= 0x01; |
167 | 0 | } else { |
168 | 0 | return (uint32_t)-1; |
169 | 0 | } |
170 | | |
171 | 0 | if (max_len >= 0 && len > max_len) { |
172 | 0 | for (i = 1; i < max_len; i++) { |
173 | 0 | if ((((unsigned char *)p)[i] & 0xc0) != 0x80) |
174 | 0 | return (uint32_t)-1; |
175 | 0 | } |
176 | 0 | return (uint32_t)-2; |
177 | 0 | } |
178 | | |
179 | 0 | for (i = 1; i < len; ++i) { |
180 | 0 | uint32_t ch = ((unsigned char *)p)[i]; |
181 | |
|
182 | 0 | if ((ch & 0xc0) != 0x80) { |
183 | 0 | if (ch) |
184 | 0 | return (uint32_t)-1; |
185 | 0 | else |
186 | 0 | return (uint32_t)-2; |
187 | 0 | } |
188 | | |
189 | 0 | wc <<= 6; |
190 | 0 | wc |= (ch & 0x3f); |
191 | 0 | } |
192 | | |
193 | 0 | if (UTF8_LENGTH(wc) != len) |
194 | 0 | return (uint32_t)-1; |
195 | | |
196 | 0 | return wc; |
197 | 0 | } |
198 | | |
199 | | /** |
200 | | * _cairo_utf8_get_char_validated: |
201 | | * @p: a UTF-8 string |
202 | | * @unicode: location to store one Unicode character |
203 | | * |
204 | | * Decodes the first character of a valid UTF-8 string, and returns |
205 | | * the number of bytes consumed. |
206 | | * |
207 | | * Note that the string should be valid. Do not use this without |
208 | | * validating the string first. |
209 | | * |
210 | | * Returns: the number of bytes forming the character returned. |
211 | | **/ |
212 | | int |
213 | | _cairo_utf8_get_char_validated (const char *p, |
214 | | uint32_t *unicode) |
215 | 0 | { |
216 | 0 | int i, mask = 0, len; |
217 | 0 | uint32_t result; |
218 | 0 | unsigned char c = (unsigned char) *p; |
219 | |
|
220 | 0 | UTF8_COMPUTE (c, mask, len); |
221 | 0 | if (len == -1) { |
222 | 0 | if (unicode) |
223 | 0 | *unicode = (uint32_t)-1; |
224 | 0 | return 1; |
225 | 0 | } |
226 | 0 | UTF8_GET (result, p, i, mask, len); |
227 | |
|
228 | 0 | if (unicode) |
229 | 0 | *unicode = result; |
230 | 0 | return len; |
231 | 0 | } |
232 | | |
233 | | /** |
234 | | * _cairo_utf8_to_ucs4: |
235 | | * @str: an UTF-8 string |
236 | | * @len: length of @str in bytes, or -1 if it is nul-terminated. |
237 | | * If @len is supplied and the string has an embedded nul |
238 | | * byte, only the portion before the nul byte is converted. |
239 | | * @result: location to store a pointer to a newly allocated UTF-32 |
240 | | * string (always native endian), or %NULL. Free with free(). A 0 |
241 | | * word will be written after the last character. |
242 | | * @items_written: location to store number of 32-bit words |
243 | | * written. (Not including the trailing 0) |
244 | | * |
245 | | * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode |
246 | | * with 1 32-bit word per character. The string is validated to |
247 | | * consist entirely of valid Unicode characters. |
248 | | * |
249 | | * Return value: %CAIRO_STATUS_SUCCESS if the entire string was |
250 | | * successfully converted. %CAIRO_STATUS_INVALID_STRING if an |
251 | | * invalid sequence was found. |
252 | | **/ |
253 | | cairo_status_t |
254 | | _cairo_utf8_to_ucs4 (const char *str, |
255 | | int len, |
256 | | uint32_t **result, |
257 | | int *items_written) |
258 | 0 | { |
259 | 0 | uint32_t *str32 = NULL; |
260 | 0 | int n_chars, i; |
261 | 0 | const unsigned char *in; |
262 | 0 | const unsigned char * const ustr = (const unsigned char *) str; |
263 | |
|
264 | 0 | in = ustr; |
265 | 0 | n_chars = 0; |
266 | 0 | while ((len < 0 || ustr + len - in > 0) && *in) |
267 | 0 | { |
268 | 0 | uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); |
269 | 0 | if (wc & 0x80000000 || !UNICODE_VALID (wc)) |
270 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
271 | | |
272 | 0 | n_chars++; |
273 | 0 | if (n_chars == INT_MAX) |
274 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
275 | | |
276 | 0 | in = UTF8_NEXT_CHAR (in); |
277 | 0 | } |
278 | | |
279 | 0 | if (result) { |
280 | 0 | str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t)); |
281 | 0 | if (!str32) |
282 | 0 | return _cairo_error (CAIRO_STATUS_NO_MEMORY); |
283 | | |
284 | 0 | in = ustr; |
285 | 0 | for (i=0; i < n_chars; i++) { |
286 | 0 | str32[i] = _utf8_get_char (in); |
287 | 0 | in = UTF8_NEXT_CHAR (in); |
288 | 0 | } |
289 | 0 | str32[i] = 0; |
290 | |
|
291 | 0 | *result = str32; |
292 | 0 | } |
293 | | |
294 | 0 | if (items_written) |
295 | 0 | *items_written = n_chars; |
296 | |
|
297 | 0 | return CAIRO_STATUS_SUCCESS; |
298 | 0 | } |
299 | | |
300 | | /** |
301 | | * _cairo_ucs4_to_utf8: |
302 | | * @unicode: a UCS-4 character |
303 | | * @utf8: buffer to write utf8 string into. Must have at least 4 bytes |
304 | | * space available. Or %NULL. |
305 | | * |
306 | | * This space left intentionally blank. |
307 | | * |
308 | | * Return value: Number of bytes in the utf8 string or 0 if an invalid |
309 | | * unicode character |
310 | | **/ |
311 | | int |
312 | | _cairo_ucs4_to_utf8 (uint32_t unicode, |
313 | | char *utf8) |
314 | 0 | { |
315 | 0 | int bytes; |
316 | 0 | char *p; |
317 | |
|
318 | 0 | if (unicode < 0x80) { |
319 | 0 | if (utf8) |
320 | 0 | *utf8 = unicode; |
321 | 0 | return 1; |
322 | 0 | } else if (unicode < 0x800) { |
323 | 0 | bytes = 2; |
324 | 0 | } else if (unicode < 0x10000) { |
325 | 0 | bytes = 3; |
326 | 0 | } else if (unicode < 0x200000) { |
327 | 0 | bytes = 4; |
328 | 0 | } else { |
329 | 0 | return 0; |
330 | 0 | } |
331 | | |
332 | 0 | if (!utf8) |
333 | 0 | return bytes; |
334 | | |
335 | 0 | p = utf8 + bytes; |
336 | 0 | while (p > utf8) { |
337 | 0 | *--p = 0x80 | (unicode & 0x3f); |
338 | 0 | unicode >>= 6; |
339 | 0 | } |
340 | 0 | *p |= 0xf0 << (4 - bytes); |
341 | |
|
342 | 0 | return bytes; |
343 | 0 | } |
344 | | |
345 | | /** |
346 | | * _cairo_ucs4_to_utf16: |
347 | | * @unicode: a UCS-4 character |
348 | | * @utf16: buffer to write utf16 string into. Must have at least 2 |
349 | | * elements. Or %NULL. |
350 | | * |
351 | | * This space left intentionally blank. |
352 | | * |
353 | | * Return value: Number of elements in the utf16 string or 0 if an |
354 | | * invalid unicode character |
355 | | **/ |
356 | | int |
357 | | _cairo_ucs4_to_utf16 (uint32_t unicode, |
358 | | uint16_t *utf16) |
359 | 0 | { |
360 | 0 | if (unicode < 0x10000) { |
361 | 0 | if (utf16) |
362 | 0 | utf16[0] = unicode; |
363 | 0 | return 1; |
364 | 0 | } else if (unicode < 0x110000) { |
365 | 0 | if (utf16) { |
366 | 0 | utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800; |
367 | 0 | utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00; |
368 | 0 | } |
369 | 0 | return 2; |
370 | 0 | } else { |
371 | 0 | return 0; |
372 | 0 | } |
373 | 0 | } |
374 | | |
375 | | #if CAIRO_HAS_UTF8_TO_UTF16 |
376 | | /** |
377 | | * _cairo_utf8_to_utf16: |
378 | | * @str: an UTF-8 string |
379 | | * @len: length of @str in bytes, or -1 if it is nul-terminated. |
380 | | * If @len is supplied and the string has an embedded nul |
381 | | * byte, only the portion before the nul byte is converted. |
382 | | * @result: location to store a pointer to a newly allocated UTF-16 |
383 | | * string (always native endian). Free with free(). A 0 |
384 | | * word will be written after the last character. |
385 | | * @items_written: location to store number of 16-bit words |
386 | | * written. (Not including the trailing 0) |
387 | | * |
388 | | * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode |
389 | | * where characters are represented either as a single 16-bit word, or |
390 | | * as a pair of 16-bit "surrogates". The string is validated to |
391 | | * consist entirely of valid Unicode characters. |
392 | | * |
393 | | * Return value: %CAIRO_STATUS_SUCCESS if the entire string was |
394 | | * successfully converted. %CAIRO_STATUS_INVALID_STRING if an |
395 | | * an invalid sequence was found. |
396 | | **/ |
397 | | cairo_status_t |
398 | | _cairo_utf8_to_utf16 (const char *str, |
399 | | int len, |
400 | | uint16_t **result, |
401 | | int *items_written) |
402 | 0 | { |
403 | 0 | uint16_t *str16 = NULL; |
404 | 0 | int n16, i; |
405 | 0 | const unsigned char *in; |
406 | 0 | const unsigned char * const ustr = (const unsigned char *) str; |
407 | |
|
408 | 0 | in = ustr; |
409 | 0 | n16 = 0; |
410 | 0 | while ((len < 0 || ustr + len - in > 0) && *in) { |
411 | 0 | uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); |
412 | 0 | if (wc & 0x80000000 || !UNICODE_VALID (wc)) |
413 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
414 | | |
415 | 0 | if (wc < 0x10000) |
416 | 0 | n16 += 1; |
417 | 0 | else |
418 | 0 | n16 += 2; |
419 | |
|
420 | 0 | if (n16 == INT_MAX - 1 || n16 == INT_MAX) |
421 | 0 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
422 | | |
423 | 0 | in = UTF8_NEXT_CHAR (in); |
424 | 0 | } |
425 | | |
426 | 0 | str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t)); |
427 | 0 | if (!str16) |
428 | 0 | return _cairo_error (CAIRO_STATUS_NO_MEMORY); |
429 | | |
430 | 0 | in = ustr; |
431 | 0 | for (i = 0; i < n16;) { |
432 | 0 | uint32_t wc = _utf8_get_char (in); |
433 | |
|
434 | 0 | i += _cairo_ucs4_to_utf16 (wc, str16 + i); |
435 | |
|
436 | 0 | in = UTF8_NEXT_CHAR (in); |
437 | 0 | } |
438 | |
|
439 | 0 | str16[i] = 0; |
440 | |
|
441 | 0 | *result = str16; |
442 | 0 | if (items_written) |
443 | 0 | *items_written = n16; |
444 | |
|
445 | 0 | return CAIRO_STATUS_SUCCESS; |
446 | 0 | } |
447 | | #endif |