Coverage Report

Created: 2025-11-16 09:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/workdir/UnpackedTarball/cairo/src/cairo-unicode.c
Line
Count
Source
1
/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2
/* cairo - a vector graphics library with display and print output
3
 *
4
 * The code in this file is derived from GLib's gutf8.c and
5
 *   ultimately from libunicode. It is relicensed under the
6
 *   dual LGPL/MPL with permission of the original authors.
7
 *
8
 * Copyright © 1999 Tom Tromey
9
 * Copyright © 2005 Red Hat, Inc
10
 *
11
 * This library is free software; you can redistribute it and/or
12
 * modify it either under the terms of the GNU Lesser General Public
13
 * License version 2.1 as published by the Free Software Foundation
14
 * (the "LGPL") or, at your option, under the terms of the Mozilla
15
 * Public License Version 1.1 (the "MPL"). If you do not alter this
16
 * notice, a recipient may use your version of this file under either
17
 * the MPL or the LGPL.
18
 *
19
 * You should have received a copy of the LGPL along with this library
20
 * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22
 * You should have received a copy of the MPL along with this library
23
 * in the file COPYING-MPL-1.1
24
 *
25
 * The contents of this file are subject to the Mozilla Public License
26
 * Version 1.1 (the "License"); you may not use this file except in
27
 * compliance with the License. You may obtain a copy of the License at
28
 * http://www.mozilla.org/MPL/
29
 *
30
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31
 * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32
 * the specific language governing rights and limitations.
33
 *
34
 * The Original Code is the cairo graphics library.
35
 *
36
 * The Initial Developer of the Original Code is Tom Tromey.
37
 *  and Red Hat, Inc.
38
 *
39
 * Contributor(s):
40
 *  Owen Taylor <otaylor@redhat.com>
41
 */
42
43
#include "cairoint.h"
44
#include "cairo-error-private.h"
45
46
#define UTF8_COMPUTE(Char, Mask, Len)               \
47
0
  if (Char < 128)                   \
48
0
    {                       \
49
0
      Len = 1;                      \
50
0
      Mask = 0x7f;                    \
51
0
    }                        \
52
0
  else if ((Char & 0xe0) == 0xc0)               \
53
0
    {                       \
54
0
      Len = 2;                      \
55
0
      Mask = 0x1f;                    \
56
0
    }                        \
57
0
  else if ((Char & 0xf0) == 0xe0)               \
58
0
    {                       \
59
0
      Len = 3;                      \
60
0
      Mask = 0x0f;                    \
61
0
    }                        \
62
0
  else if ((Char & 0xf8) == 0xf0)               \
63
0
    {                       \
64
0
      Len = 4;                      \
65
0
      Mask = 0x07;                    \
66
0
    }                        \
67
0
  else if ((Char & 0xfc) == 0xf8)               \
68
0
    {                       \
69
0
      Len = 5;                      \
70
0
      Mask = 0x03;                    \
71
0
    }                        \
72
0
  else if ((Char & 0xfe) == 0xfc)               \
73
0
    {                       \
74
0
      Len = 6;                      \
75
0
      Mask = 0x01;                    \
76
0
    }                        \
77
0
  else                        \
78
0
    Len = -1;
79
80
#define UTF8_LENGTH(Char)              \
81
0
  ((Char) < 0x80 ? 1 :                 \
82
0
   ((Char) < 0x800 ? 2 :               \
83
0
    ((Char) < 0x10000 ? 3 :            \
84
0
     ((Char) < 0x200000 ? 4 :          \
85
0
      ((Char) < 0x4000000 ? 5 : 6)))))
86
87
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
88
0
  (Result) = (Chars)[0] & (Mask);               \
89
0
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
90
0
    {                       \
91
0
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
92
0
  {                     \
93
0
    (Result) = -1;                  \
94
0
    break;                    \
95
0
  }                      \
96
0
      (Result) <<= 6;                   \
97
0
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
98
0
    }
99
100
#define UNICODE_VALID(Char)                   \
101
0
    ((Char) < 0x110000 &&                     \
102
0
     (((Char) & 0xFFFFF800) != 0xD800))
103
104
static const char utf8_skip_data[256] = {
105
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
106
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
112
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
113
};
114
115
0
#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
116
117
/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
118
 * If @p does not point to a valid UTF-8 encoded character, results are
119
 * undefined.
120
 **/
121
static uint32_t
122
_utf8_get_char (const unsigned char *p)
123
0
{
124
0
    int i, mask = 0, len;
125
0
    uint32_t result;
126
0
    unsigned char c = (unsigned char) *p;
127
128
0
    UTF8_COMPUTE (c, mask, len);
129
0
    if (len == -1)
130
0
  return (uint32_t)-1;
131
0
    UTF8_GET (result, p, i, mask, len);
132
133
0
    return result;
134
0
}
135
136
/* Like _utf8_get_char, but take a maximum length
137
 * and return (uint32_t)-2 on incomplete trailing character
138
 */
139
static uint32_t
140
_utf8_get_char_extended (const unsigned char *p,
141
       long         max_len)
142
0
{
143
0
    int i, len;
144
0
    uint32_t wc = (unsigned char) *p;
145
146
0
    if (wc < 0x80) {
147
0
  return wc;
148
0
    } else if (wc < 0xc0) {
149
0
  return (uint32_t)-1;
150
0
    } else if (wc < 0xe0) {
151
0
  len = 2;
152
0
  wc &= 0x1f;
153
0
    } else if (wc < 0xf0) {
154
0
  len = 3;
155
0
  wc &= 0x0f;
156
0
    } else if (wc < 0xf8) {
157
0
  len = 4;
158
0
  wc &= 0x07;
159
0
    } else if (wc < 0xfc) {
160
0
  len = 5;
161
0
  wc &= 0x03;
162
0
    } else if (wc < 0xfe) {
163
0
  len = 6;
164
0
  wc &= 0x01;
165
0
    } else {
166
0
  return (uint32_t)-1;
167
0
    }
168
169
0
    if (max_len >= 0 && len > max_len) {
170
0
  for (i = 1; i < max_len; i++) {
171
0
      if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
172
0
    return (uint32_t)-1;
173
0
  }
174
0
  return (uint32_t)-2;
175
0
    }
176
177
0
    for (i = 1; i < len; ++i) {
178
0
  uint32_t ch = ((unsigned char *)p)[i];
179
180
0
  if ((ch & 0xc0) != 0x80) {
181
0
      if (ch)
182
0
    return (uint32_t)-1;
183
0
      else
184
0
    return (uint32_t)-2;
185
0
  }
186
187
0
  wc <<= 6;
188
0
  wc |= (ch & 0x3f);
189
0
    }
190
191
0
    if (UTF8_LENGTH(wc) != len)
192
0
  return (uint32_t)-1;
193
194
0
    return wc;
195
0
}
196
197
/**
198
 * _cairo_utf8_get_char_validated:
199
 * @p: a UTF-8 string
200
 * @unicode: location to store one Unicode character
201
 *
202
 * Decodes the first character of a valid UTF-8 string, and returns
203
 * the number of bytes consumed.
204
 *
205
 * Note that the string should be valid.  Do not use this without
206
 * validating the string first.
207
 *
208
 * Returns: the number of bytes forming the character returned.
209
 **/
210
int
211
_cairo_utf8_get_char_validated (const char *p,
212
        uint32_t   *unicode)
213
0
{
214
0
    int i, mask = 0, len;
215
0
    uint32_t result;
216
0
    unsigned char c = (unsigned char) *p;
217
218
0
    UTF8_COMPUTE (c, mask, len);
219
0
    if (len == -1) {
220
0
  if (unicode)
221
0
      *unicode = (uint32_t)-1;
222
0
  return 1;
223
0
    }
224
0
    UTF8_GET (result, p, i, mask, len);
225
226
0
    if (unicode)
227
0
  *unicode = result;
228
0
    return len;
229
0
}
230
231
/**
232
 * _cairo_utf8_to_ucs4:
233
 * @str: an UTF-8 string
234
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
235
 *   If @len is supplied and the string has an embedded nul
236
 *   byte, only the portion before the nul byte is converted.
237
 * @result: location to store a pointer to a newly allocated UTF-32
238
 *   string (always native endian), or %NULL. Free with free(). A 0
239
 *   word will be written after the last character.
240
 * @items_written: location to store number of 32-bit words
241
 *   written. (Not including the trailing 0)
242
 *
243
 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
244
 * with 1 32-bit word per character. The string is validated to
245
 * consist entirely of valid Unicode characters.
246
 *
247
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
248
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
249
 *   invalid sequence was found.
250
 **/
251
cairo_status_t
252
_cairo_utf8_to_ucs4 (const char *str,
253
         int   len,
254
         uint32_t  **result,
255
         int  *items_written)
256
0
{
257
0
    uint32_t *str32 = NULL;
258
0
    int n_chars, i;
259
0
    const unsigned char *in;
260
0
    const unsigned char * const ustr = (const unsigned char *) str;
261
262
0
    in = ustr;
263
0
    n_chars = 0;
264
0
    while ((len < 0 || ustr + len - in > 0) && *in)
265
0
    {
266
0
  uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
267
0
  if (wc & 0x80000000 || !UNICODE_VALID (wc))
268
0
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);
269
270
0
  n_chars++;
271
0
  if (n_chars == INT_MAX)
272
0
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);
273
274
0
  in = UTF8_NEXT_CHAR (in);
275
0
    }
276
277
0
    if (result) {
278
0
  str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
279
0
  if (!str32)
280
0
      return _cairo_error (CAIRO_STATUS_NO_MEMORY);
281
282
0
  in = ustr;
283
0
  for (i=0; i < n_chars; i++) {
284
0
      str32[i] = _utf8_get_char (in);
285
0
      in = UTF8_NEXT_CHAR (in);
286
0
  }
287
0
  str32[i] = 0;
288
289
0
  *result = str32;
290
0
    }
291
292
0
    if (items_written)
293
0
  *items_written = n_chars;
294
295
0
    return CAIRO_STATUS_SUCCESS;
296
0
}
297
298
/**
299
 * _cairo_ucs4_to_utf8:
300
 * @unicode: a UCS-4 character
301
 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
302
 * space available. Or %NULL.
303
 *
304
 * This space left intentionally blank.
305
 *
306
 * Return value: Number of bytes in the utf8 string or 0 if an invalid
307
 * unicode character
308
 **/
309
int
310
_cairo_ucs4_to_utf8 (uint32_t  unicode,
311
         char     *utf8)
312
0
{
313
0
    int bytes;
314
0
    char *p;
315
316
0
    if (unicode < 0x80) {
317
0
  if (utf8)
318
0
      *utf8 = unicode;
319
0
  return 1;
320
0
    } else if (unicode < 0x800) {
321
0
  bytes = 2;
322
0
    } else if (unicode < 0x10000) {
323
0
  bytes = 3;
324
0
    } else if (unicode < 0x200000) {
325
0
  bytes = 4;
326
0
    } else {
327
0
  return 0;
328
0
    }
329
330
0
    if (!utf8)
331
0
  return bytes;
332
333
0
    p = utf8 + bytes;
334
0
    while (p > utf8) {
335
0
  *--p = 0x80 | (unicode & 0x3f);
336
0
  unicode >>= 6;
337
0
    }
338
0
    *p |= 0xf0 << (4 - bytes);
339
340
0
    return bytes;
341
0
}
342
343
/**
344
 * _cairo_ucs4_to_utf16:
345
 * @unicode: a UCS-4 character
346
 * @utf16: buffer to write utf16 string into. Must have at least 2
347
 * elements. Or %NULL.
348
 *
349
 * This space left intentionally blank.
350
 *
351
 * Return value: Number of elements in the utf16 string or 0 if an
352
 * invalid unicode character
353
 **/
354
int
355
_cairo_ucs4_to_utf16 (uint32_t  unicode,
356
          uint16_t *utf16)
357
0
{
358
0
    if (unicode < 0x10000) {
359
0
  if (utf16)
360
0
      utf16[0] = unicode;
361
0
  return 1;
362
0
    } else if (unicode < 0x110000) {
363
0
  if (utf16) {
364
0
      utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800;
365
0
      utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00;
366
0
  }
367
0
  return 2;
368
0
    } else {
369
0
  return 0;
370
0
    }
371
0
}
372
373
#if CAIRO_HAS_UTF8_TO_UTF16
374
/**
375
 * _cairo_utf8_to_utf16:
376
 * @str: an UTF-8 string
377
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
378
 *   If @len is supplied and the string has an embedded nul
379
 *   byte, only the portion before the nul byte is converted.
380
 * @result: location to store a pointer to a newly allocated UTF-16
381
 *   string (always native endian). Free with free(). A 0
382
 *   word will be written after the last character.
383
 * @items_written: location to store number of 16-bit words
384
 *   written. (Not including the trailing 0)
385
 *
386
 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
387
 * where characters are represented either as a single 16-bit word, or
388
 * as a pair of 16-bit "surrogates". The string is validated to
389
 * consist entirely of valid Unicode characters.
390
 *
391
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
392
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
393
 *   an invalid sequence was found.
394
 **/
395
cairo_status_t
396
_cairo_utf8_to_utf16 (const char *str,
397
          int   len,
398
          uint16_t **result,
399
          int *items_written)
400
0
{
401
0
    uint16_t *str16 = NULL;
402
0
    int n16, i;
403
0
    const unsigned char *in;
404
0
    const unsigned char * const ustr = (const unsigned char *) str;
405
406
0
    in = ustr;
407
0
    n16 = 0;
408
0
    while ((len < 0 || ustr + len - in > 0) && *in) {
409
0
  uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
410
0
  if (wc & 0x80000000 || !UNICODE_VALID (wc))
411
0
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);
412
413
0
  if (wc < 0x10000)
414
0
      n16 += 1;
415
0
  else
416
0
      n16 += 2;
417
418
0
  if (n16 == INT_MAX - 1 || n16 == INT_MAX)
419
0
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);
420
421
0
  in = UTF8_NEXT_CHAR (in);
422
0
    }
423
424
0
    str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
425
0
    if (!str16)
426
0
  return _cairo_error (CAIRO_STATUS_NO_MEMORY);
427
428
0
    in = ustr;
429
0
    for (i = 0; i < n16;) {
430
0
  uint32_t wc = _utf8_get_char (in);
431
432
0
  i += _cairo_ucs4_to_utf16 (wc, str16 + i);
433
434
0
  in = UTF8_NEXT_CHAR (in);
435
0
    }
436
437
0
    str16[i] = 0;
438
439
0
    *result = str16;
440
0
    if (items_written)
441
0
  *items_written = n16;
442
443
0
    return CAIRO_STATUS_SUCCESS;
444
0
}
445
#endif