/src/cairo/src/cairo-unicode.c

Source
/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
/* cairo - a vector graphics library with display and print output
 *
 * The code in this file is derived from GLib's gutf8.c and
 *   ultimately from libunicode. It is relicensed under the
 *   dual LGPL/MPL with permission of the original authors.
 *
 * Copyright © 1999 Tom Tromey
 * Copyright © 2005 Red Hat, Inc
 *
 * This library is free software; you can redistribute it and/or
 * modify it either under the terms of the GNU Lesser General Public
 * License version 2.1 as published by the Free Software Foundation
 * (the "LGPL") or, at your option, under the terms of the Mozilla
 * Public License Version 1.1 (the "MPL"). If you do not alter this
 * notice, a recipient may use your version of this file under either
 * the MPL or the LGPL.
 *
 * You should have received a copy of the LGPL along with this library
 * in the file COPYING-LGPL-2.1; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
 * You should have received a copy of the MPL along with this library
 * in the file COPYING-MPL-1.1
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
 * OF ANY KIND, either express or implied. See the LGPL or the MPL for
 * the specific language governing rights and limitations.
 *
 * The Original Code is the cairo graphics library.
 *
 * The Initial Developer of the Original Code is Tom Tromey.
 *  and Red Hat, Inc.
 *
 * Contributor(s):
 *  Owen Taylor <otaylor@redhat.com>
 */

#include "cairoint.h"
#include "cairo-error-private.h"

#define UTF8_COMPUTE(Char, Mask, Len)               \
  if (Char < 128)                   \
    {                       \
      Len = 1;                      \
      Mask = 0x7f;                    \
    }                        \
  else if ((Char & 0xe0) == 0xc0)               \
    {                       \
      Len = 2;                      \
      Mask = 0x1f;                    \
    }                        \
  else if ((Char & 0xf0) == 0xe0)               \
    {                       \
      Len = 3;                      \
      Mask = 0x0f;                    \
    }                        \
  else if ((Char & 0xf8) == 0xf0)               \
    {                       \
      Len = 4;                      \
      Mask = 0x07;                    \
    }                        \
  else if ((Char & 0xfc) == 0xf8)               \
    {                       \
      Len = 5;                      \
      Mask = 0x03;                    \
    }                        \
  else if ((Char & 0xfe) == 0xfc)               \
    {                       \
      Len = 6;                      \
      Mask = 0x01;                    \
    }                        \
  else                        \
    Len = -1;

#define UTF8_LENGTH(Char)              \
  ((Char) < 0x80 ? 1 :                 \
   ((Char) < 0x800 ? 2 :               \
    ((Char) < 0x10000 ? 3 :            \
     ((Char) < 0x200000 ? 4 :          \
      ((Char) < 0x4000000 ? 5 : 6)))))

#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
  (Result) = (Chars)[0] & (Mask);               \
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
    {                       \
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
  {                     \
    (Result) = -1;                  \
    break;                    \
  }                      \
      (Result) <<= 6;                   \
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
    }

#define UNICODE_VALID(Char)                   \
    ((Char) < 0x110000 &&                     \
     (((Char) & 0xFFFFF800) != 0xD800))

static const char utf8_skip_data[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};

#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])

/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 * If @p does not point to a valid UTF-8 encoded character, results are
 * undefined.
 **/
static uint32_t
_utf8_get_char (const unsigned char *p)
{
    int i, mask = 0, len;
    uint32_t result;
    unsigned char c = (unsigned char) *p;

    UTF8_COMPUTE (c, mask, len);
    if (len == -1)
  return (uint32_t)-1;
    UTF8_GET (result, p, i, mask, len);

    return result;
}

/* Like _utf8_get_char, but take a maximum length
 * and return (uint32_t)-2 on incomplete trailing character
 */
static uint32_t
_utf8_get_char_extended (const unsigned char *p,
       long         max_len)
{
    int i, len;
    uint32_t wc = (unsigned char) *p;

    if (wc < 0x80) {
  return wc;
    } else if (wc < 0xc0) {
  return (uint32_t)-1;
    } else if (wc < 0xe0) {
  len = 2;
  wc &= 0x1f;
    } else if (wc < 0xf0) {
  len = 3;
  wc &= 0x0f;
    } else if (wc < 0xf8) {
  len = 4;
  wc &= 0x07;
    } else if (wc < 0xfc) {
  len = 5;
  wc &= 0x03;
    } else if (wc < 0xfe) {
  len = 6;
  wc &= 0x01;
    } else {
  return (uint32_t)-1;
    }

    if (max_len >= 0 && len > max_len) {
  for (i = 1; i < max_len; i++) {
      if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
    return (uint32_t)-1;
  }
  return (uint32_t)-2;
    }

    for (i = 1; i < len; ++i) {
  uint32_t ch = ((unsigned char *)p)[i];

  if ((ch & 0xc0) != 0x80) {
      if (ch)
    return (uint32_t)-1;
      else
    return (uint32_t)-2;
  }

  wc <<= 6;
  wc |= (ch & 0x3f);
    }

    if (UTF8_LENGTH(wc) != len)
  return (uint32_t)-1;

    return wc;
}

/**
 * _cairo_utf8_get_char_validated:
 * @p: a UTF-8 string
 * @unicode: location to store one Unicode character
 *
 * Decodes the first character of a valid UTF-8 string, and returns
 * the number of bytes consumed.
 *
 * Note that the string should be valid.  Do not use this without
 * validating the string first.
 *
 * Returns: the number of bytes forming the character returned.
 **/
int
_cairo_utf8_get_char_validated (const char *p,
        uint32_t   *unicode)
{
    int i, mask = 0, len;
    uint32_t result;
    unsigned char c = (unsigned char) *p;

    UTF8_COMPUTE (c, mask, len);
    if (len == -1) {
  if (unicode)
      *unicode = (uint32_t)-1;
  return 1;
    }
    UTF8_GET (result, p, i, mask, len);

    if (unicode)
  *unicode = result;
    return len;
}

/**
 * _cairo_utf8_to_ucs4:
 * @str: an UTF-8 string
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
 *   If @len is supplied and the string has an embedded nul
 *   byte, only the portion before the nul byte is converted.
 * @result: location to store a pointer to a newly allocated UTF-32
 *   string (always native endian), or %NULL. Free with free(). A 0
 *   word will be written after the last character.
 * @items_written: location to store number of 32-bit words
 *   written. (Not including the trailing 0)
 *
 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
 * with 1 32-bit word per character. The string is validated to
 * consist entirely of valid Unicode characters.
 *
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
 *   invalid sequence was found.
 **/
cairo_status_t
_cairo_utf8_to_ucs4 (const char *str,
         int   len,
         uint32_t  **result,
         int  *items_written)
{
    uint32_t *str32 = NULL;
    int n_chars, i;
    const unsigned char *in;
    const unsigned char * const ustr = (const unsigned char *) str;

    in = ustr;
    n_chars = 0;
    while ((len < 0 || ustr + len - in > 0) && *in)
    {
  uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
  if (wc & 0x80000000 || !UNICODE_VALID (wc))
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);

  n_chars++;
  if (n_chars == INT_MAX)
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);

  in = UTF8_NEXT_CHAR (in);
    }

    if (result) {
  str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
  if (!str32)
      return _cairo_error (CAIRO_STATUS_NO_MEMORY);

  in = ustr;
  for (i=0; i < n_chars; i++) {
      str32[i] = _utf8_get_char (in);
      in = UTF8_NEXT_CHAR (in);
  }
  str32[i] = 0;

  *result = str32;
    }

    if (items_written)
  *items_written = n_chars;

    return CAIRO_STATUS_SUCCESS;
}

/**
 * _cairo_ucs4_to_utf8:
 * @unicode: a UCS-4 character
 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
 * space available. Or %NULL.
 *
 * This space left intentionally blank.
 *
 * Return value: Number of bytes in the utf8 string or 0 if an invalid
 * unicode character
 **/
int
_cairo_ucs4_to_utf8 (uint32_t  unicode,
         char     *utf8)
{
    int bytes;
    char *p;

    if (unicode < 0x80) {
  if (utf8)
      *utf8 = unicode;
  return 1;
    } else if (unicode < 0x800) {
  bytes = 2;
    } else if (unicode < 0x10000) {
  bytes = 3;
    } else if (unicode < 0x200000) {
  bytes = 4;
    } else {
  return 0;
    }

    if (!utf8)
  return bytes;

    p = utf8 + bytes;
    while (p > utf8) {
  *--p = 0x80 | (unicode & 0x3f);
  unicode >>= 6;
    }
    *p |= 0xf0 << (4 - bytes);

    return bytes;
}

/**
 * _cairo_ucs4_to_utf16:
 * @unicode: a UCS-4 character
 * @utf16: buffer to write utf16 string into. Must have at least 2
 * elements. Or %NULL.
 *
 * This space left intentionally blank.
 *
 * Return value: Number of elements in the utf16 string or 0 if an
 * invalid unicode character
 **/
int
_cairo_ucs4_to_utf16 (uint32_t  unicode,
          uint16_t *utf16)
{
    if (unicode < 0x10000) {
  if (utf16)
      utf16[0] = unicode;
  return 1;
    } else if (unicode < 0x110000) {
  if (utf16) {
      utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800;
      utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00;
  }
  return 2;
    } else {
  return 0;
    }
}

#if CAIRO_HAS_UTF8_TO_UTF16
/**
 * _cairo_utf8_to_utf16:
 * @str: an UTF-8 string
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
 *   If @len is supplied and the string has an embedded nul
 *   byte, only the portion before the nul byte is converted.
 * @result: location to store a pointer to a newly allocated UTF-16
 *   string (always native endian). Free with free(). A 0
 *   word will be written after the last character.
 * @items_written: location to store number of 16-bit words
 *   written. (Not including the trailing 0)
 *
 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
 * where characters are represented either as a single 16-bit word, or
 * as a pair of 16-bit "surrogates". The string is validated to
 * consist entirely of valid Unicode characters.
 *
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
 *   an invalid sequence was found.
 **/
cairo_status_t
_cairo_utf8_to_utf16 (const char *str,
          int   len,
          uint16_t **result,
          int *items_written)
{
    uint16_t *str16 = NULL;
    int n16, i;
    const unsigned char *in;
    const unsigned char * const ustr = (const unsigned char *) str;

    in = ustr;
    n16 = 0;
    while ((len < 0 || ustr + len - in > 0) && *in) {
  uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
  if (wc & 0x80000000 || !UNICODE_VALID (wc))
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);

  if (wc < 0x10000)
      n16 += 1;
  else
      n16 += 2;

  if (n16 == INT_MAX - 1 || n16 == INT_MAX)
      return _cairo_error (CAIRO_STATUS_INVALID_STRING);

  in = UTF8_NEXT_CHAR (in);
    }

    str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
    if (!str16)
  return _cairo_error (CAIRO_STATUS_NO_MEMORY);

    in = ustr;
    for (i = 0; i < n16;) {
  uint32_t wc = _utf8_get_char (in);

  i += _cairo_ucs4_to_utf16 (wc, str16 + i);

  in = UTF8_NEXT_CHAR (in);
    }

    str16[i] = 0;

    *result = str16;
    if (items_written)
  *items_written = n16;

    return CAIRO_STATUS_SUCCESS;
}
#endif

Coverage Report

Created: 2025-11-16 07:45

Line	Count	Source
1		/* -- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -- */
2		/* cairo - a vector graphics library with display and print output
3		*
4		* The code in this file is derived from GLib's gutf8.c and
5		* ultimately from libunicode. It is relicensed under the
6		* dual LGPL/MPL with permission of the original authors.
7		*
8		* Copyright © 1999 Tom Tromey
9		* Copyright © 2005 Red Hat, Inc
10		*
11		* This library is free software; you can redistribute it and/or
12		* modify it either under the terms of the GNU Lesser General Public
13		* License version 2.1 as published by the Free Software Foundation
14		* (the "LGPL") or, at your option, under the terms of the Mozilla
15		* Public License Version 1.1 (the "MPL"). If you do not alter this
16		* notice, a recipient may use your version of this file under either
17		* the MPL or the LGPL.
18		*
19		* You should have received a copy of the LGPL along with this library
20		* in the file COPYING-LGPL-2.1; if not, write to the Free Software
21		* Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22		* You should have received a copy of the MPL along with this library
23		* in the file COPYING-MPL-1.1
24		*
25		* The contents of this file are subject to the Mozilla Public License
26		* Version 1.1 (the "License"); you may not use this file except in
27		* compliance with the License. You may obtain a copy of the License at
28		* http://www.mozilla.org/MPL/
29		*
30		* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31		* OF ANY KIND, either express or implied. See the LGPL or the MPL for
32		* the specific language governing rights and limitations.
33		*
34		* The Original Code is the cairo graphics library.
35		*
36		* The Initial Developer of the Original Code is Tom Tromey.
37		* and Red Hat, Inc.
38		*
39		* Contributor(s):
40		* Owen Taylor <otaylor@redhat.com>
41		*/
42
43		#include "cairoint.h"
44		#include "cairo-error-private.h"
45
46		#define UTF8_COMPUTE(Char, Mask, Len) \
47	47.1k	if (Char < 128) \
48	47.1k	{ \
49	45.9k	Len = 1; \
50	45.9k	Mask = 0x7f; \
51	45.9k	} \
52	47.1k	else if ((Char & 0xe0) == 0xc0) \
53	1.15k	{ \
54	352	Len = 2; \
55	352	Mask = 0x1f; \
56	352	} \
57	1.15k	else if ((Char & 0xf0) == 0xe0) \
58	802	{ \
59	786	Len = 3; \
60	786	Mask = 0x0f; \
61	786	} \
62	802	else if ((Char & 0xf8) == 0xf0) \
63	16	{ \
64	16	Len = 4; \
65	16	Mask = 0x07; \
66	16	} \
67	16	else if ((Char & 0xfc) == 0xf8) \
68	0	{ \
69	0	Len = 5; \
70	0	Mask = 0x03; \
71	0	} \
72	0	else if ((Char & 0xfe) == 0xfc) \
73	0	{ \
74	0	Len = 6; \
75	0	Mask = 0x01; \
76	0	} \
77	0	else \
78	0	Len = -1;
79
80		#define UTF8_LENGTH(Char) \
81	2.00k	((Char) < 0x80 ? 1 : \
82	2.00k	((Char) < 0x800 ? 2 : \
83	2.00k	((Char) < 0x10000 ? 3 : \
84	1.28k	((Char) < 0x200000 ? 4 : \
85	16	((Char) < 0x4000000 ? 5 : 6)))))
86
87		#define UTF8_GET(Result, Chars, Count, Mask, Len) \
88	47.1k	(Result) = (Chars)[0] & (Mask); \
89	49.1k	for ((Count) = 1; (Count) < (Len); ++(Count)) \
90	47.1k	{ \
91	1.97k	if (((Chars)[(Count)] & 0xc0) != 0x80) \
92	1.97k	{ \
93	0	(Result) = -1; \
94	0	break; \
95	0	} \
96	1.97k	(Result) <<= 6; \
97	1.97k	(Result) \|= ((Chars)[(Count)] & 0x3f); \
98	1.97k	}
99
100		#define UNICODE_VALID(Char) \
101	264k	((Char) < 0x110000 && \
102	264k	(((Char) & 0xFFFFF800) != 0xD800))
103
104		static const char utf8_skip_data[256] = {
105		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
106		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
112		3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
113		};
114
115	298k	#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[(unsigned char )(p)])
116
117		/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
118		* If @p does not point to a valid UTF-8 encoded character, results are
119		* undefined.
120		**/
121		static uint32_t
122		_utf8_get_char (const unsigned char *p)
123	33.7k	{
124	33.7k	int i, mask = 0, len;
125	33.7k	uint32_t result;
126	33.7k	unsigned char c = (unsigned char) *p;
127
128	33.7k	UTF8_COMPUTE (c, mask, len);
129	33.7k	if (len == -1)
130	0	return (uint32_t)-1;
131	33.7k	UTF8_GET (result, p, i, mask, len);
132
133	33.7k	return result;
134	33.7k	}
135
136		/* Like _utf8_get_char, but take a maximum length
137		* and return (uint32_t)-2 on incomplete trailing character
138		*/
139		static uint32_t
140		_utf8_get_char_extended (const unsigned char *p,
141		long max_len)
142	264k	{
143	264k	int i, len;
144	264k	uint32_t wc = (unsigned char) *p;
145
146	264k	if (wc < 0x80) {
147	262k	return wc;
148	262k	} else if (wc < 0xc0) {
149	0	return (uint32_t)-1;
150	2.00k	} else if (wc < 0xe0) {
151	726	len = 2;
152	726	wc &= 0x1f;
153	1.28k	} else if (wc < 0xf0) {
154	1.26k	len = 3;
155	1.26k	wc &= 0x0f;
156	1.26k	} else if (wc < 0xf8) {
157	16	len = 4;
158	16	wc &= 0x07;
159	16	} else if (wc < 0xfc) {
160	0	len = 5;
161	0	wc &= 0x03;
162	0	} else if (wc < 0xfe) {
163	0	len = 6;
164	0	wc &= 0x01;
165	0	} else {
166	0	return (uint32_t)-1;
167	0	}
168
169	2.00k	if (max_len >= 0 && len > max_len) {
170	0	for (i = 1; i < max_len; i++) {
171	0	if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
172	0	return (uint32_t)-1;
173	0	}
174	0	return (uint32_t)-2;
175	0	}
176
177	5.30k	for (i = 1; i < len; ++i) {
178	3.30k	uint32_t ch = ((unsigned char *)p)[i];
179
180	3.30k	if ((ch & 0xc0) != 0x80) {
181	0	if (ch)
182	0	return (uint32_t)-1;
183	0	else
184	0	return (uint32_t)-2;
185	0	}
186
187	3.30k	wc <<= 6;
188	3.30k	wc \|= (ch & 0x3f);
189	3.30k	}
190
191	2.00k	if (UTF8_LENGTH(wc) != len)
192	0	return (uint32_t)-1;
193
194	2.00k	return wc;
195	2.00k	}
196
197		/**
198		* _cairo_utf8_get_char_validated:
199		* @p: a UTF-8 string
200		* @unicode: location to store one Unicode character
201		*
202		* Decodes the first character of a valid UTF-8 string, and returns
203		* the number of bytes consumed.
204		*
205		* Note that the string should be valid. Do not use this without
206		* validating the string first.
207		*
208		* Returns: the number of bytes forming the character returned.
209		**/
210		int
211		_cairo_utf8_get_char_validated (const char *p,
212		uint32_t *unicode)
213	13.3k	{
214	13.3k	int i, mask = 0, len;
215	13.3k	uint32_t result;
216	13.3k	unsigned char c = (unsigned char) *p;
217
218	13.3k	UTF8_COMPUTE (c, mask, len);
219	13.3k	if (len == -1) {
220	0	if (unicode)
221	0	*unicode = (uint32_t)-1;
222	0	return 1;
223	0	}
224	13.3k	UTF8_GET (result, p, i, mask, len);
225
226	13.3k	if (unicode)
227	13.3k	*unicode = result;
228	13.3k	return len;
229	13.3k	}
230
231		/**
232		* _cairo_utf8_to_ucs4:
233		* @str: an UTF-8 string
234		* @len: length of @str in bytes, or -1 if it is nul-terminated.
235		* If @len is supplied and the string has an embedded nul
236		* byte, only the portion before the nul byte is converted.
237		* @result: location to store a pointer to a newly allocated UTF-32
238		* string (always native endian), or %NULL. Free with free(). A 0
239		* word will be written after the last character.
240		* @items_written: location to store number of 32-bit words
241		* written. (Not including the trailing 0)
242		*
243		* Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
244		* with 1 32-bit word per character. The string is validated to
245		* consist entirely of valid Unicode characters.
246		*
247		* Return value: %CAIRO_STATUS_SUCCESS if the entire string was
248		* successfully converted. %CAIRO_STATUS_INVALID_STRING if an
249		* invalid sequence was found.
250		**/
251		cairo_status_t
252		_cairo_utf8_to_ucs4 (const char *str,
253		int len,
254		uint32_t **result,
255		int *items_written)
256	265k	{
257	265k	uint32_t *str32 = NULL;
258	265k	int n_chars, i;
259	265k	const unsigned char *in;
260	265k	const unsigned char * const ustr = (const unsigned char *) str;
261
262	265k	in = ustr;
263	265k	n_chars = 0;
264	504k	while ((len < 0 \|\| ustr + len - in > 0) && *in)
265	238k	{
266	238k	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
267	238k	if (wc & 0x80000000 \|\| !UNICODE_VALID (wc))
268	0	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
269
270	238k	n_chars++;
271	238k	if (n_chars == INT_MAX)
272	0	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
273
274	238k	in = UTF8_NEXT_CHAR (in);
275	238k	}
276
277	265k	if (result) {
278	7.69k	str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
279	7.69k	if (!str32)
280	0	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
281
282	7.69k	in = ustr;
283	15.3k	for (i=0; i < n_chars; i++) {
284	7.69k	str32[i] = _utf8_get_char (in);
285	7.69k	in = UTF8_NEXT_CHAR (in);
286	7.69k	}
287	7.69k	str32[i] = 0;
288
289	7.69k	*result = str32;
290	7.69k	}
291
292	265k	if (items_written)
293	21.0k	*items_written = n_chars;
294
295	265k	return CAIRO_STATUS_SUCCESS;
296	265k	}
297
298		/**
299		* _cairo_ucs4_to_utf8:
300		* @unicode: a UCS-4 character
301		* @utf8: buffer to write utf8 string into. Must have at least 4 bytes
302		* space available. Or %NULL.
303		*
304		* This space left intentionally blank.
305		*
306		* Return value: Number of bytes in the utf8 string or 0 if an invalid
307		* unicode character
308		**/
309		int
310		_cairo_ucs4_to_utf8 (uint32_t unicode,
311		char *utf8)
312	55.3k	{
313	55.3k	int bytes;
314	55.3k	char *p;
315
316	55.3k	if (unicode < 0x80) {
317	54.4k	if (utf8)
318	37.3k	*utf8 = unicode;
319	54.4k	return 1;
320	54.4k	} else if (unicode < 0x800) {
321	276	bytes = 2;
322	577	} else if (unicode < 0x10000) {
323	561	bytes = 3;
324	561	} else if (unicode < 0x200000) {
325	16	bytes = 4;
326	16	} else {
327	0	return 0;
328	0	}
329
330	853	if (!utf8)
331	0	return bytes;
332
333	853	p = utf8 + bytes;
334	3.15k	while (p > utf8) {
335	2.29k	*--p = 0x80 \| (unicode & 0x3f);
336	2.29k	unicode >>= 6;
337	2.29k	}
338	853	*p \|= 0xf0 << (4 - bytes);
339
340	853	return bytes;
341	853	}
342
343		/**
344		* _cairo_ucs4_to_utf16:
345		* @unicode: a UCS-4 character
346		* @utf16: buffer to write utf16 string into. Must have at least 2
347		* elements. Or %NULL.
348		*
349		* This space left intentionally blank.
350		*
351		* Return value: Number of elements in the utf16 string or 0 if an
352		* invalid unicode character
353		**/
354		int
355		_cairo_ucs4_to_utf16 (uint32_t unicode,
356		uint16_t *utf16)
357	26.0k	{
358	26.0k	if (unicode < 0x10000) {
359	26.0k	if (utf16)
360	26.0k	utf16[0] = unicode;
361	26.0k	return 1;
362	26.0k	} else if (unicode < 0x110000) {
363	16	if (utf16) {
364	16	utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800;
365	16	utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00;
366	16	}
367	16	return 2;
368	16	} else {
369	0	return 0;
370	0	}
371	26.0k	}
372
373		#if CAIRO_HAS_UTF8_TO_UTF16
374		/**
375		* _cairo_utf8_to_utf16:
376		* @str: an UTF-8 string
377		* @len: length of @str in bytes, or -1 if it is nul-terminated.
378		* If @len is supplied and the string has an embedded nul
379		* byte, only the portion before the nul byte is converted.
380		* @result: location to store a pointer to a newly allocated UTF-16
381		* string (always native endian). Free with free(). A 0
382		* word will be written after the last character.
383		* @items_written: location to store number of 16-bit words
384		* written. (Not including the trailing 0)
385		*
386		* Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
387		* where characters are represented either as a single 16-bit word, or
388		* as a pair of 16-bit "surrogates". The string is validated to
389		* consist entirely of valid Unicode characters.
390		*
391		* Return value: %CAIRO_STATUS_SUCCESS if the entire string was
392		* successfully converted. %CAIRO_STATUS_INVALID_STRING if an
393		* an invalid sequence was found.
394		**/
395		cairo_status_t
396		_cairo_utf8_to_utf16 (const char *str,
397		int len,
398		uint16_t **result,
399		int *items_written)
400	26.0k	{
401	26.0k	uint16_t *str16 = NULL;
402	26.0k	int n16, i;
403	26.0k	const unsigned char *in;
404	26.0k	const unsigned char * const ustr = (const unsigned char *) str;
405
406	26.0k	in = ustr;
407	26.0k	n16 = 0;
408	52.1k	while ((len < 0 \|\| ustr + len - in > 0) && *in) {
409	26.0k	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
410	26.0k	if (wc & 0x80000000 \|\| !UNICODE_VALID (wc))
411	0	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
412
413	26.0k	if (wc < 0x10000)
414	26.0k	n16 += 1;
415	16	else
416	16	n16 += 2;
417
418	26.0k	if (n16 == INT_MAX - 1 \|\| n16 == INT_MAX)
419	0	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
420
421	26.0k	in = UTF8_NEXT_CHAR (in);
422	26.0k	}
423
424	26.0k	str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
425	26.0k	if (!str16)
426	0	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
427
428	26.0k	in = ustr;
429	52.1k	for (i = 0; i < n16;) {
430	26.0k	uint32_t wc = _utf8_get_char (in);
431
432	26.0k	i += _cairo_ucs4_to_utf16 (wc, str16 + i);
433
434	26.0k	in = UTF8_NEXT_CHAR (in);
435	26.0k	}
436
437	26.0k	str16[i] = 0;
438
439	26.0k	*result = str16;
440	26.0k	if (items_written)
441	26.0k	*items_written = n16;
442
443	26.0k	return CAIRO_STATUS_SUCCESS;
444	26.0k	}
445		#endif