/src/samba/third_party/heimdal/lib/wind/utf8.c

Source
/*
 * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
 * (Royal Institute of Technology, Stockholm, Sweden).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the Institute nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <config.h>
#include "windlocl.h"

static int
utf8toutf32(const unsigned char **pp, uint32_t *out)
{
    const unsigned char *p = *pp;
    uint32_t c = *p;
    uint32_t out_val;

    if (c & 0x80) {
  if ((c & 0xE0) == 0xC0) {
      const uint32_t c2 = *++p;
      if ((c2 & 0xC0) == 0x80) {
    out_val =  ((c  & 0x1F) << 6)
        | (c2 & 0x3F);
    if (out_val < 0x80) {
        return WIND_ERR_INVALID_UTF8;
    }
      } else {
    return WIND_ERR_INVALID_UTF8;
      }
  } else if ((c & 0xF0) == 0xE0) {
      const uint32_t c2 = *++p;
      if ((c2 & 0xC0) == 0x80) {
    const uint32_t c3 = *++p;
    if ((c3 & 0xC0) == 0x80) {
        out_val =   ((c  & 0x0F) << 12)
      | ((c2 & 0x3F) << 6)
      |  (c3 & 0x3F);
        if (out_val < 0x800) {
      return WIND_ERR_INVALID_UTF8;
        }
    } else {
        return WIND_ERR_INVALID_UTF8;
    }
      } else {
    return WIND_ERR_INVALID_UTF8;
      }
  } else if ((c & 0xF8) == 0xF0) {
      const uint32_t c2 = *++p;
      if ((c2 & 0xC0) == 0x80) {
    const uint32_t c3 = *++p;
    if ((c3 & 0xC0) == 0x80) {
        const uint32_t c4 = *++p;
        if ((c4 & 0xC0) == 0x80) {
      out_val =   ((c  & 0x07) << 18)
          | ((c2 & 0x3F) << 12)
          | ((c3 & 0x3F) <<  6)
          |  (c4 & 0x3F);
      if (out_val < 0x10000) {
          return WIND_ERR_INVALID_UTF8;
      }
        } else {
      return WIND_ERR_INVALID_UTF8;
        }
    } else {
        return WIND_ERR_INVALID_UTF8;
    }
      } else {
    return WIND_ERR_INVALID_UTF8;
      }
  } else {
      return WIND_ERR_INVALID_UTF8;
  }
    } else {
  out_val = c;
    }

    /* Allow unpaired surrogates (in the range 0xd800–0xdfff). */

    if (out_val > 0x10ffff) {
  return WIND_ERR_INVALID_UTF8;
    }

    *out = out_val;
    *pp = p;

    return 0;
}

/**
 * Convert an UTF-8 string to an UCS4 string.
 *
 * @param in an UTF-8 string to convert.
 * @param out the resulting UCS4 string, must be at least
 * wind_utf8ucs4_length() long.  If out is NULL, the function will
 * calculate the needed space for the out variable (just like
 * wind_utf8ucs4_length()).
 * @param out_len before processing out_len should be the length of
 * the out variable, after processing it will be the length of the out
 * string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
{
    const unsigned char *p;
    size_t o = 0;
    int ret;

    for (p = (const unsigned char *)in; *p != '\0'; ++p) {
  uint32_t u;

  ret = utf8toutf32(&p, &u);
  if (ret)
      return ret;

  if (out) {
      if (o >= *out_len)
    return WIND_ERR_OVERRUN;
      out[o] = u;
  }
  o++;
    }
    *out_len = o;
    return 0;
}

/**
 * Calculate the length of from converting a UTF-8 string to a UCS4
 * string.
 *
 * @param in an UTF-8 string to convert.
 * @param out_len the length of the resulting UCS4 string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_utf8ucs4_length(const char *in, size_t *out_len)
{
    return wind_utf8ucs4(in, NULL, out_len);
}

static const char first_char[4] =
    { 0x00, 0xC0, 0xE0, 0xF0 };

/**
 * Convert an UCS4 string to a UTF-8 string.
 *
 * @param in an UCS4 string to convert.
 * @param in_len the length input array.

 * @param out the resulting UTF-8 string, must be at least
 * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
 * out is NULL, the function will calculate the needed space for the
 * out variable (just like wind_ucs4utf8_length()).

 * @param out_len before processing out_len should be the length of
 * the out variable, after processing it will be the length of the out
 * string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
{
    uint32_t ch;
    size_t i, len, o;

    for (o = 0, i = 0; i < in_len; i++) {
  ch = in[i];

  if (ch < 0x80) {
      len = 1;
  } else if (ch < 0x800) {
      len = 2;
  } else if (ch < 0x10000) {
      len = 3;
  } else if (ch <= 0x10FFFF) {
      len = 4;
  } else
      return WIND_ERR_INVALID_UTF32;

  o += len;

  if (out) {
      if (o >= *out_len)
    return WIND_ERR_OVERRUN;

      switch(len) {
      case 4:
    out[3] = (ch | 0x80) & 0xbf;
    ch = ch >> 6;
                HEIM_FALLTHROUGH;
      case 3:
    out[2] = (ch | 0x80) & 0xbf;
    ch = ch >> 6;
                HEIM_FALLTHROUGH;
      case 2:
    out[1] = (ch | 0x80) & 0xbf;
    ch = ch >> 6;
                HEIM_FALLTHROUGH;
      case 1:
    out[0] = ch | first_char[len - 1];
                HEIM_FALLTHROUGH;
            default:
                break;
      }
      out += len;
  }
    }
    if (out) {
  if (o + 1 >= *out_len)
      return WIND_ERR_OVERRUN;
  *out = '\0';
    }
    *out_len = o;
    return 0;
}

/**
 * Calculate the length of from converting a UCS4 string to an UTF-8 string.
 *
 * @param in an UCS4 string to convert.
 * @param in_len the length of UCS4 string to convert.
 * @param out_len the length of the resulting UTF-8 string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
{
    return wind_ucs4utf8(in, in_len, NULL, out_len);
}

/**
 * Read in an UCS2 from a buffer.
 *
 * @param ptr The input buffer to read from.
 * @param len the length of the input buffer.
 * @param flags Flags to control the behavior of the function.
 * @param out the output UCS2, the array must be at least out/2 long.
 * @param out_len the output length
 *
 * @return returns 0 on success, an wind error code otherwise.
 * @ingroup wind
 */

int
wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
        uint16_t *out, size_t *out_len)
{
    const unsigned char *p = ptr;
    int little = ((*flags) & WIND_RW_LE);
    size_t olen = *out_len;

    /** if len is zero, flags are unchanged */
    if (len == 0) {
  *out_len = 0;
  return 0;
    }

    /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
    if (len & 1)
  return WIND_ERR_LENGTH_NOT_MOD2;

    /**
     * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
     * found, check is LE/BE flag is already and use that otherwise
     * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
     * the LE/BE flag and set the resulting LE/BE flag.
     */
    if ((*flags) & WIND_RW_BOM) {
  uint16_t bom = (p[0] << 8) + p[1];
  if (bom == 0xfffe || bom == 0xfeff) {
      little = (bom == 0xfffe);
      p += 2;
      len -= 2;
  } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
      /* little already set */
  } else
      return WIND_ERR_NO_BOM;
  *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
  *flags |= little ? WIND_RW_LE : WIND_RW_BE;
    }

    while (len) {
  if (olen < 1)
      return WIND_ERR_OVERRUN;
  if (little)
      *out = (p[1] << 8) + p[0];
  else
      *out = (p[0] << 8) + p[1];
  out++; p += 2; len -= 2; olen--;
    }
    *out_len -= olen;
    return 0;
}

/**
 * Write an UCS2 string to a buffer.
 *
 * @param in The input UCS2 string.
 * @param in_len the length of the input buffer.
 * @param flags Flags to control the behavior of the function.
 * @param ptr The input buffer to write to, the array must be at least
 * (in + 1) * 2 bytes long.
 * @param out_len the output length
 *
 * @return returns 0 on success, an wind error code otherwise.
 * @ingroup wind
 */

int
wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
         void *ptr, size_t *out_len)
{
    unsigned char *p = ptr;
    size_t len = *out_len;

    /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
    if (len & 1)
  return WIND_ERR_LENGTH_NOT_MOD2;

    /** On zero input length, flags are preserved */
    if (in_len == 0) {
  *out_len = 0;
  return 0;
    }
    /** If flags have WIND_RW_BOM set, the byte order mark is written
     * first to the output data */
    if ((*flags) & WIND_RW_BOM) {
  uint16_t bom = 0xfffe;

  if (len < 2)
      return WIND_ERR_OVERRUN;

  if ((*flags) & WIND_RW_LE) {
      p[0] = (bom     ) & 0xff;
      p[1] = (bom >> 8) & 0xff;
  } else {
      p[1] = (bom     ) & 0xff;
      p[0] = (bom >> 8) & 0xff;
  }
  len -= 2;
    }

    while (in_len) {
  /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
  if (len < 2)
      return WIND_ERR_OVERRUN;
  if ((*flags) & WIND_RW_LE) {
      p[0] = (in[0]     ) & 0xff;
      p[1] = (in[0] >> 8) & 0xff;
  } else {
      p[1] = (in[0]     ) & 0xff;
      p[0] = (in[0] >> 8) & 0xff;
  }
  len -= 2;
  in_len--;
  p += 2;
  in++;
    }
    *out_len -= len;
    return 0;
}


/**
 * Convert an UTF-8 string to an UCS2 string.
 *
 * @param in an UTF-8 string to convert.
 * @param out the resulting UCS2 string, must be at least
 * wind_utf8ucs2_length() long.  If out is NULL, the function will
 * calculate the needed space for the out variable (just like
 * wind_utf8ucs2_length()).
 * @param out_len before processing out_len should be the length of
 * the out variable, after processing it will be the length of the out
 * string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
{
    const unsigned char *p;
    size_t o = 0;
    int ret;

    for (p = (const unsigned char *)in; *p != '\0'; ++p) {
  uint32_t u;

  ret = utf8toutf32(&p, &u);
  if (ret)
      return ret;

  if (u >= 0x10000) {
      if (out) {
    uint16_t high_ten_bits;
    uint16_t low_ten_bits;

    if (o + 2 > *out_len)
        return WIND_ERR_OVERRUN;

    u -= 0x10000;
    high_ten_bits = (u >> 10) & 0x3ff;
    low_ten_bits = u & 0x3ff;

    out[o] = 0xd800 | high_ten_bits;
    out[o+1] = 0xdc00 | low_ten_bits;
      }
      o += 2;
  } else {
      if (out) {
    if (o >= *out_len)
        return WIND_ERR_OVERRUN;
    out[o] = u;
      }
      o++;
  }
    }
    *out_len = o;
    return 0;
}

/**
 * Calculate the length of from converting a UTF-8 string to a UCS2
 * string.
 *
 * @param in an UTF-8 string to convert.
 * @param out_len the length of the resulting UCS2 string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_utf8ucs2_length(const char *in, size_t *out_len)
{
    return wind_utf8ucs2(in, NULL, out_len);
}

/**
 * Convert an UCS2 string to a UTF-8 string.
 *
 * @param in an UCS2 string to convert.
 * @param in_len the length of the in UCS2 string.
 * @param out the resulting UTF-8 string, must be at least
 * wind_ucs2utf8_length() long.  If out is NULL, the function will
 * calculate the needed space for the out variable (just like
 * wind_ucs2utf8_length()).
 * @param out_len before processing out_len should be the length of
 * the out variable, after processing it will be the length of the out
 * string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
{
    uint32_t ch;
    size_t i, len, o;

    for (o = 0, i = 0; i < in_len; i++) {
  ch = in[i];

  if (ch < 0x80) {
      len = 1;
  } else if (ch < 0x800) {
      len = 2;
  } else if (ch < 0xd800 || ch >= 0xe000) {
      len = 3;
  } else if (ch < 0xdc00) {
      /* A high surrogate. */
      if (i < in_len - 1) {
    uint16_t ch2 = in[i + 1];

    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
        uint16_t high_ten_bits;
        uint16_t low_ten_bits;

        /* A surrogate pair. */
        high_ten_bits = ch & 0x3ff;
        low_ten_bits = ch2 & 0x3ff;

        ch = 0x10000 + ((uint32_t)high_ten_bits << 10 | low_ten_bits);
        len = 4;
        ++i;
    } else {
        /* An unpaired high surrogate. */
        len = 3;
    }
      } else {
    /* An unpaired high surrogate. */
    len = 3;
      }
  } else {
      /* An unpaired low surrogate. */
      len = 3;
  }

  o += len;

  if (out) {
      if (o >= *out_len)
    return WIND_ERR_OVERRUN;

      switch(len) {
      case 4:
    out[3] = (ch | 0x80) & 0xbf;
    ch = ch >> 6;
    HEIM_FALLTHROUGH;
      case 3:
    out[2] = (ch | 0x80) & 0xbf;
    ch = ch >> 6;
                HEIM_FALLTHROUGH;
      case 2:
    out[1] = (ch | 0x80) & 0xbf;
    ch = ch >> 6;
                HEIM_FALLTHROUGH;
      case 1:
    out[0] = ch | first_char[len - 1];
                HEIM_FALLTHROUGH;
            default:
                break;
      }
      out += len;
  }
    }
    if (out) {
  if (o >= *out_len)
      return WIND_ERR_OVERRUN;
  *out = '\0';
    }
    *out_len = o;
    return 0;
}

/**
 * Calculate the length of from converting a UCS2 string to an UTF-8 string.
 *
 * @param in an UCS2 string to convert.
 * @param in_len an UCS2 string length to convert.
 * @param out_len the length of the resulting UTF-8 string.
 *
 * @return returns 0 on success, an wind error code otherwise
 * @ingroup wind
 */

int
wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
{
    return wind_ucs2utf8(in, in_len, NULL, out_len);
}

Coverage Report

Created: 2026-06-07 07:07

Line	Count	Source
1		/*
2		* Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
3		* (Royal Institute of Technology, Stockholm, Sweden).
4		* All rights reserved.
5		*
6		* Redistribution and use in source and binary forms, with or without
7		* modification, are permitted provided that the following conditions
8		* are met:
9		*
10		* 1. Redistributions of source code must retain the above copyright
11		* notice, this list of conditions and the following disclaimer.
12		*
13		* 2. Redistributions in binary form must reproduce the above copyright
14		* notice, this list of conditions and the following disclaimer in the
15		* documentation and/or other materials provided with the distribution.
16		*
17		* 3. Neither the name of the Institute nor the names of its contributors
18		* may be used to endorse or promote products derived from this software
19		* without specific prior written permission.
20		*
21		* THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22		* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23		* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24		* ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25		* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26		* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27		* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28		* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29		* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30		* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31		* SUCH DAMAGE.
32		*/
33
34		#include <config.h>
35		#include "windlocl.h"
36
37		static int
38		utf8toutf32(const unsigned char *pp, uint32_t out)
39	0	{
40	0	const unsigned char p = pp;
41	0	uint32_t c = *p;
42	0	uint32_t out_val;
43
44	0	if (c & 0x80) {
45	0	if ((c & 0xE0) == 0xC0) {
46	0	const uint32_t c2 = *++p;
47	0	if ((c2 & 0xC0) == 0x80) {
48	0	out_val = ((c & 0x1F) << 6)
49	0	\| (c2 & 0x3F);
50	0	if (out_val < 0x80) {
51	0	return WIND_ERR_INVALID_UTF8;
52	0	}
53	0	} else {
54	0	return WIND_ERR_INVALID_UTF8;
55	0	}
56	0	} else if ((c & 0xF0) == 0xE0) {
57	0	const uint32_t c2 = *++p;
58	0	if ((c2 & 0xC0) == 0x80) {
59	0	const uint32_t c3 = *++p;
60	0	if ((c3 & 0xC0) == 0x80) {
61	0	out_val = ((c & 0x0F) << 12)
62	0	\| ((c2 & 0x3F) << 6)
63	0	\| (c3 & 0x3F);
64	0	if (out_val < 0x800) {
65	0	return WIND_ERR_INVALID_UTF8;
66	0	}
67	0	} else {
68	0	return WIND_ERR_INVALID_UTF8;
69	0	}
70	0	} else {
71	0	return WIND_ERR_INVALID_UTF8;
72	0	}
73	0	} else if ((c & 0xF8) == 0xF0) {
74	0	const uint32_t c2 = *++p;
75	0	if ((c2 & 0xC0) == 0x80) {
76	0	const uint32_t c3 = *++p;
77	0	if ((c3 & 0xC0) == 0x80) {
78	0	const uint32_t c4 = *++p;
79	0	if ((c4 & 0xC0) == 0x80) {
80	0	out_val = ((c & 0x07) << 18)
81	0	\| ((c2 & 0x3F) << 12)
82	0	\| ((c3 & 0x3F) << 6)
83	0	\| (c4 & 0x3F);
84	0	if (out_val < 0x10000) {
85	0	return WIND_ERR_INVALID_UTF8;
86	0	}
87	0	} else {
88	0	return WIND_ERR_INVALID_UTF8;
89	0	}
90	0	} else {
91	0	return WIND_ERR_INVALID_UTF8;
92	0	}
93	0	} else {
94	0	return WIND_ERR_INVALID_UTF8;
95	0	}
96	0	} else {
97	0	return WIND_ERR_INVALID_UTF8;
98	0	}
99	0	} else {
100	0	out_val = c;
101	0	}
102
103		/* Allow unpaired surrogates (in the range 0xd800–0xdfff). */
104
105	0	if (out_val > 0x10ffff) {
106	0	return WIND_ERR_INVALID_UTF8;
107	0	}
108
109	0	*out = out_val;
110	0	*pp = p;
111
112	0	return 0;
113	0	}
114
115		/**
116		* Convert an UTF-8 string to an UCS4 string.
117		*
118		* @param in an UTF-8 string to convert.
119		* @param out the resulting UCS4 string, must be at least
120		* wind_utf8ucs4_length() long. If out is NULL, the function will
121		* calculate the needed space for the out variable (just like
122		* wind_utf8ucs4_length()).
123		* @param out_len before processing out_len should be the length of
124		* the out variable, after processing it will be the length of the out
125		* string.
126		*
127		* @return returns 0 on success, an wind error code otherwise
128		* @ingroup wind
129		*/
130
131		int
132		wind_utf8ucs4(const char in, uint32_t out, size_t *out_len)
133	0	{
134	0	const unsigned char *p;
135	0	size_t o = 0;
136	0	int ret;
137
138	0	for (p = (const unsigned char )in; p != '\0'; ++p) {
139	0	uint32_t u;
140
141	0	ret = utf8toutf32(&p, &u);
142	0	if (ret)
143	0	return ret;
144
145	0	if (out) {
146	0	if (o >= *out_len)
147	0	return WIND_ERR_OVERRUN;
148	0	out[o] = u;
149	0	}
150	0	o++;
151	0	}
152	0	*out_len = o;
153	0	return 0;
154	0	}
155
156		/**
157		* Calculate the length of from converting a UTF-8 string to a UCS4
158		* string.
159		*
160		* @param in an UTF-8 string to convert.
161		* @param out_len the length of the resulting UCS4 string.
162		*
163		* @return returns 0 on success, an wind error code otherwise
164		* @ingroup wind
165		*/
166
167		int
168		wind_utf8ucs4_length(const char in, size_t out_len)
169	0	{
170	0	return wind_utf8ucs4(in, NULL, out_len);
171	0	}
172
173		static const char first_char[4] =
174		{ 0x00, 0xC0, 0xE0, 0xF0 };
175
176		/**
177		* Convert an UCS4 string to a UTF-8 string.
178		*
179		* @param in an UCS4 string to convert.
180		* @param in_len the length input array.
181
182		* @param out the resulting UTF-8 string, must be at least
183		* wind_ucs4utf8_length() + 1 long (the extra char for the NUL). If
184		* out is NULL, the function will calculate the needed space for the
185		* out variable (just like wind_ucs4utf8_length()).
186
187		* @param out_len before processing out_len should be the length of
188		* the out variable, after processing it will be the length of the out
189		* string.
190		*
191		* @return returns 0 on success, an wind error code otherwise
192		* @ingroup wind
193		*/
194
195		int
196		wind_ucs4utf8(const uint32_t in, size_t in_len, char out, size_t *out_len)
197	0	{
198	0	uint32_t ch;
199	0	size_t i, len, o;
200
201	0	for (o = 0, i = 0; i < in_len; i++) {
202	0	ch = in[i];
203
204	0	if (ch < 0x80) {
205	0	len = 1;
206	0	} else if (ch < 0x800) {
207	0	len = 2;
208	0	} else if (ch < 0x10000) {
209	0	len = 3;
210	0	} else if (ch <= 0x10FFFF) {
211	0	len = 4;
212	0	} else
213	0	return WIND_ERR_INVALID_UTF32;
214
215	0	o += len;
216
217	0	if (out) {
218	0	if (o >= *out_len)
219	0	return WIND_ERR_OVERRUN;
220
221	0	switch(len) {
222	0	case 4:
223	0	out[3] = (ch \| 0x80) & 0xbf;
224	0	ch = ch >> 6;
225	0	HEIM_FALLTHROUGH;
226	0	case 3:
227	0	out[2] = (ch \| 0x80) & 0xbf;
228	0	ch = ch >> 6;
229	0	HEIM_FALLTHROUGH;
230	0	case 2:
231	0	out[1] = (ch \| 0x80) & 0xbf;
232	0	ch = ch >> 6;
233	0	HEIM_FALLTHROUGH;
234	0	case 1:
235	0	out[0] = ch \| first_char[len - 1];
236	0	HEIM_FALLTHROUGH;
237	0	default:
238	0	break;
239	0	}
240	0	out += len;
241	0	}
242	0	}
243	0	if (out) {
244	0	if (o + 1 >= *out_len)
245	0	return WIND_ERR_OVERRUN;
246	0	*out = '\0';
247	0	}
248	0	*out_len = o;
249	0	return 0;
250	0	}
251
252		/**
253		* Calculate the length of from converting a UCS4 string to an UTF-8 string.
254		*
255		* @param in an UCS4 string to convert.
256		* @param in_len the length of UCS4 string to convert.
257		* @param out_len the length of the resulting UTF-8 string.
258		*
259		* @return returns 0 on success, an wind error code otherwise
260		* @ingroup wind
261		*/
262
263		int
264		wind_ucs4utf8_length(const uint32_t in, size_t in_len, size_t out_len)
265	0	{
266	0	return wind_ucs4utf8(in, in_len, NULL, out_len);
267	0	}
268
269		/**
270		* Read in an UCS2 from a buffer.
271		*
272		* @param ptr The input buffer to read from.
273		* @param len the length of the input buffer.
274		* @param flags Flags to control the behavior of the function.
275		* @param out the output UCS2, the array must be at least out/2 long.
276		* @param out_len the output length
277		*
278		* @return returns 0 on success, an wind error code otherwise.
279		* @ingroup wind
280		*/
281
282		int
283		wind_ucs2read(const void ptr, size_t len, unsigned int flags,
284		uint16_t out, size_t out_len)
285	0	{
286	0	const unsigned char *p = ptr;
287	0	int little = ((*flags) & WIND_RW_LE);
288	0	size_t olen = *out_len;
289
290		/** if len is zero, flags are unchanged */
291	0	if (len == 0) {
292	0	*out_len = 0;
293	0	return 0;
294	0	}
295
296		/** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
297	0	if (len & 1)
298	0	return WIND_ERR_LENGTH_NOT_MOD2;
299
300		/**
301		* If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
302		* found, check is LE/BE flag is already and use that otherwise
303		* fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
304		* the LE/BE flag and set the resulting LE/BE flag.
305		*/
306	0	if ((*flags) & WIND_RW_BOM) {
307	0	uint16_t bom = (p[0] << 8) + p[1];
308	0	if (bom == 0xfffe \|\| bom == 0xfeff) {
309	0	little = (bom == 0xfffe);
310	0	p += 2;
311	0	len -= 2;
312	0	} else if (((*flags) & (WIND_RW_LE\|WIND_RW_BE)) != 0) {
313		/* little already set */
314	0	} else
315	0	return WIND_ERR_NO_BOM;
316	0	flags = ((flags) & ~(WIND_RW_BOM\|WIND_RW_LE\|WIND_RW_BE));
317	0	*flags \|= little ? WIND_RW_LE : WIND_RW_BE;
318	0	}
319
320	0	while (len) {
321	0	if (olen < 1)
322	0	return WIND_ERR_OVERRUN;
323	0	if (little)
324	0	*out = (p[1] << 8) + p[0];
325	0	else
326	0	*out = (p[0] << 8) + p[1];
327	0	out++; p += 2; len -= 2; olen--;
328	0	}
329	0	*out_len -= olen;
330	0	return 0;
331	0	}
332
333		/**
334		* Write an UCS2 string to a buffer.
335		*
336		* @param in The input UCS2 string.
337		* @param in_len the length of the input buffer.
338		* @param flags Flags to control the behavior of the function.
339		* @param ptr The input buffer to write to, the array must be at least
340		* (in + 1) * 2 bytes long.
341		* @param out_len the output length
342		*
343		* @return returns 0 on success, an wind error code otherwise.
344		* @ingroup wind
345		*/
346
347		int
348		wind_ucs2write(const uint16_t in, size_t in_len, unsigned int flags,
349		void ptr, size_t out_len)
350	0	{
351	0	unsigned char *p = ptr;
352	0	size_t len = *out_len;
353
354		/** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
355	0	if (len & 1)
356	0	return WIND_ERR_LENGTH_NOT_MOD2;
357
358		/** On zero input length, flags are preserved */
359	0	if (in_len == 0) {
360	0	*out_len = 0;
361	0	return 0;
362	0	}
363		/** If flags have WIND_RW_BOM set, the byte order mark is written
364		* first to the output data */
365	0	if ((*flags) & WIND_RW_BOM) {
366	0	uint16_t bom = 0xfffe;
367
368	0	if (len < 2)
369	0	return WIND_ERR_OVERRUN;
370
371	0	if ((*flags) & WIND_RW_LE) {
372	0	p[0] = (bom ) & 0xff;
373	0	p[1] = (bom >> 8) & 0xff;
374	0	} else {
375	0	p[1] = (bom ) & 0xff;
376	0	p[0] = (bom >> 8) & 0xff;
377	0	}
378	0	len -= 2;
379	0	}
380
381	0	while (in_len) {
382		/** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
383	0	if (len < 2)
384	0	return WIND_ERR_OVERRUN;
385	0	if ((*flags) & WIND_RW_LE) {
386	0	p[0] = (in[0] ) & 0xff;
387	0	p[1] = (in[0] >> 8) & 0xff;
388	0	} else {
389	0	p[1] = (in[0] ) & 0xff;
390	0	p[0] = (in[0] >> 8) & 0xff;
391	0	}
392	0	len -= 2;
393	0	in_len--;
394	0	p += 2;
395	0	in++;
396	0	}
397	0	*out_len -= len;
398	0	return 0;
399	0	}
400
401
402		/**
403		* Convert an UTF-8 string to an UCS2 string.
404		*
405		* @param in an UTF-8 string to convert.
406		* @param out the resulting UCS2 string, must be at least
407		* wind_utf8ucs2_length() long. If out is NULL, the function will
408		* calculate the needed space for the out variable (just like
409		* wind_utf8ucs2_length()).
410		* @param out_len before processing out_len should be the length of
411		* the out variable, after processing it will be the length of the out
412		* string.
413		*
414		* @return returns 0 on success, an wind error code otherwise
415		* @ingroup wind
416		*/
417
418		int
419		wind_utf8ucs2(const char in, uint16_t out, size_t *out_len)
420	0	{
421	0	const unsigned char *p;
422	0	size_t o = 0;
423	0	int ret;
424
425	0	for (p = (const unsigned char )in; p != '\0'; ++p) {
426	0	uint32_t u;
427
428	0	ret = utf8toutf32(&p, &u);
429	0	if (ret)
430	0	return ret;
431
432	0	if (u >= 0x10000) {
433	0	if (out) {
434	0	uint16_t high_ten_bits;
435	0	uint16_t low_ten_bits;
436
437	0	if (o + 2 > *out_len)
438	0	return WIND_ERR_OVERRUN;
439
440	0	u -= 0x10000;
441	0	high_ten_bits = (u >> 10) & 0x3ff;
442	0	low_ten_bits = u & 0x3ff;
443
444	0	out[o] = 0xd800 \| high_ten_bits;
445	0	out[o+1] = 0xdc00 \| low_ten_bits;
446	0	}
447	0	o += 2;
448	0	} else {
449	0	if (out) {
450	0	if (o >= *out_len)
451	0	return WIND_ERR_OVERRUN;
452	0	out[o] = u;
453	0	}
454	0	o++;
455	0	}
456	0	}
457	0	*out_len = o;
458	0	return 0;
459	0	}
460
461		/**
462		* Calculate the length of from converting a UTF-8 string to a UCS2
463		* string.
464		*
465		* @param in an UTF-8 string to convert.
466		* @param out_len the length of the resulting UCS2 string.
467		*
468		* @return returns 0 on success, an wind error code otherwise
469		* @ingroup wind
470		*/
471
472		int
473		wind_utf8ucs2_length(const char in, size_t out_len)
474	0	{
475	0	return wind_utf8ucs2(in, NULL, out_len);
476	0	}
477
478		/**
479		* Convert an UCS2 string to a UTF-8 string.
480		*
481		* @param in an UCS2 string to convert.
482		* @param in_len the length of the in UCS2 string.
483		* @param out the resulting UTF-8 string, must be at least
484		* wind_ucs2utf8_length() long. If out is NULL, the function will
485		* calculate the needed space for the out variable (just like
486		* wind_ucs2utf8_length()).
487		* @param out_len before processing out_len should be the length of
488		* the out variable, after processing it will be the length of the out
489		* string.
490		*
491		* @return returns 0 on success, an wind error code otherwise
492		* @ingroup wind
493		*/
494
495		int
496		wind_ucs2utf8(const uint16_t in, size_t in_len, char out, size_t *out_len)
497	0	{
498	0	uint32_t ch;
499	0	size_t i, len, o;
500
501	0	for (o = 0, i = 0; i < in_len; i++) {
502	0	ch = in[i];
503
504	0	if (ch < 0x80) {
505	0	len = 1;
506	0	} else if (ch < 0x800) {
507	0	len = 2;
508	0	} else if (ch < 0xd800 \|\| ch >= 0xe000) {
509	0	len = 3;
510	0	} else if (ch < 0xdc00) {
511		/* A high surrogate. */
512	0	if (i < in_len - 1) {
513	0	uint16_t ch2 = in[i + 1];
514
515	0	if (ch2 >= 0xdc00 && ch2 < 0xe000) {
516	0	uint16_t high_ten_bits;
517	0	uint16_t low_ten_bits;
518
519		/* A surrogate pair. */
520	0	high_ten_bits = ch & 0x3ff;
521	0	low_ten_bits = ch2 & 0x3ff;
522
523	0	ch = 0x10000 + ((uint32_t)high_ten_bits << 10 \| low_ten_bits);
524	0	len = 4;
525	0	++i;
526	0	} else {
527		/* An unpaired high surrogate. */
528	0	len = 3;
529	0	}
530	0	} else {
531		/* An unpaired high surrogate. */
532	0	len = 3;
533	0	}
534	0	} else {
535		/* An unpaired low surrogate. */
536	0	len = 3;
537	0	}
538
539	0	o += len;
540
541	0	if (out) {
542	0	if (o >= *out_len)
543	0	return WIND_ERR_OVERRUN;
544
545	0	switch(len) {
546	0	case 4:
547	0	out[3] = (ch \| 0x80) & 0xbf;
548	0	ch = ch >> 6;
549	0	HEIM_FALLTHROUGH;
550	0	case 3:
551	0	out[2] = (ch \| 0x80) & 0xbf;
552	0	ch = ch >> 6;
553	0	HEIM_FALLTHROUGH;
554	0	case 2:
555	0	out[1] = (ch \| 0x80) & 0xbf;
556	0	ch = ch >> 6;
557	0	HEIM_FALLTHROUGH;
558	0	case 1:
559	0	out[0] = ch \| first_char[len - 1];
560	0	HEIM_FALLTHROUGH;
561	0	default:
562	0	break;
563	0	}
564	0	out += len;
565	0	}
566	0	}
567	0	if (out) {
568	0	if (o >= *out_len)
569	0	return WIND_ERR_OVERRUN;
570	0	*out = '\0';
571	0	}
572	0	*out_len = o;
573	0	return 0;
574	0	}
575
576		/**
577		* Calculate the length of from converting a UCS2 string to an UTF-8 string.
578		*
579		* @param in an UCS2 string to convert.
580		* @param in_len an UCS2 string length to convert.
581		* @param out_len the length of the resulting UTF-8 string.
582		*
583		* @return returns 0 on success, an wind error code otherwise
584		* @ingroup wind
585		*/
586
587		int
588		wind_ucs2utf8_length(const uint16_t in, size_t in_len, size_t out_len)
589	0	{
590		return wind_ucs2utf8(in, in_len, NULL, out_len);
591	0	}