/src/cryptsetup/lib/crypto_backend/utf8.c

Source
// SPDX-License-Identifier: LGPL-2.1-or-later
/*
 * UTF8/16 helpers, copied and adapted from systemd project.
 *
 * Copyright (C) 2010 Lennart Poettering
 *
 * cryptsetup related changes
 * Copyright (C) 2021-2025 Vojtech Trefny

 * Parts of the original systemd implementation are based on the GLIB utf8
 * validation functions.
 * gutf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 */

#include <errno.h>
#include <endian.h>

#include "crypto_backend.h"

static inline bool utf16_is_surrogate(char16_t c)
{
  return c >= 0xd800U && c <= 0xdfffU;
}

static inline bool utf16_is_trailing_surrogate(char16_t c)
{
  return c >= 0xdc00U && c <= 0xdfffU;
}

static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail)
{
  return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U);
}

/**
 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
 * @out_utf8: output buffer of at least 4 bytes or NULL
 * @g: UCS-4 character to encode
 *
 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
 * The length of the character is returned. It is not zero-terminated! If the
 * output buffer is NULL, only the length is returned.
 *
 * Returns: The length in bytes that the UTF-8 representation does or would
 *          occupy.
 */
static size_t utf8_encode_unichar(char *out_utf8, char32_t g)
{
  if (g < (1 << 7)) {
    if (out_utf8)
      out_utf8[0] = g & 0x7f;
    return 1;
  } else if (g < (1 << 11)) {
    if (out_utf8) {
      out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
      out_utf8[1] = 0x80 | (g & 0x3f);
    }
    return 2;
  } else if (g < (1 << 16)) {
    if (out_utf8) {
      out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
      out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
      out_utf8[2] = 0x80 | (g & 0x3f);
    }
    return 3;
  } else if (g < (1 << 21)) {
    if (out_utf8) {
      out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
      out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
      out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
      out_utf8[3] = 0x80 | (g & 0x3f);
    }
    return 4;
  }

  return 0;
}

/**
 * crypt_utf16_to_utf8()
 * @out: output buffer, should be 2 * @length + 1 long
 * @s: string to convert
 * @length: length of @s in bytes
 *
 * Converts a UTF16LE encoded string to a UTF8 encoded string.
 *
 * Returns: 0 on success, negative errno otherwise
 */
int crypt_utf16_to_utf8(char **out, const char16_t *s, size_t length /* bytes! */)
{
  const uint8_t *f;
  char *t;

  assert(s);
  assert(out);
  assert(*out);

  /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
   * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
  if (length * 2 < length)
    return -EOVERFLOW; /* overflow */

  f = (const uint8_t*) s;
  t = *out;

  while (f + 1 < (const uint8_t*) s + length) {
    char16_t w1, w2;

    /* see RFC 2781 section 2.2 */

    w1 = f[1] << 8 | f[0];
    f += 2;

    if (!utf16_is_surrogate(w1)) {
      t += utf8_encode_unichar(t, w1);
      continue;
    }

    if (utf16_is_trailing_surrogate(w1))
      continue; /* spurious trailing surrogate, ignore */

    if (f + 1 >= (const uint8_t*) s + length)
      break;

    w2 = f[1] << 8 | f[0];
    f += 2;

    if (!utf16_is_trailing_surrogate(w2)) {
      f -= 2;
      continue; /* surrogate missing its trailing surrogate, ignore */
    }

    t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
  }

  *t = 0;
  return 0;
}

/* count of characters used to encode one unicode char */
static size_t utf8_encoded_expected_len(uint8_t c)
{
  if (c < 0x80)
    return 1;
  if ((c & 0xe0) == 0xc0)
    return 2;
  if ((c & 0xf0) == 0xe0)
    return 3;
  if ((c & 0xf8) == 0xf0)
    return 4;
  if ((c & 0xfc) == 0xf8)
    return 5;
  if ((c & 0xfe) == 0xfc)
    return 6;

  return 0;
}

/* decode one unicode char */
static int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar)
{
  char32_t unichar;
  size_t len, i;

  assert(str);

  len = utf8_encoded_expected_len(str[0]);

  switch (len) {
  case 1:
    *ret_unichar = (char32_t)str[0];
    return 0;
  case 2:
    unichar = str[0] & 0x1f;
    break;
  case 3:
    unichar = (char32_t)str[0] & 0x0f;
    break;
  case 4:
    unichar = (char32_t)str[0] & 0x07;
    break;
  case 5:
    unichar = (char32_t)str[0] & 0x03;
    break;
  case 6:
    unichar = (char32_t)str[0] & 0x01;
    break;
  default:
    return -EINVAL;
  }

  for (i = 1; i < len; i++) {
    if (((char32_t)str[i] & 0xc0) != 0x80)
      return -EINVAL;

    unichar <<= 6;
    unichar |= (char32_t)str[i] & 0x3f;
  }

  *ret_unichar = unichar;

  return 0;
}

static size_t utf16_encode_unichar(char16_t *out, char32_t c)
{
  /* Note that this encodes as little-endian. */

  switch (c) {

  case 0 ... 0xd7ffU:
  case 0xe000U ... 0xffffU:
    out[0] = htole16(c);
    return 1;

  case 0x10000U ... 0x10ffffU:
    /* coverity[overflow_const:FALSE] */
    c -= 0x10000U;
    out[0] = htole16((c >> 10) + 0xd800U);
    out[1] = htole16((c & 0x3ffU) + 0xdc00U);
    return 2;

  default: /* A surrogate (invalid) */
    return 0;
  }
}

/**
 * crypt_utf8_to_utf16()
 * @out: output buffer, should be @length + 1 long
 * @s: string to convert
 * @length: length of @s in bytes
 *
 * Converts a UTF8 encoded string to a UTF16LE encoded string.
 *
 * Returns: 0 on success, negative errno otherwise
 */
int crypt_utf8_to_utf16(char16_t **out, const char *s, size_t length)
{
  char16_t *p;
  size_t i;
  int r;

  assert(s);

  p = *out;

  for (i = 0; i < length;) {
    char32_t unichar;
    size_t e;

    e = utf8_encoded_expected_len(s[i]);
    if (e <= 1) /* Invalid and single byte characters are copied as they are */
      goto copy;

    if (i + e > length) /* sequence longer than input buffer, then copy as-is */
      goto copy;

    r = utf8_encoded_to_unichar(s + i, &unichar);
    if (r < 0) /* sequence invalid, then copy as-is */
      goto copy;

    p += utf16_encode_unichar(p, unichar);
    i += e;
    continue;

  copy:
    *(p++) = htole16(s[i++]);
  }

  *p = 0;
  return 0;
}

/**
 * crypt_char16_strlen()
 * @s: string to get length of
 *
 * Returns: number of 16-bit words in the string
 */
size_t crypt_char16_strlen(const char16_t *s) {
  size_t n = 0;

  assert(s);

  while (*s != 0)
    n++, s++;

  return n;
}

Coverage Report

Created: 2025-11-25 07:00

Line	Count	Source
1		// SPDX-License-Identifier: LGPL-2.1-or-later
2		/*
3		* UTF8/16 helpers, copied and adapted from systemd project.
4		*
5		* Copyright (C) 2010 Lennart Poettering
6		*
7		* cryptsetup related changes
8		* Copyright (C) 2021-2025 Vojtech Trefny
9
10		* Parts of the original systemd implementation are based on the GLIB utf8
11		* validation functions.
12		* gutf8.c - Operations on UTF-8 strings.
13		*
14		* Copyright (C) 1999 Tom Tromey
15		* Copyright (C) 2000 Red Hat, Inc.
16		*/
17
18		#include <errno.h>
19		#include <endian.h>
20
21		#include "crypto_backend.h"
22
23		static inline bool utf16_is_surrogate(char16_t c)
24	0	{
25	0	return c >= 0xd800U && c <= 0xdfffU;
26	0	}
27
28		static inline bool utf16_is_trailing_surrogate(char16_t c)
29	0	{
30	0	return c >= 0xdc00U && c <= 0xdfffU;
31	0	}
32
33		static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail)
34	0	{
35	0	return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U);
36	0	}
37
38		/**
39		* utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
40		* @out_utf8: output buffer of at least 4 bytes or NULL
41		* @g: UCS-4 character to encode
42		*
43		* This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
44		* The length of the character is returned. It is not zero-terminated! If the
45		* output buffer is NULL, only the length is returned.
46		*
47		* Returns: The length in bytes that the UTF-8 representation does or would
48		* occupy.
49		*/
50		static size_t utf8_encode_unichar(char *out_utf8, char32_t g)
51	0	{
52	0	if (g < (1 << 7)) {
53	0	if (out_utf8)
54	0	out_utf8[0] = g & 0x7f;
55	0	return 1;
56	0	} else if (g < (1 << 11)) {
57	0	if (out_utf8) {
58	0	out_utf8[0] = 0xc0 \| ((g >> 6) & 0x1f);
59	0	out_utf8[1] = 0x80 \| (g & 0x3f);
60	0	}
61	0	return 2;
62	0	} else if (g < (1 << 16)) {
63	0	if (out_utf8) {
64	0	out_utf8[0] = 0xe0 \| ((g >> 12) & 0x0f);
65	0	out_utf8[1] = 0x80 \| ((g >> 6) & 0x3f);
66	0	out_utf8[2] = 0x80 \| (g & 0x3f);
67	0	}
68	0	return 3;
69	0	} else if (g < (1 << 21)) {
70	0	if (out_utf8) {
71	0	out_utf8[0] = 0xf0 \| ((g >> 18) & 0x07);
72	0	out_utf8[1] = 0x80 \| ((g >> 12) & 0x3f);
73	0	out_utf8[2] = 0x80 \| ((g >> 6) & 0x3f);
74	0	out_utf8[3] = 0x80 \| (g & 0x3f);
75	0	}
76	0	return 4;
77	0	}
78
79	0	return 0;
80	0	}
81
82		/**
83		* crypt_utf16_to_utf8()
84		* @out: output buffer, should be 2 * @length + 1 long
85		* @s: string to convert
86		* @length: length of @s in bytes
87		*
88		* Converts a UTF16LE encoded string to a UTF8 encoded string.
89		*
90		* Returns: 0 on success, negative errno otherwise
91		*/
92		int crypt_utf16_to_utf8(char *out, const char16_t s, size_t length /* bytes! */)
93	0	{
94	0	const uint8_t *f;
95	0	char *t;
96
97	0	assert(s);
98	0	assert(out);
99	0	assert(*out);
100
101		/* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
102		* take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
103	0	if (length * 2 < length)
104	0	return -EOVERFLOW; /* overflow */
105
106	0	f = (const uint8_t*) s;
107	0	t = *out;
108
109	0	while (f + 1 < (const uint8_t*) s + length) {
110	0	char16_t w1, w2;
111
112		/* see RFC 2781 section 2.2 */
113
114	0	w1 = f[1] << 8 \| f[0];
115	0	f += 2;
116
117	0	if (!utf16_is_surrogate(w1)) {
118	0	t += utf8_encode_unichar(t, w1);
119	0	continue;
120	0	}
121
122	0	if (utf16_is_trailing_surrogate(w1))
123	0	continue; /* spurious trailing surrogate, ignore */
124
125	0	if (f + 1 >= (const uint8_t*) s + length)
126	0	break;
127
128	0	w2 = f[1] << 8 \| f[0];
129	0	f += 2;
130
131	0	if (!utf16_is_trailing_surrogate(w2)) {
132	0	f -= 2;
133	0	continue; /* surrogate missing its trailing surrogate, ignore */
134	0	}
135
136	0	t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
137	0	}
138
139	0	*t = 0;
140	0	return 0;
141	0	}
142
143		/* count of characters used to encode one unicode char */
144		static size_t utf8_encoded_expected_len(uint8_t c)
145	0	{
146	0	if (c < 0x80)
147	0	return 1;
148	0	if ((c & 0xe0) == 0xc0)
149	0	return 2;
150	0	if ((c & 0xf0) == 0xe0)
151	0	return 3;
152	0	if ((c & 0xf8) == 0xf0)
153	0	return 4;
154	0	if ((c & 0xfc) == 0xf8)
155	0	return 5;
156	0	if ((c & 0xfe) == 0xfc)
157	0	return 6;
158
159	0	return 0;
160	0	}
161
162		/* decode one unicode char */
163		static int utf8_encoded_to_unichar(const char str, char32_t ret_unichar)
164	0	{
165	0	char32_t unichar;
166	0	size_t len, i;
167
168	0	assert(str);
169
170	0	len = utf8_encoded_expected_len(str[0]);
171
172	0	switch (len) {
173	0	case 1:
174	0	*ret_unichar = (char32_t)str[0];
175	0	return 0;
176	0	case 2:
177	0	unichar = str[0] & 0x1f;
178	0	break;
179	0	case 3:
180	0	unichar = (char32_t)str[0] & 0x0f;
181	0	break;
182	0	case 4:
183	0	unichar = (char32_t)str[0] & 0x07;
184	0	break;
185	0	case 5:
186	0	unichar = (char32_t)str[0] & 0x03;
187	0	break;
188	0	case 6:
189	0	unichar = (char32_t)str[0] & 0x01;
190	0	break;
191	0	default:
192	0	return -EINVAL;
193	0	}
194
195	0	for (i = 1; i < len; i++) {
196	0	if (((char32_t)str[i] & 0xc0) != 0x80)
197	0	return -EINVAL;
198
199	0	unichar <<= 6;
200	0	unichar \|= (char32_t)str[i] & 0x3f;
201	0	}
202
203	0	*ret_unichar = unichar;
204
205	0	return 0;
206	0	}
207
208		static size_t utf16_encode_unichar(char16_t *out, char32_t c)
209	0	{
210		/* Note that this encodes as little-endian. */
211
212	0	switch (c) {
213
214	0	case 0 ... 0xd7ffU:
215	0	case 0xe000U ... 0xffffU:
216	0	out[0] = htole16(c);
217	0	return 1;
218
219	0	case 0x10000U ... 0x10ffffU:
220		/* coverity[overflow_const:FALSE] */
221	0	c -= 0x10000U;
222	0	out[0] = htole16((c >> 10) + 0xd800U);
223	0	out[1] = htole16((c & 0x3ffU) + 0xdc00U);
224	0	return 2;
225
226	0	default: /* A surrogate (invalid) */
227	0	return 0;
228	0	}
229	0	}
230
231		/**
232		* crypt_utf8_to_utf16()
233		* @out: output buffer, should be @length + 1 long
234		* @s: string to convert
235		* @length: length of @s in bytes
236		*
237		* Converts a UTF8 encoded string to a UTF16LE encoded string.
238		*
239		* Returns: 0 on success, negative errno otherwise
240		*/
241		int crypt_utf8_to_utf16(char16_t *out, const char s, size_t length)
242	0	{
243	0	char16_t *p;
244	0	size_t i;
245	0	int r;
246
247	0	assert(s);
248
249	0	p = *out;
250
251	0	for (i = 0; i < length;) {
252	0	char32_t unichar;
253	0	size_t e;
254
255	0	e = utf8_encoded_expected_len(s[i]);
256	0	if (e <= 1) /* Invalid and single byte characters are copied as they are */
257	0	goto copy;
258
259	0	if (i + e > length) /* sequence longer than input buffer, then copy as-is */
260	0	goto copy;
261
262	0	r = utf8_encoded_to_unichar(s + i, &unichar);
263	0	if (r < 0) /* sequence invalid, then copy as-is */
264	0	goto copy;
265
266	0	p += utf16_encode_unichar(p, unichar);
267	0	i += e;
268	0	continue;
269
270	0	copy:
271	0	*(p++) = htole16(s[i++]);
272	0	}
273
274	0	*p = 0;
275	0	return 0;
276	0	}
277
278		/**
279		* crypt_char16_strlen()
280		* @s: string to get length of
281		*
282		* Returns: number of 16-bit words in the string
283		*/
284	0	size_t crypt_char16_strlen(const char16_t *s) {
285	0	size_t n = 0;
286
287	0	assert(s);
288
289	0	while (*s != 0)
290	0	n++, s++;
291
292	0	return n;
293	0	}