Coverage Report

Created: 2025-11-25 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cryptsetup/lib/crypto_backend/utf8.c
Line
Count
Source
1
// SPDX-License-Identifier: LGPL-2.1-or-later
2
/*
3
 * UTF8/16 helpers, copied and adapted from systemd project.
4
 *
5
 * Copyright (C) 2010 Lennart Poettering
6
 *
7
 * cryptsetup related changes
8
 * Copyright (C) 2021-2025 Vojtech Trefny
9
10
 * Parts of the original systemd implementation are based on the GLIB utf8
11
 * validation functions.
12
 * gutf8.c - Operations on UTF-8 strings.
13
 *
14
 * Copyright (C) 1999 Tom Tromey
15
 * Copyright (C) 2000 Red Hat, Inc.
16
 */
17
18
#include <errno.h>
19
#include <endian.h>
20
21
#include "crypto_backend.h"
22
23
static inline bool utf16_is_surrogate(char16_t c)
24
0
{
25
0
  return c >= 0xd800U && c <= 0xdfffU;
26
0
}
27
28
static inline bool utf16_is_trailing_surrogate(char16_t c)
29
0
{
30
0
  return c >= 0xdc00U && c <= 0xdfffU;
31
0
}
32
33
static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail)
34
0
{
35
0
  return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U);
36
0
}
37
38
/**
39
 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
40
 * @out_utf8: output buffer of at least 4 bytes or NULL
41
 * @g: UCS-4 character to encode
42
 *
43
 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
44
 * The length of the character is returned. It is not zero-terminated! If the
45
 * output buffer is NULL, only the length is returned.
46
 *
47
 * Returns: The length in bytes that the UTF-8 representation does or would
48
 *          occupy.
49
 */
50
static size_t utf8_encode_unichar(char *out_utf8, char32_t g)
51
0
{
52
0
  if (g < (1 << 7)) {
53
0
    if (out_utf8)
54
0
      out_utf8[0] = g & 0x7f;
55
0
    return 1;
56
0
  } else if (g < (1 << 11)) {
57
0
    if (out_utf8) {
58
0
      out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
59
0
      out_utf8[1] = 0x80 | (g & 0x3f);
60
0
    }
61
0
    return 2;
62
0
  } else if (g < (1 << 16)) {
63
0
    if (out_utf8) {
64
0
      out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
65
0
      out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
66
0
      out_utf8[2] = 0x80 | (g & 0x3f);
67
0
    }
68
0
    return 3;
69
0
  } else if (g < (1 << 21)) {
70
0
    if (out_utf8) {
71
0
      out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
72
0
      out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
73
0
      out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
74
0
      out_utf8[3] = 0x80 | (g & 0x3f);
75
0
    }
76
0
    return 4;
77
0
  }
78
79
0
  return 0;
80
0
}
81
82
/**
83
 * crypt_utf16_to_utf8()
84
 * @out: output buffer, should be 2 * @length + 1 long
85
 * @s: string to convert
86
 * @length: length of @s in bytes
87
 *
88
 * Converts a UTF16LE encoded string to a UTF8 encoded string.
89
 *
90
 * Returns: 0 on success, negative errno otherwise
91
 */
92
int crypt_utf16_to_utf8(char **out, const char16_t *s, size_t length /* bytes! */)
93
0
{
94
0
  const uint8_t *f;
95
0
  char *t;
96
97
0
  assert(s);
98
0
  assert(out);
99
0
  assert(*out);
100
101
  /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
102
   * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
103
0
  if (length * 2 < length)
104
0
    return -EOVERFLOW; /* overflow */
105
106
0
  f = (const uint8_t*) s;
107
0
  t = *out;
108
109
0
  while (f + 1 < (const uint8_t*) s + length) {
110
0
    char16_t w1, w2;
111
112
    /* see RFC 2781 section 2.2 */
113
114
0
    w1 = f[1] << 8 | f[0];
115
0
    f += 2;
116
117
0
    if (!utf16_is_surrogate(w1)) {
118
0
      t += utf8_encode_unichar(t, w1);
119
0
      continue;
120
0
    }
121
122
0
    if (utf16_is_trailing_surrogate(w1))
123
0
      continue; /* spurious trailing surrogate, ignore */
124
125
0
    if (f + 1 >= (const uint8_t*) s + length)
126
0
      break;
127
128
0
    w2 = f[1] << 8 | f[0];
129
0
    f += 2;
130
131
0
    if (!utf16_is_trailing_surrogate(w2)) {
132
0
      f -= 2;
133
0
      continue; /* surrogate missing its trailing surrogate, ignore */
134
0
    }
135
136
0
    t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
137
0
  }
138
139
0
  *t = 0;
140
0
  return 0;
141
0
}
142
143
/* count of characters used to encode one unicode char */
144
static size_t utf8_encoded_expected_len(uint8_t c)
145
0
{
146
0
  if (c < 0x80)
147
0
    return 1;
148
0
  if ((c & 0xe0) == 0xc0)
149
0
    return 2;
150
0
  if ((c & 0xf0) == 0xe0)
151
0
    return 3;
152
0
  if ((c & 0xf8) == 0xf0)
153
0
    return 4;
154
0
  if ((c & 0xfc) == 0xf8)
155
0
    return 5;
156
0
  if ((c & 0xfe) == 0xfc)
157
0
    return 6;
158
159
0
  return 0;
160
0
}
161
162
/* decode one unicode char */
163
static int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar)
164
0
{
165
0
  char32_t unichar;
166
0
  size_t len, i;
167
168
0
  assert(str);
169
170
0
  len = utf8_encoded_expected_len(str[0]);
171
172
0
  switch (len) {
173
0
  case 1:
174
0
    *ret_unichar = (char32_t)str[0];
175
0
    return 0;
176
0
  case 2:
177
0
    unichar = str[0] & 0x1f;
178
0
    break;
179
0
  case 3:
180
0
    unichar = (char32_t)str[0] & 0x0f;
181
0
    break;
182
0
  case 4:
183
0
    unichar = (char32_t)str[0] & 0x07;
184
0
    break;
185
0
  case 5:
186
0
    unichar = (char32_t)str[0] & 0x03;
187
0
    break;
188
0
  case 6:
189
0
    unichar = (char32_t)str[0] & 0x01;
190
0
    break;
191
0
  default:
192
0
    return -EINVAL;
193
0
  }
194
195
0
  for (i = 1; i < len; i++) {
196
0
    if (((char32_t)str[i] & 0xc0) != 0x80)
197
0
      return -EINVAL;
198
199
0
    unichar <<= 6;
200
0
    unichar |= (char32_t)str[i] & 0x3f;
201
0
  }
202
203
0
  *ret_unichar = unichar;
204
205
0
  return 0;
206
0
}
207
208
static size_t utf16_encode_unichar(char16_t *out, char32_t c)
209
0
{
210
  /* Note that this encodes as little-endian. */
211
212
0
  switch (c) {
213
214
0
  case 0 ... 0xd7ffU:
215
0
  case 0xe000U ... 0xffffU:
216
0
    out[0] = htole16(c);
217
0
    return 1;
218
219
0
  case 0x10000U ... 0x10ffffU:
220
    /* coverity[overflow_const:FALSE] */
221
0
    c -= 0x10000U;
222
0
    out[0] = htole16((c >> 10) + 0xd800U);
223
0
    out[1] = htole16((c & 0x3ffU) + 0xdc00U);
224
0
    return 2;
225
226
0
  default: /* A surrogate (invalid) */
227
0
    return 0;
228
0
  }
229
0
}
230
231
/**
232
 * crypt_utf8_to_utf16()
233
 * @out: output buffer, should be @length + 1 long
234
 * @s: string to convert
235
 * @length: length of @s in bytes
236
 *
237
 * Converts a UTF8 encoded string to a UTF16LE encoded string.
238
 *
239
 * Returns: 0 on success, negative errno otherwise
240
 */
241
int crypt_utf8_to_utf16(char16_t **out, const char *s, size_t length)
242
0
{
243
0
  char16_t *p;
244
0
  size_t i;
245
0
  int r;
246
247
0
  assert(s);
248
249
0
  p = *out;
250
251
0
  for (i = 0; i < length;) {
252
0
    char32_t unichar;
253
0
    size_t e;
254
255
0
    e = utf8_encoded_expected_len(s[i]);
256
0
    if (e <= 1) /* Invalid and single byte characters are copied as they are */
257
0
      goto copy;
258
259
0
    if (i + e > length) /* sequence longer than input buffer, then copy as-is */
260
0
      goto copy;
261
262
0
    r = utf8_encoded_to_unichar(s + i, &unichar);
263
0
    if (r < 0) /* sequence invalid, then copy as-is */
264
0
      goto copy;
265
266
0
    p += utf16_encode_unichar(p, unichar);
267
0
    i += e;
268
0
    continue;
269
270
0
  copy:
271
0
    *(p++) = htole16(s[i++]);
272
0
  }
273
274
0
  *p = 0;
275
0
  return 0;
276
0
}
277
278
/**
279
 * crypt_char16_strlen()
280
 * @s: string to get length of
281
 *
282
 * Returns: number of 16-bit words in the string
283
 */
284
0
size_t crypt_char16_strlen(const char16_t *s) {
285
0
  size_t n = 0;
286
287
0
  assert(s);
288
289
0
  while (*s != 0)
290
0
    n++, s++;
291
292
0
  return n;
293
0
}