Coverage Report

Created: 2025-07-23 06:43

/src/libidn2/lib/decode.c
Line
Count
Source (jump to first uncovered line)
1
/* decode.c - implementation of IDNA2008 decoding functions
2
   Copyright (C) 2011-2025 Simon Josefsson
3
4
   Libidn2 is free software: you can redistribute it and/or modify it
5
   under the terms of either:
6
7
     * the GNU Lesser General Public License as published by the Free
8
       Software Foundation; either version 3 of the License, or (at
9
       your option) any later version.
10
11
   or
12
13
     * the GNU General Public License as published by the Free
14
       Software Foundation; either version 2 of the License, or (at
15
       your option) any later version.
16
17
   or both in parallel, as here.
18
19
   This program is distributed in the hope that it will be useful,
20
   but WITHOUT ANY WARRANTY; without even the implied warranty of
21
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
   GNU General Public License for more details.
23
24
   You should have received copies of the GNU General Public License and
25
   the GNU Lesser General Public License along with this program.  If
26
   not, see <http://www.gnu.org/licenses/>.
27
*/
28
29
#include <config.h>
30
31
#include "idn2.h"
32
33
#include <errno.h>    /* errno */
34
#include <stdlib.h>   /* malloc, free */
35
36
#include <unitypes.h>
37
#include <uniconv.h>    /* u8_strconv_from_locale */
38
#include <unistr.h>   /* u8_to_u32, u32_cpy, ... */
39
40
/**
41
 * idn2_to_unicode_8z4z:
42
 * @input: Input zero-terminated UTF-8 string.
43
 * @output: Newly allocated UTF-32/UCS-4 output string.
44
 * @flags: Currently unused.
45
 *
46
 * Converts a possibly ACE encoded domain name in UTF-8 format into a
47
 * UTF-32 string (punycode decoding). The output buffer will be zero-terminated
48
 * and must be deallocated by the caller.
49
 *
50
 * @output may be NULL to test lookup of @input without allocating memory.
51
 *
52
 * Returns:
53
 *   %IDN2_OK: The conversion was successful.
54
 *   %IDN2_TOO_BIG_DOMAIN: The domain is too long.
55
 *   %IDN2_TOO_BIG_LABEL: A label is would have been too long.
56
 *   %IDN2_ENCODING_ERROR: Character conversion failed.
57
 *   %IDN2_MALLOC: Memory allocation failed.
58
 *
59
 * Since: 2.0.0
60
 **/
61
int
62
idn2_to_unicode_8z4z (const char *input, uint32_t **output,
63
          G_GNUC_UNUSED int flags)
64
3.26k
{
65
3.26k
  uint32_t *domain_u32;
66
3.26k
  int rc;
67
68
3.26k
  if (!input)
69
0
    {
70
0
      if (output)
71
0
  *output = NULL;
72
0
      return IDN2_OK;
73
0
    }
74
75
  /* split into labels and check */
76
3.26k
  uint32_t out_u32[IDN2_DOMAIN_MAX_LENGTH + 1];
77
3.26k
  size_t out_len = 0;
78
3.26k
  const char *e, *s;
79
80
12.5k
  for (e = s = input; *e; s = e)
81
10.8k
    {
82
10.8k
      uint32_t label_u32[IDN2_LABEL_MAX_LENGTH];
83
10.8k
      size_t label_len = IDN2_LABEL_MAX_LENGTH;
84
85
26.8M
      while (*e && *e != '.')
86
26.8M
  e++;
87
88
10.8k
      if (e - s >= 4 && (s[0] == 'x' || s[0] == 'X')
89
10.8k
    && (s[1] == 'n' || s[1] == 'N') && s[2] == '-' && s[3] == '-')
90
2.82k
  {
91
2.82k
    s += 4;
92
93
2.82k
    rc = idn2_punycode_decode ((char *) s, e - s,
94
2.82k
             label_u32, &label_len);
95
2.82k
    if (rc)
96
1.21k
      return rc;
97
98
1.61k
    if (out_len + label_len + (*e == '.') > IDN2_DOMAIN_MAX_LENGTH)
99
2
      return IDN2_TOO_BIG_DOMAIN;
100
101
1.61k
    u32_cpy (out_u32 + out_len, label_u32, label_len);
102
1.61k
  }
103
8.04k
      else
104
8.04k
  {
105
    /* convert UTF-8 input to UTF-32 */
106
8.04k
    if (!
107
8.04k
        (domain_u32 =
108
8.04k
         u8_to_u32 ((uint8_t *) s, e - s, NULL, &label_len)))
109
279
      {
110
279
        if (errno == ENOMEM)
111
0
    return IDN2_MALLOC;
112
279
        return IDN2_ENCODING_ERROR;
113
279
      }
114
115
7.76k
    if (label_len > IDN2_LABEL_MAX_LENGTH)
116
41
      {
117
41
        free (domain_u32);
118
41
        return IDN2_TOO_BIG_LABEL;
119
41
      }
120
121
7.72k
    if (out_len + label_len + (*e == '.') > IDN2_DOMAIN_MAX_LENGTH)
122
4
      {
123
4
        free (domain_u32);
124
4
        return IDN2_TOO_BIG_DOMAIN;
125
4
      }
126
127
7.71k
    u32_cpy (out_u32 + out_len, domain_u32, label_len);
128
7.71k
    free (domain_u32);
129
7.71k
  }
130
131
9.33k
      out_len += label_len;
132
9.33k
      if (*e)
133
7.69k
  {
134
7.69k
    out_u32[out_len++] = '.';
135
7.69k
    e++;
136
7.69k
  }
137
9.33k
    }
138
139
1.72k
  if (output)
140
1.72k
    {
141
1.72k
      uint32_t *_out;
142
143
1.72k
      out_u32[out_len] = 0;
144
145
1.72k
      _out = u32_cpy_alloc (out_u32, out_len + 1);
146
1.72k
      if (!_out)
147
0
  {
148
0
    if (errno == ENOMEM)
149
0
      return IDN2_MALLOC;
150
0
    return IDN2_ENCODING_ERROR;
151
0
  }
152
153
1.72k
      *output = _out;
154
1.72k
    }
155
156
1.72k
  return IDN2_OK;
157
1.72k
}
158
159
/**
160
 * idn2_to_unicode_4z4z:
161
 * @input: Input zero-terminated UTF-32 string.
162
 * @output: Newly allocated UTF-32 output string.
163
 * @flags: Currently unused.
164
 *
165
 * Converts a possibly ACE encoded domain name in UTF-32 format into a
166
 * UTF-32 string (punycode decoding). The output buffer will be zero-terminated
167
 * and must be deallocated by the caller.
168
 *
169
 * @output may be NULL to test lookup of @input without allocating memory.
170
 *
171
 * Returns:
172
 *   %IDN2_OK: The conversion was successful.
173
 *   %IDN2_TOO_BIG_DOMAIN: The domain is too long.
174
 *   %IDN2_TOO_BIG_LABEL: A label is would have been too long.
175
 *   %IDN2_ENCODING_ERROR: Character conversion failed.
176
 *   %IDN2_MALLOC: Memory allocation failed.
177
 *
178
 * Since: 2.0.0
179
 **/
180
int
181
idn2_to_unicode_4z4z (const uint32_t *input, uint32_t **output, int flags)
182
0
{
183
0
  uint8_t *input_u8;
184
0
  uint32_t *output_u32;
185
0
  size_t length;
186
0
  int rc;
187
188
0
  if (!input)
189
0
    {
190
0
      if (output)
191
0
  *output = NULL;
192
0
      return IDN2_OK;
193
0
    }
194
195
0
  input_u8 = u32_to_u8 (input, u32_strlen (input) + 1, NULL, &length);
196
0
  if (!input_u8)
197
0
    {
198
0
      if (errno == ENOMEM)
199
0
  return IDN2_MALLOC;
200
0
      return IDN2_ENCODING_ERROR;
201
0
    }
202
203
0
  rc = idn2_to_unicode_8z4z ((char *) input_u8, &output_u32, flags);
204
0
  free (input_u8);
205
206
0
  if (rc == IDN2_OK)
207
0
    {
208
0
      if (output)
209
0
  *output = output_u32;
210
0
      else
211
0
  free (output_u32);
212
0
    }
213
214
0
  return rc;
215
0
}
216
217
/**
218
 * idn2_to_unicode_44i:
219
 * @in: Input array with UTF-32 code points.
220
 * @inlen: number of code points of input array
221
 * @out: output array with UTF-32 code points.
222
 * @outlen: on input, maximum size of output array with UTF-32 code points,
223
 *          on exit, actual size of output array with UTF-32 code points.
224
 * @flags: Currently unused.
225
 *
226
 * The ToUnicode operation takes a sequence of UTF-32 code points
227
 * that make up one domain label and returns a sequence of UTF-32
228
 * code points. If the input sequence is a label in ACE form, then the
229
 * result is an equivalent internationalized label that is not in ACE
230
 * form, otherwise the original sequence is returned unaltered.
231
 *
232
 * @output may be NULL to test lookup of @input without allocating memory.
233
 *
234
 * Returns:
235
 *   %IDN2_OK: The conversion was successful.
236
 *   %IDN2_TOO_BIG_DOMAIN: The domain is too long.
237
 *   %IDN2_TOO_BIG_LABEL: A label is would have been too long.
238
 *   %IDN2_ENCODING_ERROR: Character conversion failed.
239
 *   %IDN2_MALLOC: Memory allocation failed.
240
 *
241
 * Since: 2.0.0
242
 **/
243
int
244
idn2_to_unicode_44i (const uint32_t *in, size_t inlen, uint32_t *out,
245
         size_t *outlen, int flags)
246
0
{
247
0
  uint32_t *input_u32;
248
0
  uint32_t *output_u32;
249
0
  size_t len;
250
0
  int rc;
251
252
0
  if (!in)
253
0
    {
254
0
      if (outlen)
255
0
  *outlen = 0;
256
0
      return IDN2_OK;
257
0
    }
258
259
0
  input_u32 = (uint32_t *) malloc ((inlen + 1) * sizeof (uint32_t));
260
0
  if (!input_u32)
261
0
    return IDN2_MALLOC;
262
263
0
  u32_cpy (input_u32, in, inlen);
264
0
  input_u32[inlen] = 0;
265
266
0
  rc = idn2_to_unicode_4z4z (input_u32, &output_u32, flags);
267
0
  free (input_u32);
268
0
  if (rc != IDN2_OK)
269
0
    return rc;
270
271
0
  len = u32_strlen (output_u32);
272
0
  if (out && outlen)
273
0
    u32_cpy (out, output_u32, len < *outlen ? len : *outlen);
274
0
  free (output_u32);
275
276
0
  if (outlen)
277
0
    *outlen = len;
278
279
0
  return IDN2_OK;
280
0
}
281
282
/**
283
 * idn2_to_unicode_8z8z:
284
 * @input: Input zero-terminated UTF-8 string.
285
 * @output: Newly allocated UTF-8 output string.
286
 * @flags: Currently unused.
287
 *
288
 * Converts a possibly ACE encoded domain name in UTF-8 format into a
289
 * UTF-8 string (punycode decoding). The output buffer will be zero-terminated
290
 * and must be deallocated by the caller.
291
 *
292
 * @output may be NULL to test lookup of @input without allocating memory.
293
 *
294
 * Returns:
295
 *   %IDN2_OK: The conversion was successful.
296
 *   %IDN2_TOO_BIG_DOMAIN: The domain is too long.
297
 *   %IDN2_TOO_BIG_LABEL: A label is would have been too long.
298
 *   %IDN2_ENCODING_ERROR: Character conversion failed.
299
 *   %IDN2_MALLOC: Memory allocation failed.
300
 *
301
 * Since: 2.0.0
302
 **/
303
int
304
idn2_to_unicode_8z8z (const char *input, char **output, int flags)
305
3.26k
{
306
3.26k
  uint32_t *output_u32;
307
3.26k
  uint8_t *output_u8;
308
3.26k
  size_t length;
309
3.26k
  int rc;
310
311
3.26k
  rc = idn2_to_unicode_8z4z (input, &output_u32, flags);
312
3.26k
  if (rc != IDN2_OK || !input)
313
1.53k
    return rc;
314
315
1.72k
  output_u8 =
316
1.72k
    u32_to_u8 (output_u32, u32_strlen (output_u32) + 1, NULL, &length);
317
1.72k
  free (output_u32);
318
319
1.72k
  if (!output_u8)
320
72
    {
321
72
      if (errno == ENOMEM)
322
0
  return IDN2_MALLOC;
323
72
      return IDN2_ENCODING_ERROR;
324
72
    }
325
326
1.65k
  if (output)
327
1.65k
    *output = (char *) output_u8;
328
0
  else
329
0
    free (output_u8);
330
331
1.65k
  return IDN2_OK;
332
1.72k
}
333
334
/**
335
 * idn2_to_unicode_8zlz:
336
 * @input: Input zero-terminated UTF-8 string.
337
 * @output: Newly allocated output string in current locale's character set.
338
 * @flags: Currently unused.
339
 *
340
 * Converts a possibly ACE encoded domain name in UTF-8 format into a
341
 * string encoded in the current locale's character set (punycode
342
 * decoding). The output buffer will be zero-terminated and must be
343
 * deallocated by the caller.
344
 *
345
 * @output may be NULL to test lookup of @input without allocating memory.
346
 *
347
 * Returns:
348
 *   %IDN2_OK: The conversion was successful.
349
 *   %IDN2_TOO_BIG_DOMAIN: The domain is too long.
350
 *   %IDN2_TOO_BIG_LABEL: A label is would have been too long.
351
 *   %IDN2_ENCODING_ERROR: Character conversion failed.
352
 *   %IDN2_MALLOC: Memory allocation failed.
353
 *
354
 * Since: 2.0.0
355
 **/
356
int
357
idn2_to_unicode_8zlz (const char *input, char **output, int flags)
358
0
{
359
0
  int rc;
360
0
  uint8_t *output_u8, *output_l8;
361
0
  const char *encoding;
362
363
0
  rc = idn2_to_unicode_8z8z (input, (char **) &output_u8, flags);
364
0
  if (rc != IDN2_OK || !input)
365
0
    return rc;
366
367
0
  encoding = locale_charset ();
368
0
  output_l8 =
369
0
    (uint8_t *) u8_strconv_to_encoding (output_u8, encoding, iconveh_error);
370
371
0
  if (!output_l8)
372
0
    {
373
0
      if (errno == ENOMEM)
374
0
  rc = IDN2_MALLOC;
375
0
      else
376
0
  rc = IDN2_ENCODING_ERROR;
377
378
0
      free (output_l8);
379
0
    }
380
0
  else
381
0
    {
382
0
      if (output)
383
0
  *output = (char *) output_l8;
384
0
      else
385
0
  free (output_l8);
386
387
0
      rc = IDN2_OK;
388
0
    }
389
390
0
  free (output_u8);
391
392
0
  return rc;
393
0
}
394
395
/**
396
 * idn2_to_unicode_lzlz:
397
 * @input: Input zero-terminated string encoded in the current locale's character set.
398
 * @output: Newly allocated output string in current locale's character set.
399
 * @flags: Currently unused.
400
 *
401
 * Converts a possibly ACE encoded domain name in the locale's character
402
 * set into a string encoded in the current locale's character set (punycode
403
 * decoding). The output buffer will be zero-terminated and must be
404
 * deallocated by the caller.
405
 *
406
 * @output may be NULL to test lookup of @input without allocating memory.
407
 *
408
 * Returns:
409
 *   %IDN2_OK: The conversion was successful.
410
 *   %IDN2_TOO_BIG_DOMAIN: The domain is too long.
411
 *   %IDN2_TOO_BIG_LABEL: A label is would have been too long.
412
 *   %IDN2_ENCODING_ERROR: Output character conversion failed.
413
 *   %IDN2_ICONV_FAIL: Input character conversion failed.
414
 *   %IDN2_MALLOC: Memory allocation failed.
415
 *
416
 * Since: 2.0.0
417
 **/
418
int
419
idn2_to_unicode_lzlz (const char *input, char **output, int flags)
420
0
{
421
0
  uint8_t *input_l8;
422
0
  const char *encoding;
423
0
  int rc;
424
425
0
  if (!input)
426
0
    {
427
0
      if (output)
428
0
  *output = NULL;
429
0
      return IDN2_OK;
430
0
    }
431
432
0
  encoding = locale_charset ();
433
0
  input_l8 = u8_strconv_from_encoding (input, encoding, iconveh_error);
434
435
0
  if (!input_l8)
436
0
    {
437
0
      if (errno == ENOMEM)
438
0
  return IDN2_MALLOC;
439
0
      return IDN2_ICONV_FAIL;
440
0
    }
441
442
0
  rc = idn2_to_unicode_8zlz ((char *) input_l8, output, flags);
443
0
  free (input_l8);
444
445
0
  return rc;
446
0
}