Coverage Report

Created: 2026-06-07 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/samba/third_party/heimdal/lib/wind/utf8.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
3
 * (Royal Institute of Technology, Stockholm, Sweden).
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions
8
 * are met:
9
 *
10
 * 1. Redistributions of source code must retain the above copyright
11
 *    notice, this list of conditions and the following disclaimer.
12
 *
13
 * 2. Redistributions in binary form must reproduce the above copyright
14
 *    notice, this list of conditions and the following disclaimer in the
15
 *    documentation and/or other materials provided with the distribution.
16
 *
17
 * 3. Neither the name of the Institute nor the names of its contributors
18
 *    may be used to endorse or promote products derived from this software
19
 *    without specific prior written permission.
20
 *
21
 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31
 * SUCH DAMAGE.
32
 */
33
34
#include <config.h>
35
#include "windlocl.h"
36
37
static int
38
utf8toutf32(const unsigned char **pp, uint32_t *out)
39
0
{
40
0
    const unsigned char *p = *pp;
41
0
    uint32_t c = *p;
42
0
    uint32_t out_val;
43
44
0
    if (c & 0x80) {
45
0
  if ((c & 0xE0) == 0xC0) {
46
0
      const uint32_t c2 = *++p;
47
0
      if ((c2 & 0xC0) == 0x80) {
48
0
    out_val =  ((c  & 0x1F) << 6)
49
0
        | (c2 & 0x3F);
50
0
    if (out_val < 0x80) {
51
0
        return WIND_ERR_INVALID_UTF8;
52
0
    }
53
0
      } else {
54
0
    return WIND_ERR_INVALID_UTF8;
55
0
      }
56
0
  } else if ((c & 0xF0) == 0xE0) {
57
0
      const uint32_t c2 = *++p;
58
0
      if ((c2 & 0xC0) == 0x80) {
59
0
    const uint32_t c3 = *++p;
60
0
    if ((c3 & 0xC0) == 0x80) {
61
0
        out_val =   ((c  & 0x0F) << 12)
62
0
      | ((c2 & 0x3F) << 6)
63
0
      |  (c3 & 0x3F);
64
0
        if (out_val < 0x800) {
65
0
      return WIND_ERR_INVALID_UTF8;
66
0
        }
67
0
    } else {
68
0
        return WIND_ERR_INVALID_UTF8;
69
0
    }
70
0
      } else {
71
0
    return WIND_ERR_INVALID_UTF8;
72
0
      }
73
0
  } else if ((c & 0xF8) == 0xF0) {
74
0
      const uint32_t c2 = *++p;
75
0
      if ((c2 & 0xC0) == 0x80) {
76
0
    const uint32_t c3 = *++p;
77
0
    if ((c3 & 0xC0) == 0x80) {
78
0
        const uint32_t c4 = *++p;
79
0
        if ((c4 & 0xC0) == 0x80) {
80
0
      out_val =   ((c  & 0x07) << 18)
81
0
          | ((c2 & 0x3F) << 12)
82
0
          | ((c3 & 0x3F) <<  6)
83
0
          |  (c4 & 0x3F);
84
0
      if (out_val < 0x10000) {
85
0
          return WIND_ERR_INVALID_UTF8;
86
0
      }
87
0
        } else {
88
0
      return WIND_ERR_INVALID_UTF8;
89
0
        }
90
0
    } else {
91
0
        return WIND_ERR_INVALID_UTF8;
92
0
    }
93
0
      } else {
94
0
    return WIND_ERR_INVALID_UTF8;
95
0
      }
96
0
  } else {
97
0
      return WIND_ERR_INVALID_UTF8;
98
0
  }
99
0
    } else {
100
0
  out_val = c;
101
0
    }
102
103
    /* Allow unpaired surrogates (in the range 0xd800–0xdfff). */
104
105
0
    if (out_val > 0x10ffff) {
106
0
  return WIND_ERR_INVALID_UTF8;
107
0
    }
108
109
0
    *out = out_val;
110
0
    *pp = p;
111
112
0
    return 0;
113
0
}
114
115
/**
116
 * Convert an UTF-8 string to an UCS4 string.
117
 *
118
 * @param in an UTF-8 string to convert.
119
 * @param out the resulting UCS4 string, must be at least
120
 * wind_utf8ucs4_length() long.  If out is NULL, the function will
121
 * calculate the needed space for the out variable (just like
122
 * wind_utf8ucs4_length()).
123
 * @param out_len before processing out_len should be the length of
124
 * the out variable, after processing it will be the length of the out
125
 * string.
126
 *
127
 * @return returns 0 on success, an wind error code otherwise
128
 * @ingroup wind
129
 */
130
131
int
132
wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
133
0
{
134
0
    const unsigned char *p;
135
0
    size_t o = 0;
136
0
    int ret;
137
138
0
    for (p = (const unsigned char *)in; *p != '\0'; ++p) {
139
0
  uint32_t u;
140
141
0
  ret = utf8toutf32(&p, &u);
142
0
  if (ret)
143
0
      return ret;
144
145
0
  if (out) {
146
0
      if (o >= *out_len)
147
0
    return WIND_ERR_OVERRUN;
148
0
      out[o] = u;
149
0
  }
150
0
  o++;
151
0
    }
152
0
    *out_len = o;
153
0
    return 0;
154
0
}
155
156
/**
157
 * Calculate the length of from converting a UTF-8 string to a UCS4
158
 * string.
159
 *
160
 * @param in an UTF-8 string to convert.
161
 * @param out_len the length of the resulting UCS4 string.
162
 *
163
 * @return returns 0 on success, an wind error code otherwise
164
 * @ingroup wind
165
 */
166
167
int
168
wind_utf8ucs4_length(const char *in, size_t *out_len)
169
0
{
170
0
    return wind_utf8ucs4(in, NULL, out_len);
171
0
}
172
173
static const char first_char[4] =
174
    { 0x00, 0xC0, 0xE0, 0xF0 };
175
176
/**
177
 * Convert an UCS4 string to a UTF-8 string.
178
 *
179
 * @param in an UCS4 string to convert.
180
 * @param in_len the length input array.
181
182
 * @param out the resulting UTF-8 string, must be at least
183
 * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
184
 * out is NULL, the function will calculate the needed space for the
185
 * out variable (just like wind_ucs4utf8_length()).
186
187
 * @param out_len before processing out_len should be the length of
188
 * the out variable, after processing it will be the length of the out
189
 * string.
190
 *
191
 * @return returns 0 on success, an wind error code otherwise
192
 * @ingroup wind
193
 */
194
195
int
196
wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
197
0
{
198
0
    uint32_t ch;
199
0
    size_t i, len, o;
200
201
0
    for (o = 0, i = 0; i < in_len; i++) {
202
0
  ch = in[i];
203
204
0
  if (ch < 0x80) {
205
0
      len = 1;
206
0
  } else if (ch < 0x800) {
207
0
      len = 2;
208
0
  } else if (ch < 0x10000) {
209
0
      len = 3;
210
0
  } else if (ch <= 0x10FFFF) {
211
0
      len = 4;
212
0
  } else
213
0
      return WIND_ERR_INVALID_UTF32;
214
215
0
  o += len;
216
217
0
  if (out) {
218
0
      if (o >= *out_len)
219
0
    return WIND_ERR_OVERRUN;
220
221
0
      switch(len) {
222
0
      case 4:
223
0
    out[3] = (ch | 0x80) & 0xbf;
224
0
    ch = ch >> 6;
225
0
                HEIM_FALLTHROUGH;
226
0
      case 3:
227
0
    out[2] = (ch | 0x80) & 0xbf;
228
0
    ch = ch >> 6;
229
0
                HEIM_FALLTHROUGH;
230
0
      case 2:
231
0
    out[1] = (ch | 0x80) & 0xbf;
232
0
    ch = ch >> 6;
233
0
                HEIM_FALLTHROUGH;
234
0
      case 1:
235
0
    out[0] = ch | first_char[len - 1];
236
0
                HEIM_FALLTHROUGH;
237
0
            default:
238
0
                break;
239
0
      }
240
0
      out += len;
241
0
  }
242
0
    }
243
0
    if (out) {
244
0
  if (o + 1 >= *out_len)
245
0
      return WIND_ERR_OVERRUN;
246
0
  *out = '\0';
247
0
    }
248
0
    *out_len = o;
249
0
    return 0;
250
0
}
251
252
/**
253
 * Calculate the length of from converting a UCS4 string to an UTF-8 string.
254
 *
255
 * @param in an UCS4 string to convert.
256
 * @param in_len the length of UCS4 string to convert.
257
 * @param out_len the length of the resulting UTF-8 string.
258
 *
259
 * @return returns 0 on success, an wind error code otherwise
260
 * @ingroup wind
261
 */
262
263
int
264
wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
265
0
{
266
0
    return wind_ucs4utf8(in, in_len, NULL, out_len);
267
0
}
268
269
/**
270
 * Read in an UCS2 from a buffer.
271
 *
272
 * @param ptr The input buffer to read from.
273
 * @param len the length of the input buffer.
274
 * @param flags Flags to control the behavior of the function.
275
 * @param out the output UCS2, the array must be at least out/2 long.
276
 * @param out_len the output length
277
 *
278
 * @return returns 0 on success, an wind error code otherwise.
279
 * @ingroup wind
280
 */
281
282
int
283
wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
284
        uint16_t *out, size_t *out_len)
285
0
{
286
0
    const unsigned char *p = ptr;
287
0
    int little = ((*flags) & WIND_RW_LE);
288
0
    size_t olen = *out_len;
289
290
    /** if len is zero, flags are unchanged */
291
0
    if (len == 0) {
292
0
  *out_len = 0;
293
0
  return 0;
294
0
    }
295
296
    /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
297
0
    if (len & 1)
298
0
  return WIND_ERR_LENGTH_NOT_MOD2;
299
300
    /**
301
     * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
302
     * found, check is LE/BE flag is already and use that otherwise
303
     * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
304
     * the LE/BE flag and set the resulting LE/BE flag.
305
     */
306
0
    if ((*flags) & WIND_RW_BOM) {
307
0
  uint16_t bom = (p[0] << 8) + p[1];
308
0
  if (bom == 0xfffe || bom == 0xfeff) {
309
0
      little = (bom == 0xfffe);
310
0
      p += 2;
311
0
      len -= 2;
312
0
  } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
313
      /* little already set */
314
0
  } else
315
0
      return WIND_ERR_NO_BOM;
316
0
  *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
317
0
  *flags |= little ? WIND_RW_LE : WIND_RW_BE;
318
0
    }
319
320
0
    while (len) {
321
0
  if (olen < 1)
322
0
      return WIND_ERR_OVERRUN;
323
0
  if (little)
324
0
      *out = (p[1] << 8) + p[0];
325
0
  else
326
0
      *out = (p[0] << 8) + p[1];
327
0
  out++; p += 2; len -= 2; olen--;
328
0
    }
329
0
    *out_len -= olen;
330
0
    return 0;
331
0
}
332
333
/**
334
 * Write an UCS2 string to a buffer.
335
 *
336
 * @param in The input UCS2 string.
337
 * @param in_len the length of the input buffer.
338
 * @param flags Flags to control the behavior of the function.
339
 * @param ptr The input buffer to write to, the array must be at least
340
 * (in + 1) * 2 bytes long.
341
 * @param out_len the output length
342
 *
343
 * @return returns 0 on success, an wind error code otherwise.
344
 * @ingroup wind
345
 */
346
347
int
348
wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
349
         void *ptr, size_t *out_len)
350
0
{
351
0
    unsigned char *p = ptr;
352
0
    size_t len = *out_len;
353
354
    /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
355
0
    if (len & 1)
356
0
  return WIND_ERR_LENGTH_NOT_MOD2;
357
358
    /** On zero input length, flags are preserved */
359
0
    if (in_len == 0) {
360
0
  *out_len = 0;
361
0
  return 0;
362
0
    }
363
    /** If flags have WIND_RW_BOM set, the byte order mark is written
364
     * first to the output data */
365
0
    if ((*flags) & WIND_RW_BOM) {
366
0
  uint16_t bom = 0xfffe;
367
368
0
  if (len < 2)
369
0
      return WIND_ERR_OVERRUN;
370
371
0
  if ((*flags) & WIND_RW_LE) {
372
0
      p[0] = (bom     ) & 0xff;
373
0
      p[1] = (bom >> 8) & 0xff;
374
0
  } else {
375
0
      p[1] = (bom     ) & 0xff;
376
0
      p[0] = (bom >> 8) & 0xff;
377
0
  }
378
0
  len -= 2;
379
0
    }
380
381
0
    while (in_len) {
382
  /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
383
0
  if (len < 2)
384
0
      return WIND_ERR_OVERRUN;
385
0
  if ((*flags) & WIND_RW_LE) {
386
0
      p[0] = (in[0]     ) & 0xff;
387
0
      p[1] = (in[0] >> 8) & 0xff;
388
0
  } else {
389
0
      p[1] = (in[0]     ) & 0xff;
390
0
      p[0] = (in[0] >> 8) & 0xff;
391
0
  }
392
0
  len -= 2;
393
0
  in_len--;
394
0
  p += 2;
395
0
  in++;
396
0
    }
397
0
    *out_len -= len;
398
0
    return 0;
399
0
}
400
401
402
/**
403
 * Convert an UTF-8 string to an UCS2 string.
404
 *
405
 * @param in an UTF-8 string to convert.
406
 * @param out the resulting UCS2 string, must be at least
407
 * wind_utf8ucs2_length() long.  If out is NULL, the function will
408
 * calculate the needed space for the out variable (just like
409
 * wind_utf8ucs2_length()).
410
 * @param out_len before processing out_len should be the length of
411
 * the out variable, after processing it will be the length of the out
412
 * string.
413
 *
414
 * @return returns 0 on success, an wind error code otherwise
415
 * @ingroup wind
416
 */
417
418
int
419
wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
420
0
{
421
0
    const unsigned char *p;
422
0
    size_t o = 0;
423
0
    int ret;
424
425
0
    for (p = (const unsigned char *)in; *p != '\0'; ++p) {
426
0
  uint32_t u;
427
428
0
  ret = utf8toutf32(&p, &u);
429
0
  if (ret)
430
0
      return ret;
431
432
0
  if (u >= 0x10000) {
433
0
      if (out) {
434
0
    uint16_t high_ten_bits;
435
0
    uint16_t low_ten_bits;
436
437
0
    if (o + 2 > *out_len)
438
0
        return WIND_ERR_OVERRUN;
439
440
0
    u -= 0x10000;
441
0
    high_ten_bits = (u >> 10) & 0x3ff;
442
0
    low_ten_bits = u & 0x3ff;
443
444
0
    out[o] = 0xd800 | high_ten_bits;
445
0
    out[o+1] = 0xdc00 | low_ten_bits;
446
0
      }
447
0
      o += 2;
448
0
  } else {
449
0
      if (out) {
450
0
    if (o >= *out_len)
451
0
        return WIND_ERR_OVERRUN;
452
0
    out[o] = u;
453
0
      }
454
0
      o++;
455
0
  }
456
0
    }
457
0
    *out_len = o;
458
0
    return 0;
459
0
}
460
461
/**
462
 * Calculate the length of from converting a UTF-8 string to a UCS2
463
 * string.
464
 *
465
 * @param in an UTF-8 string to convert.
466
 * @param out_len the length of the resulting UCS2 string.
467
 *
468
 * @return returns 0 on success, an wind error code otherwise
469
 * @ingroup wind
470
 */
471
472
int
473
wind_utf8ucs2_length(const char *in, size_t *out_len)
474
0
{
475
0
    return wind_utf8ucs2(in, NULL, out_len);
476
0
}
477
478
/**
479
 * Convert an UCS2 string to a UTF-8 string.
480
 *
481
 * @param in an UCS2 string to convert.
482
 * @param in_len the length of the in UCS2 string.
483
 * @param out the resulting UTF-8 string, must be at least
484
 * wind_ucs2utf8_length() long.  If out is NULL, the function will
485
 * calculate the needed space for the out variable (just like
486
 * wind_ucs2utf8_length()).
487
 * @param out_len before processing out_len should be the length of
488
 * the out variable, after processing it will be the length of the out
489
 * string.
490
 *
491
 * @return returns 0 on success, an wind error code otherwise
492
 * @ingroup wind
493
 */
494
495
int
496
wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
497
0
{
498
0
    uint32_t ch;
499
0
    size_t i, len, o;
500
501
0
    for (o = 0, i = 0; i < in_len; i++) {
502
0
  ch = in[i];
503
504
0
  if (ch < 0x80) {
505
0
      len = 1;
506
0
  } else if (ch < 0x800) {
507
0
      len = 2;
508
0
  } else if (ch < 0xd800 || ch >= 0xe000) {
509
0
      len = 3;
510
0
  } else if (ch < 0xdc00) {
511
      /* A high surrogate. */
512
0
      if (i < in_len - 1) {
513
0
    uint16_t ch2 = in[i + 1];
514
515
0
    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
516
0
        uint16_t high_ten_bits;
517
0
        uint16_t low_ten_bits;
518
519
        /* A surrogate pair. */
520
0
        high_ten_bits = ch & 0x3ff;
521
0
        low_ten_bits = ch2 & 0x3ff;
522
523
0
        ch = 0x10000 + ((uint32_t)high_ten_bits << 10 | low_ten_bits);
524
0
        len = 4;
525
0
        ++i;
526
0
    } else {
527
        /* An unpaired high surrogate. */
528
0
        len = 3;
529
0
    }
530
0
      } else {
531
    /* An unpaired high surrogate. */
532
0
    len = 3;
533
0
      }
534
0
  } else {
535
      /* An unpaired low surrogate. */
536
0
      len = 3;
537
0
  }
538
539
0
  o += len;
540
541
0
  if (out) {
542
0
      if (o >= *out_len)
543
0
    return WIND_ERR_OVERRUN;
544
545
0
      switch(len) {
546
0
      case 4:
547
0
    out[3] = (ch | 0x80) & 0xbf;
548
0
    ch = ch >> 6;
549
0
    HEIM_FALLTHROUGH;
550
0
      case 3:
551
0
    out[2] = (ch | 0x80) & 0xbf;
552
0
    ch = ch >> 6;
553
0
                HEIM_FALLTHROUGH;
554
0
      case 2:
555
0
    out[1] = (ch | 0x80) & 0xbf;
556
0
    ch = ch >> 6;
557
0
                HEIM_FALLTHROUGH;
558
0
      case 1:
559
0
    out[0] = ch | first_char[len - 1];
560
0
                HEIM_FALLTHROUGH;
561
0
            default:
562
0
                break;
563
0
      }
564
0
      out += len;
565
0
  }
566
0
    }
567
0
    if (out) {
568
0
  if (o >= *out_len)
569
0
      return WIND_ERR_OVERRUN;
570
0
  *out = '\0';
571
0
    }
572
0
    *out_len = o;
573
0
    return 0;
574
0
}
575
576
/**
577
 * Calculate the length of from converting a UCS2 string to an UTF-8 string.
578
 *
579
 * @param in an UCS2 string to convert.
580
 * @param in_len an UCS2 string length to convert.
581
 * @param out_len the length of the resulting UTF-8 string.
582
 *
583
 * @return returns 0 on success, an wind error code otherwise
584
 * @ingroup wind
585
 */
586
587
int
588
wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
589
0
{
590
    return wind_ucs2utf8(in, in_len, NULL, out_len);
591
0
}