Coverage Report

Created: 2025-07-01 06:25

/src/nss/lib/util/utf8.c
Line
Count
Source (jump to first uncovered line)
1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
 * License, v. 2.0. If a copy of the MPL was not distributed with this
3
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5
#include "seccomon.h"
6
#include "secport.h"
7
8
/*
9
 * From RFC 2044:
10
 *
11
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
12
 * 0000 0000-0000 007F   0xxxxxxx
13
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
14
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
15
 * 0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
16
 * 0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
17
 * 0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
18
 */
19
20
/*
21
 * From http://www.imc.org/draft-hoffman-utf16
22
 *
23
 * For U on [0x00010000,0x0010FFFF]:  Let U' = U - 0x00010000
24
 *
25
 * U' = yyyyyyyyyyxxxxxxxxxx
26
 * W1 = 110110yyyyyyyyyy
27
 * W2 = 110111xxxxxxxxxx
28
 */
29
30
/*
31
 * This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit
32
 * character values.  If you wish to use this code for working with
33
 * host byte order values, define the following:
34
 *
35
 * #if IS_BIG_ENDIAN
36
 * #define L_0 0
37
 * #define L_1 1
38
 * #define L_2 2
39
 * #define L_3 3
40
 * #define H_0 0
41
 * #define H_1 1
42
 * #else / * not everyone has elif * /
43
 * #if IS_LITTLE_ENDIAN
44
 * #define L_0 3
45
 * #define L_1 2
46
 * #define L_2 1
47
 * #define L_3 0
48
 * #define H_0 1
49
 * #define H_1 0
50
 * #else
51
 * #error "PDP and NUXI support deferred"
52
 * #endif / * IS_LITTLE_ENDIAN * /
53
 * #endif / * IS_BIG_ENDIAN * /
54
 */
55
56
0
#define L_0 0
57
0
#define L_1 1
58
0
#define L_2 2
59
0
#define L_3 3
60
0
#define H_0 0
61
0
#define H_1 1
62
63
0
#define BAD_UTF8 ((PRUint32)-1)
64
65
/*
66
 * Parse a single UTF-8 character per the spec. in section 3.9 (D36)
67
 * of Unicode 4.0.0.
68
 *
69
 * Parameters:
70
 * index - Points to the byte offset in inBuf of character to read.  On success,
71
 *         updated to the offset of the following character.
72
 * inBuf - Input buffer, UTF-8 encoded
73
 * inbufLen - Length of input buffer, in bytes.
74
 *
75
 * Returns:
76
 * Success - The UCS4 encoded character
77
 * Failure - BAD_UTF8
78
 */
79
static PRUint32
80
sec_port_read_utf8(unsigned int *index, unsigned char *inBuf, unsigned int inBufLen)
81
0
{
82
0
    PRUint32 result;
83
0
    unsigned int i = *index;
84
0
    int bytes_left;
85
0
    PRUint32 min_value;
86
87
0
    PORT_Assert(i < inBufLen);
88
89
0
    if ((inBuf[i] & 0x80) == 0x00) {
90
0
        result = inBuf[i++];
91
0
        bytes_left = 0;
92
0
        min_value = 0;
93
0
    } else if ((inBuf[i] & 0xE0) == 0xC0) {
94
0
        result = inBuf[i++] & 0x1F;
95
0
        bytes_left = 1;
96
0
        min_value = 0x80;
97
0
    } else if ((inBuf[i] & 0xF0) == 0xE0) {
98
0
        result = inBuf[i++] & 0x0F;
99
0
        bytes_left = 2;
100
0
        min_value = 0x800;
101
0
    } else if ((inBuf[i] & 0xF8) == 0xF0) {
102
0
        result = inBuf[i++] & 0x07;
103
0
        bytes_left = 3;
104
0
        min_value = 0x10000;
105
0
    } else {
106
0
        return BAD_UTF8;
107
0
    }
108
109
0
    while (bytes_left--) {
110
0
        if (i >= inBufLen || (inBuf[i] & 0xC0) != 0x80)
111
0
            return BAD_UTF8;
112
0
        result = (result << 6) | (inBuf[i++] & 0x3F);
113
0
    }
114
115
    /* Check for overlong sequences, surrogates, and outside unicode range */
116
0
    if (result < min_value || (result & 0xFFFFF800) == 0xD800 || result > 0x10FFFF) {
117
0
        return BAD_UTF8;
118
0
    }
119
120
0
    *index = i;
121
0
    return result;
122
0
}
123
124
PRBool
125
sec_port_ucs4_utf8_conversion_function(
126
    PRBool toUnicode,
127
    unsigned char *inBuf,
128
    unsigned int inBufLen,
129
    unsigned char *outBuf,
130
    unsigned int maxOutBufLen,
131
    unsigned int *outBufLen)
132
0
{
133
0
    PORT_Assert((unsigned int *)NULL != outBufLen);
134
135
0
    if (toUnicode) {
136
0
        unsigned int i, len = 0;
137
138
0
        for (i = 0; i < inBufLen;) {
139
0
            if ((inBuf[i] & 0x80) == 0x00)
140
0
                i += 1;
141
0
            else if ((inBuf[i] & 0xE0) == 0xC0)
142
0
                i += 2;
143
0
            else if ((inBuf[i] & 0xF0) == 0xE0)
144
0
                i += 3;
145
0
            else if ((inBuf[i] & 0xF8) == 0xF0)
146
0
                i += 4;
147
0
            else
148
0
                return PR_FALSE;
149
150
0
            len += 4;
151
0
        }
152
153
0
        if (len > maxOutBufLen) {
154
0
            *outBufLen = len;
155
0
            return PR_FALSE;
156
0
        }
157
158
0
        len = 0;
159
160
0
        for (i = 0; i < inBufLen;) {
161
0
            PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
162
163
0
            if (ucs4 == BAD_UTF8)
164
0
                return PR_FALSE;
165
166
0
            outBuf[len + L_0] = 0x00;
167
0
            outBuf[len + L_1] = (unsigned char)(ucs4 >> 16);
168
0
            outBuf[len + L_2] = (unsigned char)(ucs4 >> 8);
169
0
            outBuf[len + L_3] = (unsigned char)ucs4;
170
171
0
            len += 4;
172
0
        }
173
174
0
        *outBufLen = len;
175
0
        return PR_TRUE;
176
0
    } else {
177
0
        unsigned int i, len = 0;
178
0
        PORT_Assert((inBufLen % 4) == 0);
179
0
        if ((inBufLen % 4) != 0) {
180
0
            *outBufLen = 0;
181
0
            return PR_FALSE;
182
0
        }
183
184
0
        for (i = 0; i < inBufLen; i += 4) {
185
0
            if ((inBuf[i + L_0] > 0x00) || (inBuf[i + L_1] > 0x10)) {
186
0
                *outBufLen = 0;
187
0
                return PR_FALSE;
188
0
            } else if (inBuf[i + L_1] >= 0x01)
189
0
                len += 4;
190
0
            else if (inBuf[i + L_2] >= 0x08)
191
0
                len += 3;
192
0
            else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80))
193
0
                len += 2;
194
0
            else
195
0
                len += 1;
196
0
        }
197
198
0
        if (len > maxOutBufLen) {
199
0
            *outBufLen = len;
200
0
            return PR_FALSE;
201
0
        }
202
203
0
        len = 0;
204
205
0
        for (i = 0; i < inBufLen; i += 4) {
206
0
            if (inBuf[i + L_1] >= 0x01) {
207
                /* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
208
                /* 00000000 000abcde fghijklm nopqrstu ->
209
                   11110abc 10defghi 10jklmno 10pqrstu */
210
211
0
                outBuf[len + 0] = 0xF0 | ((inBuf[i + L_1] & 0x1C) >> 2);
212
0
                outBuf[len + 1] = 0x80 | ((inBuf[i + L_1] & 0x03) << 4) | ((inBuf[i + L_2] & 0xF0) >> 4);
213
0
                outBuf[len + 2] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
214
0
                outBuf[len + 3] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
215
216
0
                len += 4;
217
0
            } else if (inBuf[i + L_2] >= 0x08) {
218
                /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
219
                /* 00000000 00000000 abcdefgh ijklmnop ->
220
                   1110abcd 10efghij 10klmnop */
221
222
0
                outBuf[len + 0] = 0xE0 | ((inBuf[i + L_2] & 0xF0) >> 4);
223
0
                outBuf[len + 1] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
224
0
                outBuf[len + 2] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
225
226
0
                len += 3;
227
0
            } else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) {
228
                /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
229
                /* 00000000 00000000 00000abc defghijk ->
230
                   110abcde 10fghijk */
231
232
0
                outBuf[len + 0] = 0xC0 | ((inBuf[i + L_2] & 0x07) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
233
0
                outBuf[len + 1] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
234
235
0
                len += 2;
236
0
            } else {
237
                /* 0000 0000-0000 007F -> 0xxxxxx */
238
                /* 00000000 00000000 00000000 0abcdefg ->
239
                   0abcdefg */
240
241
0
                outBuf[len + 0] = (inBuf[i + L_3] & 0x7F);
242
243
0
                len += 1;
244
0
            }
245
0
        }
246
247
0
        *outBufLen = len;
248
0
        return PR_TRUE;
249
0
    }
250
0
}
251
252
PRBool
253
sec_port_ucs2_utf8_conversion_function(
254
    PRBool toUnicode,
255
    unsigned char *inBuf,
256
    unsigned int inBufLen,
257
    unsigned char *outBuf,
258
    unsigned int maxOutBufLen,
259
    unsigned int *outBufLen)
260
0
{
261
0
    PORT_Assert((unsigned int *)NULL != outBufLen);
262
263
0
    if (toUnicode) {
264
0
        unsigned int i, len = 0;
265
266
0
        for (i = 0; i < inBufLen;) {
267
0
            if ((inBuf[i] & 0x80) == 0x00) {
268
0
                i += 1;
269
0
                len += 2;
270
0
            } else if ((inBuf[i] & 0xE0) == 0xC0) {
271
0
                i += 2;
272
0
                len += 2;
273
0
            } else if ((inBuf[i] & 0xF0) == 0xE0) {
274
0
                i += 3;
275
0
                len += 2;
276
0
            } else if ((inBuf[i] & 0xF8) == 0xF0) {
277
0
                i += 4;
278
0
                len += 4;
279
0
            } else
280
0
                return PR_FALSE;
281
0
        }
282
283
0
        if (len > maxOutBufLen) {
284
0
            *outBufLen = len;
285
0
            return PR_FALSE;
286
0
        }
287
288
0
        len = 0;
289
290
0
        for (i = 0; i < inBufLen;) {
291
0
            PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
292
293
0
            if (ucs4 == BAD_UTF8)
294
0
                return PR_FALSE;
295
296
0
            if (ucs4 < 0x10000) {
297
0
                outBuf[len + H_0] = (unsigned char)(ucs4 >> 8);
298
0
                outBuf[len + H_1] = (unsigned char)ucs4;
299
0
                len += 2;
300
0
            } else {
301
0
                ucs4 -= 0x10000;
302
0
                outBuf[len + 0 + H_0] = (unsigned char)(0xD8 | ((ucs4 >> 18) & 0x3));
303
0
                outBuf[len + 0 + H_1] = (unsigned char)(ucs4 >> 10);
304
0
                outBuf[len + 2 + H_0] = (unsigned char)(0xDC | ((ucs4 >> 8) & 0x3));
305
0
                outBuf[len + 2 + H_1] = (unsigned char)ucs4;
306
0
                len += 4;
307
0
            }
308
0
        }
309
310
0
        *outBufLen = len;
311
0
        return PR_TRUE;
312
0
    } else {
313
0
        unsigned int i, len = 0;
314
0
        PORT_Assert((inBufLen % 2) == 0);
315
0
        if ((inBufLen % 2) != 0) {
316
0
            *outBufLen = 0;
317
0
            return PR_FALSE;
318
0
        }
319
320
0
        for (i = 0; i < inBufLen; i += 2) {
321
0
            if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00))
322
0
                len += 1;
323
0
            else if (inBuf[i + H_0] < 0x08)
324
0
                len += 2;
325
0
            else if (((inBuf[i + H_0] & 0xFC) == 0xD8)) {
326
0
                if (((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)) {
327
0
                    i += 2;
328
0
                    len += 4;
329
0
                } else {
330
0
                    return PR_FALSE;
331
0
                }
332
0
            } else if ((inBuf[i + H_0] & 0xFC) == 0xDC) {
333
0
                return PR_FALSE;
334
0
            } else {
335
0
                len += 3;
336
0
            }
337
0
        }
338
339
0
        if (len > maxOutBufLen) {
340
0
            *outBufLen = len;
341
0
            return PR_FALSE;
342
0
        }
343
344
0
        len = 0;
345
346
0
        for (i = 0; i < inBufLen; i += 2) {
347
0
            if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) {
348
                /* 0000-007F -> 0xxxxxx */
349
                /* 00000000 0abcdefg -> 0abcdefg */
350
351
0
                outBuf[len] = inBuf[i + H_1] & 0x7F;
352
353
0
                len += 1;
354
0
            } else if (inBuf[i + H_0] < 0x08) {
355
                /* 0080-07FF -> 110xxxxx 10xxxxxx */
356
                /* 00000abc defghijk -> 110abcde 10fghijk */
357
358
0
                outBuf[len + 0] = 0xC0 | ((inBuf[i + H_0] & 0x07) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
359
0
                outBuf[len + 1] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0);
360
361
0
                len += 2;
362
0
            } else if ((inBuf[i + H_0] & 0xFC) == 0xD8) {
363
0
                int abcde, BCDE;
364
365
0
                PORT_Assert(((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC));
366
367
                /* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
368
                /* 110110BC DEfghijk 110111lm nopqrstu ->
369
                   { Let abcde = BCDE + 1 }
370
                   11110abc 10defghi 10jklmno 10pqrstu */
371
372
0
                BCDE = ((inBuf[i + H_0] & 0x03) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
373
0
                abcde = BCDE + 1;
374
375
0
                outBuf[len + 0] = 0xF0 | ((abcde & 0x1C) >> 2);
376
0
                outBuf[len + 1] = 0x80 | ((abcde & 0x03) << 4) | ((inBuf[i + 0 + H_1] & 0x3C) >> 2);
377
0
                outBuf[len + 2] = 0x80 | ((inBuf[i + 0 + H_1] & 0x03) << 4) | ((inBuf[i + 2 + H_0] & 0x03) << 2) | ((inBuf[i + 2 + H_1] & 0xC0) >> 6);
378
0
                outBuf[len + 3] = 0x80 | ((inBuf[i + 2 + H_1] & 0x3F) >> 0);
379
380
0
                i += 2;
381
0
                len += 4;
382
0
            } else {
383
                /* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
384
                /* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */
385
386
0
                outBuf[len + 0] = 0xE0 | ((inBuf[i + H_0] & 0xF0) >> 4);
387
0
                outBuf[len + 1] = 0x80 | ((inBuf[i + H_0] & 0x0F) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
388
0
                outBuf[len + 2] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0);
389
390
0
                len += 3;
391
0
            }
392
0
        }
393
394
0
        *outBufLen = len;
395
0
        return PR_TRUE;
396
0
    }
397
0
}
398
399
PRBool
400
sec_port_iso88591_utf8_conversion_function(
401
    const unsigned char *inBuf,
402
    unsigned int inBufLen,
403
    unsigned char *outBuf,
404
    unsigned int maxOutBufLen,
405
    unsigned int *outBufLen)
406
0
{
407
0
    unsigned int i, len = 0;
408
409
0
    PORT_Assert((unsigned int *)NULL != outBufLen);
410
411
0
    for (i = 0; i < inBufLen; i++) {
412
0
        if ((inBuf[i] & 0x80) == 0x00)
413
0
            len += 1;
414
0
        else
415
0
            len += 2;
416
0
    }
417
418
0
    if (len > maxOutBufLen) {
419
0
        *outBufLen = len;
420
0
        return PR_FALSE;
421
0
    }
422
423
0
    len = 0;
424
425
0
    for (i = 0; i < inBufLen; i++) {
426
0
        if ((inBuf[i] & 0x80) == 0x00) {
427
            /* 00-7F -> 0xxxxxxx */
428
            /* 0abcdefg -> 0abcdefg */
429
430
0
            outBuf[len] = inBuf[i];
431
0
            len += 1;
432
0
        } else {
433
            /* 80-FF <- 110xxxxx 10xxxxxx */
434
            /* 00000000 abcdefgh -> 110000ab 10cdefgh */
435
436
0
            outBuf[len + 0] = 0xC0 | ((inBuf[i] & 0xC0) >> 6);
437
0
            outBuf[len + 1] = 0x80 | ((inBuf[i] & 0x3F) >> 0);
438
439
0
            len += 2;
440
0
        }
441
0
    }
442
443
0
    *outBufLen = len;
444
0
    return PR_TRUE;
445
0
}