Coverage Report

Created: 2025-02-15 06:25

/src/wireshark/wsutil/unicode-utils.c
Line
Count
Source (jump to first uncovered line)
1
/* unicode-utils.c
2
 * Unicode utility routines
3
 *
4
 * Wireshark - Network traffic analyzer
5
 * By Gerald Combs <gerald@wireshark.org>
6
 * Copyright 2006 Gerald Combs
7
 *
8
 * SPDX-License-Identifier: GPL-2.0-or-later
9
 */
10
11
#include "config.h"
12
13
#include "unicode-utils.h"
14
15
const int ws_utf8_seqlen[256] = {
16
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x00...0x0f */
17
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x10...0x1f */
18
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x20...0x2f */
19
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x30...0x3f */
20
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x40...0x4f */
21
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x50...0x5f */
22
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x60...0x6f */
23
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x70...0x7f */
24
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0x80...0x8f */
25
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0x90...0x9f */
26
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0xa0...0xaf */
27
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0xb0...0xbf */
28
    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* 0xc0...0xcf */
29
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* 0xd0...0xdf */
30
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* 0xe0...0xef */
31
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,  /* 0xf0...0xff */
32
};
33
34
/* Given a pointer and a length, validates a string of bytes as UTF-8.
35
 * Returns the number of valid bytes, and a pointer immediately past
36
 * the checked region.
37
 *
38
 * Differs from Glib's g_utf8_validate_len in that null bytes are
39
 * considered valid UTF-8, and that maximal subparts are replaced as
40
 * a unit. (I.e., given a sequence of 2 or 3 bytes which are a
41
 * truncated version of a 3 or 4 byte UTF-8 character, but the next
42
 * byte does not continue the character, the set of 2 or 3 bytes
43
 * are replaced with one REPLACMENT CHARACTER.)
44
 */
45
static inline size_t
46
utf_8_validate(const uint8_t *start, ssize_t length, const uint8_t **end)
47
348k
{
48
348k
    const uint8_t *ptr = start;
49
348k
    uint8_t ch;
50
348k
    size_t unichar_len, valid_bytes = 0;
51
52
1.12M
    while (length > 0) {
53
54
1.09M
        ch = *ptr;
55
56
1.09M
        if (ch < 0x80) {
57
768k
            valid_bytes++;
58
768k
            ptr++;
59
768k
            length--;
60
768k
            continue;
61
768k
        }
62
63
329k
        ch = *ptr;
64
65
329k
        if (ch < 0xc2 || ch > 0xf4) {
66
220k
            ptr++;
67
220k
            length--;
68
220k
            *end = ptr;
69
220k
            return valid_bytes;
70
220k
        }
71
72
109k
        if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
73
55.2k
            unichar_len = 2;
74
55.2k
        } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
75
36.8k
            unichar_len = 3;
76
36.8k
            ptr++;
77
36.8k
            length--;
78
36.8k
            if (length < 1) {
79
876
                *end = ptr;
80
876
                return valid_bytes;
81
876
            }
82
36.0k
            switch (ch) {
83
3.87k
                case 0xe0:
84
3.87k
                    if (*ptr < 0xa0 || *ptr > 0xbf) {
85
3.38k
                        *end = ptr;
86
3.38k
                        return valid_bytes;
87
3.38k
                    }
88
493
                    break;
89
5.45k
                case 0xed:
90
5.45k
                    if (*ptr < 0x80 || *ptr > 0x9f) {
91
4.88k
                        *end = ptr;
92
4.88k
                        return valid_bytes;
93
4.88k
                    }
94
569
                    break;
95
26.6k
                default:
96
26.6k
                    if (*ptr < 0x80 || *ptr > 0xbf) {
97
23.9k
                        *end = ptr;
98
23.9k
                        return valid_bytes;
99
23.9k
                    }
100
36.0k
            }
101
36.0k
        } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
102
17.5k
            unichar_len = 4;
103
17.5k
            ptr++;
104
17.5k
            length--;
105
17.5k
            if (length < 1) {
106
346
                *end = ptr;
107
346
                return valid_bytes;
108
346
            }
109
17.1k
            switch (ch) {
110
5.27k
                case 0xf0:
111
5.27k
                    if (*ptr < 0x90 || *ptr > 0xbf) {
112
4.64k
                        *end = ptr;
113
4.64k
                        return valid_bytes;
114
4.64k
                    }
115
627
                    break;
116
3.87k
                case 0xf4:
117
3.87k
                    if (*ptr < 0x80 || *ptr > 0x8f) {
118
3.13k
                        *end = ptr;
119
3.13k
                        return valid_bytes;
120
3.13k
                    }
121
747
                    break;
122
8.03k
                default:
123
8.03k
                    if (*ptr < 0x80 || *ptr > 0xbf) {
124
4.16k
                        *end = ptr;
125
4.16k
                        return valid_bytes;
126
4.16k
                    }
127
17.1k
            }
128
5.24k
            ptr++;
129
5.24k
            length--;
130
5.24k
            if (length < 1) {
131
101
                *end = ptr;
132
101
                return valid_bytes;
133
101
            }
134
5.14k
            if (*ptr < 0x80 || *ptr > 0xbf) {
135
2.62k
                *end = ptr;
136
2.62k
                return valid_bytes;
137
2.62k
            }
138
5.14k
        }
139
140
61.5k
        ptr++;
141
61.5k
        length--;
142
61.5k
        if (length < 1) {
143
728
            *end = ptr;
144
728
            return valid_bytes;
145
728
        }
146
60.8k
        if (*ptr < 0x80 || *ptr > 0xbf) {
147
54.1k
            *end = ptr;
148
54.1k
            return valid_bytes;
149
54.1k
        } else {
150
6.66k
            ptr++;
151
6.66k
            length--;
152
6.66k
            valid_bytes += unichar_len;
153
6.66k
        }
154
155
60.8k
    }
156
25.6k
    *end = ptr;
157
25.6k
    return valid_bytes;
158
348k
}
159
160
/*
161
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
162
 * referred to by the pointer and length as a UTF-8 string, and return a
163
 * pointer to a UTF-8 string, allocated using the wmem scope, with all
164
 * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
165
 * according to the recommended "best practices" given in the Unicode
166
 * Standard and specified by W3C/WHATWG.
167
 *
168
 * Note that in conformance with the Unicode Standard, this treats three
169
 * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
170
 * and two byte overlong encodings of 7-bit ASCII characters as invalid and
171
 * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
172
 * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
173
 * be added later.
174
 *
175
 * Compared with g_utf8_make_valid(), this function does not consider
176
 * internal NUL bytes as invalid and replace them with replacment characters.
177
 * It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3
178
 * bytes which are a truncated version of a valid 3 or 4 byte character (but
179
 * the next byte does not continue the character) are replaced with a single
180
 * REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the
181
 * sequence with its own (3 octet) REPLACEMENT CHARACTER.
182
 *
183
 * XXX: length should probably be a size_t instead of a int in all
184
 * these encoding functions
185
 * XXX: the buffer returned can be of different length than the input,
186
 * and can have internal NULs as well (so that strlen doesn't give its
187
 * length). As with the other encoding functions, we should return the
188
 * length of the output buffer (or a wmem_strbuf_t directly) and an
189
 * indication of whether there was an invalid character (i.e.
190
 * REPLACEMENT CHARACTER was used.)
191
 */
192
wmem_strbuf_t *
193
ws_utf8_make_valid_strbuf(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length)
194
36.9k
{
195
36.9k
    wmem_strbuf_t *str;
196
197
36.9k
    str = wmem_strbuf_new_sized(scope, length+1);
198
199
    /* See the Unicode Standard conformance chapter at
200
     * https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially
201
     * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
202
     * U+FFFD Substitution of Maximal Subparts. */
203
204
385k
    while (length > 0) {
205
348k
        const uint8_t *prev = ptr;
206
348k
        size_t valid_bytes = utf_8_validate(prev, length, &ptr);
207
208
348k
        if (valid_bytes) {
209
116k
            wmem_strbuf_append_len(str, prev, valid_bytes);
210
116k
        }
211
348k
        length -= ptr - prev;
212
348k
        prev += valid_bytes;
213
348k
        if (ptr - prev) {
214
323k
            wmem_strbuf_append_unichar_repl(str);
215
323k
        }
216
348k
    }
217
218
36.9k
    return str;
219
36.9k
}
220
221
uint8_t *
222
ws_utf8_make_valid(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length)
223
36.9k
{
224
36.9k
    wmem_strbuf_t *str = ws_utf8_make_valid_strbuf(scope, ptr, length);
225
36.9k
    return wmem_strbuf_finalize(str);
226
36.9k
}
227
228
#ifdef _WIN32
229
230
#include <strsafe.h>
231
232
/** @file
233
 * Unicode utilities (internal interface)
234
 *
235
 * We define UNICODE and _UNICODE under Windows.  This means that
236
 * Windows SDK routines expect UTF-16 strings, in contrast to newer
237
 * versions of Glib and GTK+ which expect UTF-8.  This module provides
238
 * convenience routines for converting between UTF-8 and UTF-16.
239
 */
240
241
#define INITIAL_UTFBUF_SIZE 128
242
243
/*
244
 * XXX - Should we use g_utf8_to_utf16() and g_utf16_to_utf8()
245
 * instead?  The goal of the functions below was to provide simple
246
 * wrappers for UTF-8 <-> UTF-16 conversion without making the
247
 * caller worry about freeing up memory afterward.
248
 */
249
250
/* Convert from UTF-8 to UTF-16. */
251
const wchar_t *
252
utf_8to16(const char *utf8str)
253
{
254
    static wchar_t *utf16buf[3];
255
    static int utf16buf_len[3];
256
    static int idx;
257
258
    if (utf8str == NULL)
259
        return NULL;
260
261
    idx = (idx + 1) % 3;
262
263
    /*
264
     * Allocate the buffer if it's not already allocated.
265
     */
266
    if (utf16buf[idx] == NULL) {
267
        utf16buf_len[idx] = INITIAL_UTFBUF_SIZE;
268
        utf16buf[idx] = g_malloc(utf16buf_len[idx] * sizeof(wchar_t));
269
    }
270
271
    while (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, NULL, 0) >= utf16buf_len[idx]) {
272
        /*
273
         * Double the buffer's size if it's not big enough.
274
         * The size of the buffer starts at 128, so doubling its size
275
         * adds at least another 128 bytes, which is more than enough
276
         * for one more character plus a terminating '\0'.
277
         */
278
        utf16buf_len[idx] *= 2;
279
        utf16buf[idx] = g_realloc(utf16buf[idx], utf16buf_len[idx] * sizeof(wchar_t));
280
    }
281
282
    if (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, utf16buf[idx], utf16buf_len[idx]) == 0)
283
        return NULL;
284
285
    return utf16buf[idx];
286
}
287
288
void
289
utf_8to16_snprintf(TCHAR *utf16buf, int utf16buf_len, const char* fmt, ...)
290
{
291
    va_list ap;
292
    char* dst;
293
294
    va_start(ap,fmt);
295
    dst = ws_strdup_vprintf(fmt, ap);
296
    va_end(ap);
297
298
    StringCchPrintf(utf16buf, utf16buf_len, _T("%s"), utf_8to16(dst));
299
300
    g_free(dst);
301
}
302
303
/* Convert from UTF-16 to UTF-8. */
304
char *
305
utf_16to8(const wchar_t *utf16str)
306
{
307
    static char *utf8buf[3];
308
    static int utf8buf_len[3];
309
    static int idx;
310
311
    if (utf16str == NULL)
312
        return NULL;
313
314
    idx = (idx + 1) % 3;
315
316
    /*
317
     * Allocate the buffer if it's not already allocated.
318
    */
319
    if (utf8buf[idx] == NULL) {
320
        utf8buf_len[idx] = INITIAL_UTFBUF_SIZE;
321
        utf8buf[idx] = g_malloc(utf8buf_len[idx]);
322
    }
323
324
    while (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, NULL, 0, NULL, NULL) >= utf8buf_len[idx]) {
325
        /*
326
         * Double the buffer's size if it's not big enough.
327
         * The size of the buffer starts at 128, so doubling its size
328
         * adds at least another 128 bytes, which is more than enough
329
         * for one more character plus a terminating '\0'.
330
         */
331
        utf8buf_len[idx] *= 2;
332
        utf8buf[idx] = g_realloc(utf8buf[idx], utf8buf_len[idx]);
333
    }
334
335
    if (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, utf8buf[idx], utf8buf_len[idx], NULL, NULL) == 0)
336
        return NULL;
337
338
    return utf8buf[idx];
339
}
340
341
/* Convert our argument list from UTF-16 to UTF-8. */
342
char **
343
arg_list_utf_16to8(int argc, wchar_t *wc_argv[]) {
344
    char **argv;
345
    int i;
346
347
    argv = (char **)g_malloc((argc + 1) * sizeof(char *));
348
    for (i = 0; i < argc; i++) {
349
        argv[i] = g_utf16_to_utf8(wc_argv[i], -1, NULL, NULL, NULL);
350
    }
351
    argv[argc] = NULL;
352
    return argv;
353
}
354
355
#endif
356
357
/*
358
 * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
359
 *
360
 * Local variables:
361
 * c-basic-offset: 4
362
 * tab-width: 8
363
 * indent-tabs-mode: nil
364
 * End:
365
 *
366
 * vi: set shiftwidth=4 tabstop=8 expandtab:
367
 * :indentSize=4:tabSize=8:noTabs=true:
368
 */