/src/wireshark/wsutil/unicode-utils.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* unicode-utils.c |
2 | | * Unicode utility routines |
3 | | * |
4 | | * Wireshark - Network traffic analyzer |
5 | | * By Gerald Combs <gerald@wireshark.org> |
6 | | * Copyright 2006 Gerald Combs |
7 | | * |
8 | | * SPDX-License-Identifier: GPL-2.0-or-later |
9 | | */ |
10 | | |
11 | | #include "config.h" |
12 | | |
13 | | #include "unicode-utils.h" |
14 | | |
15 | | const int ws_utf8_seqlen[256] = { |
16 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00...0x0f */ |
17 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10...0x1f */ |
18 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20...0x2f */ |
19 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30...0x3f */ |
20 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40...0x4f */ |
21 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50...0x5f */ |
22 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60...0x6f */ |
23 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70...0x7f */ |
24 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80...0x8f */ |
25 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90...0x9f */ |
26 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0...0xaf */ |
27 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0...0xbf */ |
28 | | 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xc0...0xcf */ |
29 | | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xd0...0xdf */ |
30 | | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xe0...0xef */ |
31 | | 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xf0...0xff */ |
32 | | }; |
33 | | |
34 | | /* Given a pointer and a length, validates a string of bytes as UTF-8. |
35 | | * Returns the number of valid bytes, and a pointer immediately past |
36 | | * the checked region. |
37 | | * |
38 | | * Differs from Glib's g_utf8_validate_len in that null bytes are |
39 | | * considered valid UTF-8, and that maximal subparts are replaced as |
40 | | * a unit. (I.e., given a sequence of 2 or 3 bytes which are a |
41 | | * truncated version of a 3 or 4 byte UTF-8 character, but the next |
42 | | * byte does not continue the character, the set of 2 or 3 bytes |
43 | | * are replaced with one REPLACMENT CHARACTER.) |
44 | | */ |
45 | | static inline size_t |
46 | | utf_8_validate(const uint8_t *start, ssize_t length, const uint8_t **end) |
47 | 348k | { |
48 | 348k | const uint8_t *ptr = start; |
49 | 348k | uint8_t ch; |
50 | 348k | size_t unichar_len, valid_bytes = 0; |
51 | | |
52 | 1.12M | while (length > 0) { |
53 | | |
54 | 1.09M | ch = *ptr; |
55 | | |
56 | 1.09M | if (ch < 0x80) { |
57 | 768k | valid_bytes++; |
58 | 768k | ptr++; |
59 | 768k | length--; |
60 | 768k | continue; |
61 | 768k | } |
62 | | |
63 | 329k | ch = *ptr; |
64 | | |
65 | 329k | if (ch < 0xc2 || ch > 0xf4) { |
66 | 220k | ptr++; |
67 | 220k | length--; |
68 | 220k | *end = ptr; |
69 | 220k | return valid_bytes; |
70 | 220k | } |
71 | | |
72 | 109k | if (ch < 0xe0) { /* 110xxxxx, 2 byte char */ |
73 | 55.2k | unichar_len = 2; |
74 | 55.2k | } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */ |
75 | 36.8k | unichar_len = 3; |
76 | 36.8k | ptr++; |
77 | 36.8k | length--; |
78 | 36.8k | if (length < 1) { |
79 | 876 | *end = ptr; |
80 | 876 | return valid_bytes; |
81 | 876 | } |
82 | 36.0k | switch (ch) { |
83 | 3.87k | case 0xe0: |
84 | 3.87k | if (*ptr < 0xa0 || *ptr > 0xbf) { |
85 | 3.38k | *end = ptr; |
86 | 3.38k | return valid_bytes; |
87 | 3.38k | } |
88 | 493 | break; |
89 | 5.45k | case 0xed: |
90 | 5.45k | if (*ptr < 0x80 || *ptr > 0x9f) { |
91 | 4.88k | *end = ptr; |
92 | 4.88k | return valid_bytes; |
93 | 4.88k | } |
94 | 569 | break; |
95 | 26.6k | default: |
96 | 26.6k | if (*ptr < 0x80 || *ptr > 0xbf) { |
97 | 23.9k | *end = ptr; |
98 | 23.9k | return valid_bytes; |
99 | 23.9k | } |
100 | 36.0k | } |
101 | 36.0k | } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */ |
102 | 17.5k | unichar_len = 4; |
103 | 17.5k | ptr++; |
104 | 17.5k | length--; |
105 | 17.5k | if (length < 1) { |
106 | 346 | *end = ptr; |
107 | 346 | return valid_bytes; |
108 | 346 | } |
109 | 17.1k | switch (ch) { |
110 | 5.27k | case 0xf0: |
111 | 5.27k | if (*ptr < 0x90 || *ptr > 0xbf) { |
112 | 4.64k | *end = ptr; |
113 | 4.64k | return valid_bytes; |
114 | 4.64k | } |
115 | 627 | break; |
116 | 3.87k | case 0xf4: |
117 | 3.87k | if (*ptr < 0x80 || *ptr > 0x8f) { |
118 | 3.13k | *end = ptr; |
119 | 3.13k | return valid_bytes; |
120 | 3.13k | } |
121 | 747 | break; |
122 | 8.03k | default: |
123 | 8.03k | if (*ptr < 0x80 || *ptr > 0xbf) { |
124 | 4.16k | *end = ptr; |
125 | 4.16k | return valid_bytes; |
126 | 4.16k | } |
127 | 17.1k | } |
128 | 5.24k | ptr++; |
129 | 5.24k | length--; |
130 | 5.24k | if (length < 1) { |
131 | 101 | *end = ptr; |
132 | 101 | return valid_bytes; |
133 | 101 | } |
134 | 5.14k | if (*ptr < 0x80 || *ptr > 0xbf) { |
135 | 2.62k | *end = ptr; |
136 | 2.62k | return valid_bytes; |
137 | 2.62k | } |
138 | 5.14k | } |
139 | | |
140 | 61.5k | ptr++; |
141 | 61.5k | length--; |
142 | 61.5k | if (length < 1) { |
143 | 728 | *end = ptr; |
144 | 728 | return valid_bytes; |
145 | 728 | } |
146 | 60.8k | if (*ptr < 0x80 || *ptr > 0xbf) { |
147 | 54.1k | *end = ptr; |
148 | 54.1k | return valid_bytes; |
149 | 54.1k | } else { |
150 | 6.66k | ptr++; |
151 | 6.66k | length--; |
152 | 6.66k | valid_bytes += unichar_len; |
153 | 6.66k | } |
154 | | |
155 | 60.8k | } |
156 | 25.6k | *end = ptr; |
157 | 25.6k | return valid_bytes; |
158 | 348k | } |
159 | | |
160 | | /* |
161 | | * Given a wmem scope, a pointer, and a length, treat the string of bytes |
162 | | * referred to by the pointer and length as a UTF-8 string, and return a |
163 | | * pointer to a UTF-8 string, allocated using the wmem scope, with all |
164 | | * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER |
165 | | * according to the recommended "best practices" given in the Unicode |
166 | | * Standard and specified by W3C/WHATWG. |
167 | | * |
168 | | * Note that in conformance with the Unicode Standard, this treats three |
169 | | * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired) |
170 | | * and two byte overlong encodings of 7-bit ASCII characters as invalid and |
171 | | * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard |
172 | | * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could |
173 | | * be added later. |
174 | | * |
175 | | * Compared with g_utf8_make_valid(), this function does not consider |
176 | | * internal NUL bytes as invalid and replace them with replacment characters. |
177 | | * It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3 |
178 | | * bytes which are a truncated version of a valid 3 or 4 byte character (but |
179 | | * the next byte does not continue the character) are replaced with a single |
180 | | * REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the |
181 | | * sequence with its own (3 octet) REPLACEMENT CHARACTER. |
182 | | * |
183 | | * XXX: length should probably be a size_t instead of a int in all |
184 | | * these encoding functions |
185 | | * XXX: the buffer returned can be of different length than the input, |
186 | | * and can have internal NULs as well (so that strlen doesn't give its |
187 | | * length). As with the other encoding functions, we should return the |
188 | | * length of the output buffer (or a wmem_strbuf_t directly) and an |
189 | | * indication of whether there was an invalid character (i.e. |
190 | | * REPLACEMENT CHARACTER was used.) |
191 | | */ |
192 | | wmem_strbuf_t * |
193 | | ws_utf8_make_valid_strbuf(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length) |
194 | 36.9k | { |
195 | 36.9k | wmem_strbuf_t *str; |
196 | | |
197 | 36.9k | str = wmem_strbuf_new_sized(scope, length+1); |
198 | | |
199 | | /* See the Unicode Standard conformance chapter at |
200 | | * https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially |
201 | | * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and |
202 | | * U+FFFD Substitution of Maximal Subparts. */ |
203 | | |
204 | 385k | while (length > 0) { |
205 | 348k | const uint8_t *prev = ptr; |
206 | 348k | size_t valid_bytes = utf_8_validate(prev, length, &ptr); |
207 | | |
208 | 348k | if (valid_bytes) { |
209 | 116k | wmem_strbuf_append_len(str, prev, valid_bytes); |
210 | 116k | } |
211 | 348k | length -= ptr - prev; |
212 | 348k | prev += valid_bytes; |
213 | 348k | if (ptr - prev) { |
214 | 323k | wmem_strbuf_append_unichar_repl(str); |
215 | 323k | } |
216 | 348k | } |
217 | | |
218 | 36.9k | return str; |
219 | 36.9k | } |
220 | | |
221 | | uint8_t * |
222 | | ws_utf8_make_valid(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length) |
223 | 36.9k | { |
224 | 36.9k | wmem_strbuf_t *str = ws_utf8_make_valid_strbuf(scope, ptr, length); |
225 | 36.9k | return wmem_strbuf_finalize(str); |
226 | 36.9k | } |
227 | | |
228 | | #ifdef _WIN32 |
229 | | |
230 | | #include <strsafe.h> |
231 | | |
232 | | /** @file |
233 | | * Unicode utilities (internal interface) |
234 | | * |
235 | | * We define UNICODE and _UNICODE under Windows. This means that |
236 | | * Windows SDK routines expect UTF-16 strings, in contrast to newer |
237 | | * versions of Glib and GTK+ which expect UTF-8. This module provides |
238 | | * convenience routines for converting between UTF-8 and UTF-16. |
239 | | */ |
240 | | |
241 | | #define INITIAL_UTFBUF_SIZE 128 |
242 | | |
243 | | /* |
244 | | * XXX - Should we use g_utf8_to_utf16() and g_utf16_to_utf8() |
245 | | * instead? The goal of the functions below was to provide simple |
246 | | * wrappers for UTF-8 <-> UTF-16 conversion without making the |
247 | | * caller worry about freeing up memory afterward. |
248 | | */ |
249 | | |
250 | | /* Convert from UTF-8 to UTF-16. */ |
251 | | const wchar_t * |
252 | | utf_8to16(const char *utf8str) |
253 | | { |
254 | | static wchar_t *utf16buf[3]; |
255 | | static int utf16buf_len[3]; |
256 | | static int idx; |
257 | | |
258 | | if (utf8str == NULL) |
259 | | return NULL; |
260 | | |
261 | | idx = (idx + 1) % 3; |
262 | | |
263 | | /* |
264 | | * Allocate the buffer if it's not already allocated. |
265 | | */ |
266 | | if (utf16buf[idx] == NULL) { |
267 | | utf16buf_len[idx] = INITIAL_UTFBUF_SIZE; |
268 | | utf16buf[idx] = g_malloc(utf16buf_len[idx] * sizeof(wchar_t)); |
269 | | } |
270 | | |
271 | | while (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, NULL, 0) >= utf16buf_len[idx]) { |
272 | | /* |
273 | | * Double the buffer's size if it's not big enough. |
274 | | * The size of the buffer starts at 128, so doubling its size |
275 | | * adds at least another 128 bytes, which is more than enough |
276 | | * for one more character plus a terminating '\0'. |
277 | | */ |
278 | | utf16buf_len[idx] *= 2; |
279 | | utf16buf[idx] = g_realloc(utf16buf[idx], utf16buf_len[idx] * sizeof(wchar_t)); |
280 | | } |
281 | | |
282 | | if (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, utf16buf[idx], utf16buf_len[idx]) == 0) |
283 | | return NULL; |
284 | | |
285 | | return utf16buf[idx]; |
286 | | } |
287 | | |
288 | | void |
289 | | utf_8to16_snprintf(TCHAR *utf16buf, int utf16buf_len, const char* fmt, ...) |
290 | | { |
291 | | va_list ap; |
292 | | char* dst; |
293 | | |
294 | | va_start(ap,fmt); |
295 | | dst = ws_strdup_vprintf(fmt, ap); |
296 | | va_end(ap); |
297 | | |
298 | | StringCchPrintf(utf16buf, utf16buf_len, _T("%s"), utf_8to16(dst)); |
299 | | |
300 | | g_free(dst); |
301 | | } |
302 | | |
303 | | /* Convert from UTF-16 to UTF-8. */ |
304 | | char * |
305 | | utf_16to8(const wchar_t *utf16str) |
306 | | { |
307 | | static char *utf8buf[3]; |
308 | | static int utf8buf_len[3]; |
309 | | static int idx; |
310 | | |
311 | | if (utf16str == NULL) |
312 | | return NULL; |
313 | | |
314 | | idx = (idx + 1) % 3; |
315 | | |
316 | | /* |
317 | | * Allocate the buffer if it's not already allocated. |
318 | | */ |
319 | | if (utf8buf[idx] == NULL) { |
320 | | utf8buf_len[idx] = INITIAL_UTFBUF_SIZE; |
321 | | utf8buf[idx] = g_malloc(utf8buf_len[idx]); |
322 | | } |
323 | | |
324 | | while (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, NULL, 0, NULL, NULL) >= utf8buf_len[idx]) { |
325 | | /* |
326 | | * Double the buffer's size if it's not big enough. |
327 | | * The size of the buffer starts at 128, so doubling its size |
328 | | * adds at least another 128 bytes, which is more than enough |
329 | | * for one more character plus a terminating '\0'. |
330 | | */ |
331 | | utf8buf_len[idx] *= 2; |
332 | | utf8buf[idx] = g_realloc(utf8buf[idx], utf8buf_len[idx]); |
333 | | } |
334 | | |
335 | | if (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, utf8buf[idx], utf8buf_len[idx], NULL, NULL) == 0) |
336 | | return NULL; |
337 | | |
338 | | return utf8buf[idx]; |
339 | | } |
340 | | |
341 | | /* Convert our argument list from UTF-16 to UTF-8. */ |
342 | | char ** |
343 | | arg_list_utf_16to8(int argc, wchar_t *wc_argv[]) { |
344 | | char **argv; |
345 | | int i; |
346 | | |
347 | | argv = (char **)g_malloc((argc + 1) * sizeof(char *)); |
348 | | for (i = 0; i < argc; i++) { |
349 | | argv[i] = g_utf16_to_utf8(wc_argv[i], -1, NULL, NULL, NULL); |
350 | | } |
351 | | argv[argc] = NULL; |
352 | | return argv; |
353 | | } |
354 | | |
355 | | #endif |
356 | | |
357 | | /* |
358 | | * Editor modelines - https://www.wireshark.org/tools/modelines.html |
359 | | * |
360 | | * Local variables: |
361 | | * c-basic-offset: 4 |
362 | | * tab-width: 8 |
363 | | * indent-tabs-mode: nil |
364 | | * End: |
365 | | * |
366 | | * vi: set shiftwidth=4 tabstop=8 expandtab: |
367 | | * :indentSize=4:tabSize=8:noTabs=true: |
368 | | */ |