/src/wireshark/wsutil/unicode-utils.c

Source (jump to first uncovered line)
/* unicode-utils.c
 * Unicode utility routines
 *
 * Wireshark - Network traffic analyzer
 * By Gerald Combs <gerald@wireshark.org>
 * Copyright 2006 Gerald Combs
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */

#include "config.h"

#include "unicode-utils.h"

const int ws_utf8_seqlen[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x00...0x0f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x10...0x1f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x20...0x2f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x30...0x3f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x40...0x4f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x50...0x5f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x60...0x6f */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x70...0x7f */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0x80...0x8f */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0x90...0x9f */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0xa0...0xaf */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0xb0...0xbf */
    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* 0xc0...0xcf */
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* 0xd0...0xdf */
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* 0xe0...0xef */
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,  /* 0xf0...0xff */
};

/* Given a pointer and a length, validates a string of bytes as UTF-8.
 * Returns the number of valid bytes, and a pointer immediately past
 * the checked region.
 *
 * Differs from Glib's g_utf8_validate_len in that null bytes are
 * considered valid UTF-8, and that maximal subparts are replaced as
 * a unit. (I.e., given a sequence of 2 or 3 bytes which are a
 * truncated version of a 3 or 4 byte UTF-8 character, but the next
 * byte does not continue the character, the set of 2 or 3 bytes
 * are replaced with one REPLACMENT CHARACTER.)
 */
static inline size_t
utf_8_validate(const uint8_t *start, ssize_t length, const uint8_t **end)
{
    const uint8_t *ptr = start;
    uint8_t ch;
    size_t unichar_len, valid_bytes = 0;

    while (length > 0) {

        ch = *ptr;

        if (ch < 0x80) {
            valid_bytes++;
            ptr++;
            length--;
            continue;
        }

        ch = *ptr;

        if (ch < 0xc2 || ch > 0xf4) {
            ptr++;
            length--;
            *end = ptr;
            return valid_bytes;
        }

        if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
            unichar_len = 2;
        } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
            unichar_len = 3;
            ptr++;
            length--;
            if (length < 1) {
                *end = ptr;
                return valid_bytes;
            }
            switch (ch) {
                case 0xe0:
                    if (*ptr < 0xa0 || *ptr > 0xbf) {
                        *end = ptr;
                        return valid_bytes;
                    }
                    break;
                case 0xed:
                    if (*ptr < 0x80 || *ptr > 0x9f) {
                        *end = ptr;
                        return valid_bytes;
                    }
                    break;
                default:
                    if (*ptr < 0x80 || *ptr > 0xbf) {
                        *end = ptr;
                        return valid_bytes;
                    }
            }
        } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
            unichar_len = 4;
            ptr++;
            length--;
            if (length < 1) {
                *end = ptr;
                return valid_bytes;
            }
            switch (ch) {
                case 0xf0:
                    if (*ptr < 0x90 || *ptr > 0xbf) {
                        *end = ptr;
                        return valid_bytes;
                    }
                    break;
                case 0xf4:
                    if (*ptr < 0x80 || *ptr > 0x8f) {
                        *end = ptr;
                        return valid_bytes;
                    }
                    break;
                default:
                    if (*ptr < 0x80 || *ptr > 0xbf) {
                        *end = ptr;
                        return valid_bytes;
                    }
            }
            ptr++;
            length--;
            if (length < 1) {
                *end = ptr;
                return valid_bytes;
            }
            if (*ptr < 0x80 || *ptr > 0xbf) {
                *end = ptr;
                return valid_bytes;
            }
        }

        ptr++;
        length--;
        if (length < 1) {
            *end = ptr;
            return valid_bytes;
        }
        if (*ptr < 0x80 || *ptr > 0xbf) {
            *end = ptr;
            return valid_bytes;
        } else {
            ptr++;
            length--;
            valid_bytes += unichar_len;
        }

    }
    *end = ptr;
    return valid_bytes;
}

/*
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
 * referred to by the pointer and length as a UTF-8 string, and return a
 * pointer to a UTF-8 string, allocated using the wmem scope, with all
 * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
 * according to the recommended "best practices" given in the Unicode
 * Standard and specified by W3C/WHATWG.
 *
 * Note that in conformance with the Unicode Standard, this treats three
 * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
 * and two byte overlong encodings of 7-bit ASCII characters as invalid and
 * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
 * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
 * be added later.
 *
 * Compared with g_utf8_make_valid(), this function does not consider
 * internal NUL bytes as invalid and replace them with replacment characters.
 * It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3
 * bytes which are a truncated version of a valid 3 or 4 byte character (but
 * the next byte does not continue the character) are replaced with a single
 * REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the
 * sequence with its own (3 octet) REPLACEMENT CHARACTER.
 *
 * XXX: length should probably be a size_t instead of a int in all
 * these encoding functions
 * XXX: the buffer returned can be of different length than the input,
 * and can have internal NULs as well (so that strlen doesn't give its
 * length). As with the other encoding functions, we should return the
 * length of the output buffer (or a wmem_strbuf_t directly) and an
 * indication of whether there was an invalid character (i.e.
 * REPLACEMENT CHARACTER was used.)
 */
wmem_strbuf_t *
ws_utf8_make_valid_strbuf(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length)
{
    wmem_strbuf_t *str;

    str = wmem_strbuf_new_sized(scope, length+1);

    /* See the Unicode Standard conformance chapter at
     * https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially
     * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
     * U+FFFD Substitution of Maximal Subparts. */

    while (length > 0) {
        const uint8_t *prev = ptr;
        size_t valid_bytes = utf_8_validate(prev, length, &ptr);

        if (valid_bytes) {
            wmem_strbuf_append_len(str, prev, valid_bytes);
        }
        length -= ptr - prev;
        prev += valid_bytes;
        if (ptr - prev) {
            wmem_strbuf_append_unichar_repl(str);
        }
    }

    return str;
}

uint8_t *
ws_utf8_make_valid(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length)
{
    wmem_strbuf_t *str = ws_utf8_make_valid_strbuf(scope, ptr, length);
    return wmem_strbuf_finalize(str);
}

#ifdef _WIN32

#include <strsafe.h>

/** @file
 * Unicode utilities (internal interface)
 *
 * We define UNICODE and _UNICODE under Windows.  This means that
 * Windows SDK routines expect UTF-16 strings, in contrast to newer
 * versions of Glib and GTK+ which expect UTF-8.  This module provides
 * convenience routines for converting between UTF-8 and UTF-16.
 */

#define INITIAL_UTFBUF_SIZE 128

/*
 * XXX - Should we use g_utf8_to_utf16() and g_utf16_to_utf8()
 * instead?  The goal of the functions below was to provide simple
 * wrappers for UTF-8 <-> UTF-16 conversion without making the
 * caller worry about freeing up memory afterward.
 */

/* Convert from UTF-8 to UTF-16. */
const wchar_t *
utf_8to16(const char *utf8str)
{
    static wchar_t *utf16buf[3];
    static int utf16buf_len[3];
    static int idx;

    if (utf8str == NULL)
        return NULL;

    idx = (idx + 1) % 3;

    /*
     * Allocate the buffer if it's not already allocated.
     */
    if (utf16buf[idx] == NULL) {
        utf16buf_len[idx] = INITIAL_UTFBUF_SIZE;
        utf16buf[idx] = g_malloc(utf16buf_len[idx] * sizeof(wchar_t));
    }

    while (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, NULL, 0) >= utf16buf_len[idx]) {
        /*
         * Double the buffer's size if it's not big enough.
         * The size of the buffer starts at 128, so doubling its size
         * adds at least another 128 bytes, which is more than enough
         * for one more character plus a terminating '\0'.
         */
        utf16buf_len[idx] *= 2;
        utf16buf[idx] = g_realloc(utf16buf[idx], utf16buf_len[idx] * sizeof(wchar_t));
    }

    if (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, utf16buf[idx], utf16buf_len[idx]) == 0)
        return NULL;

    return utf16buf[idx];
}

void
utf_8to16_snprintf(TCHAR *utf16buf, int utf16buf_len, const char* fmt, ...)
{
    va_list ap;
    char* dst;

    va_start(ap,fmt);
    dst = ws_strdup_vprintf(fmt, ap);
    va_end(ap);

    StringCchPrintf(utf16buf, utf16buf_len, _T("%s"), utf_8to16(dst));

    g_free(dst);
}

/* Convert from UTF-16 to UTF-8. */
char *
utf_16to8(const wchar_t *utf16str)
{
    static char *utf8buf[3];
    static int utf8buf_len[3];
    static int idx;

    if (utf16str == NULL)
        return NULL;

    idx = (idx + 1) % 3;

    /*
     * Allocate the buffer if it's not already allocated.
    */
    if (utf8buf[idx] == NULL) {
        utf8buf_len[idx] = INITIAL_UTFBUF_SIZE;
        utf8buf[idx] = g_malloc(utf8buf_len[idx]);
    }

    while (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, NULL, 0, NULL, NULL) >= utf8buf_len[idx]) {
        /*
         * Double the buffer's size if it's not big enough.
         * The size of the buffer starts at 128, so doubling its size
         * adds at least another 128 bytes, which is more than enough
         * for one more character plus a terminating '\0'.
         */
        utf8buf_len[idx] *= 2;
        utf8buf[idx] = g_realloc(utf8buf[idx], utf8buf_len[idx]);
    }

    if (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, utf8buf[idx], utf8buf_len[idx], NULL, NULL) == 0)
        return NULL;

    return utf8buf[idx];
}

/* Convert our argument list from UTF-16 to UTF-8. */
char **
arg_list_utf_16to8(int argc, wchar_t *wc_argv[]) {
    char **argv;
    int i;

    argv = (char **)g_malloc((argc + 1) * sizeof(char *));
    for (i = 0; i < argc; i++) {
        argv[i] = g_utf16_to_utf8(wc_argv[i], -1, NULL, NULL, NULL);
    }
    argv[argc] = NULL;
    return argv;
}

#endif

/*
 * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
 *
 * Local variables:
 * c-basic-offset: 4
 * tab-width: 8
 * indent-tabs-mode: nil
 * End:
 *
 * vi: set shiftwidth=4 tabstop=8 expandtab:
 * :indentSize=4:tabSize=8:noTabs=true:
 */

Coverage Report

Created: 2025-02-15 06:25

Line	Count	Source (jump to first uncovered line)
1		/* unicode-utils.c
2		* Unicode utility routines
3		*
4		* Wireshark - Network traffic analyzer
5		* By Gerald Combs <gerald@wireshark.org>
6		* Copyright 2006 Gerald Combs
7		*
8		* SPDX-License-Identifier: GPL-2.0-or-later
9		*/
10
11		#include "config.h"
12
13		#include "unicode-utils.h"
14
15		const int ws_utf8_seqlen[256] = {
16		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00...0x0f */
17		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10...0x1f */
18		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20...0x2f */
19		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30...0x3f */
20		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40...0x4f */
21		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50...0x5f */
22		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60...0x6f */
23		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70...0x7f */
24		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80...0x8f */
25		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90...0x9f */
26		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0...0xaf */
27		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0...0xbf */
28		0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xc0...0xcf */
29		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xd0...0xdf */
30		3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xe0...0xef */
31		4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xf0...0xff */
32		};
33
34		/* Given a pointer and a length, validates a string of bytes as UTF-8.
35		* Returns the number of valid bytes, and a pointer immediately past
36		* the checked region.
37		*
38		* Differs from Glib's g_utf8_validate_len in that null bytes are
39		* considered valid UTF-8, and that maximal subparts are replaced as
40		* a unit. (I.e., given a sequence of 2 or 3 bytes which are a
41		* truncated version of a 3 or 4 byte UTF-8 character, but the next
42		* byte does not continue the character, the set of 2 or 3 bytes
43		* are replaced with one REPLACMENT CHARACTER.)
44		*/
45		static inline size_t
46		utf_8_validate(const uint8_t start, ssize_t length, const uint8_t *end)
47	348k	{
48	348k	const uint8_t *ptr = start;
49	348k	uint8_t ch;
50	348k	size_t unichar_len, valid_bytes = 0;
51
52	1.12M	while (length > 0) {
53
54	1.09M	ch = *ptr;
55
56	1.09M	if (ch < 0x80) {
57	768k	valid_bytes++;
58	768k	ptr++;
59	768k	length--;
60	768k	continue;
61	768k	}
62
63	329k	ch = *ptr;
64
65	329k	if (ch < 0xc2 \|\| ch > 0xf4) {
66	220k	ptr++;
67	220k	length--;
68	220k	*end = ptr;
69	220k	return valid_bytes;
70	220k	}
71
72	109k	if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
73	55.2k	unichar_len = 2;
74	55.2k	} else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
75	36.8k	unichar_len = 3;
76	36.8k	ptr++;
77	36.8k	length--;
78	36.8k	if (length < 1) {
79	876	*end = ptr;
80	876	return valid_bytes;
81	876	}
82	36.0k	switch (ch) {
83	3.87k	case 0xe0:
84	3.87k	if (ptr < 0xa0 \|\| ptr > 0xbf) {
85	3.38k	*end = ptr;
86	3.38k	return valid_bytes;
87	3.38k	}
88	493	break;
89	5.45k	case 0xed:
90	5.45k	if (ptr < 0x80 \|\| ptr > 0x9f) {
91	4.88k	*end = ptr;
92	4.88k	return valid_bytes;
93	4.88k	}
94	569	break;
95	26.6k	default:
96	26.6k	if (ptr < 0x80 \|\| ptr > 0xbf) {
97	23.9k	*end = ptr;
98	23.9k	return valid_bytes;
99	23.9k	}
100	36.0k	}
101	36.0k	} else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
102	17.5k	unichar_len = 4;
103	17.5k	ptr++;
104	17.5k	length--;
105	17.5k	if (length < 1) {
106	346	*end = ptr;
107	346	return valid_bytes;
108	346	}
109	17.1k	switch (ch) {
110	5.27k	case 0xf0:
111	5.27k	if (ptr < 0x90 \|\| ptr > 0xbf) {
112	4.64k	*end = ptr;
113	4.64k	return valid_bytes;
114	4.64k	}
115	627	break;
116	3.87k	case 0xf4:
117	3.87k	if (ptr < 0x80 \|\| ptr > 0x8f) {
118	3.13k	*end = ptr;
119	3.13k	return valid_bytes;
120	3.13k	}
121	747	break;
122	8.03k	default:
123	8.03k	if (ptr < 0x80 \|\| ptr > 0xbf) {
124	4.16k	*end = ptr;
125	4.16k	return valid_bytes;
126	4.16k	}
127	17.1k	}
128	5.24k	ptr++;
129	5.24k	length--;
130	5.24k	if (length < 1) {
131	101	*end = ptr;
132	101	return valid_bytes;
133	101	}
134	5.14k	if (ptr < 0x80 \|\| ptr > 0xbf) {
135	2.62k	*end = ptr;
136	2.62k	return valid_bytes;
137	2.62k	}
138	5.14k	}
139
140	61.5k	ptr++;
141	61.5k	length--;
142	61.5k	if (length < 1) {
143	728	*end = ptr;
144	728	return valid_bytes;
145	728	}
146	60.8k	if (ptr < 0x80 \|\| ptr > 0xbf) {
147	54.1k	*end = ptr;
148	54.1k	return valid_bytes;
149	54.1k	} else {
150	6.66k	ptr++;
151	6.66k	length--;
152	6.66k	valid_bytes += unichar_len;
153	6.66k	}
154
155	60.8k	}
156	25.6k	*end = ptr;
157	25.6k	return valid_bytes;
158	348k	}
159
160		/*
161		* Given a wmem scope, a pointer, and a length, treat the string of bytes
162		* referred to by the pointer and length as a UTF-8 string, and return a
163		* pointer to a UTF-8 string, allocated using the wmem scope, with all
164		* ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
165		* according to the recommended "best practices" given in the Unicode
166		* Standard and specified by W3C/WHATWG.
167		*
168		* Note that in conformance with the Unicode Standard, this treats three
169		* byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
170		* and two byte overlong encodings of 7-bit ASCII characters as invalid and
171		* substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
172		* derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
173		* be added later.
174		*
175		* Compared with g_utf8_make_valid(), this function does not consider
176		* internal NUL bytes as invalid and replace them with replacment characters.
177		* It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3
178		* bytes which are a truncated version of a valid 3 or 4 byte character (but
179		* the next byte does not continue the character) are replaced with a single
180		* REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the
181		* sequence with its own (3 octet) REPLACEMENT CHARACTER.
182		*
183		* XXX: length should probably be a size_t instead of a int in all
184		* these encoding functions
185		* XXX: the buffer returned can be of different length than the input,
186		* and can have internal NULs as well (so that strlen doesn't give its
187		* length). As with the other encoding functions, we should return the
188		* length of the output buffer (or a wmem_strbuf_t directly) and an
189		* indication of whether there was an invalid character (i.e.
190		* REPLACEMENT CHARACTER was used.)
191		*/
192		wmem_strbuf_t *
193		ws_utf8_make_valid_strbuf(wmem_allocator_t scope, const uint8_t ptr, ssize_t length)
194	36.9k	{
195	36.9k	wmem_strbuf_t *str;
196
197	36.9k	str = wmem_strbuf_new_sized(scope, length+1);
198
199		/* See the Unicode Standard conformance chapter at
200		* https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially
201		* Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
202		* U+FFFD Substitution of Maximal Subparts. */
203
204	385k	while (length > 0) {
205	348k	const uint8_t *prev = ptr;
206	348k	size_t valid_bytes = utf_8_validate(prev, length, &ptr);
207
208	348k	if (valid_bytes) {
209	116k	wmem_strbuf_append_len(str, prev, valid_bytes);
210	116k	}
211	348k	length -= ptr - prev;
212	348k	prev += valid_bytes;
213	348k	if (ptr - prev) {
214	323k	wmem_strbuf_append_unichar_repl(str);
215	323k	}
216	348k	}
217
218	36.9k	return str;
219	36.9k	}
220
221		uint8_t *
222		ws_utf8_make_valid(wmem_allocator_t scope, const uint8_t ptr, ssize_t length)
223	36.9k	{
224	36.9k	wmem_strbuf_t *str = ws_utf8_make_valid_strbuf(scope, ptr, length);
225	36.9k	return wmem_strbuf_finalize(str);
226	36.9k	}
227
228		#ifdef _WIN32
229
230		#include <strsafe.h>
231
232		/** @file
233		* Unicode utilities (internal interface)
234		*
235		* We define UNICODE and _UNICODE under Windows. This means that
236		* Windows SDK routines expect UTF-16 strings, in contrast to newer
237		* versions of Glib and GTK+ which expect UTF-8. This module provides
238		* convenience routines for converting between UTF-8 and UTF-16.
239		*/
240
241		#define INITIAL_UTFBUF_SIZE 128
242
243		/*
244		* XXX - Should we use g_utf8_to_utf16() and g_utf16_to_utf8()
245		* instead? The goal of the functions below was to provide simple
246		* wrappers for UTF-8 <-> UTF-16 conversion without making the
247		* caller worry about freeing up memory afterward.
248		*/
249
250		/* Convert from UTF-8 to UTF-16. */
251		const wchar_t *
252		utf_8to16(const char *utf8str)
253		{
254		static wchar_t *utf16buf[3];
255		static int utf16buf_len[3];
256		static int idx;
257
258		if (utf8str == NULL)
259		return NULL;
260
261		idx = (idx + 1) % 3;
262
263		/*
264		* Allocate the buffer if it's not already allocated.
265		*/
266		if (utf16buf[idx] == NULL) {
267		utf16buf_len[idx] = INITIAL_UTFBUF_SIZE;
268		utf16buf[idx] = g_malloc(utf16buf_len[idx] * sizeof(wchar_t));
269		}
270
271		while (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, NULL, 0) >= utf16buf_len[idx]) {
272		/*
273		* Double the buffer's size if it's not big enough.
274		* The size of the buffer starts at 128, so doubling its size
275		* adds at least another 128 bytes, which is more than enough
276		* for one more character plus a terminating '\0'.
277		*/
278		utf16buf_len[idx] *= 2;
279		utf16buf[idx] = g_realloc(utf16buf[idx], utf16buf_len[idx] * sizeof(wchar_t));
280		}
281
282		if (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, utf16buf[idx], utf16buf_len[idx]) == 0)
283		return NULL;
284
285		return utf16buf[idx];
286		}
287
288		void
289		utf_8to16_snprintf(TCHAR utf16buf, int utf16buf_len, const char fmt, ...)
290		{
291		va_list ap;
292		char* dst;
293
294		va_start(ap,fmt);
295		dst = ws_strdup_vprintf(fmt, ap);
296		va_end(ap);
297
298		StringCchPrintf(utf16buf, utf16buf_len, _T("%s"), utf_8to16(dst));
299
300		g_free(dst);
301		}
302
303		/* Convert from UTF-16 to UTF-8. */
304		char *
305		utf_16to8(const wchar_t *utf16str)
306		{
307		static char *utf8buf[3];
308		static int utf8buf_len[3];
309		static int idx;
310
311		if (utf16str == NULL)
312		return NULL;
313
314		idx = (idx + 1) % 3;
315
316		/*
317		* Allocate the buffer if it's not already allocated.
318		*/
319		if (utf8buf[idx] == NULL) {
320		utf8buf_len[idx] = INITIAL_UTFBUF_SIZE;
321		utf8buf[idx] = g_malloc(utf8buf_len[idx]);
322		}
323
324		while (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, NULL, 0, NULL, NULL) >= utf8buf_len[idx]) {
325		/*
326		* Double the buffer's size if it's not big enough.
327		* The size of the buffer starts at 128, so doubling its size
328		* adds at least another 128 bytes, which is more than enough
329		* for one more character plus a terminating '\0'.
330		*/
331		utf8buf_len[idx] *= 2;
332		utf8buf[idx] = g_realloc(utf8buf[idx], utf8buf_len[idx]);
333		}
334
335		if (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, utf8buf[idx], utf8buf_len[idx], NULL, NULL) == 0)
336		return NULL;
337
338		return utf8buf[idx];
339		}
340
341		/* Convert our argument list from UTF-16 to UTF-8. */
342		char **
343		arg_list_utf_16to8(int argc, wchar_t *wc_argv[]) {
344		char **argv;
345		int i;
346
347		argv = (char *)g_malloc((argc + 1) sizeof(char *));
348		for (i = 0; i < argc; i++) {
349		argv[i] = g_utf16_to_utf8(wc_argv[i], -1, NULL, NULL, NULL);
350		}
351		argv[argc] = NULL;
352		return argv;
353		}
354
355		#endif
356
357		/*
358		* Editor modelines - https://www.wireshark.org/tools/modelines.html
359		*
360		* Local variables:
361		* c-basic-offset: 4
362		* tab-width: 8
363		* indent-tabs-mode: nil
364		* End:
365		*
366		* vi: set shiftwidth=4 tabstop=8 expandtab:
367		* :indentSize=4:tabSize=8:noTabs=true:
368		*/