/src/dbus-broker/subprojects/libcutf8-1/src/c-utf8.c

Source
/*
 * UTF-8 Implementation
 *
 * For highlevel documentation of the API see the header file and the docbook
 * comments. This implementation is inspired in part by Rust's std::str.
 *
 * So far only validation helpers are implemented, as those seem the most
 * critical.
 */

#include <c-stdaux.h>
#include <stddef.h>
#include <stdint.h>
#include "c-utf8.h"

/* The following constants are truncated on 32-bit machines */
#define C_UTF8_ASCII_MASK ((size_t)UINT64_C(0x8080808080808080))
#define C_UTF8_ASCII_SUB ((size_t)UINT64_C(0x0101010101010101))

static inline int c_utf8_word_is_ascii(size_t word) {
        /* True unless any byte is NULL or has the MSB set. */
        return ((((word - C_UTF8_ASCII_SUB) | word) & C_UTF8_ASCII_MASK) == 0);
}

/**
 * c_utf8_verify_ascii() - verify that a string is ASCII encoded
 * @strp:               pointer to string to verify
 * @lenp:               pointer to length of string
 *
 * Up to the first @lenp bytes of the string pointed to by @strp is
 * verified to be ASCII encoded, and @strp and @lenp are updated to
 * point to the first non-ASCII character or the first NULL of the
 * string, and the remaining number of bytes of the string,
 * respectively.
 *
 * If @lenp is NULL the string is scanned until the first invalid
 * byte, without any upper bound on its length.
 */
_c_public_ void c_utf8_verify_ascii(const char **strp, size_t *lenp) {
        const char *str = *strp;
        size_t len = lenp ? *lenp : (size_t)-1;

        while (len > 0 && c_load_8(str, 0) < 128) {
                if ((void*)c_align_to((unsigned long)str, sizeof(size_t)) == str) {
                        /*
                         * If the string is aligned to a word boundary, scan two
                         * words at a time for any NULL or non-ASCII characters.
                         *
                         * We do two words at a time to take advantage of the
                         * compiler being able to use SIMD instructions where
                         * available.
                         */
                        while (len >= 2 * sizeof(size_t)) {
                                if (!c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, 0)) ||
                                    !c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, sizeof(size_t))))
                                        break;

                                str += 2 * sizeof(size_t);
                                len -= 2 * sizeof(size_t);
                        }


                        /*
                         * Find the actual end of the ASCII-portion of the string.
                         */
                        while (len > 0 && c_load_8(str, 0) < 128) {
                                if (_c_unlikely_(c_load_8(str, 0) == 0x00))
                                        goto out;
                                ++str;
                                --len;
                        }
                } else {
                        /*
                         * The string was not aligned, scan one character at a time until
                         * it is.
                         */
                        if (_c_unlikely_(c_load_8(str, 0) == 0x00))
                                goto out;
                        ++str;
                        --len;
                }
        }

out:
        *strp = str;
        if (lenp)
                *lenp = len;
}

#define C_UTF8_CHAR_IS_TAIL(_x)         (((_x) & 0xC0) == 0x80)

/**
 * c_utf8_verify() - verify that a string is UTF-8 encoded
 * @strp:               pointer to string to verify
 * @lenp:               pointer to length of string, or NULL
 *
 * Up to the first @lenp bytes of the string pointed to by @strp is
 * verified to be UTF-8 encoded, and @strp and @lenp are updated to
 * point to the first non-UTF-8 character or the first NULL of the
 * string, and the remaining number of bytes of the string,
 * respectively.
 *
 * If @lenp is NULL the string is scanned until the first invalid
 * byte, without any upper bound on its length.
 */
_c_public_ void c_utf8_verify(const char **strp, size_t *lenp) {
        const char *str = *strp;
        size_t len = lenp ? *lenp : (size_t)-1;

        /* See Unicode 10.0.0, Chapter 3, Section D92 */

        while (len > 0) {
                switch (c_load_8(str, 0)) {
                case 0x00:
                        goto out;
                case 0x01 ... 0x7F:
                        /*
                         * Special-case and optimize the ASCII case.
                         */
                        c_utf8_verify_ascii((const char **)&str, &len);

                        break;
                case 0xC2 ... 0xDF:
                        if (_c_unlikely_(len < 2))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
                                goto out;

                        str += 2;
                        len -= 2;

                        break;
                case 0xE0:
                        if (_c_unlikely_(len < 3))
                                goto out;
                        if (_c_unlikely_(c_load_8(str, 1) < 0xA0 || c_load_8(str, 1) > 0xBF))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;

                        str += 3;
                        len -= 3;

                        break;
                case 0xE1 ... 0xEC:
                        if (_c_unlikely_(len < 3))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;

                        str += 3;
                        len -= 3;

                        break;
                case 0xED:
                        if (_c_unlikely_(len < 3))
                                goto out;
                        if (_c_unlikely_(c_load_8(str, 1) < 0x80 || c_load_8(str, 1) > 0x9F))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;

                        str += 3;
                        len -= 3;

                        break;
                case 0xEE ... 0xEF:
                        if (_c_unlikely_(len < 3))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;

                        str += 3;
                        len -= 3;

                        break;
                case 0xF0:
                        if (_c_unlikely_(len < 4))
                                goto out;
                        if (_c_unlikely_(c_load_8(str, 1) < 0x90 || c_load_8(str, 1) > 0xBF))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
                                goto out;

                        str += 4;
                        len -= 4;

                        break;
                case 0xF1 ... 0xF3:
                        if (_c_unlikely_(len < 4))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
                                goto out;

                        str += 4;
                        len -= 4;

                        break;
                case 0xF4:
                        if (_c_unlikely_(len < 4))
                                goto out;
                        if (_c_unlikely_(c_load_8(str, 1) < 0x80 || c_load_8(str, 1) > 0x8F))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
                                goto out;
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
                                goto out;

                        str += 4;
                        len -= 4;

                        break;
                default:
                        goto out;
                }
        }

out:
        *strp = str;
        if (lenp)
                *lenp = len;
}

Coverage Report

Created: 2026-01-17 06:15

Line	Count	Source
1		/*
2		* UTF-8 Implementation
3		*
4		* For highlevel documentation of the API see the header file and the docbook
5		* comments. This implementation is inspired in part by Rust's std::str.
6		*
7		* So far only validation helpers are implemented, as those seem the most
8		* critical.
9		*/
10
11		#include <c-stdaux.h>
12		#include <stddef.h>
13		#include <stdint.h>
14		#include "c-utf8.h"
15
16		/* The following constants are truncated on 32-bit machines */
17	168k	#define C_UTF8_ASCII_MASK ((size_t)UINT64_C(0x8080808080808080))
18	168k	#define C_UTF8_ASCII_SUB ((size_t)UINT64_C(0x0101010101010101))
19
20	168k	static inline int c_utf8_word_is_ascii(size_t word) {
21		/* True unless any byte is NULL or has the MSB set. */
22	168k	return ((((word - C_UTF8_ASCII_SUB) \| word) & C_UTF8_ASCII_MASK) == 0);
23	168k	}
24
25		/**
26		* c_utf8_verify_ascii() - verify that a string is ASCII encoded
27		* @strp: pointer to string to verify
28		* @lenp: pointer to length of string
29		*
30		* Up to the first @lenp bytes of the string pointed to by @strp is
31		* verified to be ASCII encoded, and @strp and @lenp are updated to
32		* point to the first non-ASCII character or the first NULL of the
33		* string, and the remaining number of bytes of the string,
34		* respectively.
35		*
36		* If @lenp is NULL the string is scanned until the first invalid
37		* byte, without any upper bound on its length.
38		*/
39	314k	_c_public_ void c_utf8_verify_ascii(const char *strp, size_t lenp) {
40	314k	const char str = strp;
41	314k	size_t len = lenp ? *lenp : (size_t)-1;
42
43	1.09M	while (len > 0 && c_load_8(str, 0) < 128) {
44	778k	if ((void*)c_align_to((unsigned long)str, sizeof(size_t)) == str) {
45		/*
46		* If the string is aligned to a word boundary, scan two
47		* words at a time for any NULL or non-ASCII characters.
48		*
49		* We do two words at a time to take advantage of the
50		* compiler being able to use SIMD instructions where
51		* available.
52		*/
53	139k	while (len >= 2 * sizeof(size_t)) {
54	138k	if (!c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, 0)) \|\|
55	29.5k	!c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, sizeof(size_t))))
56	129k	break;
57
58	8.77k	str += 2 * sizeof(size_t);
59	8.77k	len -= 2 * sizeof(size_t);
60	8.77k	}
61
62
63		/*
64		* Find the actual end of the ASCII-portion of the string.
65		*/
66	646k	while (len > 0 && c_load_8(str, 0) < 128) {
67	515k	if (_c_unlikely_(c_load_8(str, 0) == 0x00))
68	35	goto out;
69	515k	++str;
70	515k	--len;
71	515k	}
72	648k	} else {
73		/*
74		* The string was not aligned, scan one character at a time until
75		* it is.
76		*/
77	648k	if (_c_unlikely_(c_load_8(str, 0) == 0x00))
78	16	goto out;
79	648k	++str;
80	648k	--len;
81	648k	}
82	778k	}
83
84	314k	out:
85	314k	*strp = str;
86	314k	if (lenp)
87	314k	*lenp = len;
88	314k	}
89
90		#define C_UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80)
91
92		/**
93		* c_utf8_verify() - verify that a string is UTF-8 encoded
94		* @strp: pointer to string to verify
95		* @lenp: pointer to length of string, or NULL
96		*
97		* Up to the first @lenp bytes of the string pointed to by @strp is
98		* verified to be UTF-8 encoded, and @strp and @lenp are updated to
99		* point to the first non-UTF-8 character or the first NULL of the
100		* string, and the remaining number of bytes of the string,
101		* respectively.
102		*
103		* If @lenp is NULL the string is scanned until the first invalid
104		* byte, without any upper bound on its length.
105		*/
106	5.54k	_c_public_ void c_utf8_verify(const char *strp, size_t lenp) {
107	5.54k	const char str = strp;
108	5.54k	size_t len = lenp ? *lenp : (size_t)-1;
109
110		/* See Unicode 10.0.0, Chapter 3, Section D92 */
111
112	1.15M	while (len > 0) {
113	1.15M	switch (c_load_8(str, 0)) {
114	92	case 0x00:
115	92	goto out;
116	314k	case 0x01 ... 0x7F:
117		/*
118		* Special-case and optimize the ASCII case.
119		*/
120	314k	c_utf8_verify_ascii((const char **)&str, &len);
121
122	314k	break;
123	577k	case 0xC2 ... 0xDF:
124	577k	if (_c_unlikely_(len < 2))
125	69	goto out;
126	577k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
127	73	goto out;
128
129	576k	str += 2;
130	576k	len -= 2;
131
132	576k	break;
133	13.3k	case 0xE0:
134	13.3k	if (_c_unlikely_(len < 3))
135	5	goto out;
136	13.3k	if (_c_unlikely_(c_load_8(str, 1) < 0xA0 \|\| c_load_8(str, 1) > 0xBF))
137	15	goto out;
138	13.3k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
139	8	goto out;
140
141	13.3k	str += 3;
142	13.3k	len -= 3;
143
144	13.3k	break;
145	156k	case 0xE1 ... 0xEC:
146	156k	if (_c_unlikely_(len < 3))
147	30	goto out;
148	156k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
149	34	goto out;
150	156k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
151	24	goto out;
152
153	156k	str += 3;
154	156k	len -= 3;
155
156	156k	break;
157	14.2k	case 0xED:
158	14.2k	if (_c_unlikely_(len < 3))
159	7	goto out;
160	14.2k	if (_c_unlikely_(c_load_8(str, 1) < 0x80 \|\| c_load_8(str, 1) > 0x9F))
161	17	goto out;
162	14.2k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
163	9	goto out;
164
165	14.2k	str += 3;
166	14.2k	len -= 3;
167
168	14.2k	break;
169	7.68k	case 0xEE ... 0xEF:
170	7.68k	if (_c_unlikely_(len < 3))
171	6	goto out;
172	7.67k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
173	16	goto out;
174	7.65k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
175	11	goto out;
176
177	7.64k	str += 3;
178	7.64k	len -= 3;
179
180	7.64k	break;
181	8.03k	case 0xF0:
182	8.03k	if (_c_unlikely_(len < 4))
183	5	goto out;
184	8.03k	if (_c_unlikely_(c_load_8(str, 1) < 0x90 \|\| c_load_8(str, 1) > 0xBF))
185	20	goto out;
186	8.01k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
187	8	goto out;
188	8.00k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
189	10	goto out;
190
191	7.99k	str += 4;
192	7.99k	len -= 4;
193
194	7.99k	break;
195	56.2k	case 0xF1 ... 0xF3:
196	56.2k	if (_c_unlikely_(len < 4))
197	17	goto out;
198	56.2k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
199	11	goto out;
200	56.2k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
201	8	goto out;
202	56.2k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
203	16	goto out;
204
205	56.2k	str += 4;
206	56.2k	len -= 4;
207
208	56.2k	break;
209	2.38k	case 0xF4:
210	2.38k	if (_c_unlikely_(len < 4))
211	3	goto out;
212	2.38k	if (_c_unlikely_(c_load_8(str, 1) < 0x80 \|\| c_load_8(str, 1) > 0x8F))
213	22	goto out;
214	2.36k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
215	11	goto out;
216	2.35k	if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
217	11	goto out;
218
219	2.34k	str += 4;
220	2.34k	len -= 4;
221
222	2.34k	break;
223	87	default:
224	87	goto out;
225	1.15M	}
226	1.15M	}
227
228	5.54k	out:
229	5.54k	*strp = str;
230	5.54k	if (lenp)
231	5.54k	*lenp = len;
232	5.54k	}