Coverage Report

Created: 2026-01-17 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/dbus-broker/subprojects/libcutf8-1/src/c-utf8.c
Line
Count
Source
1
/*
2
 * UTF-8 Implementation
3
 *
4
 * For highlevel documentation of the API see the header file and the docbook
5
 * comments. This implementation is inspired in part by Rust's std::str.
6
 *
7
 * So far only validation helpers are implemented, as those seem the most
8
 * critical.
9
 */
10
11
#include <c-stdaux.h>
12
#include <stddef.h>
13
#include <stdint.h>
14
#include "c-utf8.h"
15
16
/* The following constants are truncated on 32-bit machines */
17
168k
#define C_UTF8_ASCII_MASK ((size_t)UINT64_C(0x8080808080808080))
18
168k
#define C_UTF8_ASCII_SUB ((size_t)UINT64_C(0x0101010101010101))
19
20
168k
static inline int c_utf8_word_is_ascii(size_t word) {
21
        /* True unless any byte is NULL or has the MSB set. */
22
168k
        return ((((word - C_UTF8_ASCII_SUB) | word) & C_UTF8_ASCII_MASK) == 0);
23
168k
}
24
25
/**
26
 * c_utf8_verify_ascii() - verify that a string is ASCII encoded
27
 * @strp:               pointer to string to verify
28
 * @lenp:               pointer to length of string
29
 *
30
 * Up to the first @lenp bytes of the string pointed to by @strp is
31
 * verified to be ASCII encoded, and @strp and @lenp are updated to
32
 * point to the first non-ASCII character or the first NULL of the
33
 * string, and the remaining number of bytes of the string,
34
 * respectively.
35
 *
36
 * If @lenp is NULL the string is scanned until the first invalid
37
 * byte, without any upper bound on its length.
38
 */
39
314k
_c_public_ void c_utf8_verify_ascii(const char **strp, size_t *lenp) {
40
314k
        const char *str = *strp;
41
314k
        size_t len = lenp ? *lenp : (size_t)-1;
42
43
1.09M
        while (len > 0 && c_load_8(str, 0) < 128) {
44
778k
                if ((void*)c_align_to((unsigned long)str, sizeof(size_t)) == str) {
45
                        /*
46
                         * If the string is aligned to a word boundary, scan two
47
                         * words at a time for any NULL or non-ASCII characters.
48
                         *
49
                         * We do two words at a time to take advantage of the
50
                         * compiler being able to use SIMD instructions where
51
                         * available.
52
                         */
53
139k
                        while (len >= 2 * sizeof(size_t)) {
54
138k
                                if (!c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, 0)) ||
55
29.5k
                                    !c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, sizeof(size_t))))
56
129k
                                        break;
57
58
8.77k
                                str += 2 * sizeof(size_t);
59
8.77k
                                len -= 2 * sizeof(size_t);
60
8.77k
                        }
61
62
63
                        /*
64
                         * Find the actual end of the ASCII-portion of the string.
65
                         */
66
646k
                        while (len > 0 && c_load_8(str, 0) < 128) {
67
515k
                                if (_c_unlikely_(c_load_8(str, 0) == 0x00))
68
35
                                        goto out;
69
515k
                                ++str;
70
515k
                                --len;
71
515k
                        }
72
648k
                } else {
73
                        /*
74
                         * The string was not aligned, scan one character at a time until
75
                         * it is.
76
                         */
77
648k
                        if (_c_unlikely_(c_load_8(str, 0) == 0x00))
78
16
                                goto out;
79
648k
                        ++str;
80
648k
                        --len;
81
648k
                }
82
778k
        }
83
84
314k
out:
85
314k
        *strp = str;
86
314k
        if (lenp)
87
314k
                *lenp = len;
88
314k
}
89
90
#define C_UTF8_CHAR_IS_TAIL(_x)         (((_x) & 0xC0) == 0x80)
91
92
/**
93
 * c_utf8_verify() - verify that a string is UTF-8 encoded
94
 * @strp:               pointer to string to verify
95
 * @lenp:               pointer to length of string, or NULL
96
 *
97
 * Up to the first @lenp bytes of the string pointed to by @strp is
98
 * verified to be UTF-8 encoded, and @strp and @lenp are updated to
99
 * point to the first non-UTF-8 character or the first NULL of the
100
 * string, and the remaining number of bytes of the string,
101
 * respectively.
102
 *
103
 * If @lenp is NULL the string is scanned until the first invalid
104
 * byte, without any upper bound on its length.
105
 */
106
5.54k
_c_public_ void c_utf8_verify(const char **strp, size_t *lenp) {
107
5.54k
        const char *str = *strp;
108
5.54k
        size_t len = lenp ? *lenp : (size_t)-1;
109
110
        /* See Unicode 10.0.0, Chapter 3, Section D92 */
111
112
1.15M
        while (len > 0) {
113
1.15M
                switch (c_load_8(str, 0)) {
114
92
                case 0x00:
115
92
                        goto out;
116
314k
                case 0x01 ... 0x7F:
117
                        /*
118
                         * Special-case and optimize the ASCII case.
119
                         */
120
314k
                        c_utf8_verify_ascii((const char **)&str, &len);
121
122
314k
                        break;
123
577k
                case 0xC2 ... 0xDF:
124
577k
                        if (_c_unlikely_(len < 2))
125
69
                                goto out;
126
577k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
127
73
                                goto out;
128
129
576k
                        str += 2;
130
576k
                        len -= 2;
131
132
576k
                        break;
133
13.3k
                case 0xE0:
134
13.3k
                        if (_c_unlikely_(len < 3))
135
5
                                goto out;
136
13.3k
                        if (_c_unlikely_(c_load_8(str, 1) < 0xA0 || c_load_8(str, 1) > 0xBF))
137
15
                                goto out;
138
13.3k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
139
8
                                goto out;
140
141
13.3k
                        str += 3;
142
13.3k
                        len -= 3;
143
144
13.3k
                        break;
145
156k
                case 0xE1 ... 0xEC:
146
156k
                        if (_c_unlikely_(len < 3))
147
30
                                goto out;
148
156k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
149
34
                                goto out;
150
156k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
151
24
                                goto out;
152
153
156k
                        str += 3;
154
156k
                        len -= 3;
155
156
156k
                        break;
157
14.2k
                case 0xED:
158
14.2k
                        if (_c_unlikely_(len < 3))
159
7
                                goto out;
160
14.2k
                        if (_c_unlikely_(c_load_8(str, 1) < 0x80 || c_load_8(str, 1) > 0x9F))
161
17
                                goto out;
162
14.2k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
163
9
                                goto out;
164
165
14.2k
                        str += 3;
166
14.2k
                        len -= 3;
167
168
14.2k
                        break;
169
7.68k
                case 0xEE ... 0xEF:
170
7.68k
                        if (_c_unlikely_(len < 3))
171
6
                                goto out;
172
7.67k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
173
16
                                goto out;
174
7.65k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
175
11
                                goto out;
176
177
7.64k
                        str += 3;
178
7.64k
                        len -= 3;
179
180
7.64k
                        break;
181
8.03k
                case 0xF0:
182
8.03k
                        if (_c_unlikely_(len < 4))
183
5
                                goto out;
184
8.03k
                        if (_c_unlikely_(c_load_8(str, 1) < 0x90 || c_load_8(str, 1) > 0xBF))
185
20
                                goto out;
186
8.01k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
187
8
                                goto out;
188
8.00k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
189
10
                                goto out;
190
191
7.99k
                        str += 4;
192
7.99k
                        len -= 4;
193
194
7.99k
                        break;
195
56.2k
                case 0xF1 ... 0xF3:
196
56.2k
                        if (_c_unlikely_(len < 4))
197
17
                                goto out;
198
56.2k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1))))
199
11
                                goto out;
200
56.2k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
201
8
                                goto out;
202
56.2k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
203
16
                                goto out;
204
205
56.2k
                        str += 4;
206
56.2k
                        len -= 4;
207
208
56.2k
                        break;
209
2.38k
                case 0xF4:
210
2.38k
                        if (_c_unlikely_(len < 4))
211
3
                                goto out;
212
2.38k
                        if (_c_unlikely_(c_load_8(str, 1) < 0x80 || c_load_8(str, 1) > 0x8F))
213
22
                                goto out;
214
2.36k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2))))
215
11
                                goto out;
216
2.35k
                        if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3))))
217
11
                                goto out;
218
219
2.34k
                        str += 4;
220
2.34k
                        len -= 4;
221
222
2.34k
                        break;
223
87
                default:
224
87
                        goto out;
225
1.15M
                }
226
1.15M
        }
227
228
5.54k
out:
229
5.54k
        *strp = str;
230
5.54k
        if (lenp)
231
5.54k
                *lenp = len;
232
5.54k
}