/src/dbus-broker/subprojects/libcutf8-1/src/c-utf8.c
Line | Count | Source |
1 | | /* |
2 | | * UTF-8 Implementation |
3 | | * |
4 | | * For highlevel documentation of the API see the header file and the docbook |
5 | | * comments. This implementation is inspired in part by Rust's std::str. |
6 | | * |
7 | | * So far only validation helpers are implemented, as those seem the most |
8 | | * critical. |
9 | | */ |
10 | | |
11 | | #include <c-stdaux.h> |
12 | | #include <stddef.h> |
13 | | #include <stdint.h> |
14 | | #include "c-utf8.h" |
15 | | |
16 | | /* The following constants are truncated on 32-bit machines */ |
17 | 168k | #define C_UTF8_ASCII_MASK ((size_t)UINT64_C(0x8080808080808080)) |
18 | 168k | #define C_UTF8_ASCII_SUB ((size_t)UINT64_C(0x0101010101010101)) |
19 | | |
20 | 168k | static inline int c_utf8_word_is_ascii(size_t word) { |
21 | | /* True unless any byte is NULL or has the MSB set. */ |
22 | 168k | return ((((word - C_UTF8_ASCII_SUB) | word) & C_UTF8_ASCII_MASK) == 0); |
23 | 168k | } |
24 | | |
25 | | /** |
26 | | * c_utf8_verify_ascii() - verify that a string is ASCII encoded |
27 | | * @strp: pointer to string to verify |
28 | | * @lenp: pointer to length of string |
29 | | * |
30 | | * Up to the first @lenp bytes of the string pointed to by @strp is |
31 | | * verified to be ASCII encoded, and @strp and @lenp are updated to |
32 | | * point to the first non-ASCII character or the first NULL of the |
33 | | * string, and the remaining number of bytes of the string, |
34 | | * respectively. |
35 | | * |
36 | | * If @lenp is NULL the string is scanned until the first invalid |
37 | | * byte, without any upper bound on its length. |
38 | | */ |
39 | 314k | _c_public_ void c_utf8_verify_ascii(const char **strp, size_t *lenp) { |
40 | 314k | const char *str = *strp; |
41 | 314k | size_t len = lenp ? *lenp : (size_t)-1; |
42 | | |
43 | 1.09M | while (len > 0 && c_load_8(str, 0) < 128) { |
44 | 778k | if ((void*)c_align_to((unsigned long)str, sizeof(size_t)) == str) { |
45 | | /* |
46 | | * If the string is aligned to a word boundary, scan two |
47 | | * words at a time for any NULL or non-ASCII characters. |
48 | | * |
49 | | * We do two words at a time to take advantage of the |
50 | | * compiler being able to use SIMD instructions where |
51 | | * available. |
52 | | */ |
53 | 139k | while (len >= 2 * sizeof(size_t)) { |
54 | 138k | if (!c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, 0)) || |
55 | 29.5k | !c_utf8_word_is_ascii(c_load(size_t, le, aligned, str, sizeof(size_t)))) |
56 | 129k | break; |
57 | | |
58 | 8.77k | str += 2 * sizeof(size_t); |
59 | 8.77k | len -= 2 * sizeof(size_t); |
60 | 8.77k | } |
61 | | |
62 | | |
63 | | /* |
64 | | * Find the actual end of the ASCII-portion of the string. |
65 | | */ |
66 | 646k | while (len > 0 && c_load_8(str, 0) < 128) { |
67 | 515k | if (_c_unlikely_(c_load_8(str, 0) == 0x00)) |
68 | 35 | goto out; |
69 | 515k | ++str; |
70 | 515k | --len; |
71 | 515k | } |
72 | 648k | } else { |
73 | | /* |
74 | | * The string was not aligned, scan one character at a time until |
75 | | * it is. |
76 | | */ |
77 | 648k | if (_c_unlikely_(c_load_8(str, 0) == 0x00)) |
78 | 16 | goto out; |
79 | 648k | ++str; |
80 | 648k | --len; |
81 | 648k | } |
82 | 778k | } |
83 | | |
84 | 314k | out: |
85 | 314k | *strp = str; |
86 | 314k | if (lenp) |
87 | 314k | *lenp = len; |
88 | 314k | } |
89 | | |
90 | | #define C_UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80) |
91 | | |
92 | | /** |
93 | | * c_utf8_verify() - verify that a string is UTF-8 encoded |
94 | | * @strp: pointer to string to verify |
95 | | * @lenp: pointer to length of string, or NULL |
96 | | * |
97 | | * Up to the first @lenp bytes of the string pointed to by @strp is |
98 | | * verified to be UTF-8 encoded, and @strp and @lenp are updated to |
99 | | * point to the first non-UTF-8 character or the first NULL of the |
100 | | * string, and the remaining number of bytes of the string, |
101 | | * respectively. |
102 | | * |
103 | | * If @lenp is NULL the string is scanned until the first invalid |
104 | | * byte, without any upper bound on its length. |
105 | | */ |
106 | 5.54k | _c_public_ void c_utf8_verify(const char **strp, size_t *lenp) { |
107 | 5.54k | const char *str = *strp; |
108 | 5.54k | size_t len = lenp ? *lenp : (size_t)-1; |
109 | | |
110 | | /* See Unicode 10.0.0, Chapter 3, Section D92 */ |
111 | | |
112 | 1.15M | while (len > 0) { |
113 | 1.15M | switch (c_load_8(str, 0)) { |
114 | 92 | case 0x00: |
115 | 92 | goto out; |
116 | 314k | case 0x01 ... 0x7F: |
117 | | /* |
118 | | * Special-case and optimize the ASCII case. |
119 | | */ |
120 | 314k | c_utf8_verify_ascii((const char **)&str, &len); |
121 | | |
122 | 314k | break; |
123 | 577k | case 0xC2 ... 0xDF: |
124 | 577k | if (_c_unlikely_(len < 2)) |
125 | 69 | goto out; |
126 | 577k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1)))) |
127 | 73 | goto out; |
128 | | |
129 | 576k | str += 2; |
130 | 576k | len -= 2; |
131 | | |
132 | 576k | break; |
133 | 13.3k | case 0xE0: |
134 | 13.3k | if (_c_unlikely_(len < 3)) |
135 | 5 | goto out; |
136 | 13.3k | if (_c_unlikely_(c_load_8(str, 1) < 0xA0 || c_load_8(str, 1) > 0xBF)) |
137 | 15 | goto out; |
138 | 13.3k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
139 | 8 | goto out; |
140 | | |
141 | 13.3k | str += 3; |
142 | 13.3k | len -= 3; |
143 | | |
144 | 13.3k | break; |
145 | 156k | case 0xE1 ... 0xEC: |
146 | 156k | if (_c_unlikely_(len < 3)) |
147 | 30 | goto out; |
148 | 156k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1)))) |
149 | 34 | goto out; |
150 | 156k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
151 | 24 | goto out; |
152 | | |
153 | 156k | str += 3; |
154 | 156k | len -= 3; |
155 | | |
156 | 156k | break; |
157 | 14.2k | case 0xED: |
158 | 14.2k | if (_c_unlikely_(len < 3)) |
159 | 7 | goto out; |
160 | 14.2k | if (_c_unlikely_(c_load_8(str, 1) < 0x80 || c_load_8(str, 1) > 0x9F)) |
161 | 17 | goto out; |
162 | 14.2k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
163 | 9 | goto out; |
164 | | |
165 | 14.2k | str += 3; |
166 | 14.2k | len -= 3; |
167 | | |
168 | 14.2k | break; |
169 | 7.68k | case 0xEE ... 0xEF: |
170 | 7.68k | if (_c_unlikely_(len < 3)) |
171 | 6 | goto out; |
172 | 7.67k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1)))) |
173 | 16 | goto out; |
174 | 7.65k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
175 | 11 | goto out; |
176 | | |
177 | 7.64k | str += 3; |
178 | 7.64k | len -= 3; |
179 | | |
180 | 7.64k | break; |
181 | 8.03k | case 0xF0: |
182 | 8.03k | if (_c_unlikely_(len < 4)) |
183 | 5 | goto out; |
184 | 8.03k | if (_c_unlikely_(c_load_8(str, 1) < 0x90 || c_load_8(str, 1) > 0xBF)) |
185 | 20 | goto out; |
186 | 8.01k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
187 | 8 | goto out; |
188 | 8.00k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3)))) |
189 | 10 | goto out; |
190 | | |
191 | 7.99k | str += 4; |
192 | 7.99k | len -= 4; |
193 | | |
194 | 7.99k | break; |
195 | 56.2k | case 0xF1 ... 0xF3: |
196 | 56.2k | if (_c_unlikely_(len < 4)) |
197 | 17 | goto out; |
198 | 56.2k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 1)))) |
199 | 11 | goto out; |
200 | 56.2k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
201 | 8 | goto out; |
202 | 56.2k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3)))) |
203 | 16 | goto out; |
204 | | |
205 | 56.2k | str += 4; |
206 | 56.2k | len -= 4; |
207 | | |
208 | 56.2k | break; |
209 | 2.38k | case 0xF4: |
210 | 2.38k | if (_c_unlikely_(len < 4)) |
211 | 3 | goto out; |
212 | 2.38k | if (_c_unlikely_(c_load_8(str, 1) < 0x80 || c_load_8(str, 1) > 0x8F)) |
213 | 22 | goto out; |
214 | 2.36k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 2)))) |
215 | 11 | goto out; |
216 | 2.35k | if (_c_unlikely_(!C_UTF8_CHAR_IS_TAIL(c_load_8(str, 3)))) |
217 | 11 | goto out; |
218 | | |
219 | 2.34k | str += 4; |
220 | 2.34k | len -= 4; |
221 | | |
222 | 2.34k | break; |
223 | 87 | default: |
224 | 87 | goto out; |
225 | 1.15M | } |
226 | 1.15M | } |
227 | | |
228 | 5.54k | out: |
229 | 5.54k | *strp = str; |
230 | 5.54k | if (lenp) |
231 | 5.54k | *lenp = len; |
232 | 5.54k | } |