/src/pcre2/src/pcre2_valid_utf.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2020 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | /* This module contains an internal function for validating UTF character |
43 | | strings. This file is also #included by the pcre2test program, which uses |
44 | | macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes |
45 | | with the library. In this case, PCRE2_PCRE2TEST is defined. */ |
46 | | |
47 | | |
48 | | #ifndef PCRE2_PCRE2TEST /* We're compiling the library */ |
49 | | #include "pcre2_internal.h" |
50 | | #endif /* PCRE2_PCRE2TEST */ |
51 | | |
52 | | |
53 | | |
54 | | #ifndef SUPPORT_UNICODE |
55 | | /************************************************* |
56 | | * Dummy function when Unicode is not supported * |
57 | | *************************************************/ |
58 | | |
59 | | /* This function should never be called when Unicode is not supported. */ |
60 | | |
61 | | int |
62 | | PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) |
63 | | { |
64 | | (void)string; |
65 | | (void)length; |
66 | | (void)erroroffset; |
67 | | return 0; |
68 | | } |
69 | | #else /* UTF is supported */ |
70 | | |
71 | | |
72 | | |
73 | | /************************************************* |
74 | | * Validate a UTF string * |
75 | | *************************************************/ |
76 | | |
77 | | /* This function is called (optionally) at the start of compile or match, to |
78 | | check that a supposed UTF string is actually valid. The early check means |
79 | | that subsequent code can assume it is dealing with a valid string. The check |
80 | | can be turned off for maximum performance, but the consequences of supplying an |
81 | | invalid string are then undefined. |
82 | | |
83 | | Arguments: |
84 | | string points to the string |
85 | | length length of string |
86 | | errp pointer to an error position offset variable |
87 | | |
88 | | Returns: == 0 if the string is a valid UTF string |
89 | | != 0 otherwise, setting the offset of the bad character |
90 | | */ |
91 | | |
92 | | int |
93 | | PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) |
94 | 933k | { |
95 | 933k | PCRE2_SPTR p; |
96 | 933k | uint32_t c; |
97 | | |
98 | | /* ----------------- Check a UTF-8 string ----------------- */ |
99 | | |
100 | | #if PCRE2_CODE_UNIT_WIDTH == 8 |
101 | | |
102 | | /* Originally, this function checked according to RFC 2279, allowing for values |
103 | | in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were |
104 | | in the canonical format. Once somebody had pointed out RFC 3629 to me (it |
105 | | obsoletes 2279), additional restrictions were applied. The values are now |
106 | | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
107 | | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte |
108 | | characters is still checked. Error returns are as follows: |
109 | | |
110 | | PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string |
111 | | PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string |
112 | | PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string |
113 | | PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string |
114 | | PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string |
115 | | PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 |
116 | | PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 |
117 | | PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 |
118 | | PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 |
119 | | PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 |
120 | | PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 |
121 | | PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 |
122 | | PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted |
123 | | PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted |
124 | | PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence |
125 | | PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence |
126 | | PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence |
127 | | PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) |
128 | | PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) |
129 | | PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) |
130 | | PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff |
131 | | */ |
132 | | |
133 | 17.1M | for (p = string; length > 0; p++) |
134 | 16.7M | { |
135 | 16.7M | uint32_t ab, d; |
136 | | |
137 | 16.7M | c = *p; |
138 | 16.7M | length--; |
139 | | |
140 | 16.7M | if (c < 128) continue; /* ASCII character */ |
141 | | |
142 | 582k | if (c < 0xc0) /* Isolated 10xx xxxx byte */ |
143 | 3.71k | { |
144 | 3.71k | *erroroffset = (PCRE2_SIZE)(p - string); |
145 | 3.71k | return PCRE2_ERROR_UTF8_ERR20; |
146 | 3.71k | } |
147 | | |
148 | 578k | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ |
149 | 2.04k | { |
150 | 2.04k | *erroroffset = (PCRE2_SIZE)(p - string); |
151 | 2.04k | return PCRE2_ERROR_UTF8_ERR21; |
152 | 2.04k | } |
153 | | |
154 | 576k | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ |
155 | 576k | if (length < ab) /* Missing bytes */ |
156 | 873 | { |
157 | 873 | *erroroffset = (PCRE2_SIZE)(p - string); |
158 | 873 | switch(ab - length) |
159 | 873 | { |
160 | 392 | case 1: return PCRE2_ERROR_UTF8_ERR1; |
161 | 254 | case 2: return PCRE2_ERROR_UTF8_ERR2; |
162 | 154 | case 3: return PCRE2_ERROR_UTF8_ERR3; |
163 | 42 | case 4: return PCRE2_ERROR_UTF8_ERR4; |
164 | 31 | case 5: return PCRE2_ERROR_UTF8_ERR5; |
165 | 873 | } |
166 | 873 | } |
167 | 576k | length -= ab; /* Length remaining */ |
168 | | |
169 | | /* Check top bits in the second byte */ |
170 | | |
171 | 576k | if (((d = *(++p)) & 0xc0) != 0x80) |
172 | 2.52k | { |
173 | 2.52k | *erroroffset = (PCRE2_SIZE)(p - string) - 1; |
174 | 2.52k | return PCRE2_ERROR_UTF8_ERR6; |
175 | 2.52k | } |
176 | | |
177 | | /* For each length, check that the remaining bytes start with the 0x80 bit |
178 | | set and not the 0x40 bit. Then check for an overlong sequence, and for the |
179 | | excluded range 0xd800 to 0xdfff. */ |
180 | | |
181 | 573k | switch (ab) |
182 | 573k | { |
183 | | /* 2-byte character. No further bytes to check for 0x80. Check first byte |
184 | | for for xx00 000x (overlong sequence). */ |
185 | | |
186 | 174k | case 1: if ((c & 0x3e) == 0) |
187 | 44 | { |
188 | 44 | *erroroffset = (PCRE2_SIZE)(p - string) - 1; |
189 | 44 | return PCRE2_ERROR_UTF8_ERR15; |
190 | 44 | } |
191 | 174k | break; |
192 | | |
193 | | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes |
194 | | for 1110 0000, xx0x xxxx (overlong sequence) or |
195 | | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ |
196 | | |
197 | 271k | case 2: |
198 | 271k | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
199 | 103 | { |
200 | 103 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
201 | 103 | return PCRE2_ERROR_UTF8_ERR7; |
202 | 103 | } |
203 | 271k | if (c == 0xe0 && (d & 0x20) == 0) |
204 | 9 | { |
205 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
206 | 9 | return PCRE2_ERROR_UTF8_ERR16; |
207 | 9 | } |
208 | 271k | if (c == 0xed && d >= 0xa0) |
209 | 9 | { |
210 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
211 | 9 | return PCRE2_ERROR_UTF8_ERR14; |
212 | 9 | } |
213 | 271k | break; |
214 | | |
215 | | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 |
216 | | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a |
217 | | character greater than 0x0010ffff (f4 8f bf bf) */ |
218 | | |
219 | 271k | case 3: |
220 | 127k | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
221 | 52 | { |
222 | 52 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
223 | 52 | return PCRE2_ERROR_UTF8_ERR7; |
224 | 52 | } |
225 | 127k | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
226 | 42 | { |
227 | 42 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
228 | 42 | return PCRE2_ERROR_UTF8_ERR8; |
229 | 42 | } |
230 | 127k | if (c == 0xf0 && (d & 0x30) == 0) |
231 | 9 | { |
232 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
233 | 9 | return PCRE2_ERROR_UTF8_ERR17; |
234 | 9 | } |
235 | 127k | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) |
236 | 19 | { |
237 | 19 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
238 | 19 | return PCRE2_ERROR_UTF8_ERR13; |
239 | 19 | } |
240 | 127k | break; |
241 | | |
242 | | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be |
243 | | rejected by the length test below. However, we do the appropriate tests |
244 | | here so that overlong sequences get diagnosed, and also in case there is |
245 | | ever an option for handling these larger code points. */ |
246 | | |
247 | | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for |
248 | | 1111 1000, xx00 0xxx */ |
249 | | |
250 | 127k | case 4: |
251 | 63 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
252 | 12 | { |
253 | 12 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
254 | 12 | return PCRE2_ERROR_UTF8_ERR7; |
255 | 12 | } |
256 | 51 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
257 | 13 | { |
258 | 13 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
259 | 13 | return PCRE2_ERROR_UTF8_ERR8; |
260 | 13 | } |
261 | 38 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
262 | 9 | { |
263 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; |
264 | 9 | return PCRE2_ERROR_UTF8_ERR9; |
265 | 9 | } |
266 | 29 | if (c == 0xf8 && (d & 0x38) == 0) |
267 | 11 | { |
268 | 11 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; |
269 | 11 | return PCRE2_ERROR_UTF8_ERR18; |
270 | 11 | } |
271 | 18 | break; |
272 | | |
273 | | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for |
274 | | 1111 1100, xx00 00xx. */ |
275 | | |
276 | 73 | case 5: |
277 | 73 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
278 | 18 | { |
279 | 18 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
280 | 18 | return PCRE2_ERROR_UTF8_ERR7; |
281 | 18 | } |
282 | 55 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
283 | 9 | { |
284 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
285 | 9 | return PCRE2_ERROR_UTF8_ERR8; |
286 | 9 | } |
287 | 46 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
288 | 10 | { |
289 | 10 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; |
290 | 10 | return PCRE2_ERROR_UTF8_ERR9; |
291 | 10 | } |
292 | 36 | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ |
293 | 9 | { |
294 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 5; |
295 | 9 | return PCRE2_ERROR_UTF8_ERR10; |
296 | 9 | } |
297 | 27 | if (c == 0xfc && (d & 0x3c) == 0) |
298 | 9 | { |
299 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 5; |
300 | 9 | return PCRE2_ERROR_UTF8_ERR19; |
301 | 9 | } |
302 | 18 | break; |
303 | 573k | } |
304 | | |
305 | | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are |
306 | | excluded by RFC 3629. The pointer p is currently at the last byte of the |
307 | | character. */ |
308 | | |
309 | 573k | if (ab > 3) |
310 | 36 | { |
311 | 36 | *erroroffset = (PCRE2_SIZE)(p - string) - ab; |
312 | 36 | return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; |
313 | 36 | } |
314 | 573k | } |
315 | 384k | return 0; |
316 | | |
317 | | |
318 | | /* ----------------- Check a UTF-16 string ----------------- */ |
319 | | |
320 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
321 | | |
322 | | /* There's not so much work, nor so many errors, for UTF-16. |
323 | | PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string |
324 | | PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate |
325 | | PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate |
326 | | */ |
327 | | |
328 | 20.8M | for (p = string; length > 0; p++) |
329 | 20.4M | { |
330 | 20.4M | c = *p; |
331 | 20.4M | length--; |
332 | | |
333 | 20.4M | if ((c & 0xf800) != 0xd800) |
334 | 20.2M | { |
335 | | /* Normal UTF-16 code point. Neither high nor low surrogate. */ |
336 | 20.2M | } |
337 | 113k | else if ((c & 0x0400) == 0) |
338 | 111k | { |
339 | | /* High surrogate. Must be a followed by a low surrogate. */ |
340 | 111k | if (length == 0) |
341 | 212 | { |
342 | 212 | *erroroffset = (PCRE2_SIZE)(p - string); |
343 | 212 | return PCRE2_ERROR_UTF16_ERR1; |
344 | 212 | } |
345 | 111k | p++; |
346 | 111k | length--; |
347 | 111k | if ((*p & 0xfc00) != 0xdc00) |
348 | 1.07k | { |
349 | 1.07k | *erroroffset = (PCRE2_SIZE)(p - string) - 1; |
350 | 1.07k | return PCRE2_ERROR_UTF16_ERR2; |
351 | 1.07k | } |
352 | 111k | } |
353 | 1.53k | else |
354 | 1.53k | { |
355 | | /* Isolated low surrogate. Always an error. */ |
356 | 1.53k | *erroroffset = (PCRE2_SIZE)(p - string); |
357 | 1.53k | return PCRE2_ERROR_UTF16_ERR3; |
358 | 1.53k | } |
359 | 20.4M | } |
360 | 397k | return 0; |
361 | | |
362 | | |
363 | | |
364 | | /* ----------------- Check a UTF-32 string ----------------- */ |
365 | | |
366 | | #else |
367 | | |
368 | | /* There is very little to do for a UTF-32 string. |
369 | | PCRE2_ERROR_UTF32_ERR1 Surrogate character |
370 | | PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff |
371 | | */ |
372 | | |
373 | 1.77M | for (p = string; length > 0; length--, p++) |
374 | 1.64M | { |
375 | 1.64M | c = *p; |
376 | 1.64M | if ((c & 0xfffff800u) != 0xd800u) |
377 | 1.64M | { |
378 | | /* Normal UTF-32 code point. Neither high nor low surrogate. */ |
379 | 1.64M | if (c > 0x10ffffu) |
380 | 9.91k | { |
381 | 9.91k | *erroroffset = (PCRE2_SIZE)(p - string); |
382 | 9.91k | return PCRE2_ERROR_UTF32_ERR2; |
383 | 9.91k | } |
384 | 1.64M | } |
385 | 32 | else |
386 | 32 | { |
387 | | /* A surrogate */ |
388 | 32 | *erroroffset = (PCRE2_SIZE)(p - string); |
389 | 32 | return PCRE2_ERROR_UTF32_ERR1; |
390 | 32 | } |
391 | 1.64M | } |
392 | 129k | return 0; |
393 | | #endif /* CODE_UNIT_WIDTH */ |
394 | 933k | } Line | Count | Source | 94 | 394k | { | 95 | 394k | PCRE2_SPTR p; | 96 | 394k | uint32_t c; | 97 | | | 98 | | /* ----------------- Check a UTF-8 string ----------------- */ | 99 | | | 100 | 394k | #if PCRE2_CODE_UNIT_WIDTH == 8 | 101 | | | 102 | | /* Originally, this function checked according to RFC 2279, allowing for values | 103 | | in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were | 104 | | in the canonical format. Once somebody had pointed out RFC 3629 to me (it | 105 | | obsoletes 2279), additional restrictions were applied. The values are now | 106 | | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the | 107 | | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte | 108 | | characters is still checked. Error returns are as follows: | 109 | | | 110 | | PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string | 111 | | PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string | 112 | | PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string | 113 | | PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string | 114 | | PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string | 115 | | PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 | 116 | | PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 | 117 | | PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 | 118 | | PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 | 119 | | PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 | 120 | | PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 | 121 | | PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 | 122 | | PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted | 123 | | PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted | 124 | | PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence | 125 | | PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence | 126 | | PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence | 127 | | PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) | 128 | | PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) | 129 | | PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) | 130 | | PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff | 131 | | */ | 132 | | | 133 | 17.1M | for (p = string; length > 0; p++) | 134 | 16.7M | { | 135 | 16.7M | uint32_t ab, d; | 136 | | | 137 | 16.7M | c = *p; | 138 | 16.7M | length--; | 139 | | | 140 | 16.7M | if (c < 128) continue; /* ASCII character */ | 141 | | | 142 | 582k | if (c < 0xc0) /* Isolated 10xx xxxx byte */ | 143 | 3.71k | { | 144 | 3.71k | *erroroffset = (PCRE2_SIZE)(p - string); | 145 | 3.71k | return PCRE2_ERROR_UTF8_ERR20; | 146 | 3.71k | } | 147 | | | 148 | 578k | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ | 149 | 2.04k | { | 150 | 2.04k | *erroroffset = (PCRE2_SIZE)(p - string); | 151 | 2.04k | return PCRE2_ERROR_UTF8_ERR21; | 152 | 2.04k | } | 153 | | | 154 | 576k | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ | 155 | 576k | if (length < ab) /* Missing bytes */ | 156 | 873 | { | 157 | 873 | *erroroffset = (PCRE2_SIZE)(p - string); | 158 | 873 | switch(ab - length) | 159 | 873 | { | 160 | 392 | case 1: return PCRE2_ERROR_UTF8_ERR1; | 161 | 254 | case 2: return PCRE2_ERROR_UTF8_ERR2; | 162 | 154 | case 3: return PCRE2_ERROR_UTF8_ERR3; | 163 | 42 | case 4: return PCRE2_ERROR_UTF8_ERR4; | 164 | 31 | case 5: return PCRE2_ERROR_UTF8_ERR5; | 165 | 873 | } | 166 | 873 | } | 167 | 576k | length -= ab; /* Length remaining */ | 168 | | | 169 | | /* Check top bits in the second byte */ | 170 | | | 171 | 576k | if (((d = *(++p)) & 0xc0) != 0x80) | 172 | 2.52k | { | 173 | 2.52k | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 174 | 2.52k | return PCRE2_ERROR_UTF8_ERR6; | 175 | 2.52k | } | 176 | | | 177 | | /* For each length, check that the remaining bytes start with the 0x80 bit | 178 | | set and not the 0x40 bit. Then check for an overlong sequence, and for the | 179 | | excluded range 0xd800 to 0xdfff. */ | 180 | | | 181 | 573k | switch (ab) | 182 | 573k | { | 183 | | /* 2-byte character. No further bytes to check for 0x80. Check first byte | 184 | | for for xx00 000x (overlong sequence). */ | 185 | | | 186 | 174k | case 1: if ((c & 0x3e) == 0) | 187 | 44 | { | 188 | 44 | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 189 | 44 | return PCRE2_ERROR_UTF8_ERR15; | 190 | 44 | } | 191 | 174k | break; | 192 | | | 193 | | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes | 194 | | for 1110 0000, xx0x xxxx (overlong sequence) or | 195 | | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ | 196 | | | 197 | 271k | case 2: | 198 | 271k | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 199 | 103 | { | 200 | 103 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 201 | 103 | return PCRE2_ERROR_UTF8_ERR7; | 202 | 103 | } | 203 | 271k | if (c == 0xe0 && (d & 0x20) == 0) | 204 | 9 | { | 205 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 206 | 9 | return PCRE2_ERROR_UTF8_ERR16; | 207 | 9 | } | 208 | 271k | if (c == 0xed && d >= 0xa0) | 209 | 9 | { | 210 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 211 | 9 | return PCRE2_ERROR_UTF8_ERR14; | 212 | 9 | } | 213 | 271k | break; | 214 | | | 215 | | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 | 216 | | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a | 217 | | character greater than 0x0010ffff (f4 8f bf bf) */ | 218 | | | 219 | 271k | case 3: | 220 | 127k | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 221 | 52 | { | 222 | 52 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 223 | 52 | return PCRE2_ERROR_UTF8_ERR7; | 224 | 52 | } | 225 | 127k | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 226 | 42 | { | 227 | 42 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 228 | 42 | return PCRE2_ERROR_UTF8_ERR8; | 229 | 42 | } | 230 | 127k | if (c == 0xf0 && (d & 0x30) == 0) | 231 | 9 | { | 232 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 233 | 9 | return PCRE2_ERROR_UTF8_ERR17; | 234 | 9 | } | 235 | 127k | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) | 236 | 19 | { | 237 | 19 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 238 | 19 | return PCRE2_ERROR_UTF8_ERR13; | 239 | 19 | } | 240 | 127k | break; | 241 | | | 242 | | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be | 243 | | rejected by the length test below. However, we do the appropriate tests | 244 | | here so that overlong sequences get diagnosed, and also in case there is | 245 | | ever an option for handling these larger code points. */ | 246 | | | 247 | | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for | 248 | | 1111 1000, xx00 0xxx */ | 249 | | | 250 | 127k | case 4: | 251 | 63 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 252 | 12 | { | 253 | 12 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 254 | 12 | return PCRE2_ERROR_UTF8_ERR7; | 255 | 12 | } | 256 | 51 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 257 | 13 | { | 258 | 13 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 259 | 13 | return PCRE2_ERROR_UTF8_ERR8; | 260 | 13 | } | 261 | 38 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ | 262 | 9 | { | 263 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 264 | 9 | return PCRE2_ERROR_UTF8_ERR9; | 265 | 9 | } | 266 | 29 | if (c == 0xf8 && (d & 0x38) == 0) | 267 | 11 | { | 268 | 11 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 269 | 11 | return PCRE2_ERROR_UTF8_ERR18; | 270 | 11 | } | 271 | 18 | break; | 272 | | | 273 | | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for | 274 | | 1111 1100, xx00 00xx. */ | 275 | | | 276 | 73 | case 5: | 277 | 73 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 278 | 18 | { | 279 | 18 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 280 | 18 | return PCRE2_ERROR_UTF8_ERR7; | 281 | 18 | } | 282 | 55 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 283 | 9 | { | 284 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 285 | 9 | return PCRE2_ERROR_UTF8_ERR8; | 286 | 9 | } | 287 | 46 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ | 288 | 10 | { | 289 | 10 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 290 | 10 | return PCRE2_ERROR_UTF8_ERR9; | 291 | 10 | } | 292 | 36 | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ | 293 | 9 | { | 294 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 5; | 295 | 9 | return PCRE2_ERROR_UTF8_ERR10; | 296 | 9 | } | 297 | 27 | if (c == 0xfc && (d & 0x3c) == 0) | 298 | 9 | { | 299 | 9 | *erroroffset = (PCRE2_SIZE)(p - string) - 5; | 300 | 9 | return PCRE2_ERROR_UTF8_ERR19; | 301 | 9 | } | 302 | 18 | break; | 303 | 573k | } | 304 | | | 305 | | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are | 306 | | excluded by RFC 3629. The pointer p is currently at the last byte of the | 307 | | character. */ | 308 | | | 309 | 573k | if (ab > 3) | 310 | 36 | { | 311 | 36 | *erroroffset = (PCRE2_SIZE)(p - string) - ab; | 312 | 36 | return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; | 313 | 36 | } | 314 | 573k | } | 315 | 384k | return 0; | 316 | | | 317 | | | 318 | | /* ----------------- Check a UTF-16 string ----------------- */ | 319 | | | 320 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 | 321 | | | 322 | | /* There's not so much work, nor so many errors, for UTF-16. | 323 | | PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string | 324 | | PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate | 325 | | PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate | 326 | | */ | 327 | | | 328 | | for (p = string; length > 0; p++) | 329 | | { | 330 | | c = *p; | 331 | | length--; | 332 | | | 333 | | if ((c & 0xf800) != 0xd800) | 334 | | { | 335 | | /* Normal UTF-16 code point. Neither high nor low surrogate. */ | 336 | | } | 337 | | else if ((c & 0x0400) == 0) | 338 | | { | 339 | | /* High surrogate. Must be a followed by a low surrogate. */ | 340 | | if (length == 0) | 341 | | { | 342 | | *erroroffset = (PCRE2_SIZE)(p - string); | 343 | | return PCRE2_ERROR_UTF16_ERR1; | 344 | | } | 345 | | p++; | 346 | | length--; | 347 | | if ((*p & 0xfc00) != 0xdc00) | 348 | | { | 349 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 350 | | return PCRE2_ERROR_UTF16_ERR2; | 351 | | } | 352 | | } | 353 | | else | 354 | | { | 355 | | /* Isolated low surrogate. Always an error. */ | 356 | | *erroroffset = (PCRE2_SIZE)(p - string); | 357 | | return PCRE2_ERROR_UTF16_ERR3; | 358 | | } | 359 | | } | 360 | | return 0; | 361 | | | 362 | | | 363 | | | 364 | | /* ----------------- Check a UTF-32 string ----------------- */ | 365 | | | 366 | | #else | 367 | | | 368 | | /* There is very little to do for a UTF-32 string. | 369 | | PCRE2_ERROR_UTF32_ERR1 Surrogate character | 370 | | PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff | 371 | | */ | 372 | | | 373 | | for (p = string; length > 0; length--, p++) | 374 | | { | 375 | | c = *p; | 376 | | if ((c & 0xfffff800u) != 0xd800u) | 377 | | { | 378 | | /* Normal UTF-32 code point. Neither high nor low surrogate. */ | 379 | | if (c > 0x10ffffu) | 380 | | { | 381 | | *erroroffset = (PCRE2_SIZE)(p - string); | 382 | | return PCRE2_ERROR_UTF32_ERR2; | 383 | | } | 384 | | } | 385 | | else | 386 | | { | 387 | | /* A surrogate */ | 388 | | *erroroffset = (PCRE2_SIZE)(p - string); | 389 | | return PCRE2_ERROR_UTF32_ERR1; | 390 | | } | 391 | | } | 392 | | return 0; | 393 | | #endif /* CODE_UNIT_WIDTH */ | 394 | 394k | } |
Line | Count | Source | 94 | 139k | { | 95 | 139k | PCRE2_SPTR p; | 96 | 139k | uint32_t c; | 97 | | | 98 | | /* ----------------- Check a UTF-8 string ----------------- */ | 99 | | | 100 | | #if PCRE2_CODE_UNIT_WIDTH == 8 | 101 | | | 102 | | /* Originally, this function checked according to RFC 2279, allowing for values | 103 | | in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were | 104 | | in the canonical format. Once somebody had pointed out RFC 3629 to me (it | 105 | | obsoletes 2279), additional restrictions were applied. The values are now | 106 | | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the | 107 | | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte | 108 | | characters is still checked. Error returns are as follows: | 109 | | | 110 | | PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string | 111 | | PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string | 112 | | PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string | 113 | | PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string | 114 | | PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string | 115 | | PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 | 116 | | PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 | 117 | | PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 | 118 | | PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 | 119 | | PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 | 120 | | PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 | 121 | | PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 | 122 | | PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted | 123 | | PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted | 124 | | PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence | 125 | | PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence | 126 | | PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence | 127 | | PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) | 128 | | PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) | 129 | | PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) | 130 | | PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff | 131 | | */ | 132 | | | 133 | | for (p = string; length > 0; p++) | 134 | | { | 135 | | uint32_t ab, d; | 136 | | | 137 | | c = *p; | 138 | | length--; | 139 | | | 140 | | if (c < 128) continue; /* ASCII character */ | 141 | | | 142 | | if (c < 0xc0) /* Isolated 10xx xxxx byte */ | 143 | | { | 144 | | *erroroffset = (PCRE2_SIZE)(p - string); | 145 | | return PCRE2_ERROR_UTF8_ERR20; | 146 | | } | 147 | | | 148 | | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ | 149 | | { | 150 | | *erroroffset = (PCRE2_SIZE)(p - string); | 151 | | return PCRE2_ERROR_UTF8_ERR21; | 152 | | } | 153 | | | 154 | | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ | 155 | | if (length < ab) /* Missing bytes */ | 156 | | { | 157 | | *erroroffset = (PCRE2_SIZE)(p - string); | 158 | | switch(ab - length) | 159 | | { | 160 | | case 1: return PCRE2_ERROR_UTF8_ERR1; | 161 | | case 2: return PCRE2_ERROR_UTF8_ERR2; | 162 | | case 3: return PCRE2_ERROR_UTF8_ERR3; | 163 | | case 4: return PCRE2_ERROR_UTF8_ERR4; | 164 | | case 5: return PCRE2_ERROR_UTF8_ERR5; | 165 | | } | 166 | | } | 167 | | length -= ab; /* Length remaining */ | 168 | | | 169 | | /* Check top bits in the second byte */ | 170 | | | 171 | | if (((d = *(++p)) & 0xc0) != 0x80) | 172 | | { | 173 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 174 | | return PCRE2_ERROR_UTF8_ERR6; | 175 | | } | 176 | | | 177 | | /* For each length, check that the remaining bytes start with the 0x80 bit | 178 | | set and not the 0x40 bit. Then check for an overlong sequence, and for the | 179 | | excluded range 0xd800 to 0xdfff. */ | 180 | | | 181 | | switch (ab) | 182 | | { | 183 | | /* 2-byte character. No further bytes to check for 0x80. Check first byte | 184 | | for for xx00 000x (overlong sequence). */ | 185 | | | 186 | | case 1: if ((c & 0x3e) == 0) | 187 | | { | 188 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 189 | | return PCRE2_ERROR_UTF8_ERR15; | 190 | | } | 191 | | break; | 192 | | | 193 | | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes | 194 | | for 1110 0000, xx0x xxxx (overlong sequence) or | 195 | | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ | 196 | | | 197 | | case 2: | 198 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 199 | | { | 200 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 201 | | return PCRE2_ERROR_UTF8_ERR7; | 202 | | } | 203 | | if (c == 0xe0 && (d & 0x20) == 0) | 204 | | { | 205 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 206 | | return PCRE2_ERROR_UTF8_ERR16; | 207 | | } | 208 | | if (c == 0xed && d >= 0xa0) | 209 | | { | 210 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 211 | | return PCRE2_ERROR_UTF8_ERR14; | 212 | | } | 213 | | break; | 214 | | | 215 | | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 | 216 | | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a | 217 | | character greater than 0x0010ffff (f4 8f bf bf) */ | 218 | | | 219 | | case 3: | 220 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 221 | | { | 222 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 223 | | return PCRE2_ERROR_UTF8_ERR7; | 224 | | } | 225 | | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 226 | | { | 227 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 228 | | return PCRE2_ERROR_UTF8_ERR8; | 229 | | } | 230 | | if (c == 0xf0 && (d & 0x30) == 0) | 231 | | { | 232 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 233 | | return PCRE2_ERROR_UTF8_ERR17; | 234 | | } | 235 | | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) | 236 | | { | 237 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 238 | | return PCRE2_ERROR_UTF8_ERR13; | 239 | | } | 240 | | break; | 241 | | | 242 | | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be | 243 | | rejected by the length test below. However, we do the appropriate tests | 244 | | here so that overlong sequences get diagnosed, and also in case there is | 245 | | ever an option for handling these larger code points. */ | 246 | | | 247 | | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for | 248 | | 1111 1000, xx00 0xxx */ | 249 | | | 250 | | case 4: | 251 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 252 | | { | 253 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 254 | | return PCRE2_ERROR_UTF8_ERR7; | 255 | | } | 256 | | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 257 | | { | 258 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 259 | | return PCRE2_ERROR_UTF8_ERR8; | 260 | | } | 261 | | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ | 262 | | { | 263 | | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 264 | | return PCRE2_ERROR_UTF8_ERR9; | 265 | | } | 266 | | if (c == 0xf8 && (d & 0x38) == 0) | 267 | | { | 268 | | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 269 | | return PCRE2_ERROR_UTF8_ERR18; | 270 | | } | 271 | | break; | 272 | | | 273 | | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for | 274 | | 1111 1100, xx00 00xx. */ | 275 | | | 276 | | case 5: | 277 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 278 | | { | 279 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 280 | | return PCRE2_ERROR_UTF8_ERR7; | 281 | | } | 282 | | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 283 | | { | 284 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 285 | | return PCRE2_ERROR_UTF8_ERR8; | 286 | | } | 287 | | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ | 288 | | { | 289 | | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 290 | | return PCRE2_ERROR_UTF8_ERR9; | 291 | | } | 292 | | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ | 293 | | { | 294 | | *erroroffset = (PCRE2_SIZE)(p - string) - 5; | 295 | | return PCRE2_ERROR_UTF8_ERR10; | 296 | | } | 297 | | if (c == 0xfc && (d & 0x3c) == 0) | 298 | | { | 299 | | *erroroffset = (PCRE2_SIZE)(p - string) - 5; | 300 | | return PCRE2_ERROR_UTF8_ERR19; | 301 | | } | 302 | | break; | 303 | | } | 304 | | | 305 | | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are | 306 | | excluded by RFC 3629. The pointer p is currently at the last byte of the | 307 | | character. */ | 308 | | | 309 | | if (ab > 3) | 310 | | { | 311 | | *erroroffset = (PCRE2_SIZE)(p - string) - ab; | 312 | | return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; | 313 | | } | 314 | | } | 315 | | return 0; | 316 | | | 317 | | | 318 | | /* ----------------- Check a UTF-16 string ----------------- */ | 319 | | | 320 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 | 321 | | | 322 | | /* There's not so much work, nor so many errors, for UTF-16. | 323 | | PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string | 324 | | PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate | 325 | | PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate | 326 | | */ | 327 | | | 328 | | for (p = string; length > 0; p++) | 329 | | { | 330 | | c = *p; | 331 | | length--; | 332 | | | 333 | | if ((c & 0xf800) != 0xd800) | 334 | | { | 335 | | /* Normal UTF-16 code point. Neither high nor low surrogate. */ | 336 | | } | 337 | | else if ((c & 0x0400) == 0) | 338 | | { | 339 | | /* High surrogate. Must be a followed by a low surrogate. */ | 340 | | if (length == 0) | 341 | | { | 342 | | *erroroffset = (PCRE2_SIZE)(p - string); | 343 | | return PCRE2_ERROR_UTF16_ERR1; | 344 | | } | 345 | | p++; | 346 | | length--; | 347 | | if ((*p & 0xfc00) != 0xdc00) | 348 | | { | 349 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 350 | | return PCRE2_ERROR_UTF16_ERR2; | 351 | | } | 352 | | } | 353 | | else | 354 | | { | 355 | | /* Isolated low surrogate. Always an error. */ | 356 | | *erroroffset = (PCRE2_SIZE)(p - string); | 357 | | return PCRE2_ERROR_UTF16_ERR3; | 358 | | } | 359 | | } | 360 | | return 0; | 361 | | | 362 | | | 363 | | | 364 | | /* ----------------- Check a UTF-32 string ----------------- */ | 365 | | | 366 | | #else | 367 | | | 368 | | /* There is very little to do for a UTF-32 string. | 369 | | PCRE2_ERROR_UTF32_ERR1 Surrogate character | 370 | | PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff | 371 | | */ | 372 | | | 373 | 1.77M | for (p = string; length > 0; length--, p++) | 374 | 1.64M | { | 375 | 1.64M | c = *p; | 376 | 1.64M | if ((c & 0xfffff800u) != 0xd800u) | 377 | 1.64M | { | 378 | | /* Normal UTF-32 code point. Neither high nor low surrogate. */ | 379 | 1.64M | if (c > 0x10ffffu) | 380 | 9.91k | { | 381 | 9.91k | *erroroffset = (PCRE2_SIZE)(p - string); | 382 | 9.91k | return PCRE2_ERROR_UTF32_ERR2; | 383 | 9.91k | } | 384 | 1.64M | } | 385 | 32 | else | 386 | 32 | { | 387 | | /* A surrogate */ | 388 | 32 | *erroroffset = (PCRE2_SIZE)(p - string); | 389 | 32 | return PCRE2_ERROR_UTF32_ERR1; | 390 | 32 | } | 391 | 1.64M | } | 392 | 129k | return 0; | 393 | 139k | #endif /* CODE_UNIT_WIDTH */ | 394 | 139k | } |
Line | Count | Source | 94 | 399k | { | 95 | 399k | PCRE2_SPTR p; | 96 | 399k | uint32_t c; | 97 | | | 98 | | /* ----------------- Check a UTF-8 string ----------------- */ | 99 | | | 100 | | #if PCRE2_CODE_UNIT_WIDTH == 8 | 101 | | | 102 | | /* Originally, this function checked according to RFC 2279, allowing for values | 103 | | in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were | 104 | | in the canonical format. Once somebody had pointed out RFC 3629 to me (it | 105 | | obsoletes 2279), additional restrictions were applied. The values are now | 106 | | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the | 107 | | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte | 108 | | characters is still checked. Error returns are as follows: | 109 | | | 110 | | PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string | 111 | | PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string | 112 | | PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string | 113 | | PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string | 114 | | PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string | 115 | | PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 | 116 | | PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 | 117 | | PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 | 118 | | PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 | 119 | | PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 | 120 | | PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 | 121 | | PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 | 122 | | PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted | 123 | | PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted | 124 | | PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence | 125 | | PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence | 126 | | PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence | 127 | | PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) | 128 | | PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) | 129 | | PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) | 130 | | PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff | 131 | | */ | 132 | | | 133 | | for (p = string; length > 0; p++) | 134 | | { | 135 | | uint32_t ab, d; | 136 | | | 137 | | c = *p; | 138 | | length--; | 139 | | | 140 | | if (c < 128) continue; /* ASCII character */ | 141 | | | 142 | | if (c < 0xc0) /* Isolated 10xx xxxx byte */ | 143 | | { | 144 | | *erroroffset = (PCRE2_SIZE)(p - string); | 145 | | return PCRE2_ERROR_UTF8_ERR20; | 146 | | } | 147 | | | 148 | | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ | 149 | | { | 150 | | *erroroffset = (PCRE2_SIZE)(p - string); | 151 | | return PCRE2_ERROR_UTF8_ERR21; | 152 | | } | 153 | | | 154 | | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ | 155 | | if (length < ab) /* Missing bytes */ | 156 | | { | 157 | | *erroroffset = (PCRE2_SIZE)(p - string); | 158 | | switch(ab - length) | 159 | | { | 160 | | case 1: return PCRE2_ERROR_UTF8_ERR1; | 161 | | case 2: return PCRE2_ERROR_UTF8_ERR2; | 162 | | case 3: return PCRE2_ERROR_UTF8_ERR3; | 163 | | case 4: return PCRE2_ERROR_UTF8_ERR4; | 164 | | case 5: return PCRE2_ERROR_UTF8_ERR5; | 165 | | } | 166 | | } | 167 | | length -= ab; /* Length remaining */ | 168 | | | 169 | | /* Check top bits in the second byte */ | 170 | | | 171 | | if (((d = *(++p)) & 0xc0) != 0x80) | 172 | | { | 173 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 174 | | return PCRE2_ERROR_UTF8_ERR6; | 175 | | } | 176 | | | 177 | | /* For each length, check that the remaining bytes start with the 0x80 bit | 178 | | set and not the 0x40 bit. Then check for an overlong sequence, and for the | 179 | | excluded range 0xd800 to 0xdfff. */ | 180 | | | 181 | | switch (ab) | 182 | | { | 183 | | /* 2-byte character. No further bytes to check for 0x80. Check first byte | 184 | | for for xx00 000x (overlong sequence). */ | 185 | | | 186 | | case 1: if ((c & 0x3e) == 0) | 187 | | { | 188 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 189 | | return PCRE2_ERROR_UTF8_ERR15; | 190 | | } | 191 | | break; | 192 | | | 193 | | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes | 194 | | for 1110 0000, xx0x xxxx (overlong sequence) or | 195 | | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ | 196 | | | 197 | | case 2: | 198 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 199 | | { | 200 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 201 | | return PCRE2_ERROR_UTF8_ERR7; | 202 | | } | 203 | | if (c == 0xe0 && (d & 0x20) == 0) | 204 | | { | 205 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 206 | | return PCRE2_ERROR_UTF8_ERR16; | 207 | | } | 208 | | if (c == 0xed && d >= 0xa0) | 209 | | { | 210 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 211 | | return PCRE2_ERROR_UTF8_ERR14; | 212 | | } | 213 | | break; | 214 | | | 215 | | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 | 216 | | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a | 217 | | character greater than 0x0010ffff (f4 8f bf bf) */ | 218 | | | 219 | | case 3: | 220 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 221 | | { | 222 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 223 | | return PCRE2_ERROR_UTF8_ERR7; | 224 | | } | 225 | | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 226 | | { | 227 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 228 | | return PCRE2_ERROR_UTF8_ERR8; | 229 | | } | 230 | | if (c == 0xf0 && (d & 0x30) == 0) | 231 | | { | 232 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 233 | | return PCRE2_ERROR_UTF8_ERR17; | 234 | | } | 235 | | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) | 236 | | { | 237 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 238 | | return PCRE2_ERROR_UTF8_ERR13; | 239 | | } | 240 | | break; | 241 | | | 242 | | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be | 243 | | rejected by the length test below. However, we do the appropriate tests | 244 | | here so that overlong sequences get diagnosed, and also in case there is | 245 | | ever an option for handling these larger code points. */ | 246 | | | 247 | | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for | 248 | | 1111 1000, xx00 0xxx */ | 249 | | | 250 | | case 4: | 251 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 252 | | { | 253 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 254 | | return PCRE2_ERROR_UTF8_ERR7; | 255 | | } | 256 | | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 257 | | { | 258 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 259 | | return PCRE2_ERROR_UTF8_ERR8; | 260 | | } | 261 | | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ | 262 | | { | 263 | | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 264 | | return PCRE2_ERROR_UTF8_ERR9; | 265 | | } | 266 | | if (c == 0xf8 && (d & 0x38) == 0) | 267 | | { | 268 | | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 269 | | return PCRE2_ERROR_UTF8_ERR18; | 270 | | } | 271 | | break; | 272 | | | 273 | | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for | 274 | | 1111 1100, xx00 00xx. */ | 275 | | | 276 | | case 5: | 277 | | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ | 278 | | { | 279 | | *erroroffset = (PCRE2_SIZE)(p - string) - 2; | 280 | | return PCRE2_ERROR_UTF8_ERR7; | 281 | | } | 282 | | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ | 283 | | { | 284 | | *erroroffset = (PCRE2_SIZE)(p - string) - 3; | 285 | | return PCRE2_ERROR_UTF8_ERR8; | 286 | | } | 287 | | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ | 288 | | { | 289 | | *erroroffset = (PCRE2_SIZE)(p - string) - 4; | 290 | | return PCRE2_ERROR_UTF8_ERR9; | 291 | | } | 292 | | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ | 293 | | { | 294 | | *erroroffset = (PCRE2_SIZE)(p - string) - 5; | 295 | | return PCRE2_ERROR_UTF8_ERR10; | 296 | | } | 297 | | if (c == 0xfc && (d & 0x3c) == 0) | 298 | | { | 299 | | *erroroffset = (PCRE2_SIZE)(p - string) - 5; | 300 | | return PCRE2_ERROR_UTF8_ERR19; | 301 | | } | 302 | | break; | 303 | | } | 304 | | | 305 | | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are | 306 | | excluded by RFC 3629. The pointer p is currently at the last byte of the | 307 | | character. */ | 308 | | | 309 | | if (ab > 3) | 310 | | { | 311 | | *erroroffset = (PCRE2_SIZE)(p - string) - ab; | 312 | | return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; | 313 | | } | 314 | | } | 315 | | return 0; | 316 | | | 317 | | | 318 | | /* ----------------- Check a UTF-16 string ----------------- */ | 319 | | | 320 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 | 321 | | | 322 | | /* There's not so much work, nor so many errors, for UTF-16. | 323 | | PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string | 324 | | PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate | 325 | | PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate | 326 | | */ | 327 | | | 328 | 20.8M | for (p = string; length > 0; p++) | 329 | 20.4M | { | 330 | 20.4M | c = *p; | 331 | 20.4M | length--; | 332 | | | 333 | 20.4M | if ((c & 0xf800) != 0xd800) | 334 | 20.2M | { | 335 | | /* Normal UTF-16 code point. Neither high nor low surrogate. */ | 336 | 20.2M | } | 337 | 113k | else if ((c & 0x0400) == 0) | 338 | 111k | { | 339 | | /* High surrogate. Must be a followed by a low surrogate. */ | 340 | 111k | if (length == 0) | 341 | 212 | { | 342 | 212 | *erroroffset = (PCRE2_SIZE)(p - string); | 343 | 212 | return PCRE2_ERROR_UTF16_ERR1; | 344 | 212 | } | 345 | 111k | p++; | 346 | 111k | length--; | 347 | 111k | if ((*p & 0xfc00) != 0xdc00) | 348 | 1.07k | { | 349 | 1.07k | *erroroffset = (PCRE2_SIZE)(p - string) - 1; | 350 | 1.07k | return PCRE2_ERROR_UTF16_ERR2; | 351 | 1.07k | } | 352 | 111k | } | 353 | 1.53k | else | 354 | 1.53k | { | 355 | | /* Isolated low surrogate. Always an error. */ | 356 | 1.53k | *erroroffset = (PCRE2_SIZE)(p - string); | 357 | 1.53k | return PCRE2_ERROR_UTF16_ERR3; | 358 | 1.53k | } | 359 | 20.4M | } | 360 | 397k | return 0; | 361 | | | 362 | | | 363 | | | 364 | | /* ----------------- Check a UTF-32 string ----------------- */ | 365 | | | 366 | | #else | 367 | | | 368 | | /* There is very little to do for a UTF-32 string. | 369 | | PCRE2_ERROR_UTF32_ERR1 Surrogate character | 370 | | PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff | 371 | | */ | 372 | | | 373 | | for (p = string; length > 0; length--, p++) | 374 | | { | 375 | | c = *p; | 376 | | if ((c & 0xfffff800u) != 0xd800u) | 377 | | { | 378 | | /* Normal UTF-32 code point. Neither high nor low surrogate. */ | 379 | | if (c > 0x10ffffu) | 380 | | { | 381 | | *erroroffset = (PCRE2_SIZE)(p - string); | 382 | | return PCRE2_ERROR_UTF32_ERR2; | 383 | | } | 384 | | } | 385 | | else | 386 | | { | 387 | | /* A surrogate */ | 388 | | *erroroffset = (PCRE2_SIZE)(p - string); | 389 | | return PCRE2_ERROR_UTF32_ERR1; | 390 | | } | 391 | | } | 392 | | return 0; | 393 | | #endif /* CODE_UNIT_WIDTH */ | 394 | 399k | } |
|
395 | | #endif /* SUPPORT_UNICODE */ |
396 | | |
397 | | /* End of pcre2_valid_utf.c */ |