/src/php-src/ext/pcre/pcre2lib/pcre2_valid_utf.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2020 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | /* This module contains an internal function for validating UTF character |
43 | | strings. This file is also #included by the pcre2test program, which uses |
44 | | macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes |
45 | | with the library. In this case, PCRE2_PCRE2TEST is defined. */ |
46 | | |
47 | | #ifndef PCRE2_PCRE2TEST /* We're compiling the library */ |
48 | | #ifdef HAVE_CONFIG_H |
49 | | #include "config.h" |
50 | | #endif |
51 | | #include "pcre2_internal.h" |
52 | | #endif /* PCRE2_PCRE2TEST */ |
53 | | |
54 | | |
55 | | #ifndef SUPPORT_UNICODE |
56 | | /************************************************* |
57 | | * Dummy function when Unicode is not supported * |
58 | | *************************************************/ |
59 | | |
60 | | /* This function should never be called when Unicode is not supported. */ |
61 | | |
62 | | int |
63 | | PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) |
64 | | { |
65 | | (void)string; |
66 | | (void)length; |
67 | | (void)erroroffset; |
68 | | return 0; |
69 | | } |
70 | | #else /* UTF is supported */ |
71 | | |
72 | | |
73 | | |
74 | | /************************************************* |
75 | | * Validate a UTF string * |
76 | | *************************************************/ |
77 | | |
78 | | /* This function is called (optionally) at the start of compile or match, to |
79 | | check that a supposed UTF string is actually valid. The early check means |
80 | | that subsequent code can assume it is dealing with a valid string. The check |
81 | | can be turned off for maximum performance, but the consequences of supplying an |
82 | | invalid string are then undefined. |
83 | | |
84 | | Arguments: |
85 | | string points to the string |
86 | | length length of string |
87 | | errp pointer to an error position offset variable |
88 | | |
89 | | Returns: == 0 if the string is a valid UTF string |
90 | | != 0 otherwise, setting the offset of the bad character |
91 | | */ |
92 | | |
93 | | int |
94 | | PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) |
95 | 1.78k | { |
96 | 1.78k | PCRE2_SPTR p; |
97 | 1.78k | uint32_t c; |
98 | | |
99 | | /* ----------------- Check a UTF-8 string ----------------- */ |
100 | | |
101 | 1.78k | #if PCRE2_CODE_UNIT_WIDTH == 8 |
102 | | |
103 | | /* Originally, this function checked according to RFC 2279, allowing for values |
104 | | in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were |
105 | | in the canonical format. Once somebody had pointed out RFC 3629 to me (it |
106 | | obsoletes 2279), additional restrictions were applied. The values are now |
107 | | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
108 | | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte |
109 | | characters is still checked. Error returns are as follows: |
110 | | |
111 | | PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string |
112 | | PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string |
113 | | PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string |
114 | | PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string |
115 | | PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string |
116 | | PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 |
117 | | PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 |
118 | | PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 |
119 | | PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 |
120 | | PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 |
121 | | PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 |
122 | | PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 |
123 | | PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted |
124 | | PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted |
125 | | PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence |
126 | | PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence |
127 | | PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence |
128 | | PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) |
129 | | PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) |
130 | | PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) |
131 | | PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff |
132 | | */ |
133 | | |
134 | 262k | for (p = string; length > 0; p++) |
135 | 261k | { |
136 | 261k | uint32_t ab, d; |
137 | | |
138 | 261k | c = *p; |
139 | 261k | length--; |
140 | | |
141 | 261k | if (c < 128) continue; /* ASCII character */ |
142 | | |
143 | 2.56k | if (c < 0xc0) /* Isolated 10xx xxxx byte */ |
144 | 89 | { |
145 | 89 | *erroroffset = (PCRE2_SIZE)(p - string); |
146 | 89 | return PCRE2_ERROR_UTF8_ERR20; |
147 | 89 | } |
148 | | |
149 | 2.47k | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ |
150 | 29 | { |
151 | 29 | *erroroffset = (PCRE2_SIZE)(p - string); |
152 | 29 | return PCRE2_ERROR_UTF8_ERR21; |
153 | 29 | } |
154 | | |
155 | 2.44k | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ |
156 | 2.44k | if (length < ab) /* Missing bytes */ |
157 | 5 | { |
158 | 5 | *erroroffset = (PCRE2_SIZE)(p - string); |
159 | 5 | switch(ab - length) |
160 | 5 | { |
161 | 2 | case 1: return PCRE2_ERROR_UTF8_ERR1; |
162 | 0 | case 2: return PCRE2_ERROR_UTF8_ERR2; |
163 | 3 | case 3: return PCRE2_ERROR_UTF8_ERR3; |
164 | 0 | case 4: return PCRE2_ERROR_UTF8_ERR4; |
165 | 0 | case 5: return PCRE2_ERROR_UTF8_ERR5; |
166 | 5 | } |
167 | 5 | } |
168 | 2.44k | length -= ab; /* Length remaining */ |
169 | | |
170 | | /* Check top bits in the second byte */ |
171 | | |
172 | 2.44k | if (((d = *(++p)) & 0xc0) != 0x80) |
173 | 30 | { |
174 | 30 | *erroroffset = (PCRE2_SIZE)(p - string) - 1; |
175 | 30 | return PCRE2_ERROR_UTF8_ERR6; |
176 | 30 | } |
177 | | |
178 | | /* For each length, check that the remaining bytes start with the 0x80 bit |
179 | | set and not the 0x40 bit. Then check for an overlong sequence, and for the |
180 | | excluded range 0xd800 to 0xdfff. */ |
181 | | |
182 | 2.41k | switch (ab) |
183 | 2.41k | { |
184 | | /* 2-byte character. No further bytes to check for 0x80. Check first byte |
185 | | for for xx00 000x (overlong sequence). */ |
186 | | |
187 | 656 | case 1: if ((c & 0x3e) == 0) |
188 | 0 | { |
189 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 1; |
190 | 0 | return PCRE2_ERROR_UTF8_ERR15; |
191 | 0 | } |
192 | 656 | break; |
193 | | |
194 | | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes |
195 | | for 1110 0000, xx0x xxxx (overlong sequence) or |
196 | | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ |
197 | | |
198 | 1.12k | case 2: |
199 | 1.12k | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
200 | 3 | { |
201 | 3 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
202 | 3 | return PCRE2_ERROR_UTF8_ERR7; |
203 | 3 | } |
204 | 1.12k | if (c == 0xe0 && (d & 0x20) == 0) |
205 | 3 | { |
206 | 3 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
207 | 3 | return PCRE2_ERROR_UTF8_ERR16; |
208 | 3 | } |
209 | 1.12k | if (c == 0xed && d >= 0xa0) |
210 | 0 | { |
211 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
212 | 0 | return PCRE2_ERROR_UTF8_ERR14; |
213 | 0 | } |
214 | 1.12k | break; |
215 | | |
216 | | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 |
217 | | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a |
218 | | character greater than 0x0010ffff (f4 8f bf bf) */ |
219 | | |
220 | 1.12k | case 3: |
221 | 625 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
222 | 5 | { |
223 | 5 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
224 | 5 | return PCRE2_ERROR_UTF8_ERR7; |
225 | 5 | } |
226 | 620 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
227 | 5 | { |
228 | 5 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
229 | 5 | return PCRE2_ERROR_UTF8_ERR8; |
230 | 5 | } |
231 | 615 | if (c == 0xf0 && (d & 0x30) == 0) |
232 | 0 | { |
233 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
234 | 0 | return PCRE2_ERROR_UTF8_ERR17; |
235 | 0 | } |
236 | 615 | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) |
237 | 3 | { |
238 | 3 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
239 | 3 | return PCRE2_ERROR_UTF8_ERR13; |
240 | 3 | } |
241 | 612 | break; |
242 | | |
243 | | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be |
244 | | rejected by the length test below. However, we do the appropriate tests |
245 | | here so that overlong sequences get diagnosed, and also in case there is |
246 | | ever an option for handling these larger code points. */ |
247 | | |
248 | | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for |
249 | | 1111 1000, xx00 0xxx */ |
250 | | |
251 | 612 | case 4: |
252 | 2 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
253 | 2 | { |
254 | 2 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
255 | 2 | return PCRE2_ERROR_UTF8_ERR7; |
256 | 2 | } |
257 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
258 | 0 | { |
259 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
260 | 0 | return PCRE2_ERROR_UTF8_ERR8; |
261 | 0 | } |
262 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
263 | 0 | { |
264 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; |
265 | 0 | return PCRE2_ERROR_UTF8_ERR9; |
266 | 0 | } |
267 | 0 | if (c == 0xf8 && (d & 0x38) == 0) |
268 | 0 | { |
269 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; |
270 | 0 | return PCRE2_ERROR_UTF8_ERR18; |
271 | 0 | } |
272 | 0 | break; |
273 | | |
274 | | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for |
275 | | 1111 1100, xx00 00xx. */ |
276 | | |
277 | 0 | case 5: |
278 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
279 | 0 | { |
280 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 2; |
281 | 0 | return PCRE2_ERROR_UTF8_ERR7; |
282 | 0 | } |
283 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
284 | 0 | { |
285 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 3; |
286 | 0 | return PCRE2_ERROR_UTF8_ERR8; |
287 | 0 | } |
288 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
289 | 0 | { |
290 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 4; |
291 | 0 | return PCRE2_ERROR_UTF8_ERR9; |
292 | 0 | } |
293 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ |
294 | 0 | { |
295 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 5; |
296 | 0 | return PCRE2_ERROR_UTF8_ERR10; |
297 | 0 | } |
298 | 0 | if (c == 0xfc && (d & 0x3c) == 0) |
299 | 0 | { |
300 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - 5; |
301 | 0 | return PCRE2_ERROR_UTF8_ERR19; |
302 | 0 | } |
303 | 0 | break; |
304 | 2.41k | } |
305 | | |
306 | | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are |
307 | | excluded by RFC 3629. The pointer p is currently at the last byte of the |
308 | | character. */ |
309 | | |
310 | 2.39k | if (ab > 3) |
311 | 0 | { |
312 | 0 | *erroroffset = (PCRE2_SIZE)(p - string) - ab; |
313 | 0 | return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; |
314 | 0 | } |
315 | 2.39k | } |
316 | 1.61k | return 0; |
317 | | |
318 | | |
319 | | /* ----------------- Check a UTF-16 string ----------------- */ |
320 | | |
321 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
322 | | |
323 | | /* There's not so much work, nor so many errors, for UTF-16. |
324 | | PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string |
325 | | PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate |
326 | | PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate |
327 | | */ |
328 | | |
329 | | for (p = string; length > 0; p++) |
330 | | { |
331 | | c = *p; |
332 | | length--; |
333 | | |
334 | | if ((c & 0xf800) != 0xd800) |
335 | | { |
336 | | /* Normal UTF-16 code point. Neither high nor low surrogate. */ |
337 | | } |
338 | | else if ((c & 0x0400) == 0) |
339 | | { |
340 | | /* High surrogate. Must be a followed by a low surrogate. */ |
341 | | if (length == 0) |
342 | | { |
343 | | *erroroffset = (PCRE2_SIZE)(p - string); |
344 | | return PCRE2_ERROR_UTF16_ERR1; |
345 | | } |
346 | | p++; |
347 | | length--; |
348 | | if ((*p & 0xfc00) != 0xdc00) |
349 | | { |
350 | | *erroroffset = (PCRE2_SIZE)(p - string) - 1; |
351 | | return PCRE2_ERROR_UTF16_ERR2; |
352 | | } |
353 | | } |
354 | | else |
355 | | { |
356 | | /* Isolated low surrogate. Always an error. */ |
357 | | *erroroffset = (PCRE2_SIZE)(p - string); |
358 | | return PCRE2_ERROR_UTF16_ERR3; |
359 | | } |
360 | | } |
361 | | return 0; |
362 | | |
363 | | |
364 | | |
365 | | /* ----------------- Check a UTF-32 string ----------------- */ |
366 | | |
367 | | #else |
368 | | |
369 | | /* There is very little to do for a UTF-32 string. |
370 | | PCRE2_ERROR_UTF32_ERR1 Surrogate character |
371 | | PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff |
372 | | */ |
373 | | |
374 | | for (p = string; length > 0; length--, p++) |
375 | | { |
376 | | c = *p; |
377 | | if ((c & 0xfffff800u) != 0xd800u) |
378 | | { |
379 | | /* Normal UTF-32 code point. Neither high nor low surrogate. */ |
380 | | if (c > 0x10ffffu) |
381 | | { |
382 | | *erroroffset = (PCRE2_SIZE)(p - string); |
383 | | return PCRE2_ERROR_UTF32_ERR2; |
384 | | } |
385 | | } |
386 | | else |
387 | | { |
388 | | /* A surrogate */ |
389 | | *erroroffset = (PCRE2_SIZE)(p - string); |
390 | | return PCRE2_ERROR_UTF32_ERR1; |
391 | | } |
392 | | } |
393 | | return 0; |
394 | | #endif /* CODE_UNIT_WIDTH */ |
395 | 1.78k | } |
396 | | #endif /* SUPPORT_UNICODE */ |
397 | | |
398 | | /* End of pcre2_valid_utf.c */ |