/src/glib/glib/pcre/pcre_valid_utf8.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Copyright (c) 1997-2012 University of Cambridge |
10 | | |
11 | | ----------------------------------------------------------------------------- |
12 | | Redistribution and use in source and binary forms, with or without |
13 | | modification, are permitted provided that the following conditions are met: |
14 | | |
15 | | * Redistributions of source code must retain the above copyright notice, |
16 | | this list of conditions and the following disclaimer. |
17 | | |
18 | | * Redistributions in binary form must reproduce the above copyright |
19 | | notice, this list of conditions and the following disclaimer in the |
20 | | documentation and/or other materials provided with the distribution. |
21 | | |
22 | | * Neither the name of the University of Cambridge nor the names of its |
23 | | contributors may be used to endorse or promote products derived from |
24 | | this software without specific prior written permission. |
25 | | |
26 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
27 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
28 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
29 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
30 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | | POSSIBILITY OF SUCH DAMAGE. |
37 | | ----------------------------------------------------------------------------- |
38 | | */ |
39 | | |
40 | | |
41 | | /* This module contains an internal function for validating UTF-8 character |
42 | | strings. */ |
43 | | |
44 | | |
45 | | #include "config.h" |
46 | | |
47 | | #include "pcre_internal.h" |
48 | | |
49 | | |
50 | | /************************************************* |
51 | | * Validate a UTF-8 string * |
52 | | *************************************************/ |
53 | | |
54 | | /* This function is called (optionally) at the start of compile or match, to |
55 | | check that a supposed UTF-8 string is actually valid. The early check means |
56 | | that subsequent code can assume it is dealing with a valid string. The check |
57 | | can be turned off for maximum performance, but the consequences of supplying an |
58 | | invalid string are then undefined. |
59 | | |
60 | | Originally, this function checked according to RFC 2279, allowing for values in |
61 | | the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in |
62 | | the canonical format. Once somebody had pointed out RFC 3629 to me (it |
63 | | obsoletes 2279), additional restrictions were applied. The values are now |
64 | | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
65 | | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte |
66 | | characters is still checked. |
67 | | |
68 | | From release 8.13 more information about the details of the error are passed |
69 | | back in the returned value: |
70 | | |
71 | | PCRE_UTF8_ERR0 No error |
72 | | PCRE_UTF8_ERR1 Missing 1 byte at the end of the string |
73 | | PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string |
74 | | PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string |
75 | | PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string |
76 | | PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string |
77 | | PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80 |
78 | | PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80 |
79 | | PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80 |
80 | | PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80 |
81 | | PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80 |
82 | | PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629 |
83 | | PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629 |
84 | | PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted |
85 | | PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted |
86 | | PCRE_UTF8_ERR15 Overlong 2-byte sequence |
87 | | PCRE_UTF8_ERR16 Overlong 3-byte sequence |
88 | | PCRE_UTF8_ERR17 Overlong 4-byte sequence |
89 | | PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) |
90 | | PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) |
91 | | PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) |
92 | | PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff |
93 | | |
94 | | Arguments: |
95 | | string points to the string |
96 | | length length of string, or -1 if the string is zero-terminated |
97 | | errp pointer to an error position offset variable |
98 | | |
99 | | Returns: = 0 if the string is a valid UTF-8 string |
100 | | > 0 otherwise, setting the offset of the bad character |
101 | | */ |
102 | | |
103 | | int |
104 | | PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) |
105 | 0 | { |
106 | 0 | #ifdef SUPPORT_UTF |
107 | 0 | PCRE_PUCHAR p; |
108 | |
|
109 | 0 | if (length < 0) |
110 | 0 | { |
111 | 0 | for (p = string; *p != 0; p++); |
112 | 0 | length = (int)(p - string); |
113 | 0 | } |
114 | |
|
115 | 0 | for (p = string; length-- > 0; p++) |
116 | 0 | { |
117 | 0 | int ab, c, d; |
118 | |
|
119 | 0 | c = *p; |
120 | 0 | if (c < 128) continue; /* ASCII character */ |
121 | | |
122 | 0 | if (c < 0xc0) /* Isolated 10xx xxxx byte */ |
123 | 0 | { |
124 | 0 | *erroroffset = (int)(p - string); |
125 | 0 | return PCRE_UTF8_ERR20; |
126 | 0 | } |
127 | | |
128 | 0 | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ |
129 | 0 | { |
130 | 0 | *erroroffset = (int)(p - string); |
131 | 0 | return PCRE_UTF8_ERR21; |
132 | 0 | } |
133 | | |
134 | 0 | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */ |
135 | 0 | if (length < ab) |
136 | 0 | { |
137 | 0 | *erroroffset = (int)(p - string); /* Missing bytes */ |
138 | 0 | return ab - length; /* Codes ERR1 to ERR5 */ |
139 | 0 | } |
140 | 0 | length -= ab; /* Length remaining */ |
141 | | |
142 | | /* Check top bits in the second byte */ |
143 | |
|
144 | 0 | if (((d = *(++p)) & 0xc0) != 0x80) |
145 | 0 | { |
146 | 0 | *erroroffset = (int)(p - string) - 1; |
147 | 0 | return PCRE_UTF8_ERR6; |
148 | 0 | } |
149 | | |
150 | | /* For each length, check that the remaining bytes start with the 0x80 bit |
151 | | set and not the 0x40 bit. Then check for an overlong sequence, and for the |
152 | | excluded range 0xd800 to 0xdfff. */ |
153 | | |
154 | 0 | switch (ab) |
155 | 0 | { |
156 | | /* 2-byte character. No further bytes to check for 0x80. Check first byte |
157 | | for for xx00 000x (overlong sequence). */ |
158 | | |
159 | 0 | case 1: if ((c & 0x3e) == 0) |
160 | 0 | { |
161 | 0 | *erroroffset = (int)(p - string) - 1; |
162 | 0 | return PCRE_UTF8_ERR15; |
163 | 0 | } |
164 | 0 | break; |
165 | | |
166 | | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes |
167 | | for 1110 0000, xx0x xxxx (overlong sequence) or |
168 | | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ |
169 | | |
170 | 0 | case 2: |
171 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
172 | 0 | { |
173 | 0 | *erroroffset = (int)(p - string) - 2; |
174 | 0 | return PCRE_UTF8_ERR7; |
175 | 0 | } |
176 | 0 | if (c == 0xe0 && (d & 0x20) == 0) |
177 | 0 | { |
178 | 0 | *erroroffset = (int)(p - string) - 2; |
179 | 0 | return PCRE_UTF8_ERR16; |
180 | 0 | } |
181 | 0 | if (c == 0xed && d >= 0xa0) |
182 | 0 | { |
183 | 0 | *erroroffset = (int)(p - string) - 2; |
184 | 0 | return PCRE_UTF8_ERR14; |
185 | 0 | } |
186 | 0 | break; |
187 | | |
188 | | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 |
189 | | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a |
190 | | character greater than 0x0010ffff (f4 8f bf bf) */ |
191 | | |
192 | 0 | case 3: |
193 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
194 | 0 | { |
195 | 0 | *erroroffset = (int)(p - string) - 2; |
196 | 0 | return PCRE_UTF8_ERR7; |
197 | 0 | } |
198 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
199 | 0 | { |
200 | 0 | *erroroffset = (int)(p - string) - 3; |
201 | 0 | return PCRE_UTF8_ERR8; |
202 | 0 | } |
203 | 0 | if (c == 0xf0 && (d & 0x30) == 0) |
204 | 0 | { |
205 | 0 | *erroroffset = (int)(p - string) - 3; |
206 | 0 | return PCRE_UTF8_ERR17; |
207 | 0 | } |
208 | 0 | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) |
209 | 0 | { |
210 | 0 | *erroroffset = (int)(p - string) - 3; |
211 | 0 | return PCRE_UTF8_ERR13; |
212 | 0 | } |
213 | 0 | break; |
214 | | |
215 | | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be |
216 | | rejected by the length test below. However, we do the appropriate tests |
217 | | here so that overlong sequences get diagnosed, and also in case there is |
218 | | ever an option for handling these larger code points. */ |
219 | | |
220 | | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for |
221 | | 1111 1000, xx00 0xxx */ |
222 | | |
223 | 0 | case 4: |
224 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
225 | 0 | { |
226 | 0 | *erroroffset = (int)(p - string) - 2; |
227 | 0 | return PCRE_UTF8_ERR7; |
228 | 0 | } |
229 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
230 | 0 | { |
231 | 0 | *erroroffset = (int)(p - string) - 3; |
232 | 0 | return PCRE_UTF8_ERR8; |
233 | 0 | } |
234 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
235 | 0 | { |
236 | 0 | *erroroffset = (int)(p - string) - 4; |
237 | 0 | return PCRE_UTF8_ERR9; |
238 | 0 | } |
239 | 0 | if (c == 0xf8 && (d & 0x38) == 0) |
240 | 0 | { |
241 | 0 | *erroroffset = (int)(p - string) - 4; |
242 | 0 | return PCRE_UTF8_ERR18; |
243 | 0 | } |
244 | 0 | break; |
245 | | |
246 | | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for |
247 | | 1111 1100, xx00 00xx. */ |
248 | | |
249 | 0 | case 5: |
250 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
251 | 0 | { |
252 | 0 | *erroroffset = (int)(p - string) - 2; |
253 | 0 | return PCRE_UTF8_ERR7; |
254 | 0 | } |
255 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
256 | 0 | { |
257 | 0 | *erroroffset = (int)(p - string) - 3; |
258 | 0 | return PCRE_UTF8_ERR8; |
259 | 0 | } |
260 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
261 | 0 | { |
262 | 0 | *erroroffset = (int)(p - string) - 4; |
263 | 0 | return PCRE_UTF8_ERR9; |
264 | 0 | } |
265 | 0 | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ |
266 | 0 | { |
267 | 0 | *erroroffset = (int)(p - string) - 5; |
268 | 0 | return PCRE_UTF8_ERR10; |
269 | 0 | } |
270 | 0 | if (c == 0xfc && (d & 0x3c) == 0) |
271 | 0 | { |
272 | 0 | *erroroffset = (int)(p - string) - 5; |
273 | 0 | return PCRE_UTF8_ERR19; |
274 | 0 | } |
275 | 0 | break; |
276 | 0 | } |
277 | | |
278 | | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are |
279 | | excluded by RFC 3629. The pointer p is currently at the last byte of the |
280 | | character. */ |
281 | | |
282 | 0 | if (ab > 3) |
283 | 0 | { |
284 | 0 | *erroroffset = (int)(p - string) - ab; |
285 | 0 | return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12; |
286 | 0 | } |
287 | 0 | } |
288 | | |
289 | | #else /* SUPPORT_UTF */ |
290 | | (void)(string); /* Keep picky compilers happy */ |
291 | | (void)(length); |
292 | | #endif |
293 | | |
294 | 0 | return PCRE_UTF8_ERR0; /* This indicates success */ |
295 | 0 | } |
296 | | |
297 | | /* End of pcre_valid_utf8.c */ |