/src/h2o/deps/yaml/src/reader.c
Line | Count | Source |
1 | | |
2 | | #include "yaml_private.h" |
3 | | |
4 | | /* |
5 | | * Declarations. |
6 | | */ |
7 | | |
8 | | static int |
9 | | yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, |
10 | | size_t offset, int value); |
11 | | |
12 | | static int |
13 | | yaml_parser_update_raw_buffer(yaml_parser_t *parser); |
14 | | |
15 | | static int |
16 | | yaml_parser_determine_encoding(yaml_parser_t *parser); |
17 | | |
18 | | YAML_DECLARE(int) |
19 | | yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); |
20 | | |
21 | | /* |
22 | | * Set the reader error and return 0. |
23 | | */ |
24 | | |
25 | | static int |
26 | | yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, |
27 | | size_t offset, int value) |
28 | 0 | { |
29 | 0 | parser->error = YAML_READER_ERROR; |
30 | 0 | parser->problem = problem; |
31 | 0 | parser->problem_offset = offset; |
32 | 0 | parser->problem_value = value; |
33 | |
|
34 | 0 | return 0; |
35 | 0 | } |
36 | | |
37 | | /* |
38 | | * Byte order marks. |
39 | | */ |
40 | | |
41 | 0 | #define BOM_UTF8 "\xef\xbb\xbf" |
42 | 0 | #define BOM_UTF16LE "\xff\xfe" |
43 | 0 | #define BOM_UTF16BE "\xfe\xff" |
44 | | |
45 | | /* |
46 | | * Determine the input stream encoding by checking the BOM symbol. If no BOM is |
47 | | * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. |
48 | | */ |
49 | | |
50 | | static int |
51 | | yaml_parser_determine_encoding(yaml_parser_t *parser) |
52 | 0 | { |
53 | | /* Ensure that we had enough bytes in the raw buffer. */ |
54 | |
|
55 | 0 | while (!parser->eof |
56 | 0 | && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { |
57 | 0 | if (!yaml_parser_update_raw_buffer(parser)) { |
58 | 0 | return 0; |
59 | 0 | } |
60 | 0 | } |
61 | | |
62 | | /* Determine the encoding. */ |
63 | | |
64 | 0 | if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 |
65 | 0 | && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { |
66 | 0 | parser->encoding = YAML_UTF16LE_ENCODING; |
67 | 0 | parser->raw_buffer.pointer += 2; |
68 | 0 | parser->offset += 2; |
69 | 0 | } |
70 | 0 | else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 |
71 | 0 | && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { |
72 | 0 | parser->encoding = YAML_UTF16BE_ENCODING; |
73 | 0 | parser->raw_buffer.pointer += 2; |
74 | 0 | parser->offset += 2; |
75 | 0 | } |
76 | 0 | else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 |
77 | 0 | && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { |
78 | 0 | parser->encoding = YAML_UTF8_ENCODING; |
79 | 0 | parser->raw_buffer.pointer += 3; |
80 | 0 | parser->offset += 3; |
81 | 0 | } |
82 | 0 | else { |
83 | 0 | parser->encoding = YAML_UTF8_ENCODING; |
84 | 0 | } |
85 | |
|
86 | 0 | return 1; |
87 | 0 | } |
88 | | |
89 | | /* |
90 | | * Update the raw buffer. |
91 | | */ |
92 | | |
93 | | static int |
94 | | yaml_parser_update_raw_buffer(yaml_parser_t *parser) |
95 | 0 | { |
96 | 0 | size_t size_read = 0; |
97 | | |
98 | | /* Return if the raw buffer is full. */ |
99 | |
|
100 | 0 | if (parser->raw_buffer.start == parser->raw_buffer.pointer |
101 | 0 | && parser->raw_buffer.last == parser->raw_buffer.end) |
102 | 0 | return 1; |
103 | | |
104 | | /* Return on EOF. */ |
105 | | |
106 | 0 | if (parser->eof) return 1; |
107 | | |
108 | | /* Move the remaining bytes in the raw buffer to the beginning. */ |
109 | | |
110 | 0 | if (parser->raw_buffer.start < parser->raw_buffer.pointer |
111 | 0 | && parser->raw_buffer.pointer < parser->raw_buffer.last) { |
112 | 0 | memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, |
113 | 0 | parser->raw_buffer.last - parser->raw_buffer.pointer); |
114 | 0 | } |
115 | 0 | parser->raw_buffer.last -= |
116 | 0 | parser->raw_buffer.pointer - parser->raw_buffer.start; |
117 | 0 | parser->raw_buffer.pointer = parser->raw_buffer.start; |
118 | | |
119 | | /* Call the read handler to fill the buffer. */ |
120 | |
|
121 | 0 | if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, |
122 | 0 | parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { |
123 | 0 | return yaml_parser_set_reader_error(parser, "input error", |
124 | 0 | parser->offset, -1); |
125 | 0 | } |
126 | 0 | parser->raw_buffer.last += size_read; |
127 | 0 | if (!size_read) { |
128 | 0 | parser->eof = 1; |
129 | 0 | } |
130 | |
|
131 | 0 | return 1; |
132 | 0 | } |
133 | | |
134 | | /* |
135 | | * Ensure that the buffer contains at least `length` characters. |
136 | | * Return 1 on success, 0 on failure. |
137 | | * |
138 | | * The length is supposed to be significantly less that the buffer size. |
139 | | */ |
140 | | |
141 | | YAML_DECLARE(int) |
142 | | yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) |
143 | 0 | { |
144 | 0 | int first = 1; |
145 | |
|
146 | 0 | assert(parser->read_handler); /* Read handler must be set. */ |
147 | | |
148 | | /* If the EOF flag is set and the raw buffer is empty, do nothing. */ |
149 | | |
150 | 0 | if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) |
151 | 0 | return 1; |
152 | | |
153 | | /* Return if the buffer contains enough characters. */ |
154 | | |
155 | 0 | if (parser->unread >= length) |
156 | 0 | return 1; |
157 | | |
158 | | /* Determine the input encoding if it is not known yet. */ |
159 | | |
160 | 0 | if (!parser->encoding) { |
161 | 0 | if (!yaml_parser_determine_encoding(parser)) |
162 | 0 | return 0; |
163 | 0 | } |
164 | | |
165 | | /* Move the unread characters to the beginning of the buffer. */ |
166 | | |
167 | 0 | if (parser->buffer.start < parser->buffer.pointer |
168 | 0 | && parser->buffer.pointer < parser->buffer.last) { |
169 | 0 | size_t size = parser->buffer.last - parser->buffer.pointer; |
170 | 0 | memmove(parser->buffer.start, parser->buffer.pointer, size); |
171 | 0 | parser->buffer.pointer = parser->buffer.start; |
172 | 0 | parser->buffer.last = parser->buffer.start + size; |
173 | 0 | } |
174 | 0 | else if (parser->buffer.pointer == parser->buffer.last) { |
175 | 0 | parser->buffer.pointer = parser->buffer.start; |
176 | 0 | parser->buffer.last = parser->buffer.start; |
177 | 0 | } |
178 | | |
179 | | /* Fill the buffer until it has enough characters. */ |
180 | |
|
181 | 0 | while (parser->unread < length) |
182 | 0 | { |
183 | | /* Fill the raw buffer if necessary. */ |
184 | |
|
185 | 0 | if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { |
186 | 0 | if (!yaml_parser_update_raw_buffer(parser)) return 0; |
187 | 0 | } |
188 | 0 | first = 0; |
189 | | |
190 | | /* Decode the raw buffer. */ |
191 | |
|
192 | 0 | while (parser->raw_buffer.pointer != parser->raw_buffer.last) |
193 | 0 | { |
194 | 0 | unsigned int value = 0, value2 = 0; |
195 | 0 | int incomplete = 0; |
196 | 0 | unsigned char octet; |
197 | 0 | unsigned int width = 0; |
198 | 0 | int low, high; |
199 | 0 | size_t k; |
200 | 0 | size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; |
201 | | |
202 | | /* Decode the next character. */ |
203 | |
|
204 | 0 | switch (parser->encoding) |
205 | 0 | { |
206 | 0 | case YAML_UTF8_ENCODING: |
207 | | |
208 | | /* |
209 | | * Decode a UTF-8 character. Check RFC 3629 |
210 | | * (http://www.ietf.org/rfc/rfc3629.txt) for more details. |
211 | | * |
212 | | * The following table (taken from the RFC) is used for |
213 | | * decoding. |
214 | | * |
215 | | * Char. number range | UTF-8 octet sequence |
216 | | * (hexadecimal) | (binary) |
217 | | * --------------------+------------------------------------ |
218 | | * 0000 0000-0000 007F | 0xxxxxxx |
219 | | * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx |
220 | | * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
221 | | * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
222 | | * |
223 | | * Additionally, the characters in the range 0xD800-0xDFFF |
224 | | * are prohibited as they are reserved for use with UTF-16 |
225 | | * surrogate pairs. |
226 | | */ |
227 | | |
228 | | /* Determine the length of the UTF-8 sequence. */ |
229 | |
|
230 | 0 | octet = parser->raw_buffer.pointer[0]; |
231 | 0 | width = (octet & 0x80) == 0x00 ? 1 : |
232 | 0 | (octet & 0xE0) == 0xC0 ? 2 : |
233 | 0 | (octet & 0xF0) == 0xE0 ? 3 : |
234 | 0 | (octet & 0xF8) == 0xF0 ? 4 : 0; |
235 | | |
236 | | /* Check if the leading octet is valid. */ |
237 | |
|
238 | 0 | if (!width) |
239 | 0 | return yaml_parser_set_reader_error(parser, |
240 | 0 | "invalid leading UTF-8 octet", |
241 | 0 | parser->offset, octet); |
242 | | |
243 | | /* Check if the raw buffer contains an incomplete character. */ |
244 | | |
245 | 0 | if (width > raw_unread) { |
246 | 0 | if (parser->eof) { |
247 | 0 | return yaml_parser_set_reader_error(parser, |
248 | 0 | "incomplete UTF-8 octet sequence", |
249 | 0 | parser->offset, -1); |
250 | 0 | } |
251 | 0 | incomplete = 1; |
252 | 0 | break; |
253 | 0 | } |
254 | | |
255 | | /* Decode the leading octet. */ |
256 | | |
257 | 0 | value = (octet & 0x80) == 0x00 ? octet & 0x7F : |
258 | 0 | (octet & 0xE0) == 0xC0 ? octet & 0x1F : |
259 | 0 | (octet & 0xF0) == 0xE0 ? octet & 0x0F : |
260 | 0 | (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; |
261 | | |
262 | | /* Check and decode the trailing octets. */ |
263 | |
|
264 | 0 | for (k = 1; k < width; k ++) |
265 | 0 | { |
266 | 0 | octet = parser->raw_buffer.pointer[k]; |
267 | | |
268 | | /* Check if the octet is valid. */ |
269 | |
|
270 | 0 | if ((octet & 0xC0) != 0x80) |
271 | 0 | return yaml_parser_set_reader_error(parser, |
272 | 0 | "invalid trailing UTF-8 octet", |
273 | 0 | parser->offset+k, octet); |
274 | | |
275 | | /* Decode the octet. */ |
276 | | |
277 | 0 | value = (value << 6) + (octet & 0x3F); |
278 | 0 | } |
279 | | |
280 | | /* Check the length of the sequence against the value. */ |
281 | | |
282 | 0 | if (!((width == 1) || |
283 | 0 | (width == 2 && value >= 0x80) || |
284 | 0 | (width == 3 && value >= 0x800) || |
285 | 0 | (width == 4 && value >= 0x10000))) |
286 | 0 | return yaml_parser_set_reader_error(parser, |
287 | 0 | "invalid length of a UTF-8 sequence", |
288 | 0 | parser->offset, -1); |
289 | | |
290 | | /* Check the range of the value. */ |
291 | | |
292 | 0 | if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) |
293 | 0 | return yaml_parser_set_reader_error(parser, |
294 | 0 | "invalid Unicode character", |
295 | 0 | parser->offset, value); |
296 | | |
297 | 0 | break; |
298 | | |
299 | 0 | case YAML_UTF16LE_ENCODING: |
300 | 0 | case YAML_UTF16BE_ENCODING: |
301 | |
|
302 | 0 | low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); |
303 | 0 | high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); |
304 | | |
305 | | /* |
306 | | * The UTF-16 encoding is not as simple as one might |
307 | | * naively think. Check RFC 2781 |
308 | | * (http://www.ietf.org/rfc/rfc2781.txt). |
309 | | * |
310 | | * Normally, two subsequent bytes describe a Unicode |
311 | | * character. However a special technique (called a |
312 | | * surrogate pair) is used for specifying character |
313 | | * values larger than 0xFFFF. |
314 | | * |
315 | | * A surrogate pair consists of two pseudo-characters: |
316 | | * high surrogate area (0xD800-0xDBFF) |
317 | | * low surrogate area (0xDC00-0xDFFF) |
318 | | * |
319 | | * The following formulas are used for decoding |
320 | | * and encoding characters using surrogate pairs: |
321 | | * |
322 | | * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) |
323 | | * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) |
324 | | * W1 = 110110yyyyyyyyyy |
325 | | * W2 = 110111xxxxxxxxxx |
326 | | * |
327 | | * where U is the character value, W1 is the high surrogate |
328 | | * area, W2 is the low surrogate area. |
329 | | */ |
330 | | |
331 | | /* Check for incomplete UTF-16 character. */ |
332 | |
|
333 | 0 | if (raw_unread < 2) { |
334 | 0 | if (parser->eof) { |
335 | 0 | return yaml_parser_set_reader_error(parser, |
336 | 0 | "incomplete UTF-16 character", |
337 | 0 | parser->offset, -1); |
338 | 0 | } |
339 | 0 | incomplete = 1; |
340 | 0 | break; |
341 | 0 | } |
342 | | |
343 | | /* Get the character. */ |
344 | | |
345 | 0 | value = parser->raw_buffer.pointer[low] |
346 | 0 | + (parser->raw_buffer.pointer[high] << 8); |
347 | | |
348 | | /* Check for unexpected low surrogate area. */ |
349 | |
|
350 | 0 | if ((value & 0xFC00) == 0xDC00) |
351 | 0 | return yaml_parser_set_reader_error(parser, |
352 | 0 | "unexpected low surrogate area", |
353 | 0 | parser->offset, value); |
354 | | |
355 | | /* Check for a high surrogate area. */ |
356 | | |
357 | 0 | if ((value & 0xFC00) == 0xD800) { |
358 | |
|
359 | 0 | width = 4; |
360 | | |
361 | | /* Check for incomplete surrogate pair. */ |
362 | |
|
363 | 0 | if (raw_unread < 4) { |
364 | 0 | if (parser->eof) { |
365 | 0 | return yaml_parser_set_reader_error(parser, |
366 | 0 | "incomplete UTF-16 surrogate pair", |
367 | 0 | parser->offset, -1); |
368 | 0 | } |
369 | 0 | incomplete = 1; |
370 | 0 | break; |
371 | 0 | } |
372 | | |
373 | | /* Get the next character. */ |
374 | | |
375 | 0 | value2 = parser->raw_buffer.pointer[low+2] |
376 | 0 | + (parser->raw_buffer.pointer[high+2] << 8); |
377 | | |
378 | | /* Check for a low surrogate area. */ |
379 | |
|
380 | 0 | if ((value2 & 0xFC00) != 0xDC00) |
381 | 0 | return yaml_parser_set_reader_error(parser, |
382 | 0 | "expected low surrogate area", |
383 | 0 | parser->offset+2, value2); |
384 | | |
385 | | /* Generate the value of the surrogate pair. */ |
386 | | |
387 | 0 | value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); |
388 | 0 | } |
389 | | |
390 | 0 | else { |
391 | 0 | width = 2; |
392 | 0 | } |
393 | | |
394 | 0 | break; |
395 | | |
396 | 0 | default: |
397 | 0 | assert(1); /* Impossible. */ |
398 | 0 | } |
399 | | |
400 | | /* Check if the raw buffer contains enough bytes to form a character. */ |
401 | | |
402 | 0 | if (incomplete) break; |
403 | | |
404 | | /* |
405 | | * Check if the character is in the allowed range: |
406 | | * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) |
407 | | * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) |
408 | | * | [#x10000-#x10FFFF] (32 bit) |
409 | | */ |
410 | | |
411 | 0 | if (! (value == 0x09 || value == 0x0A || value == 0x0D |
412 | 0 | || (value >= 0x20 && value <= 0x7E) |
413 | 0 | || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) |
414 | 0 | || (value >= 0xE000 && value <= 0xFFFD) |
415 | 0 | || (value >= 0x10000 && value <= 0x10FFFF))) |
416 | 0 | return yaml_parser_set_reader_error(parser, |
417 | 0 | "control characters are not allowed", |
418 | 0 | parser->offset, value); |
419 | | |
420 | | /* Move the raw pointers. */ |
421 | | |
422 | 0 | parser->raw_buffer.pointer += width; |
423 | 0 | parser->offset += width; |
424 | | |
425 | | /* Finally put the character into the buffer. */ |
426 | | |
427 | | /* 0000 0000-0000 007F -> 0xxxxxxx */ |
428 | 0 | if (value <= 0x7F) { |
429 | 0 | *(parser->buffer.last++) = value; |
430 | 0 | } |
431 | | /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ |
432 | 0 | else if (value <= 0x7FF) { |
433 | 0 | *(parser->buffer.last++) = 0xC0 + (value >> 6); |
434 | 0 | *(parser->buffer.last++) = 0x80 + (value & 0x3F); |
435 | 0 | } |
436 | | /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ |
437 | 0 | else if (value <= 0xFFFF) { |
438 | 0 | *(parser->buffer.last++) = 0xE0 + (value >> 12); |
439 | 0 | *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); |
440 | 0 | *(parser->buffer.last++) = 0x80 + (value & 0x3F); |
441 | 0 | } |
442 | | /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
443 | 0 | else { |
444 | 0 | *(parser->buffer.last++) = 0xF0 + (value >> 18); |
445 | 0 | *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); |
446 | 0 | *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); |
447 | 0 | *(parser->buffer.last++) = 0x80 + (value & 0x3F); |
448 | 0 | } |
449 | |
|
450 | 0 | parser->unread ++; |
451 | 0 | } |
452 | | |
453 | | /* On EOF, put NUL into the buffer and return. */ |
454 | | |
455 | 0 | if (parser->eof) { |
456 | 0 | *(parser->buffer.last++) = '\0'; |
457 | 0 | parser->unread ++; |
458 | 0 | return 1; |
459 | 0 | } |
460 | |
|
461 | 0 | } |
462 | | |
463 | 0 | if (parser->offset >= MAX_FILE_SIZE) { |
464 | 0 | return yaml_parser_set_reader_error(parser, "input is too long", |
465 | 0 | parser->offset, -1); |
466 | 0 | } |
467 | | |
468 | 0 | return 1; |
469 | 0 | } |