/src/moddable/xs/tools/yaml/reader.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2006-2016 Kirill Simonov |
3 | | * |
4 | | * Permission is hereby granted, free of charge, to any person obtaining a copy of |
5 | | * this software and associated documentation files (the "Software"), to deal in |
6 | | * the Software without restriction, including without limitation the rights to |
7 | | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
8 | | * of the Software, and to permit persons to whom the Software is furnished to do |
9 | | * so, subject to the following conditions: |
10 | | * |
11 | | * The above copyright notice and this permission notice shall be included in all |
12 | | * copies or substantial portions of the Software. |
13 | | * |
14 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
17 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
18 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
19 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
20 | | * SOFTWARE. |
21 | | * |
22 | | */ |
23 | | |
24 | | #include "yaml_private.h" |
25 | | |
26 | | /* |
27 | | * Declarations. |
28 | | */ |
29 | | |
30 | | static int |
31 | | yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, |
32 | | size_t offset, int value); |
33 | | |
34 | | static int |
35 | | yaml_parser_update_raw_buffer(yaml_parser_t *parser); |
36 | | |
37 | | static int |
38 | | yaml_parser_determine_encoding(yaml_parser_t *parser); |
39 | | |
40 | | YAML_DECLARE(int) |
41 | | yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); |
42 | | |
43 | | /* |
44 | | * Set the reader error and return 0. |
45 | | */ |
46 | | |
47 | | static int |
48 | | yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, |
49 | | size_t offset, int value) |
50 | 0 | { |
51 | 0 | parser->error = YAML_READER_ERROR; |
52 | 0 | parser->problem = problem; |
53 | 0 | parser->problem_offset = offset; |
54 | 0 | parser->problem_value = value; |
55 | |
|
56 | 0 | return 0; |
57 | 0 | } |
58 | | |
59 | | /* |
60 | | * Byte order marks. |
61 | | */ |
62 | | |
63 | 0 | #define BOM_UTF8 "\xef\xbb\xbf" |
64 | 0 | #define BOM_UTF16LE "\xff\xfe" |
65 | 0 | #define BOM_UTF16BE "\xfe\xff" |
66 | | |
67 | | /* |
68 | | * Determine the input stream encoding by checking the BOM symbol. If no BOM is |
69 | | * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. |
70 | | */ |
71 | | |
72 | | static int |
73 | | yaml_parser_determine_encoding(yaml_parser_t *parser) |
74 | 0 | { |
75 | | /* Ensure that we had enough bytes in the raw buffer. */ |
76 | |
|
77 | 0 | while (!parser->eof |
78 | 0 | && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { |
79 | 0 | if (!yaml_parser_update_raw_buffer(parser)) { |
80 | 0 | return 0; |
81 | 0 | } |
82 | 0 | } |
83 | | |
84 | | /* Determine the encoding. */ |
85 | | |
86 | 0 | if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 |
87 | 0 | && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { |
88 | 0 | parser->encoding = YAML_UTF16LE_ENCODING; |
89 | 0 | parser->raw_buffer.pointer += 2; |
90 | 0 | parser->offset += 2; |
91 | 0 | } |
92 | 0 | else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 |
93 | 0 | && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { |
94 | 0 | parser->encoding = YAML_UTF16BE_ENCODING; |
95 | 0 | parser->raw_buffer.pointer += 2; |
96 | 0 | parser->offset += 2; |
97 | 0 | } |
98 | 0 | else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 |
99 | 0 | && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { |
100 | 0 | parser->encoding = YAML_UTF8_ENCODING; |
101 | 0 | parser->raw_buffer.pointer += 3; |
102 | 0 | parser->offset += 3; |
103 | 0 | } |
104 | 0 | else { |
105 | 0 | parser->encoding = YAML_UTF8_ENCODING; |
106 | 0 | } |
107 | |
|
108 | 0 | return 1; |
109 | 0 | } |
110 | | |
111 | | /* |
112 | | * Update the raw buffer. |
113 | | */ |
114 | | |
115 | | static int |
116 | | yaml_parser_update_raw_buffer(yaml_parser_t *parser) |
117 | 0 | { |
118 | 0 | size_t size_read = 0; |
119 | | |
120 | | /* Return if the raw buffer is full. */ |
121 | |
|
122 | 0 | if (parser->raw_buffer.start == parser->raw_buffer.pointer |
123 | 0 | && parser->raw_buffer.last == parser->raw_buffer.end) |
124 | 0 | return 1; |
125 | | |
126 | | /* Return on EOF. */ |
127 | | |
128 | 0 | if (parser->eof) return 1; |
129 | | |
130 | | /* Move the remaining bytes in the raw buffer to the beginning. */ |
131 | | |
132 | 0 | if (parser->raw_buffer.start < parser->raw_buffer.pointer |
133 | 0 | && parser->raw_buffer.pointer < parser->raw_buffer.last) { |
134 | 0 | memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, |
135 | 0 | parser->raw_buffer.last - parser->raw_buffer.pointer); |
136 | 0 | } |
137 | 0 | parser->raw_buffer.last -= |
138 | 0 | parser->raw_buffer.pointer - parser->raw_buffer.start; |
139 | 0 | parser->raw_buffer.pointer = parser->raw_buffer.start; |
140 | | |
141 | | /* Call the read handler to fill the buffer. */ |
142 | |
|
143 | 0 | if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, |
144 | 0 | parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { |
145 | 0 | return yaml_parser_set_reader_error(parser, "input error", |
146 | 0 | parser->offset, -1); |
147 | 0 | } |
148 | 0 | parser->raw_buffer.last += size_read; |
149 | 0 | if (!size_read) { |
150 | 0 | parser->eof = 1; |
151 | 0 | } |
152 | |
|
153 | 0 | return 1; |
154 | 0 | } |
155 | | |
156 | | /* |
157 | | * Ensure that the buffer contains at least `length` characters. |
158 | | * Return 1 on success, 0 on failure. |
159 | | * |
160 | | * The length is supposed to be significantly less that the buffer size. |
161 | | */ |
162 | | |
163 | | YAML_DECLARE(int) |
164 | | yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) |
165 | 0 | { |
166 | 0 | int first = 1; |
167 | |
|
168 | 0 | assert(parser->read_handler); /* Read handler must be set. */ |
169 | | |
170 | | /* If the EOF flag is set and the raw buffer is empty, do nothing. */ |
171 | | |
172 | 0 | if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) |
173 | 0 | return 1; |
174 | | |
175 | | /* Return if the buffer contains enough characters. */ |
176 | | |
177 | 0 | if (parser->unread >= length) |
178 | 0 | return 1; |
179 | | |
180 | | /* Determine the input encoding if it is not known yet. */ |
181 | | |
182 | 0 | if (!parser->encoding) { |
183 | 0 | if (!yaml_parser_determine_encoding(parser)) |
184 | 0 | return 0; |
185 | 0 | } |
186 | | |
187 | | /* Move the unread characters to the beginning of the buffer. */ |
188 | | |
189 | 0 | if (parser->buffer.start < parser->buffer.pointer |
190 | 0 | && parser->buffer.pointer < parser->buffer.last) { |
191 | 0 | size_t size = parser->buffer.last - parser->buffer.pointer; |
192 | 0 | memmove(parser->buffer.start, parser->buffer.pointer, size); |
193 | 0 | parser->buffer.pointer = parser->buffer.start; |
194 | 0 | parser->buffer.last = parser->buffer.start + size; |
195 | 0 | } |
196 | 0 | else if (parser->buffer.pointer == parser->buffer.last) { |
197 | 0 | parser->buffer.pointer = parser->buffer.start; |
198 | 0 | parser->buffer.last = parser->buffer.start; |
199 | 0 | } |
200 | | |
201 | | /* Fill the buffer until it has enough characters. */ |
202 | |
|
203 | 0 | while (parser->unread < length) |
204 | 0 | { |
205 | | /* Fill the raw buffer if necessary. */ |
206 | |
|
207 | 0 | if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { |
208 | 0 | if (!yaml_parser_update_raw_buffer(parser)) return 0; |
209 | 0 | } |
210 | 0 | first = 0; |
211 | | |
212 | | /* Decode the raw buffer. */ |
213 | |
|
214 | 0 | while (parser->raw_buffer.pointer != parser->raw_buffer.last) |
215 | 0 | { |
216 | 0 | unsigned int value = 0, value2 = 0; |
217 | 0 | int incomplete = 0; |
218 | 0 | unsigned char octet; |
219 | 0 | unsigned int width = 0; |
220 | 0 | int low, high; |
221 | 0 | size_t k; |
222 | 0 | size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; |
223 | | |
224 | | /* Decode the next character. */ |
225 | |
|
226 | 0 | switch (parser->encoding) |
227 | 0 | { |
228 | 0 | case YAML_UTF8_ENCODING: |
229 | | |
230 | | /* |
231 | | * Decode a UTF-8 character. Check RFC 3629 |
232 | | * (http://www.ietf.org/rfc/rfc3629.txt) for more details. |
233 | | * |
234 | | * The following table (taken from the RFC) is used for |
235 | | * decoding. |
236 | | * |
237 | | * Char. number range | UTF-8 octet sequence |
238 | | * (hexadecimal) | (binary) |
239 | | * --------------------+------------------------------------ |
240 | | * 0000 0000-0000 007F | 0xxxxxxx |
241 | | * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx |
242 | | * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
243 | | * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
244 | | * |
245 | | * Additionally, the characters in the range 0xD800-0xDFFF |
246 | | * are prohibited as they are reserved for use with UTF-16 |
247 | | * surrogate pairs. |
248 | | */ |
249 | | |
250 | | /* Determine the length of the UTF-8 sequence. */ |
251 | |
|
252 | 0 | octet = parser->raw_buffer.pointer[0]; |
253 | 0 | width = (octet & 0x80) == 0x00 ? 1 : |
254 | 0 | (octet & 0xE0) == 0xC0 ? 2 : |
255 | 0 | (octet & 0xF0) == 0xE0 ? 3 : |
256 | 0 | (octet & 0xF8) == 0xF0 ? 4 : 0; |
257 | | |
258 | | /* Check if the leading octet is valid. */ |
259 | |
|
260 | 0 | if (!width) |
261 | 0 | return yaml_parser_set_reader_error(parser, |
262 | 0 | "invalid leading UTF-8 octet", |
263 | 0 | parser->offset, octet); |
264 | | |
265 | | /* Check if the raw buffer contains an incomplete character. */ |
266 | | |
267 | 0 | if (width > raw_unread) { |
268 | 0 | if (parser->eof) { |
269 | 0 | return yaml_parser_set_reader_error(parser, |
270 | 0 | "incomplete UTF-8 octet sequence", |
271 | 0 | parser->offset, -1); |
272 | 0 | } |
273 | 0 | incomplete = 1; |
274 | 0 | break; |
275 | 0 | } |
276 | | |
277 | | /* Decode the leading octet. */ |
278 | | |
279 | 0 | value = (octet & 0x80) == 0x00 ? octet & 0x7F : |
280 | 0 | (octet & 0xE0) == 0xC0 ? octet & 0x1F : |
281 | 0 | (octet & 0xF0) == 0xE0 ? octet & 0x0F : |
282 | 0 | (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; |
283 | | |
284 | | /* Check and decode the trailing octets. */ |
285 | |
|
286 | 0 | for (k = 1; k < width; k ++) |
287 | 0 | { |
288 | 0 | octet = parser->raw_buffer.pointer[k]; |
289 | | |
290 | | /* Check if the octet is valid. */ |
291 | |
|
292 | 0 | if ((octet & 0xC0) != 0x80) |
293 | 0 | return yaml_parser_set_reader_error(parser, |
294 | 0 | "invalid trailing UTF-8 octet", |
295 | 0 | parser->offset+k, octet); |
296 | | |
297 | | /* Decode the octet. */ |
298 | | |
299 | 0 | value = (value << 6) + (octet & 0x3F); |
300 | 0 | } |
301 | | |
302 | | /* Check the length of the sequence against the value. */ |
303 | | |
304 | 0 | if (!((width == 1) || |
305 | 0 | (width == 2 && value >= 0x80) || |
306 | 0 | (width == 3 && value >= 0x800) || |
307 | 0 | (width == 4 && value >= 0x10000))) |
308 | 0 | return yaml_parser_set_reader_error(parser, |
309 | 0 | "invalid length of a UTF-8 sequence", |
310 | 0 | parser->offset, -1); |
311 | | |
312 | | /* Check the range of the value. */ |
313 | | |
314 | 0 | if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) |
315 | 0 | return yaml_parser_set_reader_error(parser, |
316 | 0 | "invalid Unicode character", |
317 | 0 | parser->offset, value); |
318 | | |
319 | 0 | break; |
320 | | |
321 | 0 | case YAML_UTF16LE_ENCODING: |
322 | 0 | case YAML_UTF16BE_ENCODING: |
323 | |
|
324 | 0 | low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); |
325 | 0 | high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); |
326 | | |
327 | | /* |
328 | | * The UTF-16 encoding is not as simple as one might |
329 | | * naively think. Check RFC 2781 |
330 | | * (http://www.ietf.org/rfc/rfc2781.txt). |
331 | | * |
332 | | * Normally, two subsequent bytes describe a Unicode |
333 | | * character. However a special technique (called a |
334 | | * surrogate pair) is used for specifying character |
335 | | * values larger than 0xFFFF. |
336 | | * |
337 | | * A surrogate pair consists of two pseudo-characters: |
338 | | * high surrogate area (0xD800-0xDBFF) |
339 | | * low surrogate area (0xDC00-0xDFFF) |
340 | | * |
341 | | * The following formulas are used for decoding |
342 | | * and encoding characters using surrogate pairs: |
343 | | * |
344 | | * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) |
345 | | * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) |
346 | | * W1 = 110110yyyyyyyyyy |
347 | | * W2 = 110111xxxxxxxxxx |
348 | | * |
349 | | * where U is the character value, W1 is the high surrogate |
350 | | * area, W2 is the low surrogate area. |
351 | | */ |
352 | | |
353 | | /* Check for incomplete UTF-16 character. */ |
354 | |
|
355 | 0 | if (raw_unread < 2) { |
356 | 0 | if (parser->eof) { |
357 | 0 | return yaml_parser_set_reader_error(parser, |
358 | 0 | "incomplete UTF-16 character", |
359 | 0 | parser->offset, -1); |
360 | 0 | } |
361 | 0 | incomplete = 1; |
362 | 0 | break; |
363 | 0 | } |
364 | | |
365 | | /* Get the character. */ |
366 | | |
367 | 0 | value = parser->raw_buffer.pointer[low] |
368 | 0 | + (parser->raw_buffer.pointer[high] << 8); |
369 | | |
370 | | /* Check for unexpected low surrogate area. */ |
371 | |
|
372 | 0 | if ((value & 0xFC00) == 0xDC00) |
373 | 0 | return yaml_parser_set_reader_error(parser, |
374 | 0 | "unexpected low surrogate area", |
375 | 0 | parser->offset, value); |
376 | | |
377 | | /* Check for a high surrogate area. */ |
378 | | |
379 | 0 | if ((value & 0xFC00) == 0xD800) { |
380 | |
|
381 | 0 | width = 4; |
382 | | |
383 | | /* Check for incomplete surrogate pair. */ |
384 | |
|
385 | 0 | if (raw_unread < 4) { |
386 | 0 | if (parser->eof) { |
387 | 0 | return yaml_parser_set_reader_error(parser, |
388 | 0 | "incomplete UTF-16 surrogate pair", |
389 | 0 | parser->offset, -1); |
390 | 0 | } |
391 | 0 | incomplete = 1; |
392 | 0 | break; |
393 | 0 | } |
394 | | |
395 | | /* Get the next character. */ |
396 | | |
397 | 0 | value2 = parser->raw_buffer.pointer[low+2] |
398 | 0 | + (parser->raw_buffer.pointer[high+2] << 8); |
399 | | |
400 | | /* Check for a low surrogate area. */ |
401 | |
|
402 | 0 | if ((value2 & 0xFC00) != 0xDC00) |
403 | 0 | return yaml_parser_set_reader_error(parser, |
404 | 0 | "expected low surrogate area", |
405 | 0 | parser->offset+2, value2); |
406 | | |
407 | | /* Generate the value of the surrogate pair. */ |
408 | | |
409 | 0 | value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); |
410 | 0 | } |
411 | | |
412 | 0 | else { |
413 | 0 | width = 2; |
414 | 0 | } |
415 | | |
416 | 0 | break; |
417 | | |
418 | 0 | default: |
419 | 0 | assert(1); /* Impossible. */ |
420 | 0 | } |
421 | | |
422 | | /* Check if the raw buffer contains enough bytes to form a character. */ |
423 | | |
424 | 0 | if (incomplete) break; |
425 | | |
426 | | /* |
427 | | * Check if the character is in the allowed range: |
428 | | * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) |
429 | | * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) |
430 | | * | [#x10000-#x10FFFF] (32 bit) |
431 | | */ |
432 | | |
433 | 0 | if (! (value == 0x09 || value == 0x0A || value == 0x0D |
434 | 0 | || (value >= 0x20 && value <= 0x7E) |
435 | 0 | || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) |
436 | 0 | || (value >= 0xE000 && value <= 0xFFFD) |
437 | 0 | || (value >= 0x10000 && value <= 0x10FFFF))) |
438 | 0 | return yaml_parser_set_reader_error(parser, |
439 | 0 | "control characters are not allowed", |
440 | 0 | parser->offset, value); |
441 | | |
442 | | /* Move the raw pointers. */ |
443 | | |
444 | 0 | parser->raw_buffer.pointer += width; |
445 | 0 | parser->offset += width; |
446 | | |
447 | | /* Finally put the character into the buffer. */ |
448 | | |
449 | | /* 0000 0000-0000 007F -> 0xxxxxxx */ |
450 | 0 | if (value <= 0x7F) { |
451 | 0 | *(parser->buffer.last++) = value; |
452 | 0 | } |
453 | | /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ |
454 | 0 | else if (value <= 0x7FF) { |
455 | 0 | *(parser->buffer.last++) = 0xC0 + (value >> 6); |
456 | 0 | *(parser->buffer.last++) = 0x80 + (value & 0x3F); |
457 | 0 | } |
458 | | /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ |
459 | 0 | else if (value <= 0xFFFF) { |
460 | 0 | *(parser->buffer.last++) = 0xE0 + (value >> 12); |
461 | 0 | *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); |
462 | 0 | *(parser->buffer.last++) = 0x80 + (value & 0x3F); |
463 | 0 | } |
464 | | /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
465 | 0 | else { |
466 | 0 | *(parser->buffer.last++) = 0xF0 + (value >> 18); |
467 | 0 | *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); |
468 | 0 | *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); |
469 | 0 | *(parser->buffer.last++) = 0x80 + (value & 0x3F); |
470 | 0 | } |
471 | |
|
472 | 0 | parser->unread ++; |
473 | 0 | } |
474 | | |
475 | | /* On EOF, put NUL into the buffer and return. */ |
476 | | |
477 | 0 | if (parser->eof) { |
478 | 0 | *(parser->buffer.last++) = '\0'; |
479 | 0 | parser->unread ++; |
480 | 0 | return 1; |
481 | 0 | } |
482 | |
|
483 | 0 | } |
484 | | |
485 | 0 | if (parser->offset >= PTRDIFF_MAX) |
486 | 0 | return yaml_parser_set_reader_error(parser, "input is too long", |
487 | 0 | PTRDIFF_MAX, -1); |
488 | | |
489 | 0 | return 1; |
490 | 0 | } |
491 | | |