/src/moddable/xs/tools/yaml/reader.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2006-2016 Kirill Simonov
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do
 * so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include "yaml_private.h"

/*
 * Declarations.
 */

static int
yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
        size_t offset, int value);

static int
yaml_parser_update_raw_buffer(yaml_parser_t *parser);

static int
yaml_parser_determine_encoding(yaml_parser_t *parser);

YAML_DECLARE(int)
yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);

/*
 * Set the reader error and return 0.
 */

static int
yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
        size_t offset, int value)
{
    parser->error = YAML_READER_ERROR;
    parser->problem = problem;
    parser->problem_offset = offset;
    parser->problem_value = value;

    return 0;
}

/*
 * Byte order marks.
 */

#define BOM_UTF8    "\xef\xbb\xbf"
#define BOM_UTF16LE "\xff\xfe"
#define BOM_UTF16BE "\xfe\xff"

/*
 * Determine the input stream encoding by checking the BOM symbol. If no BOM is
 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
 */

static int
yaml_parser_determine_encoding(yaml_parser_t *parser)
{
    /* Ensure that we had enough bytes in the raw buffer. */

    while (!parser->eof 
            && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
        if (!yaml_parser_update_raw_buffer(parser)) {
            return 0;
        }
    }

    /* Determine the encoding. */

    if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
            && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
        parser->encoding = YAML_UTF16LE_ENCODING;
        parser->raw_buffer.pointer += 2;
        parser->offset += 2;
    }
    else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
            && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
        parser->encoding = YAML_UTF16BE_ENCODING;
        parser->raw_buffer.pointer += 2;
        parser->offset += 2;
    }
    else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
            && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
        parser->encoding = YAML_UTF8_ENCODING;
        parser->raw_buffer.pointer += 3;
        parser->offset += 3;
    }
    else {
        parser->encoding = YAML_UTF8_ENCODING;
    }

    return 1;
}

/*
 * Update the raw buffer.
 */

static int
yaml_parser_update_raw_buffer(yaml_parser_t *parser)
{
    size_t size_read = 0;

    /* Return if the raw buffer is full. */

    if (parser->raw_buffer.start == parser->raw_buffer.pointer
            && parser->raw_buffer.last == parser->raw_buffer.end)
        return 1;

    /* Return on EOF. */

    if (parser->eof) return 1;

    /* Move the remaining bytes in the raw buffer to the beginning. */

    if (parser->raw_buffer.start < parser->raw_buffer.pointer
            && parser->raw_buffer.pointer < parser->raw_buffer.last) {
        memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
                parser->raw_buffer.last - parser->raw_buffer.pointer);
    }
    parser->raw_buffer.last -=
        parser->raw_buffer.pointer - parser->raw_buffer.start;
    parser->raw_buffer.pointer = parser->raw_buffer.start;

    /* Call the read handler to fill the buffer. */

    if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
                parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
        return yaml_parser_set_reader_error(parser, "input error",
                parser->offset, -1);
    }
    parser->raw_buffer.last += size_read;
    if (!size_read) {
        parser->eof = 1;
    }

    return 1;
}

/*
 * Ensure that the buffer contains at least `length` characters.
 * Return 1 on success, 0 on failure.
 *
 * The length is supposed to be significantly less that the buffer size.
 */

YAML_DECLARE(int)
yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
{
    int first = 1;

    assert(parser->read_handler);   /* Read handler must be set. */

    /* If the EOF flag is set and the raw buffer is empty, do nothing. */

    if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
        return 1;

    /* Return if the buffer contains enough characters. */

    if (parser->unread >= length)
        return 1;

    /* Determine the input encoding if it is not known yet. */

    if (!parser->encoding) {
        if (!yaml_parser_determine_encoding(parser))
            return 0;
    }

    /* Move the unread characters to the beginning of the buffer. */

    if (parser->buffer.start < parser->buffer.pointer
            && parser->buffer.pointer < parser->buffer.last) {
        size_t size = parser->buffer.last - parser->buffer.pointer;
        memmove(parser->buffer.start, parser->buffer.pointer, size);
        parser->buffer.pointer = parser->buffer.start;
        parser->buffer.last = parser->buffer.start + size;
    }
    else if (parser->buffer.pointer == parser->buffer.last) {
        parser->buffer.pointer = parser->buffer.start;
        parser->buffer.last = parser->buffer.start;
    }

    /* Fill the buffer until it has enough characters. */

    while (parser->unread < length)
    {
        /* Fill the raw buffer if necessary. */

        if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
            if (!yaml_parser_update_raw_buffer(parser)) return 0;
        }
        first = 0;

        /* Decode the raw buffer. */

        while (parser->raw_buffer.pointer != parser->raw_buffer.last)
        {
            unsigned int value = 0, value2 = 0;
            int incomplete = 0;
            unsigned char octet;
            unsigned int width = 0;
            int low, high;
            size_t k;
            size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;

            /* Decode the next character. */

            switch (parser->encoding)
            {
                case YAML_UTF8_ENCODING:

                    /*
                     * Decode a UTF-8 character.  Check RFC 3629
                     * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
                     *
                     * The following table (taken from the RFC) is used for
                     * decoding.
                     *
                     *    Char. number range |        UTF-8 octet sequence
                     *      (hexadecimal)    |              (binary)
                     *   --------------------+------------------------------------
                     *   0000 0000-0000 007F | 0xxxxxxx
                     *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
                     *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
                     *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                     *
                     * Additionally, the characters in the range 0xD800-0xDFFF
                     * are prohibited as they are reserved for use with UTF-16
                     * surrogate pairs.
                     */

                    /* Determine the length of the UTF-8 sequence. */

                    octet = parser->raw_buffer.pointer[0];
                    width = (octet & 0x80) == 0x00 ? 1 :
                            (octet & 0xE0) == 0xC0 ? 2 :
                            (octet & 0xF0) == 0xE0 ? 3 :
                            (octet & 0xF8) == 0xF0 ? 4 : 0;

                    /* Check if the leading octet is valid. */

                    if (!width)
                        return yaml_parser_set_reader_error(parser,
                                "invalid leading UTF-8 octet",
                                parser->offset, octet);

                    /* Check if the raw buffer contains an incomplete character. */

                    if (width > raw_unread) {
                        if (parser->eof) {
                            return yaml_parser_set_reader_error(parser,
                                    "incomplete UTF-8 octet sequence",
                                    parser->offset, -1);
                        }
                        incomplete = 1;
                        break;
                    }

                    /* Decode the leading octet. */

                    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
                            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
                            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
                            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;

                    /* Check and decode the trailing octets. */

                    for (k = 1; k < width; k ++)
                    {
                        octet = parser->raw_buffer.pointer[k];

                        /* Check if the octet is valid. */

                        if ((octet & 0xC0) != 0x80)
                            return yaml_parser_set_reader_error(parser,
                                    "invalid trailing UTF-8 octet",
                                    parser->offset+k, octet);

                        /* Decode the octet. */

                        value = (value << 6) + (octet & 0x3F);
                    }

                    /* Check the length of the sequence against the value. */

                    if (!((width == 1) ||
                            (width == 2 && value >= 0x80) ||
                            (width == 3 && value >= 0x800) ||
                            (width == 4 && value >= 0x10000)))
                        return yaml_parser_set_reader_error(parser,
                                "invalid length of a UTF-8 sequence",
                                parser->offset, -1);

                    /* Check the range of the value. */

                    if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
                        return yaml_parser_set_reader_error(parser,
                                "invalid Unicode character",
                                parser->offset, value);

                    break;
                
                case YAML_UTF16LE_ENCODING:
                case YAML_UTF16BE_ENCODING:

                    low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
                    high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);

                    /*
                     * The UTF-16 encoding is not as simple as one might
                     * naively think.  Check RFC 2781
                     * (http://www.ietf.org/rfc/rfc2781.txt).
                     *
                     * Normally, two subsequent bytes describe a Unicode
                     * character.  However a special technique (called a
                     * surrogate pair) is used for specifying character
                     * values larger than 0xFFFF.
                     *
                     * A surrogate pair consists of two pseudo-characters:
                     *      high surrogate area (0xD800-0xDBFF)
                     *      low surrogate area (0xDC00-0xDFFF)
                     *
                     * The following formulas are used for decoding
                     * and encoding characters using surrogate pairs:
                     * 
                     *  U  = U' + 0x10000   (0x01 00 00 <= U <= 0x10 FF FF)
                     *  U' = yyyyyyyyyyxxxxxxxxxx   (0 <= U' <= 0x0F FF FF)
                     *  W1 = 110110yyyyyyyyyy
                     *  W2 = 110111xxxxxxxxxx
                     *
                     * where U is the character value, W1 is the high surrogate
                     * area, W2 is the low surrogate area.
                     */

                    /* Check for incomplete UTF-16 character. */

                    if (raw_unread < 2) {
                        if (parser->eof) {
                            return yaml_parser_set_reader_error(parser,
                                    "incomplete UTF-16 character",
                                    parser->offset, -1);
                        }
                        incomplete = 1;
                        break;
                    }

                    /* Get the character. */

                    value = parser->raw_buffer.pointer[low]
                        + (parser->raw_buffer.pointer[high] << 8);

                    /* Check for unexpected low surrogate area. */

                    if ((value & 0xFC00) == 0xDC00)
                        return yaml_parser_set_reader_error(parser,
                                "unexpected low surrogate area",
                                parser->offset, value);

                    /* Check for a high surrogate area. */

                    if ((value & 0xFC00) == 0xD800) {

                        width = 4;

                        /* Check for incomplete surrogate pair. */

                        if (raw_unread < 4) {
                            if (parser->eof) {
                                return yaml_parser_set_reader_error(parser,
                                        "incomplete UTF-16 surrogate pair",
                                        parser->offset, -1);
                            }
                            incomplete = 1;
                            break;
                        }

                        /* Get the next character. */

                        value2 = parser->raw_buffer.pointer[low+2]
                            + (parser->raw_buffer.pointer[high+2] << 8);

                        /* Check for a low surrogate area. */

                        if ((value2 & 0xFC00) != 0xDC00)
                            return yaml_parser_set_reader_error(parser,
                                    "expected low surrogate area",
                                    parser->offset+2, value2);

                        /* Generate the value of the surrogate pair. */

                        value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
                    }

                    else {
                        width = 2;
                    }

                    break;

                default:
                    assert(1);      /* Impossible. */
            }

            /* Check if the raw buffer contains enough bytes to form a character. */

            if (incomplete) break;

            /*
             * Check if the character is in the allowed range:
             *      #x9 | #xA | #xD | [#x20-#x7E]               (8 bit)
             *      | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD]    (16 bit)
             *      | [#x10000-#x10FFFF]                        (32 bit)
             */

            if (! (value == 0x09 || value == 0x0A || value == 0x0D
                        || (value >= 0x20 && value <= 0x7E)
                        || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)
                        || (value >= 0xE000 && value <= 0xFFFD)
                        || (value >= 0x10000 && value <= 0x10FFFF)))
                return yaml_parser_set_reader_error(parser,
                        "control characters are not allowed",
                        parser->offset, value);

            /* Move the raw pointers. */

            parser->raw_buffer.pointer += width;
            parser->offset += width;

            /* Finally put the character into the buffer. */

            /* 0000 0000-0000 007F -> 0xxxxxxx */
            if (value <= 0x7F) {
                *(parser->buffer.last++) = value;
            }
            /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
            else if (value <= 0x7FF) {
                *(parser->buffer.last++) = 0xC0 + (value >> 6);
                *(parser->buffer.last++) = 0x80 + (value & 0x3F);
            }
            /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
            else if (value <= 0xFFFF) {
                *(parser->buffer.last++) = 0xE0 + (value >> 12);
                *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
                *(parser->buffer.last++) = 0x80 + (value & 0x3F);
            }
            /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
            else {
                *(parser->buffer.last++) = 0xF0 + (value >> 18);
                *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
                *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
                *(parser->buffer.last++) = 0x80 + (value & 0x3F);
            }

            parser->unread ++;
        }

        /* On EOF, put NUL into the buffer and return. */

        if (parser->eof) {
            *(parser->buffer.last++) = '\0';
            parser->unread ++;
            return 1;
        }

    }

    if (parser->offset >= PTRDIFF_MAX)
        return yaml_parser_set_reader_error(parser, "input is too long",
                PTRDIFF_MAX, -1);

    return 1;
}


Coverage Report

Created: 2025-06-13 06:17

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2006-2016 Kirill Simonov
3		*
4		* Permission is hereby granted, free of charge, to any person obtaining a copy of
5		* this software and associated documentation files (the "Software"), to deal in
6		* the Software without restriction, including without limitation the rights to
7		* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8		* of the Software, and to permit persons to whom the Software is furnished to do
9		* so, subject to the following conditions:
10		*
11		* The above copyright notice and this permission notice shall be included in all
12		* copies or substantial portions of the Software.
13		*
14		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20		* SOFTWARE.
21		*
22		*/
23
24		#include "yaml_private.h"
25
26		/*
27		* Declarations.
28		*/
29
30		static int
31		yaml_parser_set_reader_error(yaml_parser_t parser, const char problem,
32		size_t offset, int value);
33
34		static int
35		yaml_parser_update_raw_buffer(yaml_parser_t *parser);
36
37		static int
38		yaml_parser_determine_encoding(yaml_parser_t *parser);
39
40		YAML_DECLARE(int)
41		yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
42
43		/*
44		* Set the reader error and return 0.
45		*/
46
47		static int
48		yaml_parser_set_reader_error(yaml_parser_t parser, const char problem,
49		size_t offset, int value)
50	0	{
51	0	parser->error = YAML_READER_ERROR;
52	0	parser->problem = problem;
53	0	parser->problem_offset = offset;
54	0	parser->problem_value = value;
55
56	0	return 0;
57	0	}
58
59		/*
60		* Byte order marks.
61		*/
62
63	0	#define BOM_UTF8 "\xef\xbb\xbf"
64	0	#define BOM_UTF16LE "\xff\xfe"
65	0	#define BOM_UTF16BE "\xfe\xff"
66
67		/*
68		* Determine the input stream encoding by checking the BOM symbol. If no BOM is
69		* found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
70		*/
71
72		static int
73		yaml_parser_determine_encoding(yaml_parser_t *parser)
74	0	{
75		/* Ensure that we had enough bytes in the raw buffer. */
76
77	0	while (!parser->eof
78	0	&& parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
79	0	if (!yaml_parser_update_raw_buffer(parser)) {
80	0	return 0;
81	0	}
82	0	}
83
84		/* Determine the encoding. */
85
86	0	if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
87	0	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
88	0	parser->encoding = YAML_UTF16LE_ENCODING;
89	0	parser->raw_buffer.pointer += 2;
90	0	parser->offset += 2;
91	0	}
92	0	else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
93	0	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
94	0	parser->encoding = YAML_UTF16BE_ENCODING;
95	0	parser->raw_buffer.pointer += 2;
96	0	parser->offset += 2;
97	0	}
98	0	else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
99	0	&& !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
100	0	parser->encoding = YAML_UTF8_ENCODING;
101	0	parser->raw_buffer.pointer += 3;
102	0	parser->offset += 3;
103	0	}
104	0	else {
105	0	parser->encoding = YAML_UTF8_ENCODING;
106	0	}
107
108	0	return 1;
109	0	}
110
111		/*
112		* Update the raw buffer.
113		*/
114
115		static int
116		yaml_parser_update_raw_buffer(yaml_parser_t *parser)
117	0	{
118	0	size_t size_read = 0;
119
120		/* Return if the raw buffer is full. */
121
122	0	if (parser->raw_buffer.start == parser->raw_buffer.pointer
123	0	&& parser->raw_buffer.last == parser->raw_buffer.end)
124	0	return 1;
125
126		/* Return on EOF. */
127
128	0	if (parser->eof) return 1;
129
130		/* Move the remaining bytes in the raw buffer to the beginning. */
131
132	0	if (parser->raw_buffer.start < parser->raw_buffer.pointer
133	0	&& parser->raw_buffer.pointer < parser->raw_buffer.last) {
134	0	memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
135	0	parser->raw_buffer.last - parser->raw_buffer.pointer);
136	0	}
137	0	parser->raw_buffer.last -=
138	0	parser->raw_buffer.pointer - parser->raw_buffer.start;
139	0	parser->raw_buffer.pointer = parser->raw_buffer.start;
140
141		/* Call the read handler to fill the buffer. */
142
143	0	if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
144	0	parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
145	0	return yaml_parser_set_reader_error(parser, "input error",
146	0	parser->offset, -1);
147	0	}
148	0	parser->raw_buffer.last += size_read;
149	0	if (!size_read) {
150	0	parser->eof = 1;
151	0	}
152
153	0	return 1;
154	0	}
155
156		/*
157		* Ensure that the buffer contains at least `length` characters.
158		* Return 1 on success, 0 on failure.
159		*
160		* The length is supposed to be significantly less that the buffer size.
161		*/
162
163		YAML_DECLARE(int)
164		yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
165	0	{
166	0	int first = 1;
167
168	0	assert(parser->read_handler); /* Read handler must be set. */
169
170		/* If the EOF flag is set and the raw buffer is empty, do nothing. */
171
172	0	if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
173	0	return 1;
174
175		/* Return if the buffer contains enough characters. */
176
177	0	if (parser->unread >= length)
178	0	return 1;
179
180		/* Determine the input encoding if it is not known yet. */
181
182	0	if (!parser->encoding) {
183	0	if (!yaml_parser_determine_encoding(parser))
184	0	return 0;
185	0	}
186
187		/* Move the unread characters to the beginning of the buffer. */
188
189	0	if (parser->buffer.start < parser->buffer.pointer
190	0	&& parser->buffer.pointer < parser->buffer.last) {
191	0	size_t size = parser->buffer.last - parser->buffer.pointer;
192	0	memmove(parser->buffer.start, parser->buffer.pointer, size);
193	0	parser->buffer.pointer = parser->buffer.start;
194	0	parser->buffer.last = parser->buffer.start + size;
195	0	}
196	0	else if (parser->buffer.pointer == parser->buffer.last) {
197	0	parser->buffer.pointer = parser->buffer.start;
198	0	parser->buffer.last = parser->buffer.start;
199	0	}
200
201		/* Fill the buffer until it has enough characters. */
202
203	0	while (parser->unread < length)
204	0	{
205		/* Fill the raw buffer if necessary. */
206
207	0	if (!first \|\| parser->raw_buffer.pointer == parser->raw_buffer.last) {
208	0	if (!yaml_parser_update_raw_buffer(parser)) return 0;
209	0	}
210	0	first = 0;
211
212		/* Decode the raw buffer. */
213
214	0	while (parser->raw_buffer.pointer != parser->raw_buffer.last)
215	0	{
216	0	unsigned int value = 0, value2 = 0;
217	0	int incomplete = 0;
218	0	unsigned char octet;
219	0	unsigned int width = 0;
220	0	int low, high;
221	0	size_t k;
222	0	size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
223
224		/* Decode the next character. */
225
226	0	switch (parser->encoding)
227	0	{
228	0	case YAML_UTF8_ENCODING:
229
230		/*
231		* Decode a UTF-8 character. Check RFC 3629
232		* (http://www.ietf.org/rfc/rfc3629.txt) for more details.
233		*
234		* The following table (taken from the RFC) is used for
235		* decoding.
236		*
237		* Char. number range \| UTF-8 octet sequence
238		* (hexadecimal) \| (binary)
239		* --------------------+------------------------------------
240		* 0000 0000-0000 007F \| 0xxxxxxx
241		* 0000 0080-0000 07FF \| 110xxxxx 10xxxxxx
242		* 0000 0800-0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
243		* 0001 0000-0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
244		*
245		* Additionally, the characters in the range 0xD800-0xDFFF
246		* are prohibited as they are reserved for use with UTF-16
247		* surrogate pairs.
248		*/
249
250		/* Determine the length of the UTF-8 sequence. */
251
252	0	octet = parser->raw_buffer.pointer[0];
253	0	width = (octet & 0x80) == 0x00 ? 1 :
254	0	(octet & 0xE0) == 0xC0 ? 2 :
255	0	(octet & 0xF0) == 0xE0 ? 3 :
256	0	(octet & 0xF8) == 0xF0 ? 4 : 0;
257
258		/* Check if the leading octet is valid. */
259
260	0	if (!width)
261	0	return yaml_parser_set_reader_error(parser,
262	0	"invalid leading UTF-8 octet",
263	0	parser->offset, octet);
264
265		/* Check if the raw buffer contains an incomplete character. */
266
267	0	if (width > raw_unread) {
268	0	if (parser->eof) {
269	0	return yaml_parser_set_reader_error(parser,
270	0	"incomplete UTF-8 octet sequence",
271	0	parser->offset, -1);
272	0	}
273	0	incomplete = 1;
274	0	break;
275	0	}
276
277		/* Decode the leading octet. */
278
279	0	value = (octet & 0x80) == 0x00 ? octet & 0x7F :
280	0	(octet & 0xE0) == 0xC0 ? octet & 0x1F :
281	0	(octet & 0xF0) == 0xE0 ? octet & 0x0F :
282	0	(octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
283
284		/* Check and decode the trailing octets. */
285
286	0	for (k = 1; k < width; k ++)
287	0	{
288	0	octet = parser->raw_buffer.pointer[k];
289
290		/* Check if the octet is valid. */
291
292	0	if ((octet & 0xC0) != 0x80)
293	0	return yaml_parser_set_reader_error(parser,
294	0	"invalid trailing UTF-8 octet",
295	0	parser->offset+k, octet);
296
297		/* Decode the octet. */
298
299	0	value = (value << 6) + (octet & 0x3F);
300	0	}
301
302		/* Check the length of the sequence against the value. */
303
304	0	if (!((width == 1) \|\|
305	0	(width == 2 && value >= 0x80) \|\|
306	0	(width == 3 && value >= 0x800) \|\|
307	0	(width == 4 && value >= 0x10000)))
308	0	return yaml_parser_set_reader_error(parser,
309	0	"invalid length of a UTF-8 sequence",
310	0	parser->offset, -1);
311
312		/* Check the range of the value. */
313
314	0	if ((value >= 0xD800 && value <= 0xDFFF) \|\| value > 0x10FFFF)
315	0	return yaml_parser_set_reader_error(parser,
316	0	"invalid Unicode character",
317	0	parser->offset, value);
318
319	0	break;
320
321	0	case YAML_UTF16LE_ENCODING:
322	0	case YAML_UTF16BE_ENCODING:
323
324	0	low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
325	0	high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
326
327		/*
328		* The UTF-16 encoding is not as simple as one might
329		* naively think. Check RFC 2781
330		* (http://www.ietf.org/rfc/rfc2781.txt).
331		*
332		* Normally, two subsequent bytes describe a Unicode
333		* character. However a special technique (called a
334		* surrogate pair) is used for specifying character
335		* values larger than 0xFFFF.
336		*
337		* A surrogate pair consists of two pseudo-characters:
338		* high surrogate area (0xD800-0xDBFF)
339		* low surrogate area (0xDC00-0xDFFF)
340		*
341		* The following formulas are used for decoding
342		* and encoding characters using surrogate pairs:
343		*
344		* U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
345		* U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
346		* W1 = 110110yyyyyyyyyy
347		* W2 = 110111xxxxxxxxxx
348		*
349		* where U is the character value, W1 is the high surrogate
350		* area, W2 is the low surrogate area.
351		*/
352
353		/* Check for incomplete UTF-16 character. */
354
355	0	if (raw_unread < 2) {
356	0	if (parser->eof) {
357	0	return yaml_parser_set_reader_error(parser,
358	0	"incomplete UTF-16 character",
359	0	parser->offset, -1);
360	0	}
361	0	incomplete = 1;
362	0	break;
363	0	}
364
365		/* Get the character. */
366
367	0	value = parser->raw_buffer.pointer[low]
368	0	+ (parser->raw_buffer.pointer[high] << 8);
369
370		/* Check for unexpected low surrogate area. */
371
372	0	if ((value & 0xFC00) == 0xDC00)
373	0	return yaml_parser_set_reader_error(parser,
374	0	"unexpected low surrogate area",
375	0	parser->offset, value);
376
377		/* Check for a high surrogate area. */
378
379	0	if ((value & 0xFC00) == 0xD800) {
380
381	0	width = 4;
382
383		/* Check for incomplete surrogate pair. */
384
385	0	if (raw_unread < 4) {
386	0	if (parser->eof) {
387	0	return yaml_parser_set_reader_error(parser,
388	0	"incomplete UTF-16 surrogate pair",
389	0	parser->offset, -1);
390	0	}
391	0	incomplete = 1;
392	0	break;
393	0	}
394
395		/* Get the next character. */
396
397	0	value2 = parser->raw_buffer.pointer[low+2]
398	0	+ (parser->raw_buffer.pointer[high+2] << 8);
399
400		/* Check for a low surrogate area. */
401
402	0	if ((value2 & 0xFC00) != 0xDC00)
403	0	return yaml_parser_set_reader_error(parser,
404	0	"expected low surrogate area",
405	0	parser->offset+2, value2);
406
407		/* Generate the value of the surrogate pair. */
408
409	0	value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
410	0	}
411
412	0	else {
413	0	width = 2;
414	0	}
415
416	0	break;
417
418	0	default:
419	0	assert(1); /* Impossible. */
420	0	}
421
422		/* Check if the raw buffer contains enough bytes to form a character. */
423
424	0	if (incomplete) break;
425
426		/*
427		* Check if the character is in the allowed range:
428		* #x9 \| #xA \| #xD \| [#x20-#x7E] (8 bit)
429		* \| #x85 \| [#xA0-#xD7FF] \| [#xE000-#xFFFD] (16 bit)
430		* \| [#x10000-#x10FFFF] (32 bit)
431		*/
432
433	0	if (! (value == 0x09 \|\| value == 0x0A \|\| value == 0x0D
434	0	\|\| (value >= 0x20 && value <= 0x7E)
435	0	\|\| (value == 0x85) \|\| (value >= 0xA0 && value <= 0xD7FF)
436	0	\|\| (value >= 0xE000 && value <= 0xFFFD)
437	0	\|\| (value >= 0x10000 && value <= 0x10FFFF)))
438	0	return yaml_parser_set_reader_error(parser,
439	0	"control characters are not allowed",
440	0	parser->offset, value);
441
442		/* Move the raw pointers. */
443
444	0	parser->raw_buffer.pointer += width;
445	0	parser->offset += width;
446
447		/* Finally put the character into the buffer. */
448
449		/* 0000 0000-0000 007F -> 0xxxxxxx */
450	0	if (value <= 0x7F) {
451	0	*(parser->buffer.last++) = value;
452	0	}
453		/* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
454	0	else if (value <= 0x7FF) {
455	0	*(parser->buffer.last++) = 0xC0 + (value >> 6);
456	0	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
457	0	}
458		/* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
459	0	else if (value <= 0xFFFF) {
460	0	*(parser->buffer.last++) = 0xE0 + (value >> 12);
461	0	*(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
462	0	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
463	0	}
464		/* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
465	0	else {
466	0	*(parser->buffer.last++) = 0xF0 + (value >> 18);
467	0	*(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
468	0	*(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
469	0	*(parser->buffer.last++) = 0x80 + (value & 0x3F);
470	0	}
471
472	0	parser->unread ++;
473	0	}
474
475		/* On EOF, put NUL into the buffer and return. */
476
477	0	if (parser->eof) {
478	0	*(parser->buffer.last++) = '\0';
479	0	parser->unread ++;
480	0	return 1;
481	0	}
482
483	0	}
484
485	0	if (parser->offset >= PTRDIFF_MAX)
486	0	return yaml_parser_set_reader_error(parser, "input is too long",
487	0	PTRDIFF_MAX, -1);
488
489	0	return 1;
490	0	}
491