/src/yajl-ruby/ext/yajl/yajl_lex.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2010, Lloyd Hilaiel. |
3 | | * |
4 | | * Redistribution and use in source and binary forms, with or without |
5 | | * modification, are permitted provided that the following conditions are |
6 | | * met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright |
9 | | * notice, this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above copyright |
12 | | * notice, this list of conditions and the following disclaimer in |
13 | | * the documentation and/or other materials provided with the |
14 | | * distribution. |
15 | | * |
16 | | * 3. Neither the name of Lloyd Hilaiel nor the names of its |
17 | | * contributors may be used to endorse or promote products derived |
18 | | * from this software without specific prior written permission. |
19 | | * |
20 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
21 | | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
22 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
23 | | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, |
24 | | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
25 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
26 | | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
27 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
28 | | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
29 | | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
30 | | * POSSIBILITY OF SUCH DAMAGE. |
31 | | */ |
32 | | |
33 | | #include "yajl_lex.h" |
34 | | #include "yajl_buf.h" |
35 | | |
36 | | #include <stdlib.h> |
37 | | #include <stdio.h> |
38 | | #include <assert.h> |
39 | | #include <string.h> |
40 | | |
41 | 0 | const char *yajl_tok_name(yajl_tok tok) { |
42 | 0 | switch (tok) { |
43 | 0 | case yajl_tok_bool: return "bool"; |
44 | 0 | case yajl_tok_colon: return "colon"; |
45 | 0 | case yajl_tok_comma: return "comma"; |
46 | 0 | case yajl_tok_comment: return "comment"; |
47 | 0 | case yajl_tok_eof: return "eof"; |
48 | 0 | case yajl_tok_error: return "error"; |
49 | 0 | case yajl_tok_left_brace: return "open_array"; |
50 | 0 | case yajl_tok_left_bracket: return "open_object"; |
51 | 0 | case yajl_tok_null: return "null"; |
52 | 0 | case yajl_tok_integer: return "integer"; |
53 | 0 | case yajl_tok_double: return "double"; |
54 | 0 | case yajl_tok_right_brace: return "close_array"; |
55 | 0 | case yajl_tok_right_bracket: return "close_object"; |
56 | 0 | case yajl_tok_string: return "string"; |
57 | 0 | case yajl_tok_string_with_escapes: return "string_with_escapes"; |
58 | 0 | } |
59 | 0 | return "unknown"; |
60 | 0 | } |
61 | | |
62 | | /* Impact of the stream parsing feature on the lexer: |
63 | | * |
64 | | * YAJL support stream parsing. That is, the ability to parse the first |
65 | | * bits of a chunk of JSON before the last bits are available (still on |
66 | | * the network or disk). This makes the lexer more complex. The |
67 | | * responsibility of the lexer is to handle transparently the case where |
68 | | * a chunk boundary falls in the middle of a token. This is |
69 | | * accomplished is via a buffer and a character reading abstraction. |
70 | | * |
71 | | * Overview of implementation |
72 | | * |
73 | | * When we lex to end of input string before end of token is hit, we |
74 | | * copy all of the input text composing the token into our lexBuf. |
75 | | * |
76 | | * Every time we read a character, we do so through the readChar function. |
77 | | * readChar's responsibility is to handle pulling all chars from the buffer |
78 | | * before pulling chars from input text |
79 | | */ |
80 | | |
81 | | struct yajl_lexer_t { |
82 | | /* the overal line and char offset into the data */ |
83 | | unsigned int lineOff; |
84 | | unsigned int charOff; |
85 | | |
86 | | /* error */ |
87 | | yajl_lex_error error; |
88 | | |
89 | | /* a input buffer to handle the case where a token is spread over |
90 | | * multiple chunks */ |
91 | | yajl_buf buf; |
92 | | |
93 | | /* in the case where we have data in the lexBuf, bufOff holds |
94 | | * the current offset into the lexBuf. */ |
95 | | unsigned int bufOff; |
96 | | |
97 | | /* are we using the lex buf? */ |
98 | | unsigned int bufInUse; |
99 | | |
100 | | /* shall we allow comments? */ |
101 | | unsigned int allowComments; |
102 | | |
103 | | /* shall we validate utf8 inside strings? */ |
104 | | unsigned int validateUTF8; |
105 | | |
106 | | yajl_alloc_funcs * alloc; |
107 | | }; |
108 | | |
109 | | #define readChar(lxr, txt, off) \ |
110 | 122M | (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \ |
111 | 122M | (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \ |
112 | 122M | ((txt)[(*(off))++])) |
113 | | |
114 | 3.43M | #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--)) |
115 | | |
116 | | yajl_lexer |
117 | | yajl_lex_alloc(yajl_alloc_funcs * alloc, |
118 | | unsigned int allowComments, unsigned int validateUTF8) |
119 | 2.30k | { |
120 | 2.30k | yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t)); |
121 | 2.30k | if (!lxr) |
122 | 0 | return NULL; |
123 | 2.30k | memset((void *) lxr, 0, sizeof(struct yajl_lexer_t)); |
124 | 2.30k | lxr->buf = yajl_buf_alloc(alloc); |
125 | 2.30k | lxr->allowComments = allowComments; |
126 | 2.30k | lxr->validateUTF8 = validateUTF8; |
127 | 2.30k | lxr->alloc = alloc; |
128 | 2.30k | return lxr; |
129 | 2.30k | } |
130 | | |
131 | | yajl_lexer |
132 | 1.74M | yajl_lex_realloc(yajl_lexer orig) { |
133 | 1.74M | orig->lineOff = 0; |
134 | 1.74M | orig->charOff = 0; |
135 | 1.74M | orig->error = yajl_lex_e_ok; |
136 | 1.74M | yajl_buf_clear(orig->buf); |
137 | 1.74M | orig->bufOff = 0; |
138 | 1.74M | orig->bufInUse = 0; |
139 | 1.74M | return orig; |
140 | 1.74M | } |
141 | | |
142 | | void |
143 | | yajl_lex_free(yajl_lexer lxr) |
144 | 2.30k | { |
145 | 2.30k | yajl_buf_free(lxr->buf); |
146 | 2.30k | YA_FREE(lxr->alloc, lxr); |
147 | 2.30k | return; |
148 | 2.30k | } |
149 | | |
150 | | /* a lookup table which lets us quickly determine three things: |
151 | | * VEC - valid escaped conrol char |
152 | | * IJC - invalid json char |
153 | | * VHC - valid hex char |
154 | | * note. the solidus '/' may be escaped or not. |
155 | | * note. the |
156 | | */ |
157 | 50.5k | #define VEC 1 |
158 | 13.3M | #define IJC 2 |
159 | 140k | #define VHC 4 |
160 | | static const char charLookupTable[256] = |
161 | | { |
162 | | /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , |
163 | | /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , |
164 | | /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , |
165 | | /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , |
166 | | |
167 | | /*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 , |
168 | | /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC , |
169 | | /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC , |
170 | | /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 , |
171 | | |
172 | | /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 , |
173 | | /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
174 | | /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
175 | | /*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 , |
176 | | |
177 | | /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 , |
178 | | /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 , |
179 | | /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 , |
180 | | /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
181 | | |
182 | | /* include these so we don't have to always check the range of the char */ |
183 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
184 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
185 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
186 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
187 | | |
188 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
189 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
190 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
191 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
192 | | |
193 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
194 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
195 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
196 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
197 | | |
198 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
199 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
200 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , |
201 | | 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 |
202 | | }; |
203 | | |
204 | | /** process a variable length utf8 encoded codepoint. |
205 | | * |
206 | | * returns: |
207 | | * yajl_tok_string - if valid utf8 char was parsed and offset was |
208 | | * advanced |
209 | | * yajl_tok_eof - if end of input was hit before validation could |
210 | | * complete |
211 | | * yajl_tok_error - if invalid utf8 was encountered |
212 | | * |
213 | | * NOTE: on error the offset will point to the first char of the |
214 | | * invalid utf8 */ |
215 | 6.93k | #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; } |
216 | | |
217 | | static yajl_tok |
218 | | yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText, |
219 | | unsigned int jsonTextLen, unsigned int * offset, |
220 | | unsigned char curChar) |
221 | 13.3M | { |
222 | 13.3M | if (curChar <= 0x7f) { |
223 | | /* single byte */ |
224 | 13.3M | return yajl_tok_string; |
225 | 13.3M | } else if ((curChar >> 5) == 0x6) { |
226 | | /* two byte */ |
227 | 1.19k | UTF8_CHECK_EOF; |
228 | 1.19k | curChar = readChar(lexer, jsonText, offset); |
229 | 1.19k | if ((curChar >> 6) == 0x2) return yajl_tok_string; |
230 | 2.09k | } else if ((curChar >> 4) == 0x0e) { |
231 | | /* three byte */ |
232 | 449 | UTF8_CHECK_EOF; |
233 | 448 | curChar = readChar(lexer, jsonText, offset); |
234 | 448 | if ((curChar >> 6) == 0x2) { |
235 | 444 | UTF8_CHECK_EOF; |
236 | 437 | curChar = readChar(lexer, jsonText, offset); |
237 | 437 | if ((curChar >> 6) == 0x2) return yajl_tok_string; |
238 | 437 | } |
239 | 1.64k | } else if ((curChar >> 3) == 0x1e) { |
240 | | /* four byte */ |
241 | 1.63k | UTF8_CHECK_EOF; |
242 | 1.62k | curChar = readChar(lexer, jsonText, offset); |
243 | 1.62k | if ((curChar >> 6) == 0x2) { |
244 | 1.62k | UTF8_CHECK_EOF; |
245 | 1.60k | curChar = readChar(lexer, jsonText, offset); |
246 | 1.60k | if ((curChar >> 6) == 0x2) { |
247 | 1.59k | UTF8_CHECK_EOF; |
248 | 1.57k | curChar = readChar(lexer, jsonText, offset); |
249 | 1.57k | if ((curChar >> 6) == 0x2) return yajl_tok_string; |
250 | 1.57k | } |
251 | 1.60k | } |
252 | 1.62k | } |
253 | | |
254 | 51 | return yajl_tok_error; |
255 | 13.3M | } |
256 | | |
257 | | /* lex a string. input is the lexer, pointer to beginning of |
258 | | * json text, and start of string (offset). |
259 | | * a token is returned which has the following meanings: |
260 | | * yajl_tok_string: lex of string was successful. offset points to |
261 | | * terminating '"'. |
262 | | * yajl_tok_eof: end of text was encountered before we could complete |
263 | | * the lex. |
264 | | * yajl_tok_error: embedded in the string were unallowable chars. offset |
265 | | * points to the offending char |
266 | | */ |
267 | 14.2M | #define STR_CHECK_EOF \ |
268 | 14.2M | if (*offset >= jsonTextLen) { \ |
269 | 142 | tok = yajl_tok_eof; \ |
270 | 142 | goto finish_string_lex; \ |
271 | 142 | } |
272 | | |
273 | | static yajl_tok |
274 | | yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, |
275 | | unsigned int jsonTextLen, unsigned int * offset) |
276 | 569k | { |
277 | 569k | yajl_tok tok = yajl_tok_error; |
278 | 569k | int hasEscapes = 0; |
279 | | |
280 | 14.0M | for (;;) { |
281 | 14.0M | unsigned char curChar; |
282 | | |
283 | 14.0M | STR_CHECK_EOF; |
284 | | |
285 | 14.0M | curChar = readChar(lexer, jsonText, offset); |
286 | | |
287 | | /* quote terminates */ |
288 | 14.0M | if (curChar == '"') { |
289 | 569k | tok = yajl_tok_string; |
290 | 569k | break; |
291 | 569k | } |
292 | | /* backslash escapes a set of control chars, */ |
293 | 13.4M | else if (curChar == '\\') { |
294 | 85.6k | hasEscapes = 1; |
295 | 85.6k | STR_CHECK_EOF; |
296 | | |
297 | | /* special case \u */ |
298 | 85.6k | curChar = readChar(lexer, jsonText, offset); |
299 | 85.6k | if (curChar == 'u') { |
300 | 35.0k | unsigned int i = 0; |
301 | | |
302 | 175k | for (i=0;i<4;i++) { |
303 | 140k | STR_CHECK_EOF; |
304 | 140k | curChar = readChar(lexer, jsonText, offset); |
305 | 140k | if (!(charLookupTable[curChar] & VHC)) { |
306 | | /* back up to offending char */ |
307 | 29 | unreadChar(lexer, offset); |
308 | 29 | lexer->error = yajl_lex_string_invalid_hex_char; |
309 | 29 | goto finish_string_lex; |
310 | 29 | } |
311 | 140k | } |
312 | 50.5k | } else if (!(charLookupTable[curChar] & VEC)) { |
313 | | /* back up to offending char */ |
314 | 36 | unreadChar(lexer, offset); |
315 | 36 | lexer->error = yajl_lex_string_invalid_escaped_char; |
316 | 36 | goto finish_string_lex; |
317 | 36 | } |
318 | 85.6k | } |
319 | | /* when not validating UTF8 it's a simple table lookup to determine |
320 | | * if the present character is invalid */ |
321 | 13.3M | else if(charLookupTable[curChar] & IJC) { |
322 | | /* back up to offending char */ |
323 | 27 | unreadChar(lexer, offset); |
324 | 27 | lexer->error = yajl_lex_string_invalid_json_char; |
325 | 27 | goto finish_string_lex; |
326 | 27 | } |
327 | | /* when in validate UTF8 mode we need to do some extra work */ |
328 | 13.3M | else if (lexer->validateUTF8) { |
329 | 13.3M | yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen, |
330 | 13.3M | offset, curChar); |
331 | | |
332 | 13.3M | if (t == yajl_tok_eof) { |
333 | 51 | tok = yajl_tok_eof; |
334 | 51 | goto finish_string_lex; |
335 | 13.3M | } else if (t == yajl_tok_error) { |
336 | 51 | lexer->error = yajl_lex_string_invalid_utf8; |
337 | 51 | goto finish_string_lex; |
338 | 51 | } |
339 | 13.3M | } |
340 | | /* accept it, and move on */ |
341 | 14.0M | } |
342 | 569k | finish_string_lex: |
343 | | /* tell our buddy, the parser, wether he needs to process this string |
344 | | * again */ |
345 | 569k | if (hasEscapes && tok == yajl_tok_string) { |
346 | 3.26k | tok = yajl_tok_string_with_escapes; |
347 | 3.26k | } |
348 | | |
349 | 569k | return tok; |
350 | 569k | } |
351 | | |
352 | 65.4M | #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof; |
353 | | |
354 | | static yajl_tok |
355 | | yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, |
356 | | unsigned int jsonTextLen, unsigned int * offset) |
357 | 1.70M | { |
358 | | /** XXX: numbers are the only entities in json that we must lex |
359 | | * _beyond_ in order to know that they are complete. There |
360 | | * is an ambiguous case for integers at EOF. */ |
361 | | |
362 | 1.70M | unsigned char c; |
363 | | |
364 | 1.70M | yajl_tok tok = yajl_tok_integer; |
365 | | |
366 | 1.70M | RETURN_IF_EOF; |
367 | 1.70M | c = readChar(lexer, jsonText, offset); |
368 | | |
369 | | /* optional leading minus */ |
370 | 1.70M | if (c == '-') { |
371 | 13.1k | RETURN_IF_EOF; |
372 | 13.1k | c = readChar(lexer, jsonText, offset); |
373 | 13.1k | } |
374 | | |
375 | | /* a single zero, or a series of integers */ |
376 | 1.70M | if (c == '0') { |
377 | 1.62M | RETURN_IF_EOF; |
378 | 1.62M | c = readChar(lexer, jsonText, offset); |
379 | 1.62M | } else if (c >= '1' && c <= '9') { |
380 | 4.05M | do { |
381 | 4.05M | RETURN_IF_EOF; |
382 | 4.05M | c = readChar(lexer, jsonText, offset); |
383 | 4.05M | } while (c >= '0' && c <= '9'); |
384 | 77.5k | } else { |
385 | 36 | unreadChar(lexer, offset); |
386 | 36 | lexer->error = yajl_lex_missing_integer_after_minus; |
387 | 36 | return yajl_tok_error; |
388 | 36 | } |
389 | | |
390 | | /* optional fraction (indicates this is floating point) */ |
391 | 1.70M | if (c == '.') { |
392 | 16.9k | int numRd = 0; |
393 | | |
394 | 16.9k | RETURN_IF_EOF; |
395 | 16.9k | c = readChar(lexer, jsonText, offset); |
396 | | |
397 | 3.78M | while (c >= '0' && c <= '9') { |
398 | 3.77M | numRd++; |
399 | 3.77M | RETURN_IF_EOF; |
400 | 3.77M | c = readChar(lexer, jsonText, offset); |
401 | 3.77M | } |
402 | | |
403 | 16.8k | if (!numRd) { |
404 | 48 | unreadChar(lexer, offset); |
405 | 48 | lexer->error = yajl_lex_missing_integer_after_decimal; |
406 | 48 | return yajl_tok_error; |
407 | 48 | } |
408 | 16.8k | tok = yajl_tok_double; |
409 | 16.8k | } |
410 | | |
411 | | /* optional exponent (indicates this is floating point) */ |
412 | 1.70M | if (c == 'e' || c == 'E') { |
413 | 17.1k | RETURN_IF_EOF; |
414 | 17.1k | c = readChar(lexer, jsonText, offset); |
415 | | |
416 | | /* optional sign */ |
417 | 17.1k | if (c == '+' || c == '-') { |
418 | 4.48k | RETURN_IF_EOF; |
419 | 4.46k | c = readChar(lexer, jsonText, offset); |
420 | 4.46k | } |
421 | | |
422 | 17.1k | if (c >= '0' && c <= '9') { |
423 | 678k | do { |
424 | 678k | RETURN_IF_EOF; |
425 | 678k | c = readChar(lexer, jsonText, offset); |
426 | 678k | } while (c >= '0' && c <= '9'); |
427 | 17.0k | } else { |
428 | 51 | unreadChar(lexer, offset); |
429 | 51 | lexer->error = yajl_lex_missing_integer_after_exponent; |
430 | 51 | return yajl_tok_error; |
431 | 51 | } |
432 | 17.0k | tok = yajl_tok_double; |
433 | 17.0k | } |
434 | | |
435 | | /* we always go "one too far" */ |
436 | 1.70M | unreadChar(lexer, offset); |
437 | | |
438 | 1.70M | return tok; |
439 | 1.70M | } |
440 | | |
441 | | static yajl_tok |
442 | | yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, |
443 | | unsigned int jsonTextLen, unsigned int * offset) |
444 | 21.3k | { |
445 | 21.3k | unsigned char c; |
446 | | |
447 | 21.3k | yajl_tok tok = yajl_tok_comment; |
448 | | |
449 | 21.3k | RETURN_IF_EOF; |
450 | 21.3k | c = readChar(lexer, jsonText, offset); |
451 | | |
452 | | /* either slash or star expected */ |
453 | 21.3k | if (c == '/') { |
454 | | /* now we throw away until end of line */ |
455 | 26.4M | do { |
456 | 26.4M | RETURN_IF_EOF; |
457 | 26.4M | c = readChar(lexer, jsonText, offset); |
458 | 26.4M | } while (c != '\n'); |
459 | 13.3k | } else if (c == '*') { |
460 | | /* now we throw away until end of comment */ |
461 | 27.0M | for (;;) { |
462 | 27.0M | RETURN_IF_EOF; |
463 | 27.0M | c = readChar(lexer, jsonText, offset); |
464 | 27.0M | if (c == '*') { |
465 | 48.0k | RETURN_IF_EOF; |
466 | 47.9k | c = readChar(lexer, jsonText, offset); |
467 | 47.9k | if (c == '/') { |
468 | 13.2k | break; |
469 | 34.7k | } else { |
470 | 34.7k | unreadChar(lexer, offset); |
471 | 34.7k | } |
472 | 47.9k | } |
473 | 27.0M | } |
474 | 13.3k | } else { |
475 | 18 | lexer->error = yajl_lex_invalid_char; |
476 | 18 | tok = yajl_tok_error; |
477 | 18 | } |
478 | | |
479 | 21.1k | return tok; |
480 | 21.3k | } |
481 | | |
482 | | yajl_tok |
483 | | yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, |
484 | | unsigned int jsonTextLen, unsigned int * offset, |
485 | | const unsigned char ** outBuf, unsigned int * outLen) |
486 | 42.6M | { |
487 | 42.6M | yajl_tok tok = yajl_tok_error; |
488 | 42.6M | unsigned char c; |
489 | 42.6M | unsigned int startOffset = *offset; |
490 | | |
491 | 42.6M | *outBuf = NULL; |
492 | 42.6M | *outLen = 0; |
493 | | |
494 | 42.7M | for (;;) { |
495 | 42.7M | assert(*offset <= jsonTextLen); |
496 | | |
497 | 42.7M | if (*offset >= jsonTextLen) { |
498 | 787 | tok = yajl_tok_eof; |
499 | 787 | goto lexed; |
500 | 787 | } |
501 | | |
502 | 42.7M | c = readChar(lexer, jsonText, offset); |
503 | | |
504 | 42.7M | switch (c) { |
505 | 539k | case '{': |
506 | 539k | tok = yajl_tok_left_bracket; |
507 | 539k | goto lexed; |
508 | 1.11k | case '}': |
509 | 1.11k | tok = yajl_tok_right_bracket; |
510 | 1.11k | goto lexed; |
511 | 33.1M | case '[': |
512 | 33.1M | tok = yajl_tok_left_brace; |
513 | 33.1M | goto lexed; |
514 | 6.06M | case ']': |
515 | 6.06M | tok = yajl_tok_right_brace; |
516 | 6.06M | goto lexed; |
517 | 23.0k | case ',': |
518 | 23.0k | tok = yajl_tok_comma; |
519 | 23.0k | goto lexed; |
520 | 538k | case ':': |
521 | 538k | tok = yajl_tok_colon; |
522 | 538k | goto lexed; |
523 | 107k | case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': |
524 | 107k | startOffset++; |
525 | 107k | break; |
526 | 1.49k | case 't': { |
527 | 1.49k | const char * want = "rue"; |
528 | 4.39k | do { |
529 | 4.39k | if (*offset >= jsonTextLen) { |
530 | 12 | tok = yajl_tok_eof; |
531 | 12 | goto lexed; |
532 | 12 | } |
533 | 4.38k | c = readChar(lexer, jsonText, offset); |
534 | 4.38k | if (c != *want) { |
535 | 42 | unreadChar(lexer, offset); |
536 | 42 | lexer->error = yajl_lex_invalid_string; |
537 | 42 | tok = yajl_tok_error; |
538 | 42 | goto lexed; |
539 | 42 | } |
540 | 4.38k | } while (*(++want)); |
541 | 1.43k | tok = yajl_tok_bool; |
542 | 1.43k | goto lexed; |
543 | 1.49k | } |
544 | 493 | case 'f': { |
545 | 493 | const char * want = "alse"; |
546 | 1.82k | do { |
547 | 1.82k | if (*offset >= jsonTextLen) { |
548 | 14 | tok = yajl_tok_eof; |
549 | 14 | goto lexed; |
550 | 14 | } |
551 | 1.81k | c = readChar(lexer, jsonText, offset); |
552 | 1.81k | if (c != *want) { |
553 | 43 | unreadChar(lexer, offset); |
554 | 43 | lexer->error = yajl_lex_invalid_string; |
555 | 43 | tok = yajl_tok_error; |
556 | 43 | goto lexed; |
557 | 43 | } |
558 | 1.81k | } while (*(++want)); |
559 | 436 | tok = yajl_tok_bool; |
560 | 436 | goto lexed; |
561 | 493 | } |
562 | 3.74k | case 'n': { |
563 | 3.74k | const char * want = "ull"; |
564 | 11.1k | do { |
565 | 11.1k | if (*offset >= jsonTextLen) { |
566 | 15 | tok = yajl_tok_eof; |
567 | 15 | goto lexed; |
568 | 15 | } |
569 | 11.1k | c = readChar(lexer, jsonText, offset); |
570 | 11.1k | if (c != *want) { |
571 | 31 | unreadChar(lexer, offset); |
572 | 31 | lexer->error = yajl_lex_invalid_string; |
573 | 31 | tok = yajl_tok_error; |
574 | 31 | goto lexed; |
575 | 31 | } |
576 | 11.1k | } while (*(++want)); |
577 | 3.69k | tok = yajl_tok_null; |
578 | 3.69k | goto lexed; |
579 | 3.74k | } |
580 | 569k | case '"': { |
581 | 569k | tok = yajl_lex_string(lexer, (const unsigned char *) jsonText, |
582 | 569k | jsonTextLen, offset); |
583 | 569k | goto lexed; |
584 | 3.74k | } |
585 | 13.1k | case '-': |
586 | 1.65M | case '0': case '1': case '2': case '3': case '4': |
587 | 1.70M | case '5': case '6': case '7': case '8': case '9': { |
588 | | /* integer parsing wants to start from the beginning */ |
589 | 1.70M | unreadChar(lexer, offset); |
590 | 1.70M | tok = yajl_lex_number(lexer, (const unsigned char *) jsonText, |
591 | 1.70M | jsonTextLen, offset); |
592 | 1.70M | goto lexed; |
593 | 1.68M | } |
594 | 21.3k | case '/': |
595 | | /* hey, look, a probable comment! If comments are disabled |
596 | | * it's an error. */ |
597 | 21.3k | if (!lexer->allowComments) { |
598 | 0 | unreadChar(lexer, offset); |
599 | 0 | lexer->error = yajl_lex_unallowed_comment; |
600 | 0 | tok = yajl_tok_error; |
601 | 0 | goto lexed; |
602 | 0 | } |
603 | | /* if comments are enabled, then we should try to lex |
604 | | * the thing. possible outcomes are |
605 | | * - successful lex (tok_comment, which means continue), |
606 | | * - malformed comment opening (slash not followed by |
607 | | * '*' or '/') (tok_error) |
608 | | * - eof hit. (tok_eof) */ |
609 | 21.3k | tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText, |
610 | 21.3k | jsonTextLen, offset); |
611 | 21.3k | if (tok == yajl_tok_comment) { |
612 | | /* "error" is silly, but that's the initial |
613 | | * state of tok. guilty until proven innocent. */ |
614 | 21.1k | tok = yajl_tok_error; |
615 | 21.1k | yajl_buf_clear(lexer->buf); |
616 | 21.1k | lexer->bufInUse = 0; |
617 | 21.1k | startOffset = *offset; |
618 | 21.1k | break; |
619 | 21.1k | } |
620 | | /* hit error or eof, bail */ |
621 | 147 | goto lexed; |
622 | 213 | default: |
623 | 213 | lexer->error = yajl_lex_invalid_char; |
624 | 213 | tok = yajl_tok_error; |
625 | 213 | goto lexed; |
626 | 42.7M | } |
627 | 42.7M | } |
628 | | |
629 | | |
630 | 42.6M | lexed: |
631 | | /* need to append to buffer if the buffer is in use or |
632 | | * if it's an EOF token */ |
633 | 42.6M | if (tok == yajl_tok_eof || lexer->bufInUse) { |
634 | 1.38k | if (!lexer->bufInUse) yajl_buf_clear(lexer->buf); |
635 | 1.38k | lexer->bufInUse = 1; |
636 | 1.38k | yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset); |
637 | 1.38k | lexer->bufOff = 0; |
638 | | |
639 | 1.38k | if (yajl_buf_err(lexer->buf)) { |
640 | 0 | lexer->error = yajl_lex_alloc_failed; |
641 | 0 | return yajl_tok_error; |
642 | 0 | } |
643 | | |
644 | 1.38k | if (tok != yajl_tok_eof) { |
645 | 0 | *outBuf = yajl_buf_data(lexer->buf); |
646 | 0 | *outLen = yajl_buf_len(lexer->buf); |
647 | 0 | lexer->bufInUse = 0; |
648 | 0 | } |
649 | 42.6M | } else if (tok != yajl_tok_error) { |
650 | 42.6M | *outBuf = jsonText + startOffset; |
651 | 42.6M | *outLen = *offset - startOffset; |
652 | 42.6M | } |
653 | | |
654 | | /* special case for strings. skip the quotes. */ |
655 | 42.6M | if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes) |
656 | 569k | { |
657 | 569k | assert(*outLen >= 2); |
658 | 569k | (*outBuf)++; |
659 | 569k | *outLen -= 2; |
660 | 569k | } |
661 | | |
662 | | |
663 | | #ifdef YAJL_LEXER_DEBUG |
664 | | if (tok == yajl_tok_error) { |
665 | | printf("lexical error: %s\n", |
666 | | yajl_lex_error_to_string(yajl_lex_get_error(lexer))); |
667 | | } else if (tok == yajl_tok_eof) { |
668 | | printf("EOF hit\n"); |
669 | | } else { |
670 | | printf("lexed %s: '", tokToStr(tok)); |
671 | | fwrite(*outBuf, 1, *outLen, stdout); |
672 | | printf("'\n"); |
673 | | } |
674 | | #endif |
675 | | |
676 | 42.6M | return tok; |
677 | 42.6M | } |
678 | | |
679 | | const char * |
680 | | yajl_lex_error_to_string(yajl_lex_error error) |
681 | 0 | { |
682 | 0 | switch (error) { |
683 | 0 | case yajl_lex_e_ok: |
684 | 0 | return "ok, no error"; |
685 | 0 | case yajl_lex_string_invalid_utf8: |
686 | 0 | return "invalid bytes in UTF8 string."; |
687 | 0 | case yajl_lex_string_invalid_escaped_char: |
688 | 0 | return "inside a string, '\\' occurs before a character " |
689 | 0 | "which it may not."; |
690 | 0 | case yajl_lex_string_invalid_json_char: |
691 | 0 | return "invalid character inside string."; |
692 | 0 | case yajl_lex_string_invalid_hex_char: |
693 | 0 | return "invalid (non-hex) character occurs after '\\u' inside " |
694 | 0 | "string."; |
695 | 0 | case yajl_lex_invalid_char: |
696 | 0 | return "invalid char in json text."; |
697 | 0 | case yajl_lex_invalid_string: |
698 | 0 | return "invalid string in json text."; |
699 | 0 | case yajl_lex_missing_integer_after_exponent: |
700 | 0 | return "malformed number, a digit is required after the exponent."; |
701 | 0 | case yajl_lex_missing_integer_after_decimal: |
702 | 0 | return "malformed number, a digit is required after the " |
703 | 0 | "decimal point."; |
704 | 0 | case yajl_lex_missing_integer_after_minus: |
705 | 0 | return "malformed number, a digit is required after the " |
706 | 0 | "minus sign."; |
707 | 0 | case yajl_lex_unallowed_comment: |
708 | 0 | return "probable comment found in input text, comments are " |
709 | 0 | "not enabled."; |
710 | 0 | case yajl_lex_alloc_failed: |
711 | 0 | return "allocation failed"; |
712 | 0 | } |
713 | 0 | return "unknown error code"; |
714 | 0 | } |
715 | | |
716 | | |
717 | | /** allows access to more specific information about the lexical |
718 | | * error when yajl_lex_lex returns yajl_tok_error. */ |
719 | | yajl_lex_error |
720 | | yajl_lex_get_error(yajl_lexer lexer) |
721 | 0 | { |
722 | 0 | if (lexer == NULL) return (yajl_lex_error) -1; |
723 | 0 | return lexer->error; |
724 | 0 | } |
725 | | |
726 | | unsigned int yajl_lex_current_line(yajl_lexer lexer) |
727 | 0 | { |
728 | 0 | return lexer->lineOff; |
729 | 0 | } |
730 | | |
731 | | unsigned int yajl_lex_current_char(yajl_lexer lexer) |
732 | 0 | { |
733 | 0 | return lexer->charOff; |
734 | 0 | } |
735 | | |
736 | | yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, |
737 | | unsigned int jsonTextLen, unsigned int offset) |
738 | 0 | { |
739 | 0 | const unsigned char * outBuf; |
740 | 0 | unsigned int outLen; |
741 | 0 | unsigned int bufLen = yajl_buf_len(lexer->buf); |
742 | 0 | unsigned int bufOff = lexer->bufOff; |
743 | 0 | unsigned int bufInUse = lexer->bufInUse; |
744 | 0 | yajl_tok tok; |
745 | | |
746 | 0 | tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset, |
747 | 0 | &outBuf, &outLen); |
748 | |
|
749 | 0 | if (tok == yajl_tok_eof) { |
750 | 0 | return tok; |
751 | 0 | } |
752 | | |
753 | 0 | lexer->bufOff = bufOff; |
754 | 0 | lexer->bufInUse = bufInUse; |
755 | 0 | yajl_buf_truncate(lexer->buf, bufLen); |
756 | | |
757 | 0 | return tok; |
758 | 0 | } |