Line | Count | Source |
1 | | #include <stdio.h> |
2 | | #include <stdlib.h> |
3 | | #include <string.h> |
4 | | #include <assert.h> |
5 | | #include "jv.h" |
6 | | #include "jv_dtoa.h" |
7 | | #include "jv_unicode.h" |
8 | | #include "jv_alloc.h" |
9 | | #include "jv_dtoa.h" |
10 | | |
11 | | typedef const char* presult; |
12 | | |
13 | | #ifndef MAX_PARSING_DEPTH |
14 | 0 | #define MAX_PARSING_DEPTH (10000) |
15 | | #endif |
16 | | |
17 | 964k | #define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0) |
18 | | #ifdef __GNUC__ |
19 | | #define pfunc __attribute__((warn_unused_result)) presult |
20 | | #else |
21 | | #define pfunc presult |
22 | | #endif |
23 | | |
24 | | enum last_seen { |
25 | | JV_LAST_NONE = 0, |
26 | | JV_LAST_OPEN_ARRAY = '[', |
27 | | JV_LAST_OPEN_OBJECT = '{', |
28 | | JV_LAST_COLON = ':', |
29 | | JV_LAST_COMMA = ',', |
30 | | JV_LAST_VALUE = 'V', |
31 | | }; |
32 | | |
33 | | struct jv_parser { |
34 | | const char* curr_buf; |
35 | | int curr_buf_length; |
36 | | int curr_buf_pos; |
37 | | int curr_buf_is_partial; |
38 | | int eof; |
39 | | unsigned bom_strip_position; |
40 | | |
41 | | int flags; |
42 | | |
43 | | jv* stack; // parser |
44 | | int stackpos; // parser |
45 | | int stacklen; // both (optimization; it's really pathlen for streaming) |
46 | | jv path; // streamer |
47 | | enum last_seen last_seen; // streamer |
48 | | jv output; // streamer |
49 | | jv next; // both |
50 | | |
51 | | char* tokenbuf; |
52 | | int tokenpos; |
53 | | int tokenlen; |
54 | | |
55 | | int line, column; |
56 | | |
57 | | struct dtoa_context dtoa; |
58 | | |
59 | | enum { |
60 | | JV_PARSER_NORMAL, |
61 | | JV_PARSER_STRING, |
62 | | JV_PARSER_STRING_ESCAPE, |
63 | | JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS |
64 | | } st; |
65 | | unsigned int last_ch_was_ws:1; |
66 | | }; |
67 | | |
68 | | |
69 | 668k | static void parser_init(struct jv_parser* p, int flags) { |
70 | 668k | p->flags = flags; |
71 | 668k | if ((p->flags & JV_PARSE_STREAMING)) { |
72 | 0 | p->path = jv_array(); |
73 | 668k | } else { |
74 | 668k | p->path = jv_invalid(); |
75 | 668k | p->flags &= ~(JV_PARSE_STREAM_ERRORS); |
76 | 668k | } |
77 | 668k | p->stack = 0; |
78 | 668k | p->stacklen = p->stackpos = 0; |
79 | 668k | p->last_seen = JV_LAST_NONE; |
80 | 668k | p->output = jv_invalid(); |
81 | 668k | p->next = jv_invalid(); |
82 | 668k | p->tokenbuf = 0; |
83 | 668k | p->tokenlen = p->tokenpos = 0; |
84 | 668k | if ((p->flags & JV_PARSE_SEQ)) |
85 | 0 | p->st = JV_PARSER_WAITING_FOR_RS; |
86 | 668k | else |
87 | 668k | p->st = JV_PARSER_NORMAL; |
88 | 668k | p->eof = 0; |
89 | 668k | p->curr_buf = 0; |
90 | 668k | p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0; |
91 | 668k | p->bom_strip_position = 0; |
92 | 668k | p->last_ch_was_ws = 0; |
93 | 668k | p->line = 1; |
94 | 668k | p->column = 0; |
95 | 668k | jvp_dtoa_context_init(&p->dtoa); |
96 | 668k | } |
97 | | |
98 | 976k | static void parser_reset(struct jv_parser* p) { |
99 | 976k | if ((p->flags & JV_PARSE_STREAMING)) { |
100 | 0 | jv_free(p->path); |
101 | 0 | p->path = jv_array(); |
102 | 0 | p->stacklen = 0; |
103 | 0 | } |
104 | 976k | p->last_seen = JV_LAST_NONE; |
105 | 976k | jv_free(p->output); |
106 | 976k | p->output = jv_invalid(); |
107 | 976k | jv_free(p->next); |
108 | 976k | p->next = jv_invalid(); |
109 | 976k | for (int i=0; i<p->stackpos; i++) |
110 | 0 | jv_free(p->stack[i]); |
111 | 976k | p->stackpos = 0; |
112 | 976k | p->tokenpos = 0; |
113 | 976k | p->st = JV_PARSER_NORMAL; |
114 | 976k | } |
115 | | |
116 | 668k | static void parser_free(struct jv_parser* p) { |
117 | 668k | parser_reset(p); |
118 | 668k | jv_free(p->path); |
119 | 668k | jv_free(p->output); |
120 | 668k | jv_mem_free(p->stack); |
121 | 668k | jv_mem_free(p->tokenbuf); |
122 | 668k | jvp_dtoa_context_free(&p->dtoa); |
123 | 668k | } |
124 | | |
125 | 360k | static pfunc value(struct jv_parser* p, jv val) { |
126 | 360k | if ((p->flags & JV_PARSE_STREAMING)) { |
127 | 0 | if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) { |
128 | 0 | jv_free(val); |
129 | 0 | return "Expected separator between values"; |
130 | 0 | } |
131 | 0 | if (p->stacklen > 0) |
132 | 0 | p->last_seen = JV_LAST_VALUE; |
133 | 0 | else |
134 | 0 | p->last_seen = JV_LAST_NONE; |
135 | 360k | } else { |
136 | 360k | if (jv_is_valid(p->next)) { |
137 | 0 | jv_free(val); |
138 | 0 | return "Expected separator between values"; |
139 | 0 | } |
140 | 360k | } |
141 | 360k | jv_free(p->next); |
142 | 360k | p->next = val; |
143 | 360k | return 0; |
144 | 360k | } |
145 | | |
146 | 0 | static void push(struct jv_parser* p, jv v) { |
147 | 0 | assert(p->stackpos <= p->stacklen); |
148 | 0 | if (p->stackpos == p->stacklen) { |
149 | 0 | p->stacklen = p->stacklen * 2 + 10; |
150 | 0 | p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv)); |
151 | 0 | } |
152 | 0 | assert(p->stackpos < p->stacklen); |
153 | 0 | p->stack[p->stackpos++] = v; |
154 | 0 | } |
155 | | |
156 | 0 | static pfunc parse_token(struct jv_parser* p, char ch) { |
157 | 0 | switch (ch) { |
158 | 0 | case '[': |
159 | 0 | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
160 | 0 | if (jv_is_valid(p->next)) return "Expected separator between values"; |
161 | 0 | push(p, jv_array()); |
162 | 0 | break; |
163 | | |
164 | 0 | case '{': |
165 | 0 | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
166 | 0 | if (jv_is_valid(p->next)) return "Expected separator between values"; |
167 | 0 | push(p, jv_object()); |
168 | 0 | break; |
169 | | |
170 | 0 | case ':': |
171 | 0 | if (!jv_is_valid(p->next)) |
172 | 0 | return "Expected string key before ':'"; |
173 | 0 | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
174 | 0 | return "':' not as part of an object"; |
175 | 0 | if (jv_get_kind(p->next) != JV_KIND_STRING) |
176 | 0 | return "Object keys must be strings"; |
177 | 0 | push(p, p->next); |
178 | 0 | p->next = jv_invalid(); |
179 | 0 | break; |
180 | | |
181 | 0 | case ',': |
182 | 0 | if (!jv_is_valid(p->next)) |
183 | 0 | return "Expected value before ','"; |
184 | 0 | if (p->stackpos == 0) |
185 | 0 | return "',' not as part of an object or array"; |
186 | 0 | if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) { |
187 | 0 | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
188 | 0 | p->next = jv_invalid(); |
189 | 0 | } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) { |
190 | 0 | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
191 | 0 | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
192 | 0 | p->stack[p->stackpos-1], p->next); |
193 | 0 | p->stackpos--; |
194 | 0 | p->next = jv_invalid(); |
195 | 0 | } else { |
196 | | // this case hits on input like {"a", "b"} |
197 | 0 | return "Objects must consist of key:value pairs"; |
198 | 0 | } |
199 | 0 | break; |
200 | | |
201 | 0 | case ']': |
202 | 0 | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY) |
203 | 0 | return "Unmatched ']'"; |
204 | 0 | if (jv_is_valid(p->next)) { |
205 | 0 | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
206 | 0 | p->next = jv_invalid(); |
207 | 0 | } else { |
208 | 0 | if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) { |
209 | | // this case hits on input like [1,2,3,] |
210 | 0 | return "Expected another array element"; |
211 | 0 | } |
212 | 0 | } |
213 | 0 | jv_free(p->next); |
214 | 0 | p->next = p->stack[--p->stackpos]; |
215 | 0 | break; |
216 | | |
217 | 0 | case '}': |
218 | 0 | if (p->stackpos == 0) |
219 | 0 | return "Unmatched '}'"; |
220 | 0 | if (jv_is_valid(p->next)) { |
221 | 0 | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING) |
222 | 0 | return "Objects must consist of key:value pairs"; |
223 | 0 | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
224 | 0 | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
225 | 0 | p->stack[p->stackpos-1], p->next); |
226 | 0 | p->stackpos--; |
227 | 0 | p->next = jv_invalid(); |
228 | 0 | } else { |
229 | 0 | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
230 | 0 | return "Unmatched '}'"; |
231 | 0 | if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0) |
232 | 0 | return "Expected another key-value pair"; |
233 | 0 | } |
234 | 0 | jv_free(p->next); |
235 | 0 | p->next = p->stack[--p->stackpos]; |
236 | 0 | break; |
237 | 0 | } |
238 | 0 | return 0; |
239 | 0 | } |
240 | | |
241 | 0 | static pfunc stream_token(struct jv_parser* p, char ch) { |
242 | 0 | jv_kind k; |
243 | 0 | jv last; |
244 | |
|
245 | 0 | switch (ch) { |
246 | 0 | case '[': |
247 | 0 | if (jv_is_valid(p->next)) |
248 | 0 | return "Expected a separator between values"; |
249 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
250 | | // Looks like {["foo"]} |
251 | 0 | return "Expected string key after '{', not '['"; |
252 | 0 | if (p->last_seen == JV_LAST_COMMA) { |
253 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
254 | 0 | k = jv_get_kind(last); |
255 | 0 | jv_free(last); |
256 | 0 | if (k != JV_KIND_NUMBER) |
257 | | // Looks like {"x":"y",["foo"]} |
258 | 0 | return "Expected string key after ',' in object, not '['"; |
259 | 0 | } |
260 | 0 | p->path = jv_array_append(p->path, jv_number(0)); // push |
261 | 0 | p->last_seen = JV_LAST_OPEN_ARRAY; |
262 | 0 | p->stacklen++; |
263 | 0 | break; |
264 | | |
265 | 0 | case '{': |
266 | 0 | if (p->last_seen == JV_LAST_VALUE) |
267 | 0 | return "Expected a separator between values"; |
268 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
269 | | // Looks like {{"foo":"bar"}} |
270 | 0 | return "Expected string key after '{', not '{'"; |
271 | 0 | if (p->last_seen == JV_LAST_COMMA) { |
272 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
273 | 0 | k = jv_get_kind(last); |
274 | 0 | jv_free(last); |
275 | 0 | if (k != JV_KIND_NUMBER) |
276 | | // Looks like {"x":"y",{"foo":"bar"}} |
277 | 0 | return "Expected string key after ',' in object, not '{'"; |
278 | 0 | } |
279 | | // Push object key: null, since we don't know it yet |
280 | 0 | p->path = jv_array_append(p->path, jv_null()); // push |
281 | 0 | p->last_seen = JV_LAST_OPEN_OBJECT; |
282 | 0 | p->stacklen++; |
283 | 0 | break; |
284 | | |
285 | 0 | case ':': |
286 | 0 | last = jv_invalid(); |
287 | 0 | if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) { |
288 | 0 | jv_free(last); |
289 | 0 | return "':' not as part of an object"; |
290 | 0 | } |
291 | 0 | jv_free(last); |
292 | 0 | if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE) |
293 | 0 | return "Expected string key before ':'"; |
294 | 0 | if (jv_get_kind(p->next) != JV_KIND_STRING) |
295 | 0 | return "Object keys must be strings"; |
296 | 0 | if (p->last_seen != JV_LAST_VALUE) |
297 | 0 | return "':' should follow a key"; |
298 | 0 | p->last_seen = JV_LAST_COLON; |
299 | 0 | p->path = jv_array_set(p->path, p->stacklen - 1, p->next); |
300 | 0 | p->next = jv_invalid(); |
301 | 0 | break; |
302 | | |
303 | 0 | case ',': |
304 | 0 | if (p->last_seen != JV_LAST_VALUE) |
305 | 0 | return "Expected value before ','"; |
306 | 0 | if (p->stacklen == 0) |
307 | 0 | return "',' not as part of an object or array"; |
308 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
309 | 0 | k = jv_get_kind(last); |
310 | 0 | if (k == JV_KIND_NUMBER) { |
311 | 0 | int idx = jv_number_value(last); |
312 | |
|
313 | 0 | if (jv_is_valid(p->next)) { |
314 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
315 | 0 | p->next = jv_invalid(); |
316 | 0 | } |
317 | 0 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1)); |
318 | 0 | p->last_seen = JV_LAST_COMMA; |
319 | 0 | } else if (k == JV_KIND_STRING) { |
320 | 0 | if (jv_is_valid(p->next)) { |
321 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
322 | 0 | p->next = jv_invalid(); |
323 | 0 | } |
324 | 0 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair |
325 | 0 | p->last_seen = JV_LAST_COMMA; |
326 | 0 | } else { |
327 | 0 | assert(k == JV_KIND_NULL); |
328 | | // this case hits on input like {,} |
329 | | // make sure to handle input like {"a", "b"} and {"a":, ...} |
330 | 0 | jv_free(last); |
331 | 0 | return "Objects must consist of key:value pairs"; |
332 | 0 | } |
333 | 0 | jv_free(last); |
334 | 0 | break; |
335 | | |
336 | 0 | case ']': |
337 | 0 | if (p->stacklen == 0) |
338 | 0 | return "Unmatched ']' at the top-level"; |
339 | 0 | if (p->last_seen == JV_LAST_COMMA) |
340 | 0 | return "Expected another array element"; |
341 | 0 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
342 | 0 | assert(!jv_is_valid(p->next)); |
343 | |
|
344 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
345 | 0 | k = jv_get_kind(last); |
346 | 0 | jv_free(last); |
347 | |
|
348 | 0 | if (k != JV_KIND_NUMBER) |
349 | 0 | return "Unmatched ']' in the middle of an object"; |
350 | 0 | if (jv_is_valid(p->next)) { |
351 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
352 | 0 | p->next = jv_invalid(); |
353 | 0 | } else if (p->last_seen != JV_LAST_OPEN_ARRAY) { |
354 | 0 | p->output = JV_ARRAY(jv_copy(p->path)); |
355 | 0 | } |
356 | |
|
357 | 0 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
358 | | //assert(!jv_is_valid(p->next)); |
359 | 0 | jv_free(p->next); |
360 | 0 | p->next = jv_invalid(); |
361 | |
|
362 | 0 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
363 | 0 | p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves |
364 | |
|
365 | 0 | if (p->stacklen == 0) |
366 | 0 | p->last_seen = JV_LAST_NONE; |
367 | 0 | else |
368 | 0 | p->last_seen = JV_LAST_VALUE; |
369 | 0 | break; |
370 | | |
371 | 0 | case '}': |
372 | 0 | if (p->stacklen == 0) |
373 | 0 | return "Unmatched '}' at the top-level"; |
374 | 0 | if (p->last_seen == JV_LAST_COMMA) |
375 | 0 | return "Expected another key:value pair"; |
376 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
377 | 0 | assert(!jv_is_valid(p->next)); |
378 | |
|
379 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
380 | 0 | k = jv_get_kind(last); |
381 | 0 | jv_free(last); |
382 | 0 | if (k == JV_KIND_NUMBER) |
383 | 0 | return "Unmatched '}' in the middle of an array"; |
384 | | |
385 | 0 | if (jv_is_valid(p->next)) { |
386 | 0 | if (k != JV_KIND_STRING) |
387 | 0 | return "Objects must consist of key:value pairs"; |
388 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
389 | 0 | p->next = jv_invalid(); |
390 | 0 | } else { |
391 | | // Perhaps {"a":[]} |
392 | 0 | if (p->last_seen == JV_LAST_COLON) |
393 | | // Looks like {"a":} |
394 | 0 | return "Missing value in key:value pair"; |
395 | 0 | if (p->last_seen == JV_LAST_COMMA) |
396 | | // Looks like {"a":0,} |
397 | 0 | return "Expected another key-value pair"; |
398 | 0 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
399 | 0 | return "Unmatched '}' in the middle of an array"; |
400 | 0 | if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT) |
401 | 0 | return "Unmatched '}'"; |
402 | 0 | if (p->last_seen != JV_LAST_OPEN_OBJECT) |
403 | 0 | p->output = JV_ARRAY(jv_copy(p->path)); |
404 | 0 | } |
405 | 0 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
406 | 0 | jv_free(p->next); |
407 | 0 | p->next = jv_invalid(); |
408 | |
|
409 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
410 | 0 | p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves |
411 | |
|
412 | 0 | if (p->stacklen == 0) |
413 | 0 | p->last_seen = JV_LAST_NONE; |
414 | 0 | else |
415 | 0 | p->last_seen = JV_LAST_VALUE; |
416 | 0 | break; |
417 | 0 | } |
418 | 0 | return 0; |
419 | 0 | } |
420 | | |
421 | 23.1M | static void tokenadd(struct jv_parser* p, char c) { |
422 | 23.1M | assert(p->tokenpos <= p->tokenlen); |
423 | 23.1M | if (p->tokenpos >= (p->tokenlen - 1)) { |
424 | 669k | p->tokenlen = p->tokenlen*2 + 256; |
425 | 669k | p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen); |
426 | 669k | } |
427 | 23.1M | assert(p->tokenpos < p->tokenlen); |
428 | 23.1M | p->tokenbuf[p->tokenpos++] = c; |
429 | 23.1M | } |
430 | | |
431 | 2.96k | static int unhex4(char* hex) { |
432 | 2.96k | int r = 0; |
433 | 9.76k | for (int i=0; i<4; i++) { |
434 | 8.28k | char c = *hex++; |
435 | 8.28k | int n; |
436 | 8.28k | if ('0' <= c && c <= '9') n = c - '0'; |
437 | 4.90k | else if ('a' <= c && c <= 'f') n = c - 'a' + 10; |
438 | 2.43k | else if ('A' <= c && c <= 'F') n = c - 'A' + 10; |
439 | 1.48k | else return -1; |
440 | 6.79k | r <<= 4; |
441 | 6.79k | r |= n; |
442 | 6.79k | } |
443 | 1.48k | return r; |
444 | 2.96k | } |
445 | | |
446 | 291k | static pfunc found_string(struct jv_parser* p) { |
447 | 291k | char* in = p->tokenbuf; |
448 | 291k | char* out = p->tokenbuf; |
449 | 291k | char* end = p->tokenbuf + p->tokenpos; |
450 | | |
451 | 296k | while (in < end) { |
452 | 292k | char c = *in++; |
453 | 292k | if (c == '\\') { |
454 | 292k | if (in >= end) |
455 | 0 | return "Expected escape character at end of string"; |
456 | 292k | c = *in++; |
457 | 292k | switch (c) { |
458 | 2.26k | case '\\': |
459 | 3.46k | case '"': |
460 | 3.50k | case '/': *out++ = c; break; |
461 | 17 | case 'b': *out++ = '\b'; break; |
462 | 8 | case 'f': *out++ = '\f'; break; |
463 | 1 | case 't': *out++ = '\t'; break; |
464 | 6 | case 'n': *out++ = '\n'; break; |
465 | 403 | case 'r': *out++ = '\r'; break; |
466 | | |
467 | 2.69k | case 'u': |
468 | | /* ahh, the complicated case */ |
469 | 2.69k | if (in + 4 > end) |
470 | 345 | return "Invalid \\uXXXX escape"; |
471 | 2.34k | int hexvalue = unhex4(in); |
472 | 2.34k | if (hexvalue < 0) |
473 | 1.15k | return "Invalid characters in \\uXXXX escape"; |
474 | 1.19k | unsigned long codepoint = (unsigned long)hexvalue; |
475 | 1.19k | in += 4; |
476 | 1.19k | if (0xD800 <= codepoint && codepoint <= 0xDBFF) { |
477 | | /* who thought UTF-16 surrogate pairs were a good idea? */ |
478 | 761 | if (in + 6 > end || in[0] != '\\' || in[1] != 'u') |
479 | 144 | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
480 | 617 | unsigned long surrogate = unhex4(in+2); |
481 | 617 | if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) |
482 | 489 | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
483 | 128 | in += 6; |
484 | 128 | codepoint = 0x10000 + (((codepoint - 0xD800) << 10) |
485 | 128 | |(surrogate - 0xDC00)); |
486 | 128 | } |
487 | 563 | if (codepoint > 0x10FFFF) |
488 | 0 | codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER |
489 | 563 | out += jvp_utf8_encode(codepoint, out); |
490 | 563 | break; |
491 | | |
492 | 285k | default: |
493 | 285k | return "Invalid escape"; |
494 | 292k | } |
495 | 292k | } else { |
496 | 0 | if (!(c & ~0x1F)) |
497 | 0 | return "Invalid string: control characters from U+0000 through U+001F must be escaped"; |
498 | 0 | *out++ = c; |
499 | 0 | } |
500 | 292k | } |
501 | 3.76k | TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); |
502 | 3.76k | p->tokenpos = 0; |
503 | 3.76k | return 0; |
504 | 3.76k | } |
505 | | |
506 | 672k | static pfunc check_literal(struct jv_parser* p) { |
507 | 672k | if (p->tokenpos == 0) return 0; |
508 | | |
509 | 356k | const char* pattern = 0; |
510 | 356k | int plen; |
511 | 356k | jv v; |
512 | 356k | switch (p->tokenbuf[0]) { |
513 | 0 | case 't': pattern = "true"; plen = 4; v = jv_true(); break; |
514 | 0 | case 'f': pattern = "false"; plen = 5; v = jv_false(); break; |
515 | 0 | case '\'': |
516 | 0 | return "Invalid string literal; expected \", but got '"; |
517 | 0 | case 'n': |
518 | | // if it starts with 'n', it could be a literal "nan" |
519 | 0 | if (p->tokenpos > 1 && p->tokenbuf[1] == 'u') { |
520 | 0 | pattern = "null"; plen = 4; v = jv_null(); |
521 | 0 | } |
522 | 356k | } |
523 | 356k | if (pattern) { |
524 | 0 | if (p->tokenpos != plen) return "Invalid literal"; |
525 | 0 | for (int i=0; i<plen; i++) |
526 | 0 | if (p->tokenbuf[i] != pattern[i]) |
527 | 0 | return "Invalid literal"; |
528 | 0 | TRY(value(p, v)); |
529 | 356k | } else { |
530 | | // FIXME: better parser |
531 | 356k | p->tokenbuf[p->tokenpos] = 0; |
532 | 356k | #ifdef USE_DECNUM |
533 | 356k | jv number = jv_number_with_literal(p->tokenbuf); |
534 | 356k | if (jv_get_kind(number) == JV_KIND_INVALID) { |
535 | 0 | return "Invalid numeric literal"; |
536 | 0 | } |
537 | 356k | TRY(value(p, number)); |
538 | | #else |
539 | | char *end = 0; |
540 | | double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end); |
541 | | if (end == 0 || *end != 0) { |
542 | | return "Invalid numeric literal"; |
543 | | } |
544 | | TRY(value(p, jv_number(d))); |
545 | | #endif |
546 | 356k | } |
547 | 356k | p->tokenpos = 0; |
548 | 356k | return 0; |
549 | 356k | } |
550 | | |
551 | | typedef enum { |
552 | | LITERAL, |
553 | | WHITESPACE, |
554 | | STRUCTURE, |
555 | | QUOTE, |
556 | | INVALID |
557 | | } chclass; |
558 | | |
559 | 22.5M | static chclass classify(char c) { |
560 | 22.5M | switch (c) { |
561 | 0 | case ' ': |
562 | 0 | case '\t': |
563 | 0 | case '\r': |
564 | 0 | case '\n': |
565 | 0 | return WHITESPACE; |
566 | 311k | case '"': |
567 | 311k | return QUOTE; |
568 | 0 | case '[': |
569 | 0 | case ',': |
570 | 0 | case ']': |
571 | 0 | case '{': |
572 | 0 | case ':': |
573 | 0 | case '}': |
574 | 0 | return STRUCTURE; |
575 | 22.2M | default: |
576 | 22.2M | return LITERAL; |
577 | 22.5M | } |
578 | 22.5M | } |
579 | | |
580 | | |
581 | | static const presult OK = "output produced"; |
582 | | |
583 | 22.8M | static int parse_check_done(struct jv_parser* p, jv* out) { |
584 | 22.8M | if (p->stackpos == 0 && jv_is_valid(p->next)) { |
585 | 3.76k | *out = p->next; |
586 | 3.76k | p->next = jv_invalid(); |
587 | 3.76k | return 1; |
588 | 22.8M | } else { |
589 | 22.8M | return 0; |
590 | 22.8M | } |
591 | 22.8M | } |
592 | | |
593 | 0 | static int stream_check_done(struct jv_parser* p, jv* out) { |
594 | 0 | if (p->stacklen == 0 && jv_is_valid(p->next)) { |
595 | 0 | *out = JV_ARRAY(jv_copy(p->path),p->next); |
596 | 0 | p->next = jv_invalid(); |
597 | 0 | return 1; |
598 | 0 | } else if (jv_is_valid(p->output)) { |
599 | 0 | if (jv_array_length(jv_copy(p->output)) > 2) { |
600 | | // At end of an array or object, necessitating one more output by |
601 | | // which to indicate this |
602 | 0 | *out = jv_array_slice(jv_copy(p->output), 0, 2); |
603 | 0 | p->output = jv_array_slice(p->output, 0, 1); // arrange one more output |
604 | 0 | } else { |
605 | | // No further processing needed |
606 | 0 | *out = p->output; |
607 | 0 | p->output = jv_invalid(); |
608 | 0 | } |
609 | 0 | return 1; |
610 | 0 | } else { |
611 | 0 | return 0; |
612 | 0 | } |
613 | 0 | } |
614 | | |
615 | 0 | static int seq_check_truncation(struct jv_parser* p) { |
616 | 0 | return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER)); |
617 | 0 | } |
618 | | |
619 | 0 | static int stream_seq_check_truncation(struct jv_parser* p) { |
620 | 0 | jv_kind k = jv_get_kind(p->next); |
621 | 0 | return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL); |
622 | 0 | } |
623 | | |
624 | 0 | static int parse_is_top_num(struct jv_parser* p) { |
625 | 0 | return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
626 | 0 | } |
627 | | |
628 | 0 | static int stream_is_top_num(struct jv_parser* p) { |
629 | 0 | return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
630 | 0 | } |
631 | | |
632 | | #define check_done(p, o) \ |
633 | 22.8M | (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o))) |
634 | | |
635 | | #define token(p, ch) \ |
636 | | (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch))) |
637 | | |
638 | | #define check_truncation(p) \ |
639 | 0 | (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p))) |
640 | | |
641 | | #define is_top_num(p) \ |
642 | 0 | (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p))) |
643 | | |
644 | 23.7M | static pfunc scan(struct jv_parser* p, char ch, jv* out) { |
645 | 23.7M | p->column++; |
646 | 23.7M | if (ch == '\n') { |
647 | 287k | p->line++; |
648 | 287k | p->column = 0; |
649 | 287k | } |
650 | 23.7M | if ((p->flags & JV_PARSE_SEQ) |
651 | 0 | && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) { |
652 | 0 | if (check_truncation(p)) { |
653 | 0 | if (check_literal(p) == 0 && is_top_num(p)) |
654 | 0 | return "Potentially truncated top-level numeric value"; |
655 | 0 | return "Truncated value"; |
656 | 0 | } |
657 | 0 | TRY(check_literal(p)); |
658 | 0 | if (p->st == JV_PARSER_NORMAL && check_done(p, out)) |
659 | 0 | return OK; |
660 | | // shouldn't happen? |
661 | 0 | assert(!jv_is_valid(*out)); |
662 | 0 | parser_reset(p); |
663 | 0 | jv_free(*out); |
664 | 0 | *out = jv_invalid(); |
665 | 0 | return OK; |
666 | 0 | } |
667 | 23.7M | presult answer = 0; |
668 | 23.7M | p->last_ch_was_ws = 0; |
669 | 23.7M | if (p->st == JV_PARSER_NORMAL) { |
670 | 22.5M | chclass cls = classify(ch); |
671 | 22.5M | if (cls == WHITESPACE) |
672 | 0 | p->last_ch_was_ws = 1; |
673 | 22.5M | if (cls != LITERAL) { |
674 | 311k | TRY(check_literal(p)); |
675 | 311k | if (check_done(p, out)) answer = OK; |
676 | 311k | } |
677 | 22.5M | switch (cls) { |
678 | 22.2M | case LITERAL: |
679 | 22.2M | tokenadd(p, ch); |
680 | 22.2M | break; |
681 | 0 | case WHITESPACE: |
682 | 0 | break; |
683 | 311k | case QUOTE: |
684 | 311k | p->st = JV_PARSER_STRING; |
685 | 311k | break; |
686 | 0 | case STRUCTURE: |
687 | 0 | TRY(token(p, ch)); |
688 | 0 | break; |
689 | 0 | case INVALID: |
690 | 0 | return "Invalid character"; |
691 | 22.5M | } |
692 | 22.5M | if (check_done(p, out)) answer = OK; |
693 | 22.5M | } else { |
694 | 1.21M | if (ch == '"' && p->st == JV_PARSER_STRING) { |
695 | 291k | TRY(found_string(p)); |
696 | 3.76k | p->st = JV_PARSER_NORMAL; |
697 | 3.76k | if (check_done(p, out)) answer = OK; |
698 | 918k | } else { |
699 | 918k | tokenadd(p, ch); |
700 | 918k | if (ch == '\\' && p->st == JV_PARSER_STRING) { |
701 | 429k | p->st = JV_PARSER_STRING_ESCAPE; |
702 | 488k | } else { |
703 | 488k | p->st = JV_PARSER_STRING; |
704 | 488k | } |
705 | 918k | } |
706 | 1.21M | } |
707 | 23.4M | return answer; |
708 | 23.7M | } |
709 | | |
710 | 0 | struct jv_parser* jv_parser_new(int flags) { |
711 | 0 | struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser)); |
712 | 0 | parser_init(p, flags); |
713 | 0 | p->flags = flags; |
714 | 0 | return p; |
715 | 0 | } |
716 | | |
717 | 0 | void jv_parser_free(struct jv_parser* p) { |
718 | 0 | parser_free(p); |
719 | 0 | jv_mem_free(p); |
720 | 0 | } |
721 | | |
722 | | static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF}; |
723 | | |
724 | 0 | int jv_parser_remaining(struct jv_parser* p) { |
725 | 0 | if (p->curr_buf == 0) |
726 | 0 | return 0; |
727 | 0 | return (p->curr_buf_length - p->curr_buf_pos); |
728 | 0 | } |
729 | | |
730 | 668k | void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) { |
731 | 668k | assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length) |
732 | 668k | && "previous buffer not exhausted"); |
733 | 1.33M | while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) { |
734 | 668k | if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) { |
735 | | // matched a BOM character |
736 | 0 | buf++; |
737 | 0 | length--; |
738 | 0 | p->bom_strip_position++; |
739 | 668k | } else { |
740 | 668k | if (p->bom_strip_position == 0) { |
741 | | // no BOM in this document |
742 | 668k | p->bom_strip_position = sizeof(UTF8_BOM); |
743 | 668k | } else { |
744 | | // malformed BOM (prefix present, rest missing) |
745 | 0 | p->bom_strip_position = 0xff; |
746 | 0 | } |
747 | 668k | } |
748 | 668k | } |
749 | 668k | p->curr_buf = buf; |
750 | 668k | p->curr_buf_length = length; |
751 | 668k | p->curr_buf_pos = 0; |
752 | 668k | p->curr_buf_is_partial = is_partial; |
753 | 668k | } |
754 | | |
755 | | static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3); |
756 | | |
757 | 308k | static jv make_error(struct jv_parser* p, const char *fmt, ...) { |
758 | 308k | va_list ap; |
759 | 308k | va_start(ap, fmt); |
760 | 308k | jv e = jv_string_vfmt(fmt, ap); |
761 | 308k | va_end(ap); |
762 | 308k | if ((p->flags & JV_PARSE_STREAM_ERRORS)) |
763 | 0 | return JV_ARRAY(e, jv_copy(p->path)); |
764 | 308k | return jv_invalid_with_msg(e); |
765 | 308k | } |
766 | | |
767 | 1.02M | jv jv_parser_next(struct jv_parser* p) { |
768 | 1.02M | if (p->eof) |
769 | 356k | return jv_invalid(); |
770 | 672k | if (!p->curr_buf) |
771 | 0 | return jv_invalid(); // Need a buffer |
772 | 672k | if (p->bom_strip_position == 0xff) { |
773 | 0 | if (!(p->flags & JV_PARSE_SEQ)) |
774 | 0 | return jv_invalid_with_msg(jv_string("Malformed BOM")); |
775 | 0 | p->st =JV_PARSER_WAITING_FOR_RS; |
776 | 0 | parser_reset(p); |
777 | 0 | } |
778 | 672k | jv value = jv_invalid(); |
779 | 672k | if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value)) |
780 | 0 | return value; |
781 | 672k | char ch; |
782 | 672k | presult msg = 0; |
783 | 24.4M | while (!msg && p->curr_buf_pos < p->curr_buf_length) { |
784 | 23.7M | ch = p->curr_buf[p->curr_buf_pos++]; |
785 | 23.7M | if (p->st == JV_PARSER_WAITING_FOR_RS) { |
786 | 0 | if (ch == '\n') { |
787 | 0 | p->line++; |
788 | 0 | p->column = 0; |
789 | 0 | } else { |
790 | 0 | p->column++; |
791 | 0 | } |
792 | 0 | if (ch == '\036') |
793 | 0 | p->st = JV_PARSER_NORMAL; |
794 | 0 | continue; // need to resync, wait for RS |
795 | 0 | } |
796 | 23.7M | msg = scan(p, ch, &value); |
797 | 23.7M | } |
798 | 672k | if (msg == OK) { |
799 | 3.76k | return value; |
800 | 668k | } else if (msg) { |
801 | 288k | jv_free(value); |
802 | 288k | if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) { |
803 | | // Skip to the next RS |
804 | 0 | p->st = JV_PARSER_WAITING_FOR_RS; |
805 | 0 | value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column); |
806 | 0 | parser_reset(p); |
807 | 0 | return value; |
808 | 0 | } |
809 | 288k | value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column); |
810 | 288k | parser_reset(p); |
811 | 288k | if (!(p->flags & JV_PARSE_SEQ)) { |
812 | | // We're not parsing a JSON text sequence; throw this buffer away. |
813 | | // XXX We should fail permanently here. |
814 | 288k | p->curr_buf = 0; |
815 | 288k | p->curr_buf_pos = 0; |
816 | 288k | } // Else ch must be RS; don't clear buf so we can start parsing again after this ch |
817 | 288k | return value; |
818 | 380k | } else if (p->curr_buf_is_partial) { |
819 | 0 | assert(p->curr_buf_pos == p->curr_buf_length); |
820 | | // need another buffer |
821 | 0 | return jv_invalid(); |
822 | 380k | } else { |
823 | | // at EOF |
824 | 380k | p->eof = 1; |
825 | 380k | assert(p->curr_buf_pos == p->curr_buf_length); |
826 | 380k | jv_free(value); |
827 | 380k | if (p->st == JV_PARSER_WAITING_FOR_RS) |
828 | 0 | return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column); |
829 | 380k | if (p->st != JV_PARSER_NORMAL) { |
830 | 20.1k | value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column); |
831 | 20.1k | parser_reset(p); |
832 | 20.1k | p->st = JV_PARSER_WAITING_FOR_RS; |
833 | 20.1k | return value; |
834 | 20.1k | } |
835 | 360k | if ((msg = check_literal(p))) { |
836 | 0 | value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column); |
837 | 0 | parser_reset(p); |
838 | 0 | p->st = JV_PARSER_WAITING_FOR_RS; |
839 | 0 | return value; |
840 | 0 | } |
841 | 360k | if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) || |
842 | 360k | (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) { |
843 | 0 | value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column); |
844 | 0 | parser_reset(p); |
845 | 0 | p->st = JV_PARSER_WAITING_FOR_RS; |
846 | 0 | return value; |
847 | 0 | } |
848 | | // p->next is either invalid (nothing here, but no syntax error) |
849 | | // or valid (this is the value). either way it's the thing to return |
850 | 360k | if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) { |
851 | 0 | value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value] |
852 | 360k | } else { |
853 | 360k | value = p->next; |
854 | 360k | } |
855 | 360k | p->next = jv_invalid(); |
856 | 360k | if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) { |
857 | 0 | jv_free(value); |
858 | 0 | return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column); |
859 | 0 | } |
860 | 360k | return value; |
861 | 360k | } |
862 | 672k | } |
863 | | |
864 | 668k | jv jv_parse_sized_custom_flags(const char* string, int length, int flags) { |
865 | 668k | struct jv_parser parser; |
866 | 668k | parser_init(&parser, flags); |
867 | 668k | jv_parser_set_buf(&parser, string, length, 0); |
868 | 668k | jv value = jv_parser_next(&parser); |
869 | 668k | if (jv_is_valid(value)) { |
870 | 360k | jv next = jv_parser_next(&parser); |
871 | 360k | if (jv_is_valid(next)) { |
872 | | // multiple JSON values, we only wanted one |
873 | 0 | jv_free(value); |
874 | 0 | jv_free(next); |
875 | 0 | value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); |
876 | 360k | } else if (jv_invalid_has_msg(jv_copy(next))) { |
877 | | // parser error after the first JSON value |
878 | 0 | jv_free(value); |
879 | 0 | value = next; |
880 | 360k | } else { |
881 | | // a single valid JSON value |
882 | 360k | jv_free(next); |
883 | 360k | } |
884 | 360k | } else if (jv_invalid_has_msg(jv_copy(value))) { |
885 | | // parse error, we'll return it |
886 | 308k | } else { |
887 | | // no value at all |
888 | 0 | jv_free(value); |
889 | 0 | value = jv_invalid_with_msg(jv_string("Expected JSON value")); |
890 | 0 | } |
891 | 668k | parser_free(&parser); |
892 | | |
893 | 668k | if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) { |
894 | 308k | jv msg = jv_invalid_get_msg(value); |
895 | 308k | value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')", |
896 | 308k | jv_string_value(msg), |
897 | 308k | string)); |
898 | 308k | jv_free(msg); |
899 | 308k | } |
900 | 668k | return value; |
901 | 668k | } |
902 | | |
903 | 668k | jv jv_parse_sized(const char* string, int length) { |
904 | 668k | return jv_parse_sized_custom_flags(string, length, 0); |
905 | 668k | } |
906 | | |
907 | 0 | jv jv_parse(const char* string) { |
908 | 0 | return jv_parse_sized(string, strlen(string)); |
909 | 0 | } |
910 | | |
911 | 0 | jv jv_parse_custom_flags(const char* string, int flags) { |
912 | 0 | return jv_parse_sized_custom_flags(string, strlen(string), flags); |
913 | 0 | } |