Line | Count | Source (jump to first uncovered line) |
1 | | #include <stdio.h> |
2 | | #include <stdlib.h> |
3 | | #include <string.h> |
4 | | #include <assert.h> |
5 | | #include "jv.h" |
6 | | #include "jv_dtoa.h" |
7 | | #include "jv_unicode.h" |
8 | | #include "jv_alloc.h" |
9 | | #include "jv_dtoa.h" |
10 | | |
11 | | typedef const char* presult; |
12 | | |
13 | | #ifndef MAX_PARSING_DEPTH |
14 | 1.83M | #define MAX_PARSING_DEPTH (256) |
15 | | #endif |
16 | | |
17 | 17.0M | #define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0) |
18 | | #ifdef __GNUC__ |
19 | | #define pfunc __attribute__((warn_unused_result)) presult |
20 | | #else |
21 | | #define pfunc presult |
22 | | #endif |
23 | | |
24 | | enum last_seen { |
25 | | JV_LAST_NONE = 0, |
26 | | JV_LAST_OPEN_ARRAY = '[', |
27 | | JV_LAST_OPEN_OBJECT = '{', |
28 | | JV_LAST_COLON = ':', |
29 | | JV_LAST_COMMA = ',', |
30 | | JV_LAST_VALUE = 'V', |
31 | | }; |
32 | | |
33 | | struct jv_parser { |
34 | | const char* curr_buf; |
35 | | int curr_buf_length; |
36 | | int curr_buf_pos; |
37 | | int curr_buf_is_partial; |
38 | | int eof; |
39 | | unsigned bom_strip_position; |
40 | | |
41 | | int flags; |
42 | | |
43 | | jv* stack; // parser |
44 | | int stackpos; // parser |
45 | | int stacklen; // both (optimization; it's really pathlen for streaming) |
46 | | jv path; // streamer |
47 | | enum last_seen last_seen; // streamer |
48 | | jv output; // streamer |
49 | | jv next; // both |
50 | | |
51 | | char* tokenbuf; |
52 | | int tokenpos; |
53 | | int tokenlen; |
54 | | |
55 | | int line, column; |
56 | | |
57 | | struct dtoa_context dtoa; |
58 | | |
59 | | enum { |
60 | | JV_PARSER_NORMAL, |
61 | | JV_PARSER_STRING, |
62 | | JV_PARSER_STRING_ESCAPE, |
63 | | JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS |
64 | | } st; |
65 | | unsigned int last_ch_was_ws:1; |
66 | | }; |
67 | | |
68 | | |
69 | 321k | static void parser_init(struct jv_parser* p, int flags) { |
70 | 321k | p->flags = flags; |
71 | 321k | if ((p->flags & JV_PARSE_STREAMING)) { |
72 | 0 | p->path = jv_array(); |
73 | 321k | } else { |
74 | 321k | p->path = jv_invalid(); |
75 | 321k | p->flags &= ~(JV_PARSE_STREAM_ERRORS); |
76 | 321k | } |
77 | 321k | p->stack = 0; |
78 | 321k | p->stacklen = p->stackpos = 0; |
79 | 321k | p->last_seen = JV_LAST_NONE; |
80 | 321k | p->output = jv_invalid(); |
81 | 321k | p->next = jv_invalid(); |
82 | 321k | p->tokenbuf = 0; |
83 | 321k | p->tokenlen = p->tokenpos = 0; |
84 | 321k | if ((p->flags & JV_PARSE_SEQ)) |
85 | 0 | p->st = JV_PARSER_WAITING_FOR_RS; |
86 | 321k | else |
87 | 321k | p->st = JV_PARSER_NORMAL; |
88 | 321k | p->eof = 0; |
89 | 321k | p->curr_buf = 0; |
90 | 321k | p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0; |
91 | 321k | p->bom_strip_position = 0; |
92 | 321k | p->last_ch_was_ws = 0; |
93 | 321k | p->line = 1; |
94 | 321k | p->column = 0; |
95 | 321k | jvp_dtoa_context_init(&p->dtoa); |
96 | 321k | } |
97 | | |
98 | 381k | static void parser_reset(struct jv_parser* p) { |
99 | 381k | if ((p->flags & JV_PARSE_STREAMING)) { |
100 | 0 | jv_free(p->path); |
101 | 0 | p->path = jv_array(); |
102 | 0 | p->stacklen = 0; |
103 | 0 | } |
104 | 381k | p->last_seen = JV_LAST_NONE; |
105 | 381k | jv_free(p->output); |
106 | 381k | p->output = jv_invalid(); |
107 | 381k | jv_free(p->next); |
108 | 381k | p->next = jv_invalid(); |
109 | 407k | for (int i=0; i<p->stackpos; i++) |
110 | 26.2k | jv_free(p->stack[i]); |
111 | 381k | p->stackpos = 0; |
112 | 381k | p->tokenpos = 0; |
113 | 381k | p->st = JV_PARSER_NORMAL; |
114 | 381k | } |
115 | | |
116 | 321k | static void parser_free(struct jv_parser* p) { |
117 | 321k | parser_reset(p); |
118 | 321k | jv_free(p->path); |
119 | 321k | jv_free(p->output); |
120 | 321k | jv_mem_free(p->stack); |
121 | 321k | jv_mem_free(p->tokenbuf); |
122 | 321k | jvp_dtoa_context_free(&p->dtoa); |
123 | 321k | } |
124 | | |
125 | 4.58M | static pfunc value(struct jv_parser* p, jv val) { |
126 | 4.58M | if ((p->flags & JV_PARSE_STREAMING)) { |
127 | 0 | if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) { |
128 | 0 | jv_free(val); |
129 | 0 | return "Expected separator between values"; |
130 | 0 | } |
131 | 0 | if (p->stacklen > 0) |
132 | 0 | p->last_seen = JV_LAST_VALUE; |
133 | 0 | else |
134 | 0 | p->last_seen = JV_LAST_NONE; |
135 | 4.58M | } else { |
136 | 4.58M | if (jv_is_valid(p->next)) { |
137 | 54 | jv_free(val); |
138 | 54 | return "Expected separator between values"; |
139 | 54 | } |
140 | 4.58M | } |
141 | 4.58M | jv_free(p->next); |
142 | 4.58M | p->next = val; |
143 | 4.58M | return 0; |
144 | 4.58M | } |
145 | | |
146 | 1.85M | static void push(struct jv_parser* p, jv v) { |
147 | 1.85M | assert(p->stackpos <= p->stacklen); |
148 | 1.85M | if (p->stackpos == p->stacklen) { |
149 | 2.65k | p->stacklen = p->stacklen * 2 + 10; |
150 | 2.65k | p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv)); |
151 | 2.65k | } |
152 | 1.85M | assert(p->stackpos < p->stacklen); |
153 | 1.85M | p->stack[p->stackpos++] = v; |
154 | 1.85M | } |
155 | | |
156 | 6.15M | static pfunc parse_token(struct jv_parser* p, char ch) { |
157 | 6.15M | switch (ch) { |
158 | 19.8k | case '[': |
159 | 19.8k | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
160 | 19.8k | if (jv_is_valid(p->next)) return "Expected separator between values"; |
161 | 19.8k | push(p, jv_array()); |
162 | 19.8k | break; |
163 | | |
164 | 1.81M | case '{': |
165 | 1.81M | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
166 | 1.81M | if (jv_is_valid(p->next)) return "Expected separator between values"; |
167 | 1.81M | push(p, jv_object()); |
168 | 1.81M | break; |
169 | | |
170 | 15.7k | case ':': |
171 | 15.7k | if (!jv_is_valid(p->next)) |
172 | 21 | return "Expected string key before ':'"; |
173 | 15.7k | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
174 | 17 | return "':' not as part of an object"; |
175 | 15.7k | if (jv_get_kind(p->next) != JV_KIND_STRING) |
176 | 14 | return "Object keys must be strings"; |
177 | 15.7k | push(p, p->next); |
178 | 15.7k | p->next = jv_invalid(); |
179 | 15.7k | break; |
180 | | |
181 | 2.48M | case ',': |
182 | 2.48M | if (!jv_is_valid(p->next)) |
183 | 21 | return "Expected value before ','"; |
184 | 2.48M | if (p->stackpos == 0) |
185 | 0 | return "',' not as part of an object or array"; |
186 | 2.48M | if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) { |
187 | 2.46M | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
188 | 2.46M | p->next = jv_invalid(); |
189 | 2.46M | } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) { |
190 | 14.2k | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
191 | 14.2k | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
192 | 14.2k | p->stack[p->stackpos-1], p->next); |
193 | 14.2k | p->stackpos--; |
194 | 14.2k | p->next = jv_invalid(); |
195 | 14.2k | } else { |
196 | | // this case hits on input like {"a", "b"} |
197 | 11 | return "Objects must consist of key:value pairs"; |
198 | 11 | } |
199 | 2.48M | break; |
200 | | |
201 | 2.48M | case ']': |
202 | 7.50k | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY) |
203 | 29 | return "Unmatched ']'"; |
204 | 7.47k | if (jv_is_valid(p->next)) { |
205 | 6.26k | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
206 | 6.26k | p->next = jv_invalid(); |
207 | 6.26k | } else { |
208 | 1.20k | if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) { |
209 | | // this case hits on input like [1,2,3,] |
210 | 51 | return "Expected another array element"; |
211 | 51 | } |
212 | 1.20k | } |
213 | 7.42k | jv_free(p->next); |
214 | 7.42k | p->next = p->stack[--p->stackpos]; |
215 | 7.42k | break; |
216 | | |
217 | 1.80M | case '}': |
218 | 1.80M | if (p->stackpos == 0) |
219 | 13 | return "Unmatched '}'"; |
220 | 1.80M | if (jv_is_valid(p->next)) { |
221 | 805 | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING) |
222 | 14 | return "Objects must consist of key:value pairs"; |
223 | 791 | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
224 | 791 | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
225 | 791 | p->stack[p->stackpos-1], p->next); |
226 | 791 | p->stackpos--; |
227 | 791 | p->next = jv_invalid(); |
228 | 1.80M | } else { |
229 | 1.80M | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
230 | 8 | return "Unmatched '}'"; |
231 | 1.80M | if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0) |
232 | 28 | return "Expected another key-value pair"; |
233 | 1.80M | } |
234 | 1.80M | jv_free(p->next); |
235 | 1.80M | p->next = p->stack[--p->stackpos]; |
236 | 1.80M | break; |
237 | 6.15M | } |
238 | 6.15M | return 0; |
239 | 6.15M | } |
240 | | |
241 | 0 | static pfunc stream_token(struct jv_parser* p, char ch) { |
242 | 0 | jv_kind k; |
243 | 0 | jv last; |
244 | |
|
245 | 0 | switch (ch) { |
246 | 0 | case '[': |
247 | 0 | if (jv_is_valid(p->next)) |
248 | 0 | return "Expected a separator between values"; |
249 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
250 | | // Looks like {["foo"]} |
251 | 0 | return "Expected string key after '{', not '['"; |
252 | 0 | if (p->last_seen == JV_LAST_COMMA) { |
253 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
254 | 0 | k = jv_get_kind(last); |
255 | 0 | jv_free(last); |
256 | 0 | if (k != JV_KIND_NUMBER) |
257 | | // Looks like {"x":"y",["foo"]} |
258 | 0 | return "Expected string key after ',' in object, not '['"; |
259 | 0 | } |
260 | 0 | p->path = jv_array_append(p->path, jv_number(0)); // push |
261 | 0 | p->last_seen = JV_LAST_OPEN_ARRAY; |
262 | 0 | p->stacklen++; |
263 | 0 | break; |
264 | | |
265 | 0 | case '{': |
266 | 0 | if (p->last_seen == JV_LAST_VALUE) |
267 | 0 | return "Expected a separator between values"; |
268 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
269 | | // Looks like {{"foo":"bar"}} |
270 | 0 | return "Expected string key after '{', not '{'"; |
271 | 0 | if (p->last_seen == JV_LAST_COMMA) { |
272 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
273 | 0 | k = jv_get_kind(last); |
274 | 0 | jv_free(last); |
275 | 0 | if (k != JV_KIND_NUMBER) |
276 | | // Looks like {"x":"y",{"foo":"bar"}} |
277 | 0 | return "Expected string key after ',' in object, not '{'"; |
278 | 0 | } |
279 | | // Push object key: null, since we don't know it yet |
280 | 0 | p->path = jv_array_append(p->path, jv_null()); // push |
281 | 0 | p->last_seen = JV_LAST_OPEN_OBJECT; |
282 | 0 | p->stacklen++; |
283 | 0 | break; |
284 | | |
285 | 0 | case ':': |
286 | 0 | last = jv_invalid(); |
287 | 0 | if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) { |
288 | 0 | jv_free(last); |
289 | 0 | return "':' not as part of an object"; |
290 | 0 | } |
291 | 0 | jv_free(last); |
292 | 0 | if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE) |
293 | 0 | return "Expected string key before ':'"; |
294 | 0 | if (jv_get_kind(p->next) != JV_KIND_STRING) |
295 | 0 | return "Object keys must be strings"; |
296 | 0 | if (p->last_seen != JV_LAST_VALUE) |
297 | 0 | return "':' should follow a key"; |
298 | 0 | p->last_seen = JV_LAST_COLON; |
299 | 0 | p->path = jv_array_set(p->path, p->stacklen - 1, p->next); |
300 | 0 | p->next = jv_invalid(); |
301 | 0 | break; |
302 | | |
303 | 0 | case ',': |
304 | 0 | if (p->last_seen != JV_LAST_VALUE) |
305 | 0 | return "Expected value before ','"; |
306 | 0 | if (p->stacklen == 0) |
307 | 0 | return "',' not as part of an object or array"; |
308 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
309 | 0 | k = jv_get_kind(last); |
310 | 0 | if (k == JV_KIND_NUMBER) { |
311 | 0 | int idx = jv_number_value(last); |
312 | |
|
313 | 0 | if (jv_is_valid(p->next)) { |
314 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
315 | 0 | p->next = jv_invalid(); |
316 | 0 | } |
317 | 0 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1)); |
318 | 0 | p->last_seen = JV_LAST_COMMA; |
319 | 0 | } else if (k == JV_KIND_STRING) { |
320 | 0 | if (jv_is_valid(p->next)) { |
321 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
322 | 0 | p->next = jv_invalid(); |
323 | 0 | } |
324 | 0 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair |
325 | 0 | p->last_seen = JV_LAST_COMMA; |
326 | 0 | } else { |
327 | 0 | assert(k == JV_KIND_NULL); |
328 | | // this case hits on input like {,} |
329 | | // make sure to handle input like {"a", "b"} and {"a":, ...} |
330 | 0 | jv_free(last); |
331 | 0 | return "Objects must consist of key:value pairs"; |
332 | 0 | } |
333 | 0 | jv_free(last); |
334 | 0 | break; |
335 | | |
336 | 0 | case ']': |
337 | 0 | if (p->stacklen == 0) |
338 | 0 | return "Unmatched ']' at the top-level"; |
339 | 0 | if (p->last_seen == JV_LAST_COMMA) |
340 | 0 | return "Expected another array element"; |
341 | 0 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
342 | 0 | assert(!jv_is_valid(p->next)); |
343 | | |
344 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
345 | 0 | k = jv_get_kind(last); |
346 | 0 | jv_free(last); |
347 | |
|
348 | 0 | if (k != JV_KIND_NUMBER) |
349 | 0 | return "Unmatched ']' in the middle of an object"; |
350 | 0 | if (jv_is_valid(p->next)) { |
351 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
352 | 0 | p->next = jv_invalid(); |
353 | 0 | } else if (p->last_seen != JV_LAST_OPEN_ARRAY) { |
354 | 0 | p->output = JV_ARRAY(jv_copy(p->path)); |
355 | 0 | } |
356 | |
|
357 | 0 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
358 | | //assert(!jv_is_valid(p->next)); |
359 | 0 | jv_free(p->next); |
360 | 0 | p->next = jv_invalid(); |
361 | |
|
362 | 0 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
363 | 0 | p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves |
364 | |
|
365 | 0 | if (p->stacklen == 0) |
366 | 0 | p->last_seen = JV_LAST_NONE; |
367 | 0 | else |
368 | 0 | p->last_seen = JV_LAST_VALUE; |
369 | 0 | break; |
370 | | |
371 | 0 | case '}': |
372 | 0 | if (p->stacklen == 0) |
373 | 0 | return "Unmatched '}' at the top-level"; |
374 | 0 | if (p->last_seen == JV_LAST_COMMA) |
375 | 0 | return "Expected another key:value pair"; |
376 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
377 | 0 | assert(!jv_is_valid(p->next)); |
378 | | |
379 | 0 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
380 | 0 | k = jv_get_kind(last); |
381 | 0 | jv_free(last); |
382 | 0 | if (k == JV_KIND_NUMBER) |
383 | 0 | return "Unmatched '}' in the middle of an array"; |
384 | | |
385 | 0 | if (jv_is_valid(p->next)) { |
386 | 0 | if (k != JV_KIND_STRING) |
387 | 0 | return "Objects must consist of key:value pairs"; |
388 | 0 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
389 | 0 | p->next = jv_invalid(); |
390 | 0 | } else { |
391 | | // Perhaps {"a":[]} |
392 | 0 | if (p->last_seen == JV_LAST_COLON) |
393 | | // Looks like {"a":} |
394 | 0 | return "Missing value in key:value pair"; |
395 | 0 | if (p->last_seen == JV_LAST_COMMA) |
396 | | // Looks like {"a":0,} |
397 | 0 | return "Expected another key-value pair"; |
398 | 0 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
399 | 0 | return "Unmatched '}' in the middle of an array"; |
400 | 0 | if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT) |
401 | 0 | return "Unmatched '}'"; |
402 | 0 | if (p->last_seen != JV_LAST_OPEN_OBJECT) |
403 | 0 | p->output = JV_ARRAY(jv_copy(p->path)); |
404 | 0 | } |
405 | 0 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
406 | 0 | jv_free(p->next); |
407 | 0 | p->next = jv_invalid(); |
408 | |
|
409 | 0 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
410 | 0 | p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves |
411 | |
|
412 | 0 | if (p->stacklen == 0) |
413 | 0 | p->last_seen = JV_LAST_NONE; |
414 | 0 | else |
415 | 0 | p->last_seen = JV_LAST_VALUE; |
416 | 0 | break; |
417 | 0 | } |
418 | 0 | return 0; |
419 | 0 | } |
420 | | |
421 | 111M | static void tokenadd(struct jv_parser* p, char c) { |
422 | 111M | assert(p->tokenpos <= p->tokenlen); |
423 | 111M | if (p->tokenpos >= (p->tokenlen - 1)) { |
424 | 327k | p->tokenlen = p->tokenlen*2 + 256; |
425 | 327k | p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen); |
426 | 327k | } |
427 | 111M | assert(p->tokenpos < p->tokenlen); |
428 | 111M | p->tokenbuf[p->tokenpos++] = c; |
429 | 111M | } |
430 | | |
431 | 6.57k | static int unhex4(char* hex) { |
432 | 6.57k | int r = 0; |
433 | 30.2k | for (int i=0; i<4; i++) { |
434 | 24.9k | char c = *hex++; |
435 | 24.9k | int n; |
436 | 24.9k | if ('0' <= c && c <= '9') n = c - '0'; |
437 | 14.1k | else if ('a' <= c && c <= 'f') n = c - 'a' + 10; |
438 | 7.68k | else if ('A' <= c && c <= 'F') n = c - 'A' + 10; |
439 | 1.22k | else return -1; |
440 | 23.7k | r <<= 4; |
441 | 23.7k | r |= n; |
442 | 23.7k | } |
443 | 5.34k | return r; |
444 | 6.57k | } |
445 | | |
446 | 60.3k | static pfunc found_string(struct jv_parser* p) { |
447 | 60.3k | char* in = p->tokenbuf; |
448 | 60.3k | char* out = p->tokenbuf; |
449 | 60.3k | char* end = p->tokenbuf + p->tokenpos; |
450 | | |
451 | 4.97M | while (in < end) { |
452 | 4.94M | char c = *in++; |
453 | 4.94M | if (c == '\\') { |
454 | 40.4k | if (in >= end) |
455 | 0 | return "Expected escape character at end of string"; |
456 | 40.4k | c = *in++; |
457 | 40.4k | switch (c) { |
458 | 955 | case '\\': |
459 | 2.38k | case '"': |
460 | 2.85k | case '/': *out++ = c; break; |
461 | 393 | case 'b': *out++ = '\b'; break; |
462 | 401 | case 'f': *out++ = '\f'; break; |
463 | 406 | case 't': *out++ = '\t'; break; |
464 | 416 | case 'n': *out++ = '\n'; break; |
465 | 390 | case 'r': *out++ = '\r'; break; |
466 | | |
467 | 4.94k | case 'u': |
468 | | /* ahh, the complicated case */ |
469 | 4.94k | if (in + 4 > end) |
470 | 112 | return "Invalid \\uXXXX escape"; |
471 | 4.83k | int hexvalue = unhex4(in); |
472 | 4.83k | if (hexvalue < 0) |
473 | 746 | return "Invalid characters in \\uXXXX escape"; |
474 | 4.08k | unsigned long codepoint = (unsigned long)hexvalue; |
475 | 4.08k | in += 4; |
476 | 4.08k | if (0xD800 <= codepoint && codepoint <= 0xDBFF) { |
477 | | /* who thought UTF-16 surrogate pairs were a good idea? */ |
478 | 2.14k | if (in + 6 > end || in[0] != '\\' || in[1] != 'u') |
479 | 403 | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
480 | 1.74k | unsigned long surrogate = unhex4(in+2); |
481 | 1.74k | if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) |
482 | 676 | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
483 | 1.06k | in += 6; |
484 | 1.06k | codepoint = 0x10000 + (((codepoint - 0xD800) << 10) |
485 | 1.06k | |(surrogate - 0xDC00)); |
486 | 1.06k | } |
487 | 3.00k | if (codepoint > 0x10FFFF) |
488 | 0 | codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER |
489 | 3.00k | out += jvp_utf8_encode(codepoint, out); |
490 | 3.00k | break; |
491 | | |
492 | 30.6k | default: |
493 | 30.6k | return "Invalid escape"; |
494 | 40.4k | } |
495 | 4.90M | } else { |
496 | 4.90M | if (c >= 0 && c <= 0x001f) |
497 | 73 | return "Invalid string: control characters from U+0000 through U+001F must be escaped"; |
498 | 4.90M | *out++ = c; |
499 | 4.90M | } |
500 | 4.94M | } |
501 | 27.6k | TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); |
502 | 27.6k | p->tokenpos = 0; |
503 | 27.6k | return 0; |
504 | 27.6k | } |
505 | | |
506 | 6.53M | static pfunc check_literal(struct jv_parser* p) { |
507 | 6.53M | if (p->tokenpos == 0) return 0; |
508 | | |
509 | 4.55M | const char* pattern = 0; |
510 | 4.55M | int plen; |
511 | 4.55M | jv v; |
512 | 4.55M | switch (p->tokenbuf[0]) { |
513 | 494 | case 't': pattern = "true"; plen = 4; v = jv_true(); break; |
514 | 504 | case 'f': pattern = "false"; plen = 5; v = jv_false(); break; |
515 | 1.85k | case 'n': |
516 | | // if it starts with 'n', it could be a literal "nan" |
517 | 1.85k | if (p->tokenpos != 3) { |
518 | 1.10k | pattern = "null"; plen = 4; v = jv_null(); |
519 | 1.10k | } |
520 | 4.55M | } |
521 | 4.55M | if (pattern) { |
522 | 2.10k | if (p->tokenpos != plen) return "Invalid literal"; |
523 | 10.1k | for (int i=0; i<plen; i++) |
524 | 8.24k | if (p->tokenbuf[i] != pattern[i]) |
525 | 8 | return "Invalid literal"; |
526 | 1.93k | TRY(value(p, v)); |
527 | 4.55M | } else { |
528 | | // FIXME: better parser |
529 | 4.55M | p->tokenbuf[p->tokenpos] = 0; |
530 | 4.55M | #ifdef USE_DECNUM |
531 | 4.55M | jv number = jv_number_with_literal(p->tokenbuf); |
532 | 4.55M | if (jv_get_kind(number) == JV_KIND_INVALID) { |
533 | 1.13k | return "Invalid numeric literal"; |
534 | 1.13k | } |
535 | 4.55M | TRY(value(p, number)); |
536 | | #else |
537 | | char *end = 0; |
538 | | double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end); |
539 | | if (end == 0 || *end != 0) { |
540 | | return "Invalid numeric literal"; |
541 | | } |
542 | | TRY(value(p, jv_number(d))); |
543 | | #endif |
544 | 4.55M | } |
545 | 4.55M | p->tokenpos = 0; |
546 | 4.55M | return 0; |
547 | 4.55M | } |
548 | | |
549 | | typedef enum { |
550 | | LITERAL, |
551 | | WHITESPACE, |
552 | | STRUCTURE, |
553 | | QUOTE, |
554 | | INVALID |
555 | | } chclass; |
556 | | |
557 | 107M | static chclass classify(char c) { |
558 | 107M | switch (c) { |
559 | 8.40k | case ' ': |
560 | 9.62k | case '\t': |
561 | 22.0k | case '\r': |
562 | 30.5k | case '\n': |
563 | 30.5k | return WHITESPACE; |
564 | 84.5k | case '"': |
565 | 84.5k | return QUOTE; |
566 | 19.8k | case '[': |
567 | 2.50M | case ',': |
568 | 2.50M | case ']': |
569 | 4.32M | case '{': |
570 | 4.34M | case ':': |
571 | 6.15M | case '}': |
572 | 6.15M | return STRUCTURE; |
573 | 101M | default: |
574 | 101M | return LITERAL; |
575 | 107M | } |
576 | 107M | } |
577 | | |
578 | | |
579 | | static const presult OK = "output produced"; |
580 | | |
581 | 113M | static int parse_check_done(struct jv_parser* p, jv* out) { |
582 | 113M | if (p->stackpos == 0 && jv_is_valid(p->next)) { |
583 | 3.63M | *out = p->next; |
584 | 3.63M | p->next = jv_invalid(); |
585 | 3.63M | return 1; |
586 | 110M | } else { |
587 | 110M | return 0; |
588 | 110M | } |
589 | 113M | } |
590 | | |
591 | 0 | static int stream_check_done(struct jv_parser* p, jv* out) { |
592 | 0 | if (p->stacklen == 0 && jv_is_valid(p->next)) { |
593 | 0 | *out = JV_ARRAY(jv_copy(p->path),p->next); |
594 | 0 | p->next = jv_invalid(); |
595 | 0 | return 1; |
596 | 0 | } else if (jv_is_valid(p->output)) { |
597 | 0 | if (jv_array_length(jv_copy(p->output)) > 2) { |
598 | | // At end of an array or object, necessitating one more output by |
599 | | // which to indicate this |
600 | 0 | *out = jv_array_slice(jv_copy(p->output), 0, 2); |
601 | 0 | p->output = jv_array_slice(p->output, 0, 1); // arrange one more output |
602 | 0 | } else { |
603 | | // No further processing needed |
604 | 0 | *out = p->output; |
605 | 0 | p->output = jv_invalid(); |
606 | 0 | } |
607 | 0 | return 1; |
608 | 0 | } else { |
609 | 0 | return 0; |
610 | 0 | } |
611 | 0 | } |
612 | | |
613 | 0 | static int seq_check_truncation(struct jv_parser* p) { |
614 | 0 | return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER)); |
615 | 0 | } |
616 | | |
617 | 0 | static int stream_seq_check_truncation(struct jv_parser* p) { |
618 | 0 | jv_kind k = jv_get_kind(p->next); |
619 | 0 | return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL); |
620 | 0 | } |
621 | | |
622 | 0 | static int parse_is_top_num(struct jv_parser* p) { |
623 | 0 | return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
624 | 0 | } |
625 | | |
626 | 0 | static int stream_is_top_num(struct jv_parser* p) { |
627 | 0 | return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
628 | 0 | } |
629 | | |
630 | | #define check_done(p, o) \ |
631 | 113M | (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o))) |
632 | | |
633 | | #define token(p, ch) \ |
634 | | (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch))) |
635 | | |
636 | | #define check_truncation(p) \ |
637 | 0 | (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p))) |
638 | | |
639 | | #define is_top_num(p) \ |
640 | 0 | (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p))) |
641 | | |
642 | 117M | static pfunc scan(struct jv_parser* p, char ch, jv* out) { |
643 | 117M | p->column++; |
644 | 117M | if (ch == '\n') { |
645 | 50.9k | p->line++; |
646 | 50.9k | p->column = 0; |
647 | 50.9k | } |
648 | 117M | if ((p->flags & JV_PARSE_SEQ) |
649 | 117M | && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) { |
650 | 0 | if (check_truncation(p)) { |
651 | 0 | if (check_literal(p) == 0 && is_top_num(p)) |
652 | 0 | return "Potentially truncated top-level numeric value"; |
653 | 0 | return "Truncated value"; |
654 | 0 | } |
655 | 0 | TRY(check_literal(p)); |
656 | 0 | if (p->st == JV_PARSER_NORMAL && check_done(p, out)) |
657 | 0 | return OK; |
658 | | // shouldn't happen? |
659 | 0 | assert(!jv_is_valid(*out)); |
660 | 0 | parser_reset(p); |
661 | 0 | jv_free(*out); |
662 | 0 | *out = jv_invalid(); |
663 | 0 | return OK; |
664 | 0 | } |
665 | 117M | presult answer = 0; |
666 | 117M | p->last_ch_was_ws = 0; |
667 | 117M | if (p->st == JV_PARSER_NORMAL) { |
668 | 107M | chclass cls = classify(ch); |
669 | 107M | if (cls == WHITESPACE) |
670 | 30.5k | p->last_ch_was_ws = 1; |
671 | 107M | if (cls != LITERAL) { |
672 | 6.26M | TRY(check_literal(p)); |
673 | 6.26M | if (check_done(p, out)) answer = OK; |
674 | 6.26M | } |
675 | 107M | switch (cls) { |
676 | 101M | case LITERAL: |
677 | 101M | tokenadd(p, ch); |
678 | 101M | break; |
679 | 30.4k | case WHITESPACE: |
680 | 30.4k | break; |
681 | 84.5k | case QUOTE: |
682 | 84.5k | p->st = JV_PARSER_STRING; |
683 | 84.5k | break; |
684 | 6.15M | case STRUCTURE: |
685 | 6.15M | TRY(token(p, ch)); |
686 | 6.15M | break; |
687 | 6.15M | case INVALID: |
688 | 0 | return "Invalid character"; |
689 | 107M | } |
690 | 107M | if (check_done(p, out)) answer = OK; |
691 | 107M | } else { |
692 | 10.1M | if (ch == '"' && p->st == JV_PARSER_STRING) { |
693 | 60.3k | TRY(found_string(p)); |
694 | 27.6k | p->st = JV_PARSER_NORMAL; |
695 | 27.6k | if (check_done(p, out)) answer = OK; |
696 | 10.0M | } else { |
697 | 10.0M | tokenadd(p, ch); |
698 | 10.0M | if (ch == '\\' && p->st == JV_PARSER_STRING) { |
699 | 508k | p->st = JV_PARSER_STRING_ESCAPE; |
700 | 9.58M | } else { |
701 | 9.58M | p->st = JV_PARSER_STRING; |
702 | 9.58M | } |
703 | 10.0M | } |
704 | 10.1M | } |
705 | 117M | return answer; |
706 | 117M | } |
707 | | |
708 | 3.14k | struct jv_parser* jv_parser_new(int flags) { |
709 | 3.14k | struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser)); |
710 | 3.14k | parser_init(p, flags); |
711 | 3.14k | p->flags = flags; |
712 | 3.14k | return p; |
713 | 3.14k | } |
714 | | |
715 | 3.14k | void jv_parser_free(struct jv_parser* p) { |
716 | 3.14k | parser_free(p); |
717 | 3.14k | jv_mem_free(p); |
718 | 3.14k | } |
719 | | |
720 | | static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF}; |
721 | | |
722 | 0 | int jv_parser_remaining(struct jv_parser* p) { |
723 | 0 | if (p->curr_buf == 0) |
724 | 0 | return 0; |
725 | 0 | return (p->curr_buf_length - p->curr_buf_pos); |
726 | 0 | } |
727 | | |
728 | 333k | void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) { |
729 | 333k | assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length) |
730 | 333k | && "previous buffer not exhausted"); |
731 | 656k | while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) { |
732 | 322k | if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) { |
733 | | // matched a BOM character |
734 | 72 | buf++; |
735 | 72 | length--; |
736 | 72 | p->bom_strip_position++; |
737 | 321k | } else { |
738 | 321k | if (p->bom_strip_position == 0) { |
739 | | // no BOM in this document |
740 | 321k | p->bom_strip_position = sizeof(UTF8_BOM); |
741 | 321k | } else { |
742 | | // malformed BOM (prefix present, rest missing) |
743 | 41 | p->bom_strip_position = 0xff; |
744 | 41 | } |
745 | 321k | } |
746 | 322k | } |
747 | 333k | p->curr_buf = buf; |
748 | 333k | p->curr_buf_length = length; |
749 | 333k | p->curr_buf_pos = 0; |
750 | 333k | p->curr_buf_is_partial = is_partial; |
751 | 333k | } |
752 | | |
753 | | static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3); |
754 | | |
755 | 59.6k | static jv make_error(struct jv_parser* p, const char *fmt, ...) { |
756 | 59.6k | va_list ap; |
757 | 59.6k | va_start(ap, fmt); |
758 | 59.6k | jv e = jv_string_vfmt(fmt, ap); |
759 | 59.6k | va_end(ap); |
760 | 59.6k | if ((p->flags & JV_PARSE_STREAM_ERRORS)) |
761 | 0 | return JV_ARRAY(e, jv_copy(p->path)); |
762 | 59.6k | return jv_invalid_with_msg(e); |
763 | 59.6k | } |
764 | | |
765 | 4.22M | jv jv_parser_next(struct jv_parser* p) { |
766 | 4.22M | if (p->eof) |
767 | 259k | return jv_invalid(); |
768 | 3.96M | if (!p->curr_buf) |
769 | 0 | return jv_invalid(); // Need a buffer |
770 | 3.96M | if (p->bom_strip_position == 0xff) { |
771 | 41 | if (!(p->flags & JV_PARSE_SEQ)) |
772 | 41 | return jv_invalid_with_msg(jv_string("Malformed BOM")); |
773 | 0 | p->st =JV_PARSER_WAITING_FOR_RS; |
774 | 0 | parser_reset(p); |
775 | 0 | } |
776 | 3.96M | jv value = jv_invalid(); |
777 | 3.96M | if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value)) |
778 | 0 | return value; |
779 | 3.96M | char ch; |
780 | 3.96M | presult msg = 0; |
781 | 121M | while (!msg && p->curr_buf_pos < p->curr_buf_length) { |
782 | 117M | ch = p->curr_buf[p->curr_buf_pos++]; |
783 | 117M | if (p->st == JV_PARSER_WAITING_FOR_RS) { |
784 | 0 | if (ch == '\n') { |
785 | 0 | p->line++; |
786 | 0 | p->column = 0; |
787 | 0 | } else { |
788 | 0 | p->column++; |
789 | 0 | } |
790 | 0 | if (ch == '\036') |
791 | 0 | p->st = JV_PARSER_NORMAL; |
792 | 0 | continue; // need to resync, wait for RS |
793 | 0 | } |
794 | 117M | msg = scan(p, ch, &value); |
795 | 117M | } |
796 | 3.96M | if (msg == OK) { |
797 | 3.63M | return value; |
798 | 3.63M | } else if (msg) { |
799 | 32.9k | jv_free(value); |
800 | 32.9k | if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) { |
801 | | // Skip to the next RS |
802 | 0 | p->st = JV_PARSER_WAITING_FOR_RS; |
803 | 0 | value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column); |
804 | 0 | parser_reset(p); |
805 | 0 | return value; |
806 | 0 | } |
807 | 32.9k | value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column); |
808 | 32.9k | parser_reset(p); |
809 | 32.9k | if (!(p->flags & JV_PARSE_SEQ)) { |
810 | | // We're not parsing a JSON text sequence; throw this buffer away. |
811 | | // XXX We should fail permanently here. |
812 | 32.9k | p->curr_buf = 0; |
813 | 32.9k | p->curr_buf_pos = 0; |
814 | 32.9k | } // Else ch must be RS; don't clear buf so we can start parsing again after this ch |
815 | 32.9k | return value; |
816 | 300k | } else if (p->curr_buf_is_partial) { |
817 | 11.9k | assert(p->curr_buf_pos == p->curr_buf_length); |
818 | | // need another buffer |
819 | 11.9k | return jv_invalid(); |
820 | 288k | } else { |
821 | | // at EOF |
822 | 288k | p->eof = 1; |
823 | 288k | assert(p->curr_buf_pos == p->curr_buf_length); |
824 | 288k | jv_free(value); |
825 | 288k | if (p->st == JV_PARSER_WAITING_FOR_RS) |
826 | 0 | return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column); |
827 | 288k | if (p->st != JV_PARSER_NORMAL) { |
828 | 24.2k | value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column); |
829 | 24.2k | parser_reset(p); |
830 | 24.2k | p->st = JV_PARSER_WAITING_FOR_RS; |
831 | 24.2k | return value; |
832 | 24.2k | } |
833 | 264k | if ((msg = check_literal(p))) { |
834 | 1.23k | value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column); |
835 | 1.23k | parser_reset(p); |
836 | 1.23k | p->st = JV_PARSER_WAITING_FOR_RS; |
837 | 1.23k | return value; |
838 | 1.23k | } |
839 | 263k | if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) || |
840 | 263k | (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) { |
841 | 1.15k | value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column); |
842 | 1.15k | parser_reset(p); |
843 | 1.15k | p->st = JV_PARSER_WAITING_FOR_RS; |
844 | 1.15k | return value; |
845 | 1.15k | } |
846 | | // p->next is either invalid (nothing here, but no syntax error) |
847 | | // or valid (this is the value). either way it's the thing to return |
848 | 262k | if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) { |
849 | 0 | value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value] |
850 | 262k | } else { |
851 | 262k | value = p->next; |
852 | 262k | } |
853 | 262k | p->next = jv_invalid(); |
854 | 262k | if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) { |
855 | 0 | jv_free(value); |
856 | 0 | return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column); |
857 | 0 | } |
858 | 262k | return value; |
859 | 262k | } |
860 | 3.96M | } |
861 | | |
862 | 318k | jv jv_parse_sized(const char* string, int length) { |
863 | 318k | struct jv_parser parser; |
864 | 318k | parser_init(&parser, 0); |
865 | 318k | jv_parser_set_buf(&parser, string, length, 0); |
866 | 318k | jv value = jv_parser_next(&parser); |
867 | 318k | if (jv_is_valid(value)) { |
868 | 261k | jv next = jv_parser_next(&parser); |
869 | 261k | if (jv_is_valid(next)) { |
870 | | // multiple JSON values, we only wanted one |
871 | 27 | jv_free(value); |
872 | 27 | jv_free(next); |
873 | 27 | value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); |
874 | 261k | } else if (jv_invalid_has_msg(jv_copy(next))) { |
875 | | // parser error after the first JSON value |
876 | 556 | jv_free(value); |
877 | 556 | value = next; |
878 | 260k | } else { |
879 | | // a single valid JSON value |
880 | 260k | jv_free(next); |
881 | 260k | } |
882 | 261k | } else if (jv_invalid_has_msg(jv_copy(value))) { |
883 | | // parse error, we'll return it |
884 | 57.6k | } else { |
885 | | // no value at all |
886 | 38 | jv_free(value); |
887 | 38 | value = jv_invalid_with_msg(jv_string("Expected JSON value")); |
888 | 38 | } |
889 | 318k | parser_free(&parser); |
890 | | |
891 | 318k | if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) { |
892 | 58.2k | jv msg = jv_invalid_get_msg(value); |
893 | 58.2k | value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')", |
894 | 58.2k | jv_string_value(msg), |
895 | 58.2k | string)); |
896 | 58.2k | jv_free(msg); |
897 | 58.2k | } |
898 | 318k | return value; |
899 | 318k | } |
900 | | |
901 | 3.05k | jv jv_parse(const char* string) { |
902 | 3.05k | return jv_parse_sized(string, strlen(string)); |
903 | 3.05k | } |