Line | Count | Source (jump to first uncovered line) |
1 | | #include <stdio.h> |
2 | | #include <stdlib.h> |
3 | | #include <string.h> |
4 | | #include <assert.h> |
5 | | #include "jv.h" |
6 | | #include "jv_dtoa.h" |
7 | | #include "jv_unicode.h" |
8 | | #include "jv_alloc.h" |
9 | | #include "jv_dtoa.h" |
10 | | |
11 | | typedef const char* presult; |
12 | | |
13 | | #ifndef MAX_PARSING_DEPTH |
14 | 1.91M | #define MAX_PARSING_DEPTH (256) |
15 | | #endif |
16 | | |
17 | 92.2M | #define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0) |
18 | | #ifdef __GNUC__ |
19 | | #define pfunc __attribute__((warn_unused_result)) presult |
20 | | #else |
21 | | #define pfunc presult |
22 | | #endif |
23 | | |
24 | | enum last_seen { |
25 | | JV_LAST_NONE = 0, |
26 | | JV_LAST_OPEN_ARRAY = '[', |
27 | | JV_LAST_OPEN_OBJECT = '{', |
28 | | JV_LAST_COLON = ':', |
29 | | JV_LAST_COMMA = ',', |
30 | | JV_LAST_VALUE = 'V', |
31 | | }; |
32 | | |
33 | | struct jv_parser { |
34 | | const char* curr_buf; |
35 | | int curr_buf_length; |
36 | | int curr_buf_pos; |
37 | | int curr_buf_is_partial; |
38 | | int eof; |
39 | | unsigned bom_strip_position; |
40 | | |
41 | | int flags; |
42 | | |
43 | | jv* stack; // parser |
44 | | int stackpos; // parser |
45 | | int stacklen; // both (optimization; it's really pathlen for streaming) |
46 | | jv path; // streamer |
47 | | enum last_seen last_seen; // streamer |
48 | | jv output; // streamer |
49 | | jv next; // both |
50 | | |
51 | | char* tokenbuf; |
52 | | int tokenpos; |
53 | | int tokenlen; |
54 | | |
55 | | int line, column; |
56 | | |
57 | | struct dtoa_context dtoa; |
58 | | |
59 | | enum { |
60 | | JV_PARSER_NORMAL, |
61 | | JV_PARSER_STRING, |
62 | | JV_PARSER_STRING_ESCAPE, |
63 | | JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS |
64 | | } st; |
65 | | unsigned int last_ch_was_ws:1; |
66 | | }; |
67 | | |
68 | | |
69 | 1.75M | static void parser_init(struct jv_parser* p, int flags) { |
70 | 1.75M | p->flags = flags; |
71 | 1.75M | if ((p->flags & JV_PARSE_STREAMING)) { |
72 | 3.73k | p->path = jv_array(); |
73 | 1.75M | } else { |
74 | 1.75M | p->path = jv_invalid(); |
75 | 1.75M | p->flags &= ~(JV_PARSE_STREAM_ERRORS); |
76 | 1.75M | } |
77 | 1.75M | p->stack = 0; |
78 | 1.75M | p->stacklen = p->stackpos = 0; |
79 | 1.75M | p->last_seen = JV_LAST_NONE; |
80 | 1.75M | p->output = jv_invalid(); |
81 | 1.75M | p->next = jv_invalid(); |
82 | 1.75M | p->tokenbuf = 0; |
83 | 1.75M | p->tokenlen = p->tokenpos = 0; |
84 | 1.75M | if ((p->flags & JV_PARSE_SEQ)) |
85 | 293 | p->st = JV_PARSER_WAITING_FOR_RS; |
86 | 1.75M | else |
87 | 1.75M | p->st = JV_PARSER_NORMAL; |
88 | 1.75M | p->eof = 0; |
89 | 1.75M | p->curr_buf = 0; |
90 | 1.75M | p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0; |
91 | 1.75M | p->bom_strip_position = 0; |
92 | 1.75M | p->last_ch_was_ws = 0; |
93 | 1.75M | p->line = 1; |
94 | 1.75M | p->column = 0; |
95 | 1.75M | jvp_dtoa_context_init(&p->dtoa); |
96 | 1.75M | } |
97 | | |
98 | 1.86M | static void parser_reset(struct jv_parser* p) { |
99 | 1.86M | if ((p->flags & JV_PARSE_STREAMING)) { |
100 | 5.79k | jv_free(p->path); |
101 | 5.79k | p->path = jv_array(); |
102 | 5.79k | p->stacklen = 0; |
103 | 5.79k | } |
104 | 1.86M | p->last_seen = JV_LAST_NONE; |
105 | 1.86M | jv_free(p->output); |
106 | 1.86M | p->output = jv_invalid(); |
107 | 1.86M | jv_free(p->next); |
108 | 1.86M | p->next = jv_invalid(); |
109 | 1.92M | for (int i=0; i<p->stackpos; i++) |
110 | 55.1k | jv_free(p->stack[i]); |
111 | 1.86M | p->stackpos = 0; |
112 | 1.86M | p->tokenpos = 0; |
113 | 1.86M | p->st = JV_PARSER_NORMAL; |
114 | 1.86M | } |
115 | | |
116 | 1.75M | static void parser_free(struct jv_parser* p) { |
117 | 1.75M | parser_reset(p); |
118 | 1.75M | jv_free(p->path); |
119 | 1.75M | jv_free(p->output); |
120 | 1.75M | jv_mem_free(p->stack); |
121 | 1.75M | jv_mem_free(p->tokenbuf); |
122 | 1.75M | jvp_dtoa_context_free(&p->dtoa); |
123 | 1.75M | } |
124 | | |
125 | 10.2M | static pfunc value(struct jv_parser* p, jv val) { |
126 | 10.2M | if ((p->flags & JV_PARSE_STREAMING)) { |
127 | 19.0k | if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) { |
128 | 109 | jv_free(val); |
129 | 109 | return "Expected separator between values"; |
130 | 109 | } |
131 | 18.8k | if (p->stacklen > 0) |
132 | 16.8k | p->last_seen = JV_LAST_VALUE; |
133 | 2.00k | else |
134 | 2.00k | p->last_seen = JV_LAST_NONE; |
135 | 10.2M | } else { |
136 | 10.2M | if (jv_is_valid(p->next)) { |
137 | 105 | jv_free(val); |
138 | 105 | return "Expected separator between values"; |
139 | 105 | } |
140 | 10.2M | } |
141 | 10.2M | jv_free(p->next); |
142 | 10.2M | p->next = val; |
143 | 10.2M | return 0; |
144 | 10.2M | } |
145 | | |
146 | 1.96M | static void push(struct jv_parser* p, jv v) { |
147 | 1.96M | assert(p->stackpos <= p->stacklen); |
148 | 1.96M | if (p->stackpos == p->stacklen) { |
149 | 14.3k | p->stacklen = p->stacklen * 2 + 10; |
150 | 14.3k | p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv)); |
151 | 14.3k | } |
152 | 1.96M | assert(p->stackpos < p->stacklen); |
153 | 1.96M | p->stack[p->stackpos++] = v; |
154 | 1.96M | } |
155 | | |
156 | 10.5M | static pfunc parse_token(struct jv_parser* p, char ch) { |
157 | 10.5M | switch (ch) { |
158 | 56.3k | case '[': |
159 | 56.3k | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
160 | 56.3k | if (jv_is_valid(p->next)) return "Expected separator between values"; |
161 | 56.0k | push(p, jv_array()); |
162 | 56.0k | break; |
163 | | |
164 | 1.86M | case '{': |
165 | 1.86M | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
166 | 1.86M | if (jv_is_valid(p->next)) return "Expected separator between values"; |
167 | 1.86M | push(p, jv_object()); |
168 | 1.86M | break; |
169 | | |
170 | 44.5k | case ':': |
171 | 44.5k | if (!jv_is_valid(p->next)) |
172 | 427 | return "Expected string key before ':'"; |
173 | 44.1k | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
174 | 1.01k | return "':' not as part of an object"; |
175 | 43.1k | if (jv_get_kind(p->next) != JV_KIND_STRING) |
176 | 31 | return "Object keys must be strings"; |
177 | 43.0k | push(p, p->next); |
178 | 43.0k | p->next = jv_invalid(); |
179 | 43.0k | break; |
180 | | |
181 | 6.70M | case ',': |
182 | 6.70M | if (!jv_is_valid(p->next)) |
183 | 8.04k | return "Expected value before ','"; |
184 | 6.69M | if (p->stackpos == 0) |
185 | 0 | return "',' not as part of an object or array"; |
186 | 6.69M | if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) { |
187 | 6.65M | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
188 | 6.65M | p->next = jv_invalid(); |
189 | 6.65M | } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) { |
190 | 35.8k | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
191 | 35.8k | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
192 | 35.8k | p->stack[p->stackpos-1], p->next); |
193 | 35.8k | p->stackpos--; |
194 | 35.8k | p->next = jv_invalid(); |
195 | 35.8k | } else { |
196 | | // this case hits on input like {"a", "b"} |
197 | 53 | return "Objects must consist of key:value pairs"; |
198 | 53 | } |
199 | 6.69M | break; |
200 | | |
201 | 6.69M | case ']': |
202 | 28.8k | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY) |
203 | 420 | return "Unmatched ']'"; |
204 | 28.3k | if (jv_is_valid(p->next)) { |
205 | 20.7k | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
206 | 20.7k | p->next = jv_invalid(); |
207 | 20.7k | } else { |
208 | 7.58k | if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) { |
209 | | // this case hits on input like [1,2,3,] |
210 | 62 | return "Expected another array element"; |
211 | 62 | } |
212 | 7.58k | } |
213 | 28.3k | jv_free(p->next); |
214 | 28.3k | p->next = p->stack[--p->stackpos]; |
215 | 28.3k | break; |
216 | | |
217 | 1.83M | case '}': |
218 | 1.83M | if (p->stackpos == 0) |
219 | 231 | return "Unmatched '}'"; |
220 | 1.83M | if (jv_is_valid(p->next)) { |
221 | 5.35k | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING) |
222 | 29 | return "Objects must consist of key:value pairs"; |
223 | 5.32k | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
224 | 5.32k | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
225 | 5.32k | p->stack[p->stackpos-1], p->next); |
226 | 5.32k | p->stackpos--; |
227 | 5.32k | p->next = jv_invalid(); |
228 | 1.83M | } else { |
229 | 1.83M | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
230 | 44 | return "Unmatched '}'"; |
231 | 1.83M | if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0) |
232 | 29 | return "Expected another key-value pair"; |
233 | 1.83M | } |
234 | 1.83M | jv_free(p->next); |
235 | 1.83M | p->next = p->stack[--p->stackpos]; |
236 | 1.83M | break; |
237 | 10.5M | } |
238 | 10.5M | return 0; |
239 | 10.5M | } |
240 | | |
241 | 30.2M | static pfunc stream_token(struct jv_parser* p, char ch) { |
242 | 30.2M | jv_kind k; |
243 | 30.2M | jv last; |
244 | | |
245 | 30.2M | switch (ch) { |
246 | 30.2M | case '[': |
247 | 30.2M | if (jv_is_valid(p->next)) |
248 | 58 | return "Expected a separator between values"; |
249 | 30.2M | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
250 | | // Looks like {["foo"]} |
251 | 8 | return "Expected string key after '{', not '['"; |
252 | 30.2M | if (p->last_seen == JV_LAST_COMMA) { |
253 | 36 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
254 | 36 | k = jv_get_kind(last); |
255 | 36 | jv_free(last); |
256 | 36 | if (k != JV_KIND_NUMBER) |
257 | | // Looks like {"x":"y",["foo"]} |
258 | 2 | return "Expected string key after ',' in object, not '['"; |
259 | 36 | } |
260 | 30.2M | p->path = jv_array_append(p->path, jv_number(0)); // push |
261 | 30.2M | p->last_seen = JV_LAST_OPEN_ARRAY; |
262 | 30.2M | p->stacklen++; |
263 | 30.2M | break; |
264 | | |
265 | 10.5k | case '{': |
266 | 10.5k | if (p->last_seen == JV_LAST_VALUE) |
267 | 5 | return "Expected a separator between values"; |
268 | 10.5k | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
269 | | // Looks like {{"foo":"bar"}} |
270 | 11 | return "Expected string key after '{', not '{'"; |
271 | 10.5k | if (p->last_seen == JV_LAST_COMMA) { |
272 | 14 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
273 | 14 | k = jv_get_kind(last); |
274 | 14 | jv_free(last); |
275 | 14 | if (k != JV_KIND_NUMBER) |
276 | | // Looks like {"x":"y",{"foo":"bar"}} |
277 | 2 | return "Expected string key after ',' in object, not '{'"; |
278 | 14 | } |
279 | | // Push object key: null, since we don't know it yet |
280 | 10.5k | p->path = jv_array_append(p->path, jv_null()); // push |
281 | 10.5k | p->last_seen = JV_LAST_OPEN_OBJECT; |
282 | 10.5k | p->stacklen++; |
283 | 10.5k | break; |
284 | | |
285 | 16.4k | case ':': |
286 | 16.4k | last = jv_invalid(); |
287 | 16.4k | if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) { |
288 | 38 | jv_free(last); |
289 | 38 | return "':' not as part of an object"; |
290 | 38 | } |
291 | 16.3k | jv_free(last); |
292 | 16.3k | if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE) |
293 | 26 | return "Expected string key before ':'"; |
294 | 16.3k | if (jv_get_kind(p->next) != JV_KIND_STRING) |
295 | 7 | return "Object keys must be strings"; |
296 | 16.3k | if (p->last_seen != JV_LAST_VALUE) |
297 | 0 | return "':' should follow a key"; |
298 | 16.3k | p->last_seen = JV_LAST_COLON; |
299 | 16.3k | p->path = jv_array_set(p->path, p->stacklen - 1, p->next); |
300 | 16.3k | p->next = jv_invalid(); |
301 | 16.3k | break; |
302 | | |
303 | 261 | case ',': |
304 | 261 | if (p->last_seen != JV_LAST_VALUE) |
305 | 14 | return "Expected value before ','"; |
306 | 247 | if (p->stacklen == 0) |
307 | 0 | return "',' not as part of an object or array"; |
308 | 247 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
309 | 247 | k = jv_get_kind(last); |
310 | 247 | if (k == JV_KIND_NUMBER) { |
311 | 180 | int idx = jv_number_value(last); |
312 | | |
313 | 180 | if (jv_is_valid(p->next)) { |
314 | 169 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
315 | 169 | p->next = jv_invalid(); |
316 | 169 | } |
317 | 180 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1)); |
318 | 180 | p->last_seen = JV_LAST_COMMA; |
319 | 180 | } else if (k == JV_KIND_STRING) { |
320 | 62 | if (jv_is_valid(p->next)) { |
321 | 33 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
322 | 33 | p->next = jv_invalid(); |
323 | 33 | } |
324 | 62 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair |
325 | 62 | p->last_seen = JV_LAST_COMMA; |
326 | 62 | } else { |
327 | 5 | assert(k == JV_KIND_NULL); |
328 | | // this case hits on input like {,} |
329 | | // make sure to handle input like {"a", "b"} and {"a":, ...} |
330 | 5 | jv_free(last); |
331 | 5 | return "Objects must consist of key:value pairs"; |
332 | 5 | } |
333 | 242 | jv_free(last); |
334 | 242 | break; |
335 | | |
336 | 287 | case ']': |
337 | 287 | if (p->stacklen == 0) |
338 | 9 | return "Unmatched ']' at the top-level"; |
339 | 278 | if (p->last_seen == JV_LAST_COMMA) |
340 | 2 | return "Expected another array element"; |
341 | 276 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
342 | 246 | assert(!jv_is_valid(p->next)); |
343 | | |
344 | 276 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
345 | 276 | k = jv_get_kind(last); |
346 | 276 | jv_free(last); |
347 | | |
348 | 276 | if (k != JV_KIND_NUMBER) |
349 | 8 | return "Unmatched ']' in the middle of an object"; |
350 | 268 | if (jv_is_valid(p->next)) { |
351 | 18 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
352 | 18 | p->next = jv_invalid(); |
353 | 250 | } else if (p->last_seen != JV_LAST_OPEN_ARRAY) { |
354 | 4 | p->output = JV_ARRAY(jv_copy(p->path)); |
355 | 4 | } |
356 | | |
357 | 268 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
358 | | //assert(!jv_is_valid(p->next)); |
359 | 268 | jv_free(p->next); |
360 | 268 | p->next = jv_invalid(); |
361 | | |
362 | 268 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
363 | 246 | p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves |
364 | | |
365 | 268 | if (p->stacklen == 0) |
366 | 26 | p->last_seen = JV_LAST_NONE; |
367 | 242 | else |
368 | 242 | p->last_seen = JV_LAST_VALUE; |
369 | 268 | break; |
370 | | |
371 | 196 | case '}': |
372 | 196 | if (p->stacklen == 0) |
373 | 9 | return "Unmatched '}' at the top-level"; |
374 | 187 | if (p->last_seen == JV_LAST_COMMA) |
375 | 2 | return "Expected another key:value pair"; |
376 | 185 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
377 | 142 | assert(!jv_is_valid(p->next)); |
378 | | |
379 | 185 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
380 | 185 | k = jv_get_kind(last); |
381 | 185 | jv_free(last); |
382 | 185 | if (k == JV_KIND_NUMBER) |
383 | 23 | return "Unmatched '}' in the middle of an array"; |
384 | | |
385 | 162 | if (jv_is_valid(p->next)) { |
386 | 12 | if (k != JV_KIND_STRING) |
387 | 6 | return "Objects must consist of key:value pairs"; |
388 | 6 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
389 | 6 | p->next = jv_invalid(); |
390 | 150 | } else { |
391 | | // Perhaps {"a":[]} |
392 | 150 | if (p->last_seen == JV_LAST_COLON) |
393 | | // Looks like {"a":} |
394 | 6 | return "Missing value in key:value pair"; |
395 | 144 | if (p->last_seen == JV_LAST_COMMA) |
396 | | // Looks like {"a":0,} |
397 | 0 | return "Expected another key-value pair"; |
398 | 144 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
399 | 0 | return "Unmatched '}' in the middle of an array"; |
400 | 144 | if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT) |
401 | 0 | return "Unmatched '}'"; |
402 | 144 | if (p->last_seen != JV_LAST_OPEN_OBJECT) |
403 | 2 | p->output = JV_ARRAY(jv_copy(p->path)); |
404 | 144 | } |
405 | 150 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
406 | 150 | jv_free(p->next); |
407 | 150 | p->next = jv_invalid(); |
408 | | |
409 | 150 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
410 | 142 | p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves |
411 | | |
412 | 150 | if (p->stacklen == 0) |
413 | 39 | p->last_seen = JV_LAST_NONE; |
414 | 111 | else |
415 | 111 | p->last_seen = JV_LAST_VALUE; |
416 | 150 | break; |
417 | 30.2M | } |
418 | 30.2M | return 0; |
419 | 30.2M | } |
420 | | |
421 | 350M | static void tokenadd(struct jv_parser* p, char c) { |
422 | 350M | assert(p->tokenpos <= p->tokenlen); |
423 | 350M | if (p->tokenpos >= (p->tokenlen - 1)) { |
424 | 1.76M | p->tokenlen = p->tokenlen*2 + 256; |
425 | 1.76M | p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen); |
426 | 1.76M | } |
427 | 350M | assert(p->tokenpos < p->tokenlen); |
428 | 350M | p->tokenbuf[p->tokenpos++] = c; |
429 | 350M | } |
430 | | |
431 | 25.7k | static int unhex4(char* hex) { |
432 | 25.7k | int r = 0; |
433 | 122k | for (int i=0; i<4; i++) { |
434 | 99.5k | char c = *hex++; |
435 | 99.5k | int n; |
436 | 99.5k | if ('0' <= c && c <= '9') n = c - '0'; |
437 | 49.1k | else if ('a' <= c && c <= 'f') n = c - 'a' + 10; |
438 | 21.6k | else if ('A' <= c && c <= 'F') n = c - 'A' + 10; |
439 | 2.50k | else return -1; |
440 | 97.0k | r <<= 4; |
441 | 97.0k | r |= n; |
442 | 97.0k | } |
443 | 23.2k | return r; |
444 | 25.7k | } |
445 | | |
446 | 166k | static pfunc found_string(struct jv_parser* p) { |
447 | 166k | char* in = p->tokenbuf; |
448 | 166k | char* out = p->tokenbuf; |
449 | 166k | char* end = p->tokenbuf + p->tokenpos; |
450 | | |
451 | 77.9M | while (in < end) { |
452 | 77.8M | char c = *in++; |
453 | 77.8M | if (c == '\\') { |
454 | 81.7k | if (in >= end) |
455 | 0 | return "Expected escape character at end of string"; |
456 | 81.7k | c = *in++; |
457 | 81.7k | switch (c) { |
458 | 10.3k | case '\\': |
459 | 11.6k | case '"': |
460 | 12.9k | case '/': *out++ = c; break; |
461 | 1.67k | case 'b': *out++ = '\b'; break; |
462 | 1.55k | case 'f': *out++ = '\f'; break; |
463 | 1.22k | case 't': *out++ = '\t'; break; |
464 | 2.10k | case 'n': *out++ = '\n'; break; |
465 | 1.28k | case 'r': *out++ = '\r'; break; |
466 | | |
467 | 20.3k | case 'u': |
468 | | /* ahh, the complicated case */ |
469 | 20.3k | if (in + 4 > end) |
470 | 1.01k | return "Invalid \\uXXXX escape"; |
471 | 19.3k | int hexvalue = unhex4(in); |
472 | 19.3k | if (hexvalue < 0) |
473 | 1.35k | return "Invalid characters in \\uXXXX escape"; |
474 | 17.9k | unsigned long codepoint = (unsigned long)hexvalue; |
475 | 17.9k | in += 4; |
476 | 17.9k | if (0xD800 <= codepoint && codepoint <= 0xDBFF) { |
477 | | /* who thought UTF-16 surrogate pairs were a good idea? */ |
478 | 11.4k | if (in + 6 > end || in[0] != '\\' || in[1] != 'u') |
479 | 4.97k | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
480 | 6.44k | unsigned long surrogate = unhex4(in+2); |
481 | 6.44k | if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) |
482 | 3.29k | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
483 | 3.15k | in += 6; |
484 | 3.15k | codepoint = 0x10000 + (((codepoint - 0xD800) << 10) |
485 | 3.15k | |(surrogate - 0xDC00)); |
486 | 3.15k | } |
487 | 9.68k | if (codepoint > 0x10FFFF) |
488 | 0 | codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER |
489 | 9.68k | out += jvp_utf8_encode(codepoint, out); |
490 | 9.68k | break; |
491 | | |
492 | 40.6k | default: |
493 | 40.6k | return "Invalid escape"; |
494 | 81.7k | } |
495 | 77.7M | } else { |
496 | 77.7M | if (c >= 0 && c <= 0x001f) |
497 | 214 | return "Invalid string: control characters from U+0000 through U+001F must be escaped"; |
498 | 77.7M | *out++ = c; |
499 | 77.7M | } |
500 | 77.8M | } |
501 | 115k | TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); |
502 | 115k | p->tokenpos = 0; |
503 | 115k | return 0; |
504 | 115k | } |
505 | | |
506 | 42.7M | static pfunc check_literal(struct jv_parser* p) { |
507 | 42.7M | if (p->tokenpos == 0) return 0; |
508 | | |
509 | 10.1M | const char* pattern = 0; |
510 | 10.1M | int plen; |
511 | 10.1M | jv v; |
512 | 10.1M | switch (p->tokenbuf[0]) { |
513 | 1.75k | case 't': pattern = "true"; plen = 4; v = jv_true(); break; |
514 | 6.74k | case 'f': pattern = "false"; plen = 5; v = jv_false(); break; |
515 | 56.5k | case 'n': |
516 | | // if it starts with 'n', it could be a literal "nan" |
517 | 56.5k | if (p->tokenbuf[1] == 'u') { |
518 | 54.6k | pattern = "null"; plen = 4; v = jv_null(); |
519 | 54.6k | } |
520 | 10.1M | } |
521 | 10.1M | if (pattern) { |
522 | 63.1k | if (p->tokenpos != plen) return "Invalid literal"; |
523 | 318k | for (int i=0; i<plen; i++) |
524 | 256k | if (p->tokenbuf[i] != pattern[i]) |
525 | 227 | return "Invalid literal"; |
526 | 62.3k | TRY(value(p, v)); |
527 | 10.0M | } else { |
528 | | // FIXME: better parser |
529 | 10.0M | p->tokenbuf[p->tokenpos] = 0; |
530 | 10.0M | #ifdef USE_DECNUM |
531 | 10.0M | jv number = jv_number_with_literal(p->tokenbuf); |
532 | 10.0M | if (jv_get_kind(number) == JV_KIND_INVALID) { |
533 | 12.5k | return "Invalid numeric literal"; |
534 | 12.5k | } |
535 | 10.0M | TRY(value(p, number)); |
536 | | #else |
537 | | char *end = 0; |
538 | | double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end); |
539 | | if (end == 0 || *end != 0) { |
540 | | return "Invalid numeric literal"; |
541 | | } |
542 | | TRY(value(p, jv_number(d))); |
543 | | #endif |
544 | 10.0M | } |
545 | 10.1M | p->tokenpos = 0; |
546 | 10.1M | return 0; |
547 | 10.1M | } |
548 | | |
549 | | typedef enum { |
550 | | LITERAL, |
551 | | WHITESPACE, |
552 | | STRUCTURE, |
553 | | QUOTE, |
554 | | INVALID |
555 | | } chclass; |
556 | | |
557 | 300M | static chclass classify(char c) { |
558 | 300M | switch (c) { |
559 | 17.9k | case ' ': |
560 | 22.6k | case '\t': |
561 | 36.5k | case '\r': |
562 | 51.3k | case '\n': |
563 | 51.3k | return WHITESPACE; |
564 | 193k | case '"': |
565 | 193k | return QUOTE; |
566 | 30.3M | case '[': |
567 | 37.0M | case ',': |
568 | 37.0M | case ']': |
569 | 38.9M | case '{': |
570 | 38.9M | case ':': |
571 | 40.8M | case '}': |
572 | 40.8M | return STRUCTURE; |
573 | 259M | default: |
574 | 259M | return LITERAL; |
575 | 300M | } |
576 | 300M | } |
577 | | |
578 | | |
579 | | static const presult OK = "output produced"; |
580 | | |
581 | 230M | static int parse_check_done(struct jv_parser* p, jv* out) { |
582 | 230M | if (p->stackpos == 0 && jv_is_valid(p->next)) { |
583 | 3.67M | *out = p->next; |
584 | 3.67M | p->next = jv_invalid(); |
585 | 3.67M | return 1; |
586 | 226M | } else { |
587 | 226M | return 0; |
588 | 226M | } |
589 | 230M | } |
590 | | |
591 | 112M | static int stream_check_done(struct jv_parser* p, jv* out) { |
592 | 112M | if (p->stacklen == 0 && jv_is_valid(p->next)) { |
593 | 794 | *out = JV_ARRAY(jv_copy(p->path),p->next); |
594 | 794 | p->next = jv_invalid(); |
595 | 794 | return 1; |
596 | 112M | } else if (jv_is_valid(p->output)) { |
597 | 628 | if (jv_array_length(jv_copy(p->output)) > 2) { |
598 | | // At end of an array or object, necessitating one more output by |
599 | | // which to indicate this |
600 | 24 | *out = jv_array_slice(jv_copy(p->output), 0, 2); |
601 | 24 | p->output = jv_array_slice(p->output, 0, 1); // arrange one more output |
602 | 604 | } else { |
603 | | // No further processing needed |
604 | 604 | *out = p->output; |
605 | 604 | p->output = jv_invalid(); |
606 | 604 | } |
607 | 628 | return 1; |
608 | 112M | } else { |
609 | 112M | return 0; |
610 | 112M | } |
611 | 112M | } |
612 | | |
613 | 37 | static int seq_check_truncation(struct jv_parser* p) { |
614 | 37 | return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER)); |
615 | 37 | } |
616 | | |
617 | 43 | static int stream_seq_check_truncation(struct jv_parser* p) { |
618 | 43 | jv_kind k = jv_get_kind(p->next); |
619 | 43 | return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL); |
620 | 43 | } |
621 | | |
622 | 11 | static int parse_is_top_num(struct jv_parser* p) { |
623 | 11 | return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
624 | 11 | } |
625 | | |
626 | 19 | static int stream_is_top_num(struct jv_parser* p) { |
627 | 19 | return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
628 | 19 | } |
629 | | |
630 | | #define check_done(p, o) \ |
631 | 342M | (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o))) |
632 | | |
633 | | #define token(p, ch) \ |
634 | | (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch))) |
635 | | |
636 | | #define check_truncation(p) \ |
637 | 80 | (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p))) |
638 | | |
639 | | #define is_top_num(p) \ |
640 | 30 | (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p))) |
641 | | |
642 | 392M | static pfunc scan(struct jv_parser* p, char ch, jv* out) { |
643 | 392M | p->column++; |
644 | 392M | if (ch == '\n') { |
645 | 193k | p->line++; |
646 | 193k | p->column = 0; |
647 | 193k | } |
648 | 392M | if ((p->flags & JV_PARSE_SEQ) |
649 | 392M | && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) { |
650 | 80 | if (check_truncation(p)) { |
651 | 51 | if (check_literal(p) == 0 && is_top_num(p)) |
652 | 1 | return "Potentially truncated top-level numeric value"; |
653 | 50 | return "Truncated value"; |
654 | 51 | } |
655 | 29 | TRY(check_literal(p)); |
656 | 16 | if (p->st == JV_PARSER_NORMAL && check_done(p, out)) |
657 | 4 | return OK; |
658 | | // shouldn't happen? |
659 | 12 | assert(!jv_is_valid(*out)); |
660 | 12 | parser_reset(p); |
661 | 12 | jv_free(*out); |
662 | 12 | *out = jv_invalid(); |
663 | 12 | return OK; |
664 | 12 | } |
665 | 392M | presult answer = 0; |
666 | 392M | p->last_ch_was_ws = 0; |
667 | 392M | if (p->st == JV_PARSER_NORMAL) { |
668 | 300M | chclass cls = classify(ch); |
669 | 300M | if (cls == WHITESPACE) |
670 | 51.3k | p->last_ch_was_ws = 1; |
671 | 300M | if (cls != LITERAL) { |
672 | 41.0M | TRY(check_literal(p)); |
673 | 41.0M | if (check_done(p, out)) answer = OK; |
674 | 41.0M | } |
675 | 300M | switch (cls) { |
676 | 259M | case LITERAL: |
677 | 259M | tokenadd(p, ch); |
678 | 259M | break; |
679 | 51.2k | case WHITESPACE: |
680 | 51.2k | break; |
681 | 193k | case QUOTE: |
682 | 193k | p->st = JV_PARSER_STRING; |
683 | 193k | break; |
684 | 40.8M | case STRUCTURE: |
685 | 40.8M | TRY(token(p, ch)); |
686 | 40.8M | break; |
687 | 40.8M | case INVALID: |
688 | 0 | return "Invalid character"; |
689 | 300M | } |
690 | 300M | if (check_done(p, out)) answer = OK; |
691 | 300M | } else { |
692 | 91.2M | if (ch == '"' && p->st == JV_PARSER_STRING) { |
693 | 166k | TRY(found_string(p)); |
694 | 115k | p->st = JV_PARSER_NORMAL; |
695 | 115k | if (check_done(p, out)) answer = OK; |
696 | 91.0M | } else { |
697 | 91.0M | tokenadd(p, ch); |
698 | 91.0M | if (ch == '\\' && p->st == JV_PARSER_STRING) { |
699 | 1.03M | p->st = JV_PARSER_STRING_ESCAPE; |
700 | 90.0M | } else { |
701 | 90.0M | p->st = JV_PARSER_STRING; |
702 | 90.0M | } |
703 | 91.0M | } |
704 | 91.2M | } |
705 | 392M | return answer; |
706 | 392M | } |
707 | | |
708 | 3.13k | struct jv_parser* jv_parser_new(int flags) { |
709 | 3.13k | struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser)); |
710 | 3.13k | parser_init(p, flags); |
711 | 3.13k | p->flags = flags; |
712 | 3.13k | return p; |
713 | 3.13k | } |
714 | | |
715 | 3.13k | void jv_parser_free(struct jv_parser* p) { |
716 | 3.13k | parser_free(p); |
717 | 3.13k | jv_mem_free(p); |
718 | 3.13k | } |
719 | | |
720 | | static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF}; |
721 | | |
722 | 0 | int jv_parser_remaining(struct jv_parser* p) { |
723 | 0 | if (p->curr_buf == 0) |
724 | 0 | return 0; |
725 | 0 | return (p->curr_buf_length - p->curr_buf_pos); |
726 | 0 | } |
727 | | |
728 | 1.77M | void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) { |
729 | 1.77M | assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length) |
730 | 1.77M | && "previous buffer not exhausted"); |
731 | 3.52M | while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) { |
732 | 1.75M | if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) { |
733 | | // matched a BOM character |
734 | 1.25k | buf++; |
735 | 1.25k | length--; |
736 | 1.25k | p->bom_strip_position++; |
737 | 1.75M | } else { |
738 | 1.75M | if (p->bom_strip_position == 0) { |
739 | | // no BOM in this document |
740 | 1.75M | p->bom_strip_position = sizeof(UTF8_BOM); |
741 | 1.75M | } else { |
742 | | // malformed BOM (prefix present, rest missing) |
743 | 1.18k | p->bom_strip_position = 0xff; |
744 | 1.18k | } |
745 | 1.75M | } |
746 | 1.75M | } |
747 | 1.77M | p->curr_buf = buf; |
748 | 1.77M | p->curr_buf_length = length; |
749 | 1.77M | p->curr_buf_pos = 0; |
750 | 1.77M | p->curr_buf_is_partial = is_partial; |
751 | 1.77M | } |
752 | | |
753 | | static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3); |
754 | | |
755 | 107k | static jv make_error(struct jv_parser* p, const char *fmt, ...) { |
756 | 107k | va_list ap; |
757 | 107k | va_start(ap, fmt); |
758 | 107k | jv e = jv_string_vfmt(fmt, ap); |
759 | 107k | va_end(ap); |
760 | 107k | if ((p->flags & JV_PARSE_STREAM_ERRORS)) |
761 | 516 | return JV_ARRAY(e, jv_copy(p->path)); |
762 | 107k | return jv_invalid_with_msg(e); |
763 | 107k | } |
764 | | |
765 | 7.07M | jv jv_parser_next(struct jv_parser* p) { |
766 | 7.07M | if (p->eof) |
767 | 1.63M | return jv_invalid(); |
768 | 5.44M | if (!p->curr_buf) |
769 | 57 | return jv_invalid(); // Need a buffer |
770 | 5.44M | if (p->bom_strip_position == 0xff) { |
771 | 1.22k | if (!(p->flags & JV_PARSE_SEQ)) |
772 | 1.10k | return jv_invalid_with_msg(jv_string("Malformed BOM")); |
773 | 116 | p->st =JV_PARSER_WAITING_FOR_RS; |
774 | 116 | parser_reset(p); |
775 | 116 | } |
776 | 5.44M | jv value = jv_invalid(); |
777 | 5.44M | if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value)) |
778 | 8 | return value; |
779 | 5.44M | char ch; |
780 | 5.44M | presult msg = 0; |
781 | 400M | while (!msg && p->curr_buf_pos < p->curr_buf_length) { |
782 | 395M | ch = p->curr_buf[p->curr_buf_pos++]; |
783 | 395M | if (p->st == JV_PARSER_WAITING_FOR_RS) { |
784 | 3.17M | if (ch == '\n') { |
785 | 4.22k | p->line++; |
786 | 4.22k | p->column = 0; |
787 | 3.17M | } else { |
788 | 3.17M | p->column++; |
789 | 3.17M | } |
790 | 3.17M | if (ch == '\036') |
791 | 106 | p->st = JV_PARSER_NORMAL; |
792 | 3.17M | continue; // need to resync, wait for RS |
793 | 3.17M | } |
794 | 392M | msg = scan(p, ch, &value); |
795 | 392M | } |
796 | 5.44M | if (msg == OK) { |
797 | 3.67M | return value; |
798 | 3.67M | } else if (msg) { |
799 | 69.8k | jv_free(value); |
800 | 69.8k | if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) { |
801 | | // Skip to the next RS |
802 | 126 | p->st = JV_PARSER_WAITING_FOR_RS; |
803 | 126 | value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column); |
804 | 126 | parser_reset(p); |
805 | 126 | return value; |
806 | 126 | } |
807 | 69.7k | value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column); |
808 | 69.7k | parser_reset(p); |
809 | 69.7k | if (!(p->flags & JV_PARSE_SEQ)) { |
810 | | // We're not parsing a JSON text sequence; throw this buffer away. |
811 | | // XXX We should fail permanently here. |
812 | 69.6k | p->curr_buf = 0; |
813 | 69.6k | p->curr_buf_pos = 0; |
814 | 69.6k | } // Else ch must be RS; don't clear buf so we can start parsing again after this ch |
815 | 69.7k | return value; |
816 | 1.70M | } else if (p->curr_buf_is_partial) { |
817 | 11.9k | assert(p->curr_buf_pos == p->curr_buf_length); |
818 | | // need another buffer |
819 | 11.9k | return jv_invalid(); |
820 | 1.68M | } else { |
821 | | // at EOF |
822 | 1.68M | p->eof = 1; |
823 | 1.68M | assert(p->curr_buf_pos == p->curr_buf_length); |
824 | 1.68M | jv_free(value); |
825 | 1.68M | if (p->st == JV_PARSER_WAITING_FOR_RS) |
826 | 107 | return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column); |
827 | 1.68M | if (p->st != JV_PARSER_NORMAL) { |
828 | 26.8k | value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column); |
829 | 26.8k | parser_reset(p); |
830 | 26.8k | p->st = JV_PARSER_WAITING_FOR_RS; |
831 | 26.8k | return value; |
832 | 26.8k | } |
833 | 1.66M | if ((msg = check_literal(p))) { |
834 | 6.11k | value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column); |
835 | 6.11k | parser_reset(p); |
836 | 6.11k | p->st = JV_PARSER_WAITING_FOR_RS; |
837 | 6.11k | return value; |
838 | 6.11k | } |
839 | 1.65M | if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) || |
840 | 1.65M | (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) { |
841 | 5.06k | value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column); |
842 | 5.06k | parser_reset(p); |
843 | 5.06k | p->st = JV_PARSER_WAITING_FOR_RS; |
844 | 5.06k | return value; |
845 | 5.06k | } |
846 | | // p->next is either invalid (nothing here, but no syntax error) |
847 | | // or valid (this is the value). either way it's the thing to return |
848 | 1.65M | if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) { |
849 | 1.20k | value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value] |
850 | 1.64M | } else { |
851 | 1.64M | value = p->next; |
852 | 1.64M | } |
853 | 1.65M | p->next = jv_invalid(); |
854 | 1.65M | if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) { |
855 | 4 | jv_free(value); |
856 | 4 | return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column); |
857 | 4 | } |
858 | 1.65M | return value; |
859 | 1.65M | } |
860 | 5.44M | } |
861 | | |
862 | 1.75M | jv jv_parse_sized_custom_flags(const char* string, int length, int flags) { |
863 | 1.75M | struct jv_parser parser; |
864 | 1.75M | parser_init(&parser, flags); |
865 | 1.75M | jv_parser_set_buf(&parser, string, length, 0); |
866 | 1.75M | jv value = jv_parser_next(&parser); |
867 | 1.75M | if (jv_is_valid(value)) { |
868 | 1.65M | jv next = jv_parser_next(&parser); |
869 | 1.65M | if (jv_is_valid(next)) { |
870 | | // multiple JSON values, we only wanted one |
871 | 794 | jv_free(value); |
872 | 794 | jv_free(next); |
873 | 794 | value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); |
874 | 1.65M | } else if (jv_invalid_has_msg(jv_copy(next))) { |
875 | | // parser error after the first JSON value |
876 | 6.11k | jv_free(value); |
877 | 6.11k | value = next; |
878 | 1.64M | } else { |
879 | | // a single valid JSON value |
880 | 1.64M | jv_free(next); |
881 | 1.64M | } |
882 | 1.65M | } else if (jv_invalid_has_msg(jv_copy(value))) { |
883 | | // parse error, we'll return it |
884 | 100k | } else { |
885 | | // no value at all |
886 | 3.11k | jv_free(value); |
887 | 3.11k | value = jv_invalid_with_msg(jv_string("Expected JSON value")); |
888 | 3.11k | } |
889 | 1.75M | parser_free(&parser); |
890 | | |
891 | 1.75M | if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) { |
892 | 111k | jv msg = jv_invalid_get_msg(value); |
893 | 111k | value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')", |
894 | 111k | jv_string_value(msg), |
895 | 111k | string)); |
896 | 111k | jv_free(msg); |
897 | 111k | } |
898 | 1.75M | return value; |
899 | 1.75M | } |
900 | | |
901 | 1.75M | jv jv_parse_sized(const char* string, int length) { |
902 | 1.75M | return jv_parse_sized_custom_flags(string, length, 0); |
903 | 1.75M | } |
904 | | |
905 | 48.5k | jv jv_parse(const char* string) { |
906 | 48.5k | return jv_parse_sized(string, strlen(string)); |
907 | 48.5k | } |
908 | | |
909 | 6.59k | jv jv_parse_custom_flags(const char* string, int flags) { |
910 | 6.59k | return jv_parse_sized_custom_flags(string, strlen(string), flags); |
911 | 6.59k | } |