Line | Count | Source |
1 | | #include <stdio.h> |
2 | | #include <stdlib.h> |
3 | | #include <string.h> |
4 | | #include <assert.h> |
5 | | #include "jv.h" |
6 | | #include "jv_dtoa.h" |
7 | | #include "jv_unicode.h" |
8 | | #include "jv_alloc.h" |
9 | | #include "jv_dtoa.h" |
10 | | |
11 | | typedef const char* presult; |
12 | | |
13 | | #ifndef MAX_PARSING_DEPTH |
14 | 3.60M | #define MAX_PARSING_DEPTH (10000) |
15 | | #endif |
16 | | |
17 | 135M | #define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0) |
18 | | #ifdef __GNUC__ |
19 | | #define pfunc __attribute__((warn_unused_result)) presult |
20 | | #else |
21 | | #define pfunc presult |
22 | | #endif |
23 | | |
24 | | enum last_seen { |
25 | | JV_LAST_NONE = 0, |
26 | | JV_LAST_OPEN_ARRAY = '[', |
27 | | JV_LAST_OPEN_OBJECT = '{', |
28 | | JV_LAST_COLON = ':', |
29 | | JV_LAST_COMMA = ',', |
30 | | JV_LAST_VALUE = 'V', |
31 | | }; |
32 | | |
33 | | struct jv_parser { |
34 | | const char* curr_buf; |
35 | | int curr_buf_length; |
36 | | int curr_buf_pos; |
37 | | int curr_buf_is_partial; |
38 | | int eof; |
39 | | unsigned bom_strip_position; |
40 | | |
41 | | int flags; |
42 | | |
43 | | jv* stack; // parser |
44 | | int stackpos; // parser |
45 | | int stacklen; // both (optimization; it's really pathlen for streaming) |
46 | | jv path; // streamer |
47 | | enum last_seen last_seen; // streamer |
48 | | jv output; // streamer |
49 | | jv next; // both |
50 | | |
51 | | char* tokenbuf; |
52 | | int tokenpos; |
53 | | int tokenlen; |
54 | | |
55 | | int line, column; |
56 | | |
57 | | struct dtoa_context dtoa; |
58 | | |
59 | | enum { |
60 | | JV_PARSER_NORMAL, |
61 | | JV_PARSER_STRING, |
62 | | JV_PARSER_STRING_ESCAPE, |
63 | | JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS |
64 | | } st; |
65 | | unsigned int last_ch_was_ws:1; |
66 | | }; |
67 | | |
68 | | |
69 | 3.16M | static void parser_init(struct jv_parser* p, int flags) { |
70 | 3.16M | p->flags = flags; |
71 | 3.16M | if ((p->flags & JV_PARSE_STREAMING)) { |
72 | 3.88k | p->path = jv_array(); |
73 | 3.16M | } else { |
74 | 3.16M | p->path = jv_invalid(); |
75 | 3.16M | p->flags &= ~(JV_PARSE_STREAM_ERRORS); |
76 | 3.16M | } |
77 | 3.16M | p->stack = 0; |
78 | 3.16M | p->stacklen = p->stackpos = 0; |
79 | 3.16M | p->last_seen = JV_LAST_NONE; |
80 | 3.16M | p->output = jv_invalid(); |
81 | 3.16M | p->next = jv_invalid(); |
82 | 3.16M | p->tokenbuf = 0; |
83 | 3.16M | p->tokenlen = p->tokenpos = 0; |
84 | 3.16M | if ((p->flags & JV_PARSE_SEQ)) |
85 | 285 | p->st = JV_PARSER_WAITING_FOR_RS; |
86 | 3.16M | else |
87 | 3.16M | p->st = JV_PARSER_NORMAL; |
88 | 3.16M | p->eof = 0; |
89 | 3.16M | p->curr_buf = 0; |
90 | 3.16M | p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0; |
91 | 3.16M | p->bom_strip_position = 0; |
92 | 3.16M | p->last_ch_was_ws = 0; |
93 | 3.16M | p->line = 1; |
94 | 3.16M | p->column = 0; |
95 | 3.16M | jvp_dtoa_context_init(&p->dtoa); |
96 | 3.16M | } |
97 | | |
98 | 3.40M | static void parser_reset(struct jv_parser* p) { |
99 | 3.40M | if ((p->flags & JV_PARSE_STREAMING)) { |
100 | 6.07k | jv_free(p->path); |
101 | 6.07k | p->path = jv_array(); |
102 | 6.07k | p->stacklen = 0; |
103 | 6.07k | } |
104 | 3.40M | p->last_seen = JV_LAST_NONE; |
105 | 3.40M | jv_free(p->output); |
106 | 3.40M | p->output = jv_invalid(); |
107 | 3.40M | jv_free(p->next); |
108 | 3.40M | p->next = jv_invalid(); |
109 | 4.73M | for (int i=0; i<p->stackpos; i++) |
110 | 1.32M | jv_free(p->stack[i]); |
111 | 3.40M | p->stackpos = 0; |
112 | 3.40M | p->tokenpos = 0; |
113 | 3.40M | p->st = JV_PARSER_NORMAL; |
114 | 3.40M | } |
115 | | |
116 | 3.16M | static void parser_free(struct jv_parser* p) { |
117 | 3.16M | parser_reset(p); |
118 | 3.16M | jv_free(p->path); |
119 | 3.16M | jv_free(p->output); |
120 | 3.16M | jv_mem_free(p->stack); |
121 | 3.16M | jv_mem_free(p->tokenbuf); |
122 | 3.16M | jvp_dtoa_context_free(&p->dtoa); |
123 | 3.16M | } |
124 | | |
125 | 22.2M | static pfunc value(struct jv_parser* p, jv val) { |
126 | 22.2M | if ((p->flags & JV_PARSE_STREAMING)) { |
127 | 10.5k | if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) { |
128 | 105 | jv_free(val); |
129 | 105 | return "Expected separator between values"; |
130 | 105 | } |
131 | 10.4k | if (p->stacklen > 0) |
132 | 8.39k | p->last_seen = JV_LAST_VALUE; |
133 | 2.06k | else |
134 | 2.06k | p->last_seen = JV_LAST_NONE; |
135 | 22.2M | } else { |
136 | 22.2M | if (jv_is_valid(p->next)) { |
137 | 212 | jv_free(val); |
138 | 212 | return "Expected separator between values"; |
139 | 212 | } |
140 | 22.2M | } |
141 | 22.2M | jv_free(p->next); |
142 | 22.2M | p->next = val; |
143 | 22.2M | return 0; |
144 | 22.2M | } |
145 | | |
146 | 3.67M | static void push(struct jv_parser* p, jv v) { |
147 | 3.67M | assert(p->stackpos <= p->stacklen); |
148 | 3.67M | if (p->stackpos == p->stacklen) { |
149 | 18.8k | p->stacklen = p->stacklen * 2 + 10; |
150 | 18.8k | p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv)); |
151 | 18.8k | } |
152 | 3.67M | assert(p->stackpos < p->stacklen); |
153 | 3.67M | p->stack[p->stackpos++] = v; |
154 | 3.67M | } |
155 | | |
156 | 23.7M | static pfunc parse_token(struct jv_parser* p, char ch) { |
157 | 23.7M | switch (ch) { |
158 | 1.67M | case '[': |
159 | 1.67M | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
160 | 1.67M | if (jv_is_valid(p->next)) return "Expected separator between values"; |
161 | 1.67M | push(p, jv_array()); |
162 | 1.67M | break; |
163 | | |
164 | 1.92M | case '{': |
165 | 1.92M | if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing"; |
166 | 1.92M | if (jv_is_valid(p->next)) return "Expected separator between values"; |
167 | 1.92M | push(p, jv_object()); |
168 | 1.92M | break; |
169 | | |
170 | 71.7k | case ':': |
171 | 71.7k | if (!jv_is_valid(p->next)) |
172 | 146 | return "Expected string key before ':'"; |
173 | 71.5k | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
174 | 124 | return "':' not as part of an object"; |
175 | 71.4k | if (jv_get_kind(p->next) != JV_KIND_STRING) |
176 | 46 | return "Object keys must be strings"; |
177 | 71.4k | push(p, p->next); |
178 | 71.4k | p->next = jv_invalid(); |
179 | 71.4k | break; |
180 | | |
181 | 17.7M | case ',': |
182 | 17.7M | if (!jv_is_valid(p->next)) |
183 | 273 | return "Expected value before ','"; |
184 | 17.7M | if (p->stackpos == 0) |
185 | 0 | return "',' not as part of an object or array"; |
186 | 17.7M | if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) { |
187 | 17.7M | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
188 | 17.7M | p->next = jv_invalid(); |
189 | 17.7M | } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) { |
190 | 47.2k | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
191 | 47.2k | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
192 | 47.2k | p->stack[p->stackpos-1], p->next); |
193 | 47.2k | p->stackpos--; |
194 | 47.2k | p->next = jv_invalid(); |
195 | 47.2k | } else { |
196 | | // this case hits on input like {"a", "b"} |
197 | 174 | return "Objects must consist of key:value pairs"; |
198 | 174 | } |
199 | 17.7M | break; |
200 | | |
201 | 17.7M | case ']': |
202 | 918k | if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY) |
203 | 600 | return "Unmatched ']'"; |
204 | 918k | if (jv_is_valid(p->next)) { |
205 | 909k | p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next); |
206 | 909k | p->next = jv_invalid(); |
207 | 909k | } else { |
208 | 8.48k | if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) { |
209 | | // this case hits on input like [1,2,3,] |
210 | 80 | return "Expected another array element"; |
211 | 80 | } |
212 | 8.48k | } |
213 | 918k | jv_free(p->next); |
214 | 918k | p->next = p->stack[--p->stackpos]; |
215 | 918k | break; |
216 | | |
217 | 1.37M | case '}': |
218 | 1.37M | if (p->stackpos == 0) |
219 | 344 | return "Unmatched '}'"; |
220 | 1.37M | if (jv_is_valid(p->next)) { |
221 | 9.69k | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING) |
222 | 45 | return "Objects must consist of key:value pairs"; |
223 | 9.69k | assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT); |
224 | 9.65k | p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], |
225 | 9.65k | p->stack[p->stackpos-1], p->next); |
226 | 9.65k | p->stackpos--; |
227 | 9.65k | p->next = jv_invalid(); |
228 | 1.36M | } else { |
229 | 1.36M | if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT) |
230 | 348 | return "Unmatched '}'"; |
231 | 1.36M | if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0) |
232 | 29 | return "Expected another key-value pair"; |
233 | 1.36M | } |
234 | 1.37M | jv_free(p->next); |
235 | 1.37M | p->next = p->stack[--p->stackpos]; |
236 | 1.37M | break; |
237 | 23.7M | } |
238 | 23.7M | return 0; |
239 | 23.7M | } |
240 | | |
241 | 32.5M | static pfunc stream_token(struct jv_parser* p, char ch) { |
242 | 32.5M | jv_kind k; |
243 | 32.5M | jv last; |
244 | | |
245 | 32.5M | switch (ch) { |
246 | 32.5M | case '[': |
247 | 32.5M | if (jv_is_valid(p->next)) |
248 | 17 | return "Expected a separator between values"; |
249 | 32.5M | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
250 | | // Looks like {["foo"]} |
251 | 5 | return "Expected string key after '{', not '['"; |
252 | 32.5M | if (p->last_seen == JV_LAST_COMMA) { |
253 | 31 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
254 | 31 | k = jv_get_kind(last); |
255 | 31 | jv_free(last); |
256 | 31 | if (k != JV_KIND_NUMBER) |
257 | | // Looks like {"x":"y",["foo"]} |
258 | 2 | return "Expected string key after ',' in object, not '['"; |
259 | 31 | } |
260 | 32.5M | p->path = jv_array_append(p->path, jv_number(0)); // push |
261 | 32.5M | p->last_seen = JV_LAST_OPEN_ARRAY; |
262 | 32.5M | p->stacklen++; |
263 | 32.5M | break; |
264 | | |
265 | 1.82k | case '{': |
266 | 1.82k | if (p->last_seen == JV_LAST_VALUE) |
267 | 5 | return "Expected a separator between values"; |
268 | 1.82k | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
269 | | // Looks like {{"foo":"bar"}} |
270 | 11 | return "Expected string key after '{', not '{'"; |
271 | 1.80k | if (p->last_seen == JV_LAST_COMMA) { |
272 | 13 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
273 | 13 | k = jv_get_kind(last); |
274 | 13 | jv_free(last); |
275 | 13 | if (k != JV_KIND_NUMBER) |
276 | | // Looks like {"x":"y",{"foo":"bar"}} |
277 | 2 | return "Expected string key after ',' in object, not '{'"; |
278 | 13 | } |
279 | | // Push object key: null, since we don't know it yet |
280 | 1.80k | p->path = jv_array_append(p->path, jv_null()); // push |
281 | 1.80k | p->last_seen = JV_LAST_OPEN_OBJECT; |
282 | 1.80k | p->stacklen++; |
283 | 1.80k | break; |
284 | | |
285 | 7.98k | case ':': |
286 | 7.98k | last = jv_invalid(); |
287 | 7.98k | if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) { |
288 | 25 | jv_free(last); |
289 | 25 | return "':' not as part of an object"; |
290 | 25 | } |
291 | 7.96k | jv_free(last); |
292 | 7.96k | if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE) |
293 | 34 | return "Expected string key before ':'"; |
294 | 7.92k | if (jv_get_kind(p->next) != JV_KIND_STRING) |
295 | 10 | return "Object keys must be strings"; |
296 | 7.91k | if (p->last_seen != JV_LAST_VALUE) |
297 | 0 | return "':' should follow a key"; |
298 | 7.91k | p->last_seen = JV_LAST_COLON; |
299 | 7.91k | p->path = jv_array_set(p->path, p->stacklen - 1, p->next); |
300 | 7.91k | p->next = jv_invalid(); |
301 | 7.91k | break; |
302 | | |
303 | 227 | case ',': |
304 | 227 | if (p->last_seen != JV_LAST_VALUE) |
305 | 18 | return "Expected value before ','"; |
306 | 209 | if (p->stacklen == 0) |
307 | 0 | return "',' not as part of an object or array"; |
308 | 209 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
309 | 209 | k = jv_get_kind(last); |
310 | 209 | if (k == JV_KIND_NUMBER) { |
311 | 160 | int idx = jv_number_value(last); |
312 | | |
313 | 160 | if (jv_is_valid(p->next)) { |
314 | 128 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
315 | 128 | p->next = jv_invalid(); |
316 | 128 | } |
317 | 160 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1)); |
318 | 160 | p->last_seen = JV_LAST_COMMA; |
319 | 160 | } else if (k == JV_KIND_STRING) { |
320 | 45 | if (jv_is_valid(p->next)) { |
321 | 31 | p->output = JV_ARRAY(jv_copy(p->path), p->next); |
322 | 31 | p->next = jv_invalid(); |
323 | 31 | } |
324 | 45 | p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair |
325 | 45 | p->last_seen = JV_LAST_COMMA; |
326 | 45 | } else { |
327 | 4 | assert(k == JV_KIND_NULL); |
328 | | // this case hits on input like {,} |
329 | | // make sure to handle input like {"a", "b"} and {"a":, ...} |
330 | 4 | jv_free(last); |
331 | 4 | return "Objects must consist of key:value pairs"; |
332 | 4 | } |
333 | 205 | jv_free(last); |
334 | 205 | break; |
335 | | |
336 | 279 | case ']': |
337 | 279 | if (p->stacklen == 0) |
338 | 8 | return "Unmatched ']' at the top-level"; |
339 | 271 | if (p->last_seen == JV_LAST_COMMA) |
340 | 2 | return "Expected another array element"; |
341 | 269 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
342 | 269 | assert(!jv_is_valid(p->next)); |
343 | | |
344 | 269 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
345 | 269 | k = jv_get_kind(last); |
346 | 269 | jv_free(last); |
347 | | |
348 | 269 | if (k != JV_KIND_NUMBER) |
349 | 8 | return "Unmatched ']' in the middle of an object"; |
350 | 261 | if (jv_is_valid(p->next)) { |
351 | 17 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
352 | 17 | p->next = jv_invalid(); |
353 | 244 | } else if (p->last_seen != JV_LAST_OPEN_ARRAY) { |
354 | 4 | p->output = JV_ARRAY(jv_copy(p->path)); |
355 | 4 | } |
356 | | |
357 | 261 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
358 | | //assert(!jv_is_valid(p->next)); |
359 | 261 | jv_free(p->next); |
360 | 261 | p->next = jv_invalid(); |
361 | | |
362 | 261 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
363 | 240 | p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves |
364 | | |
365 | 261 | if (p->stacklen == 0) |
366 | 26 | p->last_seen = JV_LAST_NONE; |
367 | 235 | else |
368 | 235 | p->last_seen = JV_LAST_VALUE; |
369 | 261 | break; |
370 | | |
371 | 217 | case '}': |
372 | 217 | if (p->stacklen == 0) |
373 | 11 | return "Unmatched '}' at the top-level"; |
374 | 206 | if (p->last_seen == JV_LAST_COMMA) |
375 | 2 | return "Expected another key:value pair"; |
376 | 204 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
377 | 204 | assert(!jv_is_valid(p->next)); |
378 | | |
379 | 204 | last = jv_array_get(jv_copy(p->path), p->stacklen - 1); |
380 | 204 | k = jv_get_kind(last); |
381 | 204 | jv_free(last); |
382 | 204 | if (k == JV_KIND_NUMBER) |
383 | 16 | return "Unmatched '}' in the middle of an array"; |
384 | | |
385 | 188 | if (jv_is_valid(p->next)) { |
386 | 10 | if (k != JV_KIND_STRING) |
387 | 6 | return "Objects must consist of key:value pairs"; |
388 | 4 | p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true()); |
389 | 4 | p->next = jv_invalid(); |
390 | 178 | } else { |
391 | | // Perhaps {"a":[]} |
392 | 178 | if (p->last_seen == JV_LAST_COLON) |
393 | | // Looks like {"a":} |
394 | 6 | return "Missing value in key:value pair"; |
395 | 172 | if (p->last_seen == JV_LAST_COMMA) |
396 | | // Looks like {"a":0,} |
397 | 0 | return "Expected another key-value pair"; |
398 | 172 | if (p->last_seen == JV_LAST_OPEN_ARRAY) |
399 | 0 | return "Unmatched '}' in the middle of an array"; |
400 | 172 | if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT) |
401 | 0 | return "Unmatched '}'"; |
402 | 172 | if (p->last_seen != JV_LAST_OPEN_OBJECT) |
403 | 2 | p->output = JV_ARRAY(jv_copy(p->path)); |
404 | 172 | } |
405 | 176 | p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop |
406 | 176 | jv_free(p->next); |
407 | 176 | p->next = jv_invalid(); |
408 | | |
409 | 176 | if (p->last_seen == JV_LAST_OPEN_OBJECT) |
410 | 170 | p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves |
411 | | |
412 | 176 | if (p->stacklen == 0) |
413 | 50 | p->last_seen = JV_LAST_NONE; |
414 | 126 | else |
415 | 126 | p->last_seen = JV_LAST_VALUE; |
416 | 176 | break; |
417 | 32.5M | } |
418 | 32.5M | return 0; |
419 | 32.5M | } |
420 | | |
421 | 413M | static void tokenadd(struct jv_parser* p, char c) { |
422 | 413M | assert(p->tokenpos <= p->tokenlen); |
423 | 413M | if (p->tokenpos >= (p->tokenlen - 1)) { |
424 | 3.18M | p->tokenlen = p->tokenlen*2 + 256; |
425 | 3.18M | p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen); |
426 | 3.18M | } |
427 | 413M | assert(p->tokenpos < p->tokenlen); |
428 | 413M | p->tokenbuf[p->tokenpos++] = c; |
429 | 413M | } |
430 | | |
431 | 4.61M | static int unhex4(char* hex) { |
432 | 4.61M | int r = 0; |
433 | 23.0M | for (int i=0; i<4; i++) { |
434 | 18.4M | char c = *hex++; |
435 | 18.4M | int n; |
436 | 18.4M | if ('0' <= c && c <= '9') n = c - '0'; |
437 | 4.64M | else if ('a' <= c && c <= 'f') n = c - 'a' + 10; |
438 | 22.0k | else if ('A' <= c && c <= 'F') n = c - 'A' + 10; |
439 | 2.77k | else return -1; |
440 | 18.4M | r <<= 4; |
441 | 18.4M | r |= n; |
442 | 18.4M | } |
443 | 4.61M | return r; |
444 | 4.61M | } |
445 | | |
446 | 417k | static pfunc found_string(struct jv_parser* p) { |
447 | 417k | char* in = p->tokenbuf; |
448 | 417k | char* out = p->tokenbuf; |
449 | 417k | char* end = p->tokenbuf + p->tokenpos; |
450 | | |
451 | 104M | while (in < end) { |
452 | 104M | char c = *in++; |
453 | 104M | if (c == '\\') { |
454 | 4.88M | if (in >= end) |
455 | 0 | return "Expected escape character at end of string"; |
456 | 4.88M | c = *in++; |
457 | 4.88M | switch (c) { |
458 | 68.0k | case '\\': |
459 | 70.5k | case '"': |
460 | 72.2k | case '/': *out++ = c; break; |
461 | 3.81k | case 'b': *out++ = '\b'; break; |
462 | 2.11k | case 'f': *out++ = '\f'; break; |
463 | 4.11k | case 't': *out++ = '\t'; break; |
464 | 3.37k | case 'n': *out++ = '\n'; break; |
465 | 2.16k | case 'r': *out++ = '\r'; break; |
466 | | |
467 | 4.60M | case 'u': |
468 | | /* ahh, the complicated case */ |
469 | 4.60M | if (in + 4 > end) |
470 | 1.00k | return "Invalid \\uXXXX escape"; |
471 | 4.60M | int hexvalue = unhex4(in); |
472 | 4.60M | if (hexvalue < 0) |
473 | 1.67k | return "Invalid characters in \\uXXXX escape"; |
474 | 4.60M | unsigned long codepoint = (unsigned long)hexvalue; |
475 | 4.60M | in += 4; |
476 | 4.60M | if (0xD800 <= codepoint && codepoint <= 0xDBFF) { |
477 | | /* who thought UTF-16 surrogate pairs were a good idea? */ |
478 | 6.84k | if (in + 6 > end || in[0] != '\\' || in[1] != 'u') |
479 | 1.24k | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
480 | 5.60k | unsigned long surrogate = unhex4(in+2); |
481 | 5.60k | if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) |
482 | 2.14k | return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; |
483 | 3.45k | in += 6; |
484 | 3.45k | codepoint = 0x10000 + (((codepoint - 0xD800) << 10) |
485 | 3.45k | |(surrogate - 0xDC00)); |
486 | 3.45k | } |
487 | 4.60M | if (codepoint > 0x10FFFF) |
488 | 0 | codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER |
489 | 4.60M | out += jvp_utf8_encode(codepoint, out); |
490 | 4.60M | break; |
491 | | |
492 | 190k | default: |
493 | 190k | return "Invalid escape"; |
494 | 4.88M | } |
495 | 99.8M | } else { |
496 | 99.8M | if (!(c & ~0x1F)) |
497 | 208 | return "Invalid string: control characters from U+0000 through U+001F must be escaped"; |
498 | 99.8M | *out++ = c; |
499 | 99.8M | } |
500 | 104M | } |
501 | 221k | TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); |
502 | 221k | p->tokenpos = 0; |
503 | 221k | return 0; |
504 | 221k | } |
505 | | |
506 | 59.8M | static pfunc check_literal(struct jv_parser* p) { |
507 | 59.8M | if (p->tokenpos == 0) return 0; |
508 | | |
509 | 22.0M | const char* pattern = 0; |
510 | 22.0M | int plen; |
511 | 22.0M | jv v; |
512 | 22.0M | switch (p->tokenbuf[0]) { |
513 | 1.87k | case 't': pattern = "true"; plen = 4; v = jv_true(); break; |
514 | 89.5k | case 'f': pattern = "false"; plen = 5; v = jv_false(); break; |
515 | 172 | case '\'': |
516 | 172 | return "Invalid string literal; expected \", but got '"; |
517 | 8.67k | case 'n': |
518 | | // if it starts with 'n', it could be a literal "nan" |
519 | 8.67k | if (p->tokenpos > 1 && p->tokenbuf[1] == 'u') { |
520 | 3.19k | pattern = "null"; plen = 4; v = jv_null(); |
521 | 3.19k | } |
522 | 22.0M | } |
523 | 22.0M | if (pattern) { |
524 | 94.6k | if (p->tokenpos != plen) return "Invalid literal"; |
525 | 557k | for (int i=0; i<plen; i++) |
526 | 464k | if (p->tokenbuf[i] != pattern[i]) |
527 | 338 | return "Invalid literal"; |
528 | 93.5k | TRY(value(p, v)); |
529 | 21.9M | } else { |
530 | | // FIXME: better parser |
531 | 21.9M | p->tokenbuf[p->tokenpos] = 0; |
532 | 21.9M | #ifdef USE_DECNUM |
533 | 21.9M | jv number = jv_number_with_literal(p->tokenbuf); |
534 | 21.9M | if (jv_get_kind(number) == JV_KIND_INVALID) { |
535 | 8.29k | return "Invalid numeric literal"; |
536 | 8.29k | } |
537 | 21.9M | TRY(value(p, number)); |
538 | | #else |
539 | | char *end = 0; |
540 | | double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end); |
541 | | if (end == 0 || *end != 0) { |
542 | | return "Invalid numeric literal"; |
543 | | } |
544 | | TRY(value(p, jv_number(d))); |
545 | | #endif |
546 | 21.9M | } |
547 | 22.0M | p->tokenpos = 0; |
548 | 22.0M | return 0; |
549 | 22.0M | } |
550 | | |
551 | | typedef enum { |
552 | | LITERAL, |
553 | | WHITESPACE, |
554 | | STRUCTURE, |
555 | | QUOTE, |
556 | | INVALID |
557 | | } chclass; |
558 | | |
559 | 329M | static chclass classify(char c) { |
560 | 329M | switch (c) { |
561 | 121k | case ' ': |
562 | 130k | case '\t': |
563 | 133k | case '\r': |
564 | 194k | case '\n': |
565 | 194k | return WHITESPACE; |
566 | 445k | case '"': |
567 | 445k | return QUOTE; |
568 | 34.2M | case '[': |
569 | 51.9M | case ',': |
570 | 52.9M | case ']': |
571 | 54.8M | case '{': |
572 | 54.9M | case ':': |
573 | 56.3M | case '}': |
574 | 56.3M | return STRUCTURE; |
575 | 272M | default: |
576 | 272M | return LITERAL; |
577 | 329M | } |
578 | 329M | } |
579 | | |
580 | | |
581 | | static const presult OK = "output produced"; |
582 | | |
583 | 245M | static int parse_check_done(struct jv_parser* p, jv* out) { |
584 | 245M | if (p->stackpos == 0 && jv_is_valid(p->next)) { |
585 | 2.85M | *out = p->next; |
586 | 2.85M | p->next = jv_invalid(); |
587 | 2.85M | return 1; |
588 | 242M | } else { |
589 | 242M | return 0; |
590 | 242M | } |
591 | 245M | } |
592 | | |
593 | 141M | static int stream_check_done(struct jv_parser* p, jv* out) { |
594 | 141M | if (p->stacklen == 0 && jv_is_valid(p->next)) { |
595 | 734 | *out = JV_ARRAY(jv_copy(p->path),p->next); |
596 | 734 | p->next = jv_invalid(); |
597 | 734 | return 1; |
598 | 141M | } else if (jv_is_valid(p->output)) { |
599 | 604 | if (jv_array_length(jv_copy(p->output)) > 2) { |
600 | | // At end of an array or object, necessitating one more output by |
601 | | // which to indicate this |
602 | 21 | *out = jv_array_slice(jv_copy(p->output), 0, 2); |
603 | 21 | p->output = jv_array_slice(p->output, 0, 1); // arrange one more output |
604 | 583 | } else { |
605 | | // No further processing needed |
606 | 583 | *out = p->output; |
607 | 583 | p->output = jv_invalid(); |
608 | 583 | } |
609 | 604 | return 1; |
610 | 141M | } else { |
611 | 141M | return 0; |
612 | 141M | } |
613 | 141M | } |
614 | | |
615 | 40 | static int seq_check_truncation(struct jv_parser* p) { |
616 | 40 | return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER)); |
617 | 40 | } |
618 | | |
619 | 48 | static int stream_seq_check_truncation(struct jv_parser* p) { |
620 | 48 | jv_kind k = jv_get_kind(p->next); |
621 | 48 | return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL); |
622 | 48 | } |
623 | | |
624 | 14 | static int parse_is_top_num(struct jv_parser* p) { |
625 | 14 | return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
626 | 14 | } |
627 | | |
628 | 18 | static int stream_is_top_num(struct jv_parser* p) { |
629 | 18 | return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER); |
630 | 18 | } |
631 | | |
632 | | #define check_done(p, o) \ |
633 | 386M | (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o))) |
634 | | |
635 | | #define token(p, ch) \ |
636 | | (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch))) |
637 | | |
638 | | #define check_truncation(p) \ |
639 | 88 | (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p))) |
640 | | |
641 | | #define is_top_num(p) \ |
642 | 32 | (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p))) |
643 | | |
644 | 470M | static pfunc scan(struct jv_parser* p, char ch, jv* out) { |
645 | 470M | p->column++; |
646 | 470M | if (ch == '\n') { |
647 | 264k | p->line++; |
648 | 264k | p->column = 0; |
649 | 264k | } |
650 | 470M | if ((p->flags & JV_PARSE_SEQ) |
651 | 3.33M | && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) { |
652 | 88 | if (check_truncation(p)) { |
653 | 49 | if (check_literal(p) == 0 && is_top_num(p)) |
654 | 1 | return "Potentially truncated top-level numeric value"; |
655 | 48 | return "Truncated value"; |
656 | 49 | } |
657 | 39 | TRY(check_literal(p)); |
658 | 23 | if (p->st == JV_PARSER_NORMAL && check_done(p, out)) |
659 | 4 | return OK; |
660 | | // shouldn't happen? |
661 | 23 | assert(!jv_is_valid(*out)); |
662 | 19 | parser_reset(p); |
663 | 19 | jv_free(*out); |
664 | 19 | *out = jv_invalid(); |
665 | 19 | return OK; |
666 | 19 | } |
667 | 470M | presult answer = 0; |
668 | 470M | p->last_ch_was_ws = 0; |
669 | 470M | if (p->st == JV_PARSER_NORMAL) { |
670 | 329M | chclass cls = classify(ch); |
671 | 329M | if (cls == WHITESPACE) |
672 | 194k | p->last_ch_was_ws = 1; |
673 | 329M | if (cls != LITERAL) { |
674 | 56.9M | TRY(check_literal(p)); |
675 | 56.9M | if (check_done(p, out)) answer = OK; |
676 | 56.9M | } |
677 | 329M | switch (cls) { |
678 | 272M | case LITERAL: |
679 | 272M | tokenadd(p, ch); |
680 | 272M | break; |
681 | 192k | case WHITESPACE: |
682 | 192k | break; |
683 | 445k | case QUOTE: |
684 | 445k | p->st = JV_PARSER_STRING; |
685 | 445k | break; |
686 | 56.2M | case STRUCTURE: |
687 | 56.2M | TRY(token(p, ch)); |
688 | 56.2M | break; |
689 | 56.2M | case INVALID: |
690 | 0 | return "Invalid character"; |
691 | 329M | } |
692 | 329M | if (check_done(p, out)) answer = OK; |
693 | 329M | } else { |
694 | 141M | if (ch == '"' && p->st == JV_PARSER_STRING) { |
695 | 417k | TRY(found_string(p)); |
696 | 221k | p->st = JV_PARSER_NORMAL; |
697 | 221k | if (check_done(p, out)) answer = OK; |
698 | 140M | } else { |
699 | 140M | tokenadd(p, ch); |
700 | 140M | if (ch == '\\' && p->st == JV_PARSER_STRING) { |
701 | 5.08M | p->st = JV_PARSER_STRING_ESCAPE; |
702 | 135M | } else { |
703 | 135M | p->st = JV_PARSER_STRING; |
704 | 135M | } |
705 | 140M | } |
706 | 141M | } |
707 | 470M | return answer; |
708 | 470M | } |
709 | | |
710 | 3.32k | struct jv_parser* jv_parser_new(int flags) { |
711 | 3.32k | struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser)); |
712 | 3.32k | parser_init(p, flags); |
713 | 3.32k | p->flags = flags; |
714 | 3.32k | return p; |
715 | 3.32k | } |
716 | | |
717 | 3.32k | void jv_parser_free(struct jv_parser* p) { |
718 | 3.32k | parser_free(p); |
719 | 3.32k | jv_mem_free(p); |
720 | 3.32k | } |
721 | | |
722 | | static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF}; |
723 | | |
724 | 0 | int jv_parser_remaining(struct jv_parser* p) { |
725 | 0 | if (p->curr_buf == 0) |
726 | 0 | return 0; |
727 | 0 | return (p->curr_buf_length - p->curr_buf_pos); |
728 | 0 | } |
729 | | |
730 | 3.18M | void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) { |
731 | 3.18M | assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length) |
732 | 3.18M | && "previous buffer not exhausted"); |
733 | 6.35M | while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) { |
734 | 3.16M | if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) { |
735 | | // matched a BOM character |
736 | 564 | buf++; |
737 | 564 | length--; |
738 | 564 | p->bom_strip_position++; |
739 | 3.16M | } else { |
740 | 3.16M | if (p->bom_strip_position == 0) { |
741 | | // no BOM in this document |
742 | 3.16M | p->bom_strip_position = sizeof(UTF8_BOM); |
743 | 3.16M | } else { |
744 | | // malformed BOM (prefix present, rest missing) |
745 | 488 | p->bom_strip_position = 0xff; |
746 | 488 | } |
747 | 3.16M | } |
748 | 3.16M | } |
749 | 3.18M | p->curr_buf = buf; |
750 | 3.18M | p->curr_buf_length = length; |
751 | 3.18M | p->curr_buf_pos = 0; |
752 | 3.18M | p->curr_buf_is_partial = is_partial; |
753 | 3.18M | } |
754 | | |
755 | | static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3); |
756 | | |
757 | 239k | static jv make_error(struct jv_parser* p, const char *fmt, ...) { |
758 | 239k | va_list ap; |
759 | 239k | va_start(ap, fmt); |
760 | 239k | jv e = jv_string_vfmt(fmt, ap); |
761 | 239k | va_end(ap); |
762 | 239k | if ((p->flags & JV_PARSE_STREAM_ERRORS)) |
763 | 519 | return JV_ARRAY(e, jv_copy(p->path)); |
764 | 239k | return jv_invalid_with_msg(e); |
765 | 239k | } |
766 | | |
767 | 8.93M | jv jv_parser_next(struct jv_parser* p) { |
768 | 8.93M | if (p->eof) |
769 | 2.89M | return jv_invalid(); |
770 | 6.03M | if (!p->curr_buf) |
771 | 46 | return jv_invalid(); // Need a buffer |
772 | 6.03M | if (p->bom_strip_position == 0xff) { |
773 | 522 | if (!(p->flags & JV_PARSE_SEQ)) |
774 | 406 | return jv_invalid_with_msg(jv_string("Malformed BOM")); |
775 | 116 | p->st =JV_PARSER_WAITING_FOR_RS; |
776 | 116 | parser_reset(p); |
777 | 116 | } |
778 | 6.03M | jv value = jv_invalid(); |
779 | 6.03M | if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value)) |
780 | 8 | return value; |
781 | 6.03M | char ch; |
782 | 6.03M | presult msg = 0; |
783 | 479M | while (!msg && p->curr_buf_pos < p->curr_buf_length) { |
784 | 473M | ch = p->curr_buf[p->curr_buf_pos++]; |
785 | 473M | if (p->st == JV_PARSER_WAITING_FOR_RS) { |
786 | 3.14M | if (ch == '\n') { |
787 | 228 | p->line++; |
788 | 228 | p->column = 0; |
789 | 3.14M | } else { |
790 | 3.14M | p->column++; |
791 | 3.14M | } |
792 | 3.14M | if (ch == '\036') |
793 | 117 | p->st = JV_PARSER_NORMAL; |
794 | 3.14M | continue; // need to resync, wait for RS |
795 | 3.14M | } |
796 | 470M | msg = scan(p, ch, &value); |
797 | 470M | } |
798 | 6.03M | if (msg == OK) { |
799 | 2.85M | return value; |
800 | 3.18M | } else if (msg) { |
801 | 204k | jv_free(value); |
802 | 204k | if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) { |
803 | | // Skip to the next RS |
804 | 128 | p->st = JV_PARSER_WAITING_FOR_RS; |
805 | 128 | value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column); |
806 | 128 | parser_reset(p); |
807 | 128 | return value; |
808 | 128 | } |
809 | 204k | value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column); |
810 | 204k | parser_reset(p); |
811 | 204k | if (!(p->flags & JV_PARSE_SEQ)) { |
812 | | // We're not parsing a JSON text sequence; throw this buffer away. |
813 | | // XXX We should fail permanently here. |
814 | 204k | p->curr_buf = 0; |
815 | 204k | p->curr_buf_pos = 0; |
816 | 204k | } // Else ch must be RS; don't clear buf so we can start parsing again after this ch |
817 | 204k | return value; |
818 | 2.97M | } else if (p->curr_buf_is_partial) { |
819 | 15.1k | assert(p->curr_buf_pos == p->curr_buf_length); |
820 | | // need another buffer |
821 | 15.1k | return jv_invalid(); |
822 | 2.96M | } else { |
823 | | // at EOF |
824 | 2.96M | p->eof = 1; |
825 | 2.96M | assert(p->curr_buf_pos == p->curr_buf_length); |
826 | 2.96M | jv_free(value); |
827 | 2.96M | if (p->st == JV_PARSER_WAITING_FOR_RS) |
828 | 86 | return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column); |
829 | 2.96M | if (p->st != JV_PARSER_NORMAL) { |
830 | 27.7k | value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column); |
831 | 27.7k | parser_reset(p); |
832 | 27.7k | p->st = JV_PARSER_WAITING_FOR_RS; |
833 | 27.7k | return value; |
834 | 27.7k | } |
835 | 2.93M | if ((msg = check_literal(p))) { |
836 | 4.22k | value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column); |
837 | 4.22k | parser_reset(p); |
838 | 4.22k | p->st = JV_PARSER_WAITING_FOR_RS; |
839 | 4.22k | return value; |
840 | 4.22k | } |
841 | 2.93M | if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) || |
842 | 2.93M | (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) { |
843 | 3.17k | value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column); |
844 | 3.17k | parser_reset(p); |
845 | 3.17k | p->st = JV_PARSER_WAITING_FOR_RS; |
846 | 3.17k | return value; |
847 | 3.17k | } |
848 | | // p->next is either invalid (nothing here, but no syntax error) |
849 | | // or valid (this is the value). either way it's the thing to return |
850 | 2.92M | if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) { |
851 | 1.33k | value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value] |
852 | 2.92M | } else { |
853 | 2.92M | value = p->next; |
854 | 2.92M | } |
855 | 2.92M | p->next = jv_invalid(); |
856 | 2.92M | if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) { |
857 | 3 | jv_free(value); |
858 | 3 | return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column); |
859 | 3 | } |
860 | 2.92M | return value; |
861 | 2.92M | } |
862 | 6.03M | } |
863 | | |
864 | 3.16M | jv jv_parse_sized_custom_flags(const char* string, int length, int flags) { |
865 | 3.16M | struct jv_parser parser; |
866 | 3.16M | parser_init(&parser, flags); |
867 | 3.16M | jv_parser_set_buf(&parser, string, length, 0); |
868 | 3.16M | jv value = jv_parser_next(&parser); |
869 | 3.16M | if (jv_is_valid(value)) { |
870 | 2.92M | jv next = jv_parser_next(&parser); |
871 | 2.92M | if (jv_is_valid(next)) { |
872 | | // multiple JSON values, we only wanted one |
873 | 871 | jv_free(value); |
874 | 871 | jv_free(next); |
875 | 871 | value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); |
876 | 2.92M | } else if (jv_invalid_has_msg(jv_copy(next))) { |
877 | | // parser error after the first JSON value |
878 | 2.13k | jv_free(value); |
879 | 2.13k | value = next; |
880 | 2.92M | } else { |
881 | | // a single valid JSON value |
882 | 2.92M | jv_free(next); |
883 | 2.92M | } |
884 | 2.92M | } else if (jv_invalid_has_msg(jv_copy(value))) { |
885 | | // parse error, we'll return it |
886 | 235k | } else { |
887 | | // no value at all |
888 | 2.84k | jv_free(value); |
889 | 2.84k | value = jv_invalid_with_msg(jv_string("Expected JSON value")); |
890 | 2.84k | } |
891 | 3.16M | parser_free(&parser); |
892 | | |
893 | 3.16M | if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) { |
894 | 241k | jv msg = jv_invalid_get_msg(value); |
895 | 241k | value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')", |
896 | 241k | jv_string_value(msg), |
897 | 241k | string)); |
898 | 241k | jv_free(msg); |
899 | 241k | } |
900 | 3.16M | return value; |
901 | 3.16M | } |
902 | | |
903 | 3.15M | jv jv_parse_sized(const char* string, int length) { |
904 | 3.15M | return jv_parse_sized_custom_flags(string, length, 0); |
905 | 3.15M | } |
906 | | |
907 | 47.7k | jv jv_parse(const char* string) { |
908 | 47.7k | return jv_parse_sized(string, strlen(string)); |
909 | 47.7k | } |
910 | | |
911 | 6.87k | jv jv_parse_custom_flags(const char* string, int flags) { |
912 | 6.87k | return jv_parse_sized_custom_flags(string, strlen(string), flags); |
913 | 6.87k | } |