Coverage Report

Created: 2025-11-09 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jq/src/jv_parse.c
Line
Count
Source
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <string.h>
4
#include <assert.h>
5
#include "jv.h"
6
#include "jv_dtoa.h"
7
#include "jv_unicode.h"
8
#include "jv_alloc.h"
9
#include "jv_dtoa.h"
10
11
typedef const char* presult;
12
13
#ifndef MAX_PARSING_DEPTH
14
3.60M
#define MAX_PARSING_DEPTH (10000)
15
#endif
16
17
135M
#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
18
#ifdef __GNUC__
19
#define pfunc __attribute__((warn_unused_result)) presult
20
#else
21
#define pfunc presult
22
#endif
23
24
enum last_seen {
25
  JV_LAST_NONE = 0,
26
  JV_LAST_OPEN_ARRAY = '[',
27
  JV_LAST_OPEN_OBJECT = '{',
28
  JV_LAST_COLON = ':',
29
  JV_LAST_COMMA = ',',
30
  JV_LAST_VALUE = 'V',
31
};
32
33
struct jv_parser {
34
  const char* curr_buf;
35
  int curr_buf_length;
36
  int curr_buf_pos;
37
  int curr_buf_is_partial;
38
  int eof;
39
  unsigned bom_strip_position;
40
41
  int flags;
42
43
  jv* stack;                   // parser
44
  int stackpos;                // parser
45
  int stacklen;                // both (optimization; it's really pathlen for streaming)
46
  jv path;                     // streamer
47
  enum last_seen last_seen;    // streamer
48
  jv output;                   // streamer
49
  jv next;                     // both
50
51
  char* tokenbuf;
52
  int tokenpos;
53
  int tokenlen;
54
55
  int line, column;
56
57
  struct dtoa_context dtoa;
58
59
  enum {
60
    JV_PARSER_NORMAL,
61
    JV_PARSER_STRING,
62
    JV_PARSER_STRING_ESCAPE,
63
    JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS
64
  } st;
65
  unsigned int last_ch_was_ws:1;
66
};
67
68
69
3.16M
static void parser_init(struct jv_parser* p, int flags) {
70
3.16M
  p->flags = flags;
71
3.16M
  if ((p->flags & JV_PARSE_STREAMING)) {
72
3.88k
    p->path = jv_array();
73
3.16M
  } else {
74
3.16M
    p->path = jv_invalid();
75
3.16M
    p->flags &= ~(JV_PARSE_STREAM_ERRORS);
76
3.16M
  }
77
3.16M
  p->stack = 0;
78
3.16M
  p->stacklen = p->stackpos = 0;
79
3.16M
  p->last_seen = JV_LAST_NONE;
80
3.16M
  p->output = jv_invalid();
81
3.16M
  p->next = jv_invalid();
82
3.16M
  p->tokenbuf = 0;
83
3.16M
  p->tokenlen = p->tokenpos = 0;
84
3.16M
  if ((p->flags & JV_PARSE_SEQ))
85
285
    p->st = JV_PARSER_WAITING_FOR_RS;
86
3.16M
  else
87
3.16M
    p->st = JV_PARSER_NORMAL;
88
3.16M
  p->eof = 0;
89
3.16M
  p->curr_buf = 0;
90
3.16M
  p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
91
3.16M
  p->bom_strip_position = 0;
92
3.16M
  p->last_ch_was_ws = 0;
93
3.16M
  p->line = 1;
94
3.16M
  p->column = 0;
95
3.16M
  jvp_dtoa_context_init(&p->dtoa);
96
3.16M
}
97
98
3.40M
static void parser_reset(struct jv_parser* p) {
99
3.40M
  if ((p->flags & JV_PARSE_STREAMING)) {
100
6.07k
    jv_free(p->path);
101
6.07k
    p->path = jv_array();
102
6.07k
    p->stacklen = 0;
103
6.07k
  }
104
3.40M
  p->last_seen = JV_LAST_NONE;
105
3.40M
  jv_free(p->output);
106
3.40M
  p->output = jv_invalid();
107
3.40M
  jv_free(p->next);
108
3.40M
  p->next = jv_invalid();
109
4.73M
  for (int i=0; i<p->stackpos; i++)
110
1.32M
    jv_free(p->stack[i]);
111
3.40M
  p->stackpos = 0;
112
3.40M
  p->tokenpos = 0;
113
3.40M
  p->st = JV_PARSER_NORMAL;
114
3.40M
}
115
116
3.16M
static void parser_free(struct jv_parser* p) {
117
3.16M
  parser_reset(p);
118
3.16M
  jv_free(p->path);
119
3.16M
  jv_free(p->output);
120
3.16M
  jv_mem_free(p->stack);
121
3.16M
  jv_mem_free(p->tokenbuf);
122
3.16M
  jvp_dtoa_context_free(&p->dtoa);
123
3.16M
}
124
125
22.2M
static pfunc value(struct jv_parser* p, jv val) {
126
22.2M
  if ((p->flags & JV_PARSE_STREAMING)) {
127
10.5k
    if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) {
128
105
      jv_free(val);
129
105
      return "Expected separator between values";
130
105
    }
131
10.4k
    if (p->stacklen > 0)
132
8.39k
      p->last_seen = JV_LAST_VALUE;
133
2.06k
    else
134
2.06k
      p->last_seen = JV_LAST_NONE;
135
22.2M
  } else {
136
22.2M
    if (jv_is_valid(p->next)) {
137
212
      jv_free(val);
138
212
      return "Expected separator between values";
139
212
    }
140
22.2M
  }
141
22.2M
  jv_free(p->next);
142
22.2M
  p->next = val;
143
22.2M
  return 0;
144
22.2M
}
145
146
3.67M
static void push(struct jv_parser* p, jv v) {
147
3.67M
  assert(p->stackpos <= p->stacklen);
148
3.67M
  if (p->stackpos == p->stacklen) {
149
18.8k
    p->stacklen = p->stacklen * 2 + 10;
150
18.8k
    p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv));
151
18.8k
  }
152
3.67M
  assert(p->stackpos < p->stacklen);
153
3.67M
  p->stack[p->stackpos++] = v;
154
3.67M
}
155
156
23.7M
static pfunc parse_token(struct jv_parser* p, char ch) {
157
23.7M
  switch (ch) {
158
1.67M
  case '[':
159
1.67M
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
160
1.67M
    if (jv_is_valid(p->next)) return "Expected separator between values";
161
1.67M
    push(p, jv_array());
162
1.67M
    break;
163
164
1.92M
  case '{':
165
1.92M
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
166
1.92M
    if (jv_is_valid(p->next)) return "Expected separator between values";
167
1.92M
    push(p, jv_object());
168
1.92M
    break;
169
170
71.7k
  case ':':
171
71.7k
    if (!jv_is_valid(p->next))
172
146
      return "Expected string key before ':'";
173
71.5k
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
174
124
      return "':' not as part of an object";
175
71.4k
    if (jv_get_kind(p->next) != JV_KIND_STRING)
176
46
      return "Object keys must be strings";
177
71.4k
    push(p, p->next);
178
71.4k
    p->next = jv_invalid();
179
71.4k
    break;
180
181
17.7M
  case ',':
182
17.7M
    if (!jv_is_valid(p->next))
183
273
      return "Expected value before ','";
184
17.7M
    if (p->stackpos == 0)
185
0
      return "',' not as part of an object or array";
186
17.7M
    if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) {
187
17.7M
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
188
17.7M
      p->next = jv_invalid();
189
17.7M
    } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) {
190
47.2k
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
191
47.2k
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
192
47.2k
                                              p->stack[p->stackpos-1], p->next);
193
47.2k
      p->stackpos--;
194
47.2k
      p->next = jv_invalid();
195
47.2k
    } else {
196
      // this case hits on input like {"a", "b"}
197
174
      return "Objects must consist of key:value pairs";
198
174
    }
199
17.7M
    break;
200
201
17.7M
  case ']':
202
918k
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY)
203
600
      return "Unmatched ']'";
204
918k
    if (jv_is_valid(p->next)) {
205
909k
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
206
909k
      p->next = jv_invalid();
207
909k
    } else {
208
8.48k
      if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) {
209
        // this case hits on input like [1,2,3,]
210
80
        return "Expected another array element";
211
80
      }
212
8.48k
    }
213
918k
    jv_free(p->next);
214
918k
    p->next = p->stack[--p->stackpos];
215
918k
    break;
216
217
1.37M
  case '}':
218
1.37M
    if (p->stackpos == 0)
219
344
      return "Unmatched '}'";
220
1.37M
    if (jv_is_valid(p->next)) {
221
9.69k
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING)
222
45
        return "Objects must consist of key:value pairs";
223
9.69k
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
224
9.65k
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
225
9.65k
                                              p->stack[p->stackpos-1], p->next);
226
9.65k
      p->stackpos--;
227
9.65k
      p->next = jv_invalid();
228
1.36M
    } else {
229
1.36M
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
230
348
        return "Unmatched '}'";
231
1.36M
      if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0)
232
29
        return "Expected another key-value pair";
233
1.36M
    }
234
1.37M
    jv_free(p->next);
235
1.37M
    p->next = p->stack[--p->stackpos];
236
1.37M
    break;
237
23.7M
  }
238
23.7M
  return 0;
239
23.7M
}
240
241
32.5M
static pfunc stream_token(struct jv_parser* p, char ch) {
242
32.5M
  jv_kind k;
243
32.5M
  jv last;
244
245
32.5M
  switch (ch) {
246
32.5M
  case '[':
247
32.5M
    if (jv_is_valid(p->next))
248
17
      return "Expected a separator between values";
249
32.5M
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
250
      // Looks like {["foo"]}
251
5
      return "Expected string key after '{', not '['";
252
32.5M
    if (p->last_seen == JV_LAST_COMMA) {
253
31
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
254
31
      k = jv_get_kind(last);
255
31
      jv_free(last);
256
31
      if (k != JV_KIND_NUMBER)
257
        // Looks like {"x":"y",["foo"]}
258
2
        return "Expected string key after ',' in object, not '['";
259
31
    }
260
32.5M
    p->path = jv_array_append(p->path, jv_number(0)); // push
261
32.5M
    p->last_seen = JV_LAST_OPEN_ARRAY;
262
32.5M
    p->stacklen++;
263
32.5M
    break;
264
265
1.82k
  case '{':
266
1.82k
    if (p->last_seen == JV_LAST_VALUE)
267
5
      return "Expected a separator between values";
268
1.82k
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
269
      // Looks like {{"foo":"bar"}}
270
11
      return "Expected string key after '{', not '{'";
271
1.80k
    if (p->last_seen == JV_LAST_COMMA) {
272
13
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
273
13
      k = jv_get_kind(last);
274
13
      jv_free(last);
275
13
      if (k != JV_KIND_NUMBER)
276
        // Looks like {"x":"y",{"foo":"bar"}}
277
2
        return "Expected string key after ',' in object, not '{'";
278
13
    }
279
    // Push object key: null, since we don't know it yet
280
1.80k
    p->path = jv_array_append(p->path, jv_null()); // push
281
1.80k
    p->last_seen = JV_LAST_OPEN_OBJECT;
282
1.80k
    p->stacklen++;
283
1.80k
    break;
284
285
7.98k
  case ':':
286
7.98k
    last = jv_invalid();
287
7.98k
    if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) {
288
25
      jv_free(last);
289
25
      return "':' not as part of an object";
290
25
    }
291
7.96k
    jv_free(last);
292
7.96k
    if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE)
293
34
      return "Expected string key before ':'";
294
7.92k
    if (jv_get_kind(p->next) != JV_KIND_STRING)
295
10
      return "Object keys must be strings";
296
7.91k
    if (p->last_seen != JV_LAST_VALUE)
297
0
      return "':' should follow a key";
298
7.91k
    p->last_seen = JV_LAST_COLON;
299
7.91k
    p->path = jv_array_set(p->path, p->stacklen - 1, p->next);
300
7.91k
    p->next = jv_invalid();
301
7.91k
    break;
302
303
227
  case ',':
304
227
    if (p->last_seen != JV_LAST_VALUE)
305
18
      return "Expected value before ','";
306
209
    if (p->stacklen == 0)
307
0
      return "',' not as part of an object or array";
308
209
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
309
209
    k = jv_get_kind(last);
310
209
    if (k == JV_KIND_NUMBER) {
311
160
      int idx = jv_number_value(last);
312
313
160
      if (jv_is_valid(p->next)) {
314
128
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
315
128
        p->next = jv_invalid();
316
128
      }
317
160
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1));
318
160
      p->last_seen = JV_LAST_COMMA;
319
160
    } else if (k == JV_KIND_STRING) {
320
45
      if (jv_is_valid(p->next)) {
321
31
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
322
31
        p->next = jv_invalid();
323
31
      }
324
45
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair
325
45
      p->last_seen = JV_LAST_COMMA;
326
45
    } else {
327
4
      assert(k == JV_KIND_NULL);
328
      // this case hits on input like {,}
329
      // make sure to handle input like {"a", "b"} and {"a":, ...}
330
4
      jv_free(last);
331
4
      return "Objects must consist of key:value pairs";
332
4
    }
333
205
    jv_free(last);
334
205
    break;
335
336
279
  case ']':
337
279
    if (p->stacklen == 0)
338
8
      return "Unmatched ']' at the top-level";
339
271
    if (p->last_seen == JV_LAST_COMMA)
340
2
      return "Expected another array element";
341
269
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
342
269
      assert(!jv_is_valid(p->next));
343
344
269
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
345
269
    k = jv_get_kind(last);
346
269
    jv_free(last);
347
348
269
    if (k != JV_KIND_NUMBER)
349
8
      return "Unmatched ']' in the middle of an object";
350
261
    if (jv_is_valid(p->next)) {
351
17
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
352
17
      p->next = jv_invalid();
353
244
    } else if (p->last_seen != JV_LAST_OPEN_ARRAY) {
354
4
      p->output = JV_ARRAY(jv_copy(p->path));
355
4
    }
356
357
261
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
358
    //assert(!jv_is_valid(p->next));
359
261
    jv_free(p->next);
360
261
    p->next = jv_invalid();
361
362
261
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
363
240
      p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves
364
365
261
    if (p->stacklen == 0)
366
26
      p->last_seen = JV_LAST_NONE;
367
235
    else
368
235
      p->last_seen = JV_LAST_VALUE;
369
261
    break;
370
371
217
  case '}':
372
217
    if (p->stacklen == 0)
373
11
      return "Unmatched '}' at the top-level";
374
206
    if (p->last_seen == JV_LAST_COMMA)
375
2
      return "Expected another key:value pair";
376
204
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
377
204
      assert(!jv_is_valid(p->next));
378
379
204
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
380
204
    k = jv_get_kind(last);
381
204
    jv_free(last);
382
204
    if (k == JV_KIND_NUMBER)
383
16
      return "Unmatched '}' in the middle of an array";
384
385
188
    if (jv_is_valid(p->next)) {
386
10
      if (k != JV_KIND_STRING)
387
6
        return "Objects must consist of key:value pairs";
388
4
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
389
4
      p->next = jv_invalid();
390
178
    } else {
391
      // Perhaps {"a":[]}
392
178
      if (p->last_seen == JV_LAST_COLON)
393
        // Looks like {"a":}
394
6
        return "Missing value in key:value pair";
395
172
      if (p->last_seen == JV_LAST_COMMA)
396
        // Looks like {"a":0,}
397
0
        return "Expected another key-value pair";
398
172
      if (p->last_seen == JV_LAST_OPEN_ARRAY)
399
0
        return "Unmatched '}' in the middle of an array";
400
172
      if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT)
401
0
        return "Unmatched '}'";
402
172
      if (p->last_seen != JV_LAST_OPEN_OBJECT)
403
2
        p->output = JV_ARRAY(jv_copy(p->path));
404
172
    }
405
176
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
406
176
    jv_free(p->next);
407
176
    p->next = jv_invalid();
408
409
176
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
410
170
      p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves
411
412
176
    if (p->stacklen == 0)
413
50
      p->last_seen = JV_LAST_NONE;
414
126
    else
415
126
      p->last_seen = JV_LAST_VALUE;
416
176
    break;
417
32.5M
  }
418
32.5M
  return 0;
419
32.5M
}
420
421
413M
static void tokenadd(struct jv_parser* p, char c) {
422
413M
  assert(p->tokenpos <= p->tokenlen);
423
413M
  if (p->tokenpos >= (p->tokenlen - 1)) {
424
3.18M
    p->tokenlen = p->tokenlen*2 + 256;
425
3.18M
    p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen);
426
3.18M
  }
427
413M
  assert(p->tokenpos < p->tokenlen);
428
413M
  p->tokenbuf[p->tokenpos++] = c;
429
413M
}
430
431
4.61M
static int unhex4(char* hex) {
432
4.61M
  int r = 0;
433
23.0M
  for (int i=0; i<4; i++) {
434
18.4M
    char c = *hex++;
435
18.4M
    int n;
436
18.4M
    if ('0' <= c && c <= '9') n = c - '0';
437
4.64M
    else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
438
22.0k
    else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
439
2.77k
    else return -1;
440
18.4M
    r <<= 4;
441
18.4M
    r |= n;
442
18.4M
  }
443
4.61M
  return r;
444
4.61M
}
445
446
417k
static pfunc found_string(struct jv_parser* p) {
447
417k
  char* in = p->tokenbuf;
448
417k
  char* out = p->tokenbuf;
449
417k
  char* end = p->tokenbuf + p->tokenpos;
450
451
104M
  while (in < end) {
452
104M
    char c = *in++;
453
104M
    if (c == '\\') {
454
4.88M
      if (in >= end)
455
0
        return "Expected escape character at end of string";
456
4.88M
      c = *in++;
457
4.88M
      switch (c) {
458
68.0k
      case '\\':
459
70.5k
      case '"':
460
72.2k
      case '/': *out++ = c;    break;
461
3.81k
      case 'b': *out++ = '\b'; break;
462
2.11k
      case 'f': *out++ = '\f'; break;
463
4.11k
      case 't': *out++ = '\t'; break;
464
3.37k
      case 'n': *out++ = '\n'; break;
465
2.16k
      case 'r': *out++ = '\r'; break;
466
467
4.60M
      case 'u':
468
        /* ahh, the complicated case */
469
4.60M
        if (in + 4 > end)
470
1.00k
          return "Invalid \\uXXXX escape";
471
4.60M
        int hexvalue = unhex4(in);
472
4.60M
        if (hexvalue < 0)
473
1.67k
          return "Invalid characters in \\uXXXX escape";
474
4.60M
        unsigned long codepoint = (unsigned long)hexvalue;
475
4.60M
        in += 4;
476
4.60M
        if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
477
          /* who thought UTF-16 surrogate pairs were a good idea? */
478
6.84k
          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
479
1.24k
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
480
5.60k
          unsigned long surrogate = unhex4(in+2);
481
5.60k
          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
482
2.14k
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
483
3.45k
          in += 6;
484
3.45k
          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
485
3.45k
                                 |(surrogate - 0xDC00));
486
3.45k
        }
487
4.60M
        if (codepoint > 0x10FFFF)
488
0
          codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
489
4.60M
        out += jvp_utf8_encode(codepoint, out);
490
4.60M
        break;
491
492
190k
      default:
493
190k
        return "Invalid escape";
494
4.88M
      }
495
99.8M
    } else {
496
99.8M
      if (!(c & ~0x1F))
497
208
        return "Invalid string: control characters from U+0000 through U+001F must be escaped";
498
99.8M
      *out++ = c;
499
99.8M
    }
500
104M
  }
501
221k
  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
502
221k
  p->tokenpos = 0;
503
221k
  return 0;
504
221k
}
505
506
59.8M
static pfunc check_literal(struct jv_parser* p) {
507
59.8M
  if (p->tokenpos == 0) return 0;
508
509
22.0M
  const char* pattern = 0;
510
22.0M
  int plen;
511
22.0M
  jv v;
512
22.0M
  switch (p->tokenbuf[0]) {
513
1.87k
  case 't': pattern = "true"; plen = 4; v = jv_true(); break;
514
89.5k
  case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
515
172
  case '\'':
516
172
    return "Invalid string literal; expected \", but got '";
517
8.67k
  case 'n':
518
    // if it starts with 'n', it could be a literal "nan"
519
8.67k
    if (p->tokenpos > 1 && p->tokenbuf[1] == 'u') {
520
3.19k
      pattern = "null"; plen = 4; v = jv_null();
521
3.19k
    }
522
22.0M
  }
523
22.0M
  if (pattern) {
524
94.6k
    if (p->tokenpos != plen) return "Invalid literal";
525
557k
    for (int i=0; i<plen; i++)
526
464k
      if (p->tokenbuf[i] != pattern[i])
527
338
        return "Invalid literal";
528
93.5k
    TRY(value(p, v));
529
21.9M
  } else {
530
    // FIXME: better parser
531
21.9M
    p->tokenbuf[p->tokenpos] = 0;
532
21.9M
#ifdef USE_DECNUM
533
21.9M
    jv number = jv_number_with_literal(p->tokenbuf);
534
21.9M
    if (jv_get_kind(number) == JV_KIND_INVALID) {
535
8.29k
      return "Invalid numeric literal";
536
8.29k
    }
537
21.9M
    TRY(value(p, number));
538
#else
539
    char *end = 0;
540
    double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end);
541
    if (end == 0 || *end != 0) {
542
      return "Invalid numeric literal";
543
    }
544
    TRY(value(p, jv_number(d)));
545
#endif
546
21.9M
  }
547
22.0M
  p->tokenpos = 0;
548
22.0M
  return 0;
549
22.0M
}
550
551
typedef enum {
552
  LITERAL,
553
  WHITESPACE,
554
  STRUCTURE,
555
  QUOTE,
556
  INVALID
557
} chclass;
558
559
329M
static chclass classify(char c) {
560
329M
  switch (c) {
561
121k
  case ' ':
562
130k
  case '\t':
563
133k
  case '\r':
564
194k
  case '\n':
565
194k
    return WHITESPACE;
566
445k
  case '"':
567
445k
    return QUOTE;
568
34.2M
  case '[':
569
51.9M
  case ',':
570
52.9M
  case ']':
571
54.8M
  case '{':
572
54.9M
  case ':':
573
56.3M
  case '}':
574
56.3M
    return STRUCTURE;
575
272M
  default:
576
272M
    return LITERAL;
577
329M
  }
578
329M
}
579
580
581
static const presult OK = "output produced";
582
583
245M
static int parse_check_done(struct jv_parser* p, jv* out) {
584
245M
  if (p->stackpos == 0 && jv_is_valid(p->next)) {
585
2.85M
    *out = p->next;
586
2.85M
    p->next = jv_invalid();
587
2.85M
    return 1;
588
242M
  } else {
589
242M
    return 0;
590
242M
  }
591
245M
}
592
593
141M
static int stream_check_done(struct jv_parser* p, jv* out) {
594
141M
  if (p->stacklen == 0 && jv_is_valid(p->next)) {
595
734
    *out = JV_ARRAY(jv_copy(p->path),p->next);
596
734
    p->next = jv_invalid();
597
734
    return 1;
598
141M
  } else if (jv_is_valid(p->output)) {
599
604
    if (jv_array_length(jv_copy(p->output)) > 2) {
600
      // At end of an array or object, necessitating one more output by
601
      // which to indicate this
602
21
      *out = jv_array_slice(jv_copy(p->output), 0, 2);
603
21
      p->output = jv_array_slice(p->output, 0, 1);      // arrange one more output
604
583
    } else {
605
      // No further processing needed
606
583
      *out = p->output;
607
583
      p->output = jv_invalid();
608
583
    }
609
604
    return 1;
610
141M
  } else {
611
141M
    return 0;
612
141M
  }
613
141M
}
614
615
40
static int seq_check_truncation(struct jv_parser* p) {
616
40
  return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER));
617
40
}
618
619
48
static int stream_seq_check_truncation(struct jv_parser* p) {
620
48
  jv_kind k = jv_get_kind(p->next);
621
48
  return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL);
622
48
}
623
624
14
static int parse_is_top_num(struct jv_parser* p) {
625
14
  return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
626
14
}
627
628
18
static int stream_is_top_num(struct jv_parser* p) {
629
18
  return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
630
18
}
631
632
#define check_done(p, o) \
633
386M
   (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o)))
634
635
#define token(p, ch) \
636
   (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch)))
637
638
#define check_truncation(p) \
639
88
   (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p)))
640
641
#define is_top_num(p) \
642
32
   (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p)))
643
644
470M
static pfunc scan(struct jv_parser* p, char ch, jv* out) {
645
470M
  p->column++;
646
470M
  if (ch == '\n') {
647
264k
    p->line++;
648
264k
    p->column = 0;
649
264k
  }
650
470M
  if ((p->flags & JV_PARSE_SEQ)
651
3.33M
      && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) {
652
88
    if (check_truncation(p)) {
653
49
      if (check_literal(p) == 0 && is_top_num(p))
654
1
        return "Potentially truncated top-level numeric value";
655
48
      return "Truncated value";
656
49
    }
657
39
    TRY(check_literal(p));
658
23
    if (p->st == JV_PARSER_NORMAL && check_done(p, out))
659
4
      return OK;
660
    // shouldn't happen?
661
23
    assert(!jv_is_valid(*out));
662
19
    parser_reset(p);
663
19
    jv_free(*out);
664
19
    *out = jv_invalid();
665
19
    return OK;
666
19
  }
667
470M
  presult answer = 0;
668
470M
  p->last_ch_was_ws = 0;
669
470M
  if (p->st == JV_PARSER_NORMAL) {
670
329M
    chclass cls = classify(ch);
671
329M
    if (cls == WHITESPACE)
672
194k
      p->last_ch_was_ws = 1;
673
329M
    if (cls != LITERAL) {
674
56.9M
      TRY(check_literal(p));
675
56.9M
      if (check_done(p, out)) answer = OK;
676
56.9M
    }
677
329M
    switch (cls) {
678
272M
    case LITERAL:
679
272M
      tokenadd(p, ch);
680
272M
      break;
681
192k
    case WHITESPACE:
682
192k
      break;
683
445k
    case QUOTE:
684
445k
      p->st = JV_PARSER_STRING;
685
445k
      break;
686
56.2M
    case STRUCTURE:
687
56.2M
      TRY(token(p, ch));
688
56.2M
      break;
689
56.2M
    case INVALID:
690
0
      return "Invalid character";
691
329M
    }
692
329M
    if (check_done(p, out)) answer = OK;
693
329M
  } else {
694
141M
    if (ch == '"' && p->st == JV_PARSER_STRING) {
695
417k
      TRY(found_string(p));
696
221k
      p->st = JV_PARSER_NORMAL;
697
221k
      if (check_done(p, out)) answer = OK;
698
140M
    } else {
699
140M
      tokenadd(p, ch);
700
140M
      if (ch == '\\' && p->st == JV_PARSER_STRING) {
701
5.08M
        p->st = JV_PARSER_STRING_ESCAPE;
702
135M
      } else {
703
135M
        p->st = JV_PARSER_STRING;
704
135M
      }
705
140M
    }
706
141M
  }
707
470M
  return answer;
708
470M
}
709
710
3.32k
struct jv_parser* jv_parser_new(int flags) {
711
3.32k
  struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser));
712
3.32k
  parser_init(p, flags);
713
3.32k
  p->flags = flags;
714
3.32k
  return p;
715
3.32k
}
716
717
3.32k
void jv_parser_free(struct jv_parser* p) {
718
3.32k
  parser_free(p);
719
3.32k
  jv_mem_free(p);
720
3.32k
}
721
722
static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
723
724
0
int jv_parser_remaining(struct jv_parser* p) {
725
0
  if (p->curr_buf == 0)
726
0
    return 0;
727
0
  return (p->curr_buf_length - p->curr_buf_pos);
728
0
}
729
730
3.18M
void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
731
3.18M
  assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
732
3.18M
         && "previous buffer not exhausted");
733
6.35M
  while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) {
734
3.16M
    if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
735
      // matched a BOM character
736
564
      buf++;
737
564
      length--;
738
564
      p->bom_strip_position++;
739
3.16M
    } else {
740
3.16M
      if (p->bom_strip_position == 0) {
741
        // no BOM in this document
742
3.16M
        p->bom_strip_position = sizeof(UTF8_BOM);
743
3.16M
      } else {
744
        // malformed BOM (prefix present, rest missing)
745
488
        p->bom_strip_position = 0xff;
746
488
      }
747
3.16M
    }
748
3.16M
  }
749
3.18M
  p->curr_buf = buf;
750
3.18M
  p->curr_buf_length = length;
751
3.18M
  p->curr_buf_pos = 0;
752
3.18M
  p->curr_buf_is_partial = is_partial;
753
3.18M
}
754
755
static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3);
756
757
239k
static jv make_error(struct jv_parser* p, const char *fmt, ...) {
758
239k
  va_list ap;
759
239k
  va_start(ap, fmt);
760
239k
  jv e = jv_string_vfmt(fmt, ap);
761
239k
  va_end(ap);
762
239k
  if ((p->flags & JV_PARSE_STREAM_ERRORS))
763
519
    return JV_ARRAY(e, jv_copy(p->path));
764
239k
  return jv_invalid_with_msg(e);
765
239k
}
766
767
8.93M
jv jv_parser_next(struct jv_parser* p) {
768
8.93M
  if (p->eof)
769
2.89M
    return jv_invalid();
770
6.03M
  if (!p->curr_buf)
771
46
    return jv_invalid(); // Need a buffer
772
6.03M
  if (p->bom_strip_position == 0xff) {
773
522
    if (!(p->flags & JV_PARSE_SEQ))
774
406
      return jv_invalid_with_msg(jv_string("Malformed BOM"));
775
116
    p->st =JV_PARSER_WAITING_FOR_RS;
776
116
    parser_reset(p);
777
116
  }
778
6.03M
  jv value = jv_invalid();
779
6.03M
  if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value))
780
8
    return value;
781
6.03M
  char ch;
782
6.03M
  presult msg = 0;
783
479M
  while (!msg && p->curr_buf_pos < p->curr_buf_length) {
784
473M
    ch = p->curr_buf[p->curr_buf_pos++];
785
473M
    if (p->st == JV_PARSER_WAITING_FOR_RS) {
786
3.14M
      if (ch == '\n') {
787
228
        p->line++;
788
228
        p->column = 0;
789
3.14M
      } else {
790
3.14M
        p->column++;
791
3.14M
      }
792
3.14M
      if (ch == '\036')
793
117
        p->st = JV_PARSER_NORMAL;
794
3.14M
      continue; // need to resync, wait for RS
795
3.14M
    }
796
470M
    msg = scan(p, ch, &value);
797
470M
  }
798
6.03M
  if (msg == OK) {
799
2.85M
    return value;
800
3.18M
  } else if (msg) {
801
204k
    jv_free(value);
802
204k
    if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) {
803
      // Skip to the next RS
804
128
      p->st = JV_PARSER_WAITING_FOR_RS;
805
128
      value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column);
806
128
      parser_reset(p);
807
128
      return value;
808
128
    }
809
204k
    value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column);
810
204k
    parser_reset(p);
811
204k
    if (!(p->flags & JV_PARSE_SEQ)) {
812
      // We're not parsing a JSON text sequence; throw this buffer away.
813
      // XXX We should fail permanently here.
814
204k
      p->curr_buf = 0;
815
204k
      p->curr_buf_pos = 0;
816
204k
    } // Else ch must be RS; don't clear buf so we can start parsing again after this ch
817
204k
    return value;
818
2.97M
  } else if (p->curr_buf_is_partial) {
819
15.1k
    assert(p->curr_buf_pos == p->curr_buf_length);
820
    // need another buffer
821
15.1k
    return jv_invalid();
822
2.96M
  } else {
823
    // at EOF
824
2.96M
    p->eof = 1;
825
2.96M
    assert(p->curr_buf_pos == p->curr_buf_length);
826
2.96M
    jv_free(value);
827
2.96M
    if (p->st == JV_PARSER_WAITING_FOR_RS)
828
86
      return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column);
829
2.96M
    if (p->st != JV_PARSER_NORMAL) {
830
27.7k
      value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column);
831
27.7k
      parser_reset(p);
832
27.7k
      p->st = JV_PARSER_WAITING_FOR_RS;
833
27.7k
      return value;
834
27.7k
    }
835
2.93M
    if ((msg = check_literal(p))) {
836
4.22k
      value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column);
837
4.22k
      parser_reset(p);
838
4.22k
      p->st = JV_PARSER_WAITING_FOR_RS;
839
4.22k
      return value;
840
4.22k
    }
841
2.93M
    if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) ||
842
2.93M
        (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) {
843
3.17k
      value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column);
844
3.17k
      parser_reset(p);
845
3.17k
      p->st = JV_PARSER_WAITING_FOR_RS;
846
3.17k
      return value;
847
3.17k
    }
848
    // p->next is either invalid (nothing here, but no syntax error)
849
    // or valid (this is the value). either way it's the thing to return
850
2.92M
    if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) {
851
1.33k
      value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value]
852
2.92M
    } else {
853
2.92M
      value = p->next;
854
2.92M
    }
855
2.92M
    p->next = jv_invalid();
856
2.92M
    if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) {
857
3
      jv_free(value);
858
3
      return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column);
859
3
    }
860
2.92M
    return value;
861
2.92M
  }
862
6.03M
}
863
864
3.16M
jv jv_parse_sized_custom_flags(const char* string, int length, int flags) {
865
3.16M
  struct jv_parser parser;
866
3.16M
  parser_init(&parser, flags);
867
3.16M
  jv_parser_set_buf(&parser, string, length, 0);
868
3.16M
  jv value = jv_parser_next(&parser);
869
3.16M
  if (jv_is_valid(value)) {
870
2.92M
    jv next = jv_parser_next(&parser);
871
2.92M
    if (jv_is_valid(next)) {
872
      // multiple JSON values, we only wanted one
873
871
      jv_free(value);
874
871
      jv_free(next);
875
871
      value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
876
2.92M
    } else if (jv_invalid_has_msg(jv_copy(next))) {
877
      // parser error after the first JSON value
878
2.13k
      jv_free(value);
879
2.13k
      value = next;
880
2.92M
    } else {
881
      // a single valid JSON value
882
2.92M
      jv_free(next);
883
2.92M
    }
884
2.92M
  } else if (jv_invalid_has_msg(jv_copy(value))) {
885
    // parse error, we'll return it
886
235k
  } else {
887
    // no value at all
888
2.84k
    jv_free(value);
889
2.84k
    value = jv_invalid_with_msg(jv_string("Expected JSON value"));
890
2.84k
  }
891
3.16M
  parser_free(&parser);
892
893
3.16M
  if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
894
241k
    jv msg = jv_invalid_get_msg(value);
895
241k
    value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
896
241k
                                              jv_string_value(msg),
897
241k
                                              string));
898
241k
    jv_free(msg);
899
241k
  }
900
3.16M
  return value;
901
3.16M
}
902
903
3.15M
jv jv_parse_sized(const char* string, int length) {
904
3.15M
  return jv_parse_sized_custom_flags(string, length, 0);
905
3.15M
}
906
907
47.7k
jv jv_parse(const char* string) {
908
47.7k
  return jv_parse_sized(string, strlen(string));
909
47.7k
}
910
911
6.87k
jv jv_parse_custom_flags(const char* string, int flags) {
912
6.87k
  return jv_parse_sized_custom_flags(string, strlen(string), flags);
913
6.87k
}