Coverage Report

Created: 2026-03-14 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jq/src/jv_parse.c
Line
Count
Source
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <string.h>
4
#include <assert.h>
5
#include "jv.h"
6
#include "jv_dtoa.h"
7
#include "jv_unicode.h"
8
#include "jv_alloc.h"
9
#include "jv_dtoa.h"
10
11
typedef const char* presult;
12
13
#ifndef MAX_PARSING_DEPTH
14
0
#define MAX_PARSING_DEPTH (10000)
15
#endif
16
17
241k
#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
18
#ifdef __GNUC__
19
#define pfunc __attribute__((warn_unused_result)) presult
20
#else
21
#define pfunc presult
22
#endif
23
24
enum last_seen {
25
  JV_LAST_NONE = 0,
26
  JV_LAST_OPEN_ARRAY = '[',
27
  JV_LAST_OPEN_OBJECT = '{',
28
  JV_LAST_COLON = ':',
29
  JV_LAST_COMMA = ',',
30
  JV_LAST_VALUE = 'V',
31
};
32
33
struct jv_parser {
34
  const char* curr_buf;
35
  int curr_buf_length;
36
  int curr_buf_pos;
37
  int curr_buf_is_partial;
38
  int eof;
39
  unsigned bom_strip_position;
40
41
  int flags;
42
43
  jv* stack;                   // parser
44
  int stackpos;                // parser
45
  int stacklen;                // both (optimization; it's really pathlen for streaming)
46
  jv path;                     // streamer
47
  enum last_seen last_seen;    // streamer
48
  jv output;                   // streamer
49
  jv next;                     // both
50
51
  char* tokenbuf;
52
  int tokenpos;
53
  int tokenlen;
54
55
  int line, column;
56
57
  struct dtoa_context dtoa;
58
59
  enum {
60
    JV_PARSER_NORMAL,
61
    JV_PARSER_STRING,
62
    JV_PARSER_STRING_ESCAPE,
63
    JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS
64
  } st;
65
  unsigned int last_ch_was_ws:1;
66
};
67
68
69
214k
static void parser_init(struct jv_parser* p, int flags) {
70
214k
  p->flags = flags;
71
214k
  if ((p->flags & JV_PARSE_STREAMING)) {
72
0
    p->path = jv_array();
73
214k
  } else {
74
214k
    p->path = jv_invalid();
75
214k
    p->flags &= ~(JV_PARSE_STREAM_ERRORS);
76
214k
  }
77
214k
  p->stack = 0;
78
214k
  p->stacklen = p->stackpos = 0;
79
214k
  p->last_seen = JV_LAST_NONE;
80
214k
  p->output = jv_invalid();
81
214k
  p->next = jv_invalid();
82
214k
  p->tokenlen = 256;
83
214k
  p->tokenbuf = jv_mem_alloc(p->tokenlen);
84
214k
  p->tokenpos = 0;
85
214k
  if ((p->flags & JV_PARSE_SEQ))
86
0
    p->st = JV_PARSER_WAITING_FOR_RS;
87
214k
  else
88
214k
    p->st = JV_PARSER_NORMAL;
89
214k
  p->eof = 0;
90
214k
  p->curr_buf = 0;
91
214k
  p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
92
214k
  p->bom_strip_position = 0;
93
214k
  p->last_ch_was_ws = 0;
94
214k
  p->line = 1;
95
214k
  p->column = 0;
96
214k
  jvp_dtoa_context_init(&p->dtoa);
97
214k
}
98
99
240k
static void parser_reset(struct jv_parser* p) {
100
240k
  if ((p->flags & JV_PARSE_STREAMING)) {
101
0
    jv_free(p->path);
102
0
    p->path = jv_array();
103
0
    p->stacklen = 0;
104
0
  }
105
240k
  p->last_seen = JV_LAST_NONE;
106
240k
  jv_free(p->output);
107
240k
  p->output = jv_invalid();
108
240k
  jv_free(p->next);
109
240k
  p->next = jv_invalid();
110
240k
  for (int i=0; i<p->stackpos; i++)
111
0
    jv_free(p->stack[i]);
112
240k
  p->stackpos = 0;
113
240k
  p->tokenpos = 0;
114
240k
  p->st = JV_PARSER_NORMAL;
115
240k
}
116
117
214k
static void parser_free(struct jv_parser* p) {
118
214k
  parser_reset(p);
119
214k
  jv_free(p->path);
120
214k
  jv_free(p->output);
121
214k
  jv_mem_free(p->stack);
122
214k
  jv_mem_free(p->tokenbuf);
123
214k
  jvp_dtoa_context_free(&p->dtoa);
124
214k
}
125
126
188k
static pfunc value(struct jv_parser* p, jv val) {
127
188k
  if ((p->flags & JV_PARSE_STREAMING)) {
128
0
    if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) {
129
0
      jv_free(val);
130
0
      return "Expected separator between values";
131
0
    }
132
0
    if (p->stacklen > 0)
133
0
      p->last_seen = JV_LAST_VALUE;
134
0
    else
135
0
      p->last_seen = JV_LAST_NONE;
136
188k
  } else {
137
188k
    if (jv_is_valid(p->next)) {
138
0
      jv_free(val);
139
0
      return "Expected separator between values";
140
0
    }
141
188k
  }
142
188k
  jv_free(p->next);
143
188k
  p->next = val;
144
188k
  return 0;
145
188k
}
146
147
0
static void push(struct jv_parser* p, jv v) {
148
0
  assert(p->stackpos <= p->stacklen);
149
0
  if (p->stackpos == p->stacklen) {
150
0
    p->stacklen = p->stacklen * 2 + 10;
151
0
    p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv));
152
0
  }
153
0
  assert(p->stackpos < p->stacklen);
154
0
  p->stack[p->stackpos++] = v;
155
0
}
156
157
0
static pfunc parse_token(struct jv_parser* p, char ch) {
158
0
  switch (ch) {
159
0
  case '[':
160
0
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
161
0
    if (jv_is_valid(p->next)) return "Expected separator between values";
162
0
    push(p, jv_array());
163
0
    break;
164
165
0
  case '{':
166
0
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
167
0
    if (jv_is_valid(p->next)) return "Expected separator between values";
168
0
    push(p, jv_object());
169
0
    break;
170
171
0
  case ':':
172
0
    if (!jv_is_valid(p->next))
173
0
      return "Expected string key before ':'";
174
0
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
175
0
      return "':' not as part of an object";
176
0
    if (jv_get_kind(p->next) != JV_KIND_STRING)
177
0
      return "Object keys must be strings";
178
0
    push(p, p->next);
179
0
    p->next = jv_invalid();
180
0
    break;
181
182
0
  case ',':
183
0
    if (!jv_is_valid(p->next))
184
0
      return "Expected value before ','";
185
0
    if (p->stackpos == 0)
186
0
      return "',' not as part of an object or array";
187
0
    if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) {
188
0
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
189
0
      p->next = jv_invalid();
190
0
    } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) {
191
0
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
192
0
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
193
0
                                              p->stack[p->stackpos-1], p->next);
194
0
      p->stackpos--;
195
0
      p->next = jv_invalid();
196
0
    } else {
197
      // this case hits on input like {"a", "b"}
198
0
      return "Objects must consist of key:value pairs";
199
0
    }
200
0
    break;
201
202
0
  case ']':
203
0
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY)
204
0
      return "Unmatched ']'";
205
0
    if (jv_is_valid(p->next)) {
206
0
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
207
0
      p->next = jv_invalid();
208
0
    } else {
209
0
      if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) {
210
        // this case hits on input like [1,2,3,]
211
0
        return "Expected another array element";
212
0
      }
213
0
    }
214
0
    jv_free(p->next);
215
0
    p->next = p->stack[--p->stackpos];
216
0
    break;
217
218
0
  case '}':
219
0
    if (p->stackpos == 0)
220
0
      return "Unmatched '}'";
221
0
    if (jv_is_valid(p->next)) {
222
0
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING)
223
0
        return "Objects must consist of key:value pairs";
224
0
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
225
0
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
226
0
                                              p->stack[p->stackpos-1], p->next);
227
0
      p->stackpos--;
228
0
      p->next = jv_invalid();
229
0
    } else {
230
0
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
231
0
        return "Unmatched '}'";
232
0
      if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0)
233
0
        return "Expected another key-value pair";
234
0
    }
235
0
    jv_free(p->next);
236
0
    p->next = p->stack[--p->stackpos];
237
0
    break;
238
0
  }
239
0
  return 0;
240
0
}
241
242
0
static pfunc stream_token(struct jv_parser* p, char ch) {
243
0
  jv_kind k;
244
0
  jv last;
245
246
0
  switch (ch) {
247
0
  case '[':
248
0
    if (jv_is_valid(p->next))
249
0
      return "Expected a separator between values";
250
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
251
      // Looks like {["foo"]}
252
0
      return "Expected string key after '{', not '['";
253
0
    if (p->last_seen == JV_LAST_COMMA) {
254
0
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
255
0
      k = jv_get_kind(last);
256
0
      jv_free(last);
257
0
      if (k != JV_KIND_NUMBER)
258
        // Looks like {"x":"y",["foo"]}
259
0
        return "Expected string key after ',' in object, not '['";
260
0
    }
261
0
    p->path = jv_array_append(p->path, jv_number(0)); // push
262
0
    p->last_seen = JV_LAST_OPEN_ARRAY;
263
0
    p->stacklen++;
264
0
    break;
265
266
0
  case '{':
267
0
    if (p->last_seen == JV_LAST_VALUE)
268
0
      return "Expected a separator between values";
269
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
270
      // Looks like {{"foo":"bar"}}
271
0
      return "Expected string key after '{', not '{'";
272
0
    if (p->last_seen == JV_LAST_COMMA) {
273
0
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
274
0
      k = jv_get_kind(last);
275
0
      jv_free(last);
276
0
      if (k != JV_KIND_NUMBER)
277
        // Looks like {"x":"y",{"foo":"bar"}}
278
0
        return "Expected string key after ',' in object, not '{'";
279
0
    }
280
    // Push object key: null, since we don't know it yet
281
0
    p->path = jv_array_append(p->path, jv_null()); // push
282
0
    p->last_seen = JV_LAST_OPEN_OBJECT;
283
0
    p->stacklen++;
284
0
    break;
285
286
0
  case ':':
287
0
    last = jv_invalid();
288
0
    if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) {
289
0
      jv_free(last);
290
0
      return "':' not as part of an object";
291
0
    }
292
0
    jv_free(last);
293
0
    if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE)
294
0
      return "Expected string key before ':'";
295
0
    if (jv_get_kind(p->next) != JV_KIND_STRING)
296
0
      return "Object keys must be strings";
297
0
    if (p->last_seen != JV_LAST_VALUE)
298
0
      return "':' should follow a key";
299
0
    p->last_seen = JV_LAST_COLON;
300
0
    p->path = jv_array_set(p->path, p->stacklen - 1, p->next);
301
0
    p->next = jv_invalid();
302
0
    break;
303
304
0
  case ',':
305
0
    if (p->last_seen != JV_LAST_VALUE)
306
0
      return "Expected value before ','";
307
0
    if (p->stacklen == 0)
308
0
      return "',' not as part of an object or array";
309
0
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
310
0
    k = jv_get_kind(last);
311
0
    if (k == JV_KIND_NUMBER) {
312
0
      int idx = jv_number_value(last);
313
314
0
      if (jv_is_valid(p->next)) {
315
0
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
316
0
        p->next = jv_invalid();
317
0
      }
318
0
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1));
319
0
      p->last_seen = JV_LAST_COMMA;
320
0
    } else if (k == JV_KIND_STRING) {
321
0
      if (jv_is_valid(p->next)) {
322
0
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
323
0
        p->next = jv_invalid();
324
0
      }
325
0
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair
326
0
      p->last_seen = JV_LAST_COMMA;
327
0
    } else {
328
0
      assert(k == JV_KIND_NULL);
329
      // this case hits on input like {,}
330
      // make sure to handle input like {"a", "b"} and {"a":, ...}
331
0
      jv_free(last);
332
0
      return "Objects must consist of key:value pairs";
333
0
    }
334
0
    jv_free(last);
335
0
    break;
336
337
0
  case ']':
338
0
    if (p->stacklen == 0)
339
0
      return "Unmatched ']' at the top-level";
340
0
    if (p->last_seen == JV_LAST_COMMA)
341
0
      return "Expected another array element";
342
0
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
343
0
      assert(!jv_is_valid(p->next));
344
345
0
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
346
0
    k = jv_get_kind(last);
347
0
    jv_free(last);
348
349
0
    if (k != JV_KIND_NUMBER)
350
0
      return "Unmatched ']' in the middle of an object";
351
0
    if (jv_is_valid(p->next)) {
352
0
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
353
0
      p->next = jv_invalid();
354
0
    } else if (p->last_seen != JV_LAST_OPEN_ARRAY) {
355
0
      p->output = JV_ARRAY(jv_copy(p->path));
356
0
    }
357
358
0
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
359
    //assert(!jv_is_valid(p->next));
360
0
    jv_free(p->next);
361
0
    p->next = jv_invalid();
362
363
0
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
364
0
      p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves
365
366
0
    if (p->stacklen == 0)
367
0
      p->last_seen = JV_LAST_NONE;
368
0
    else
369
0
      p->last_seen = JV_LAST_VALUE;
370
0
    break;
371
372
0
  case '}':
373
0
    if (p->stacklen == 0)
374
0
      return "Unmatched '}' at the top-level";
375
0
    if (p->last_seen == JV_LAST_COMMA)
376
0
      return "Expected another key:value pair";
377
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
378
0
      assert(!jv_is_valid(p->next));
379
380
0
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
381
0
    k = jv_get_kind(last);
382
0
    jv_free(last);
383
0
    if (k == JV_KIND_NUMBER)
384
0
      return "Unmatched '}' in the middle of an array";
385
386
0
    if (jv_is_valid(p->next)) {
387
0
      if (k != JV_KIND_STRING)
388
0
        return "Objects must consist of key:value pairs";
389
0
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
390
0
      p->next = jv_invalid();
391
0
    } else {
392
      // Perhaps {"a":[]}
393
0
      if (p->last_seen == JV_LAST_COLON)
394
        // Looks like {"a":}
395
0
        return "Missing value in key:value pair";
396
0
      if (p->last_seen == JV_LAST_COMMA)
397
        // Looks like {"a":0,}
398
0
        return "Expected another key-value pair";
399
0
      if (p->last_seen == JV_LAST_OPEN_ARRAY)
400
0
        return "Unmatched '}' in the middle of an array";
401
0
      if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT)
402
0
        return "Unmatched '}'";
403
0
      if (p->last_seen != JV_LAST_OPEN_OBJECT)
404
0
        p->output = JV_ARRAY(jv_copy(p->path));
405
0
    }
406
0
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
407
0
    jv_free(p->next);
408
0
    p->next = jv_invalid();
409
410
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
411
0
      p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves
412
413
0
    if (p->stacklen == 0)
414
0
      p->last_seen = JV_LAST_NONE;
415
0
    else
416
0
      p->last_seen = JV_LAST_VALUE;
417
0
    break;
418
0
  }
419
0
  return 0;
420
0
}
421
422
24.1M
static void tokenadd(struct jv_parser* p, char c) {
423
24.1M
  assert(p->tokenpos <= p->tokenlen);
424
24.1M
  if (p->tokenpos >= (p->tokenlen - 1)) {
425
1.06k
    p->tokenlen = p->tokenlen*2 + 256;
426
1.06k
    p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen);
427
1.06k
  }
428
24.1M
  assert(p->tokenpos < p->tokenlen);
429
24.1M
  p->tokenbuf[p->tokenpos++] = c;
430
24.1M
}
431
432
153
static int unhex4(char* hex) {
433
153
  int r = 0;
434
433
  for (int i=0; i<4; i++) {
435
399
    char c = *hex++;
436
399
    int n;
437
399
    if ('0' <= c && c <= '9') n = c - '0';
438
214
    else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
439
119
    else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
440
119
    else return -1;
441
280
    r <<= 4;
442
280
    r |= n;
443
280
  }
444
34
  return r;
445
153
}
446
447
26.0k
static pfunc found_string(struct jv_parser* p) {
448
26.0k
  char* in = p->tokenbuf;
449
26.0k
  char* out = p->tokenbuf;
450
26.0k
  char* end = p->tokenbuf + p->tokenpos;
451
452
26.2k
  while (in < end) {
453
26.0k
    char c = *in++;
454
26.0k
    if (c == '\\') {
455
26.0k
      if (in >= end)
456
0
        return "Expected escape character at end of string";
457
26.0k
      c = *in++;
458
26.0k
      switch (c) {
459
117
      case '\\':
460
131
      case '"':
461
155
      case '/': *out++ = c;    break;
462
0
      case 'b': *out++ = '\b'; break;
463
0
      case 'f': *out++ = '\f'; break;
464
0
      case 't': *out++ = '\t'; break;
465
0
      case 'n': *out++ = '\n'; break;
466
5
      case 'r': *out++ = '\r'; break;
467
468
198
      case 'u':
469
        /* ahh, the complicated case */
470
198
        if (in + 4 > end)
471
62
          return "Invalid \\uXXXX escape";
472
136
        int hexvalue = unhex4(in);
473
136
        if (hexvalue < 0)
474
102
          return "Invalid characters in \\uXXXX escape";
475
34
        unsigned long codepoint = (unsigned long)hexvalue;
476
34
        in += 4;
477
34
        if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
478
          /* who thought UTF-16 surrogate pairs were a good idea? */
479
17
          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
480
0
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
481
17
          unsigned long surrogate = unhex4(in+2);
482
17
          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
483
17
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
484
0
          in += 6;
485
0
          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
486
0
                                 |(surrogate - 0xDC00));
487
0
        }
488
17
        if (codepoint > 0x10FFFF)
489
0
          codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
490
17
        out += jvp_utf8_encode(codepoint, out);
491
17
        break;
492
493
25.7k
      default:
494
25.7k
        return "Invalid escape";
495
26.0k
      }
496
26.0k
    } else {
497
0
      if (!(c & ~0x1F))
498
0
        return "Invalid string: control characters from U+0000 through U+001F must be escaped";
499
0
      *out++ = c;
500
0
    }
501
26.0k
  }
502
163
  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
503
163
  p->tokenpos = 0;
504
163
  return 0;
505
163
}
506
507
214k
static pfunc check_literal(struct jv_parser* p) {
508
214k
  if (p->tokenpos == 0) return 0;
509
510
188k
  const char* pattern = 0;
511
188k
  int plen;
512
188k
  jv v;
513
188k
  switch (p->tokenbuf[0]) {
514
0
  case 't': pattern = "true"; plen = 4; v = jv_true(); break;
515
0
  case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
516
0
  case '\'':
517
0
    return "Invalid string literal; expected \", but got '";
518
0
  case 'n':
519
    // if it starts with 'n', it could be a literal "nan"
520
0
    if (p->tokenpos > 1 && p->tokenbuf[1] == 'u') {
521
0
      pattern = "null"; plen = 4; v = jv_null();
522
0
    }
523
188k
  }
524
188k
  if (pattern) {
525
0
    if (p->tokenpos != plen) return "Invalid literal";
526
0
    for (int i=0; i<plen; i++)
527
0
      if (p->tokenbuf[i] != pattern[i])
528
0
        return "Invalid literal";
529
0
    TRY(value(p, v));
530
188k
  } else {
531
    // FIXME: better parser
532
188k
    p->tokenbuf[p->tokenpos] = 0;
533
188k
#ifdef USE_DECNUM
534
188k
    jv number = jv_number_with_literal(p->tokenbuf);
535
188k
    if (jv_get_kind(number) == JV_KIND_INVALID) {
536
0
      return "Invalid numeric literal";
537
0
    }
538
188k
    TRY(value(p, number));
539
#else
540
    char *end = 0;
541
    double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end);
542
    if (end == 0 || *end != 0) {
543
      return "Invalid numeric literal";
544
    }
545
    TRY(value(p, jv_number(d)));
546
#endif
547
188k
  }
548
188k
  p->tokenpos = 0;
549
188k
  return 0;
550
188k
}
551
552
typedef enum {
553
  LITERAL,
554
  WHITESPACE,
555
  STRUCTURE,
556
  QUOTE,
557
  INVALID
558
} chclass;
559
560
23.7M
static chclass classify(char c) {
561
23.7M
  switch (c) {
562
0
  case ' ':
563
0
  case '\t':
564
0
  case '\r':
565
0
  case '\n':
566
0
    return WHITESPACE;
567
26.1k
  case '"':
568
26.1k
    return QUOTE;
569
0
  case '[':
570
0
  case ',':
571
0
  case ']':
572
0
  case '{':
573
0
  case ':':
574
0
  case '}':
575
0
    return STRUCTURE;
576
23.7M
  default:
577
23.7M
    return LITERAL;
578
23.7M
  }
579
23.7M
}
580
581
582
static const presult OK = "output produced";
583
584
23.7M
static int parse_check_done(struct jv_parser* p, jv* out) {
585
23.7M
  if (p->stackpos == 0 && jv_is_valid(p->next)) {
586
163
    *out = p->next;
587
163
    p->next = jv_invalid();
588
163
    return 1;
589
23.7M
  } else {
590
23.7M
    return 0;
591
23.7M
  }
592
23.7M
}
593
594
0
static int stream_check_done(struct jv_parser* p, jv* out) {
595
0
  if (p->stacklen == 0 && jv_is_valid(p->next)) {
596
0
    *out = JV_ARRAY(jv_copy(p->path),p->next);
597
0
    p->next = jv_invalid();
598
0
    return 1;
599
0
  } else if (jv_is_valid(p->output)) {
600
0
    if (jv_array_length(jv_copy(p->output)) > 2) {
601
      // At end of an array or object, necessitating one more output by
602
      // which to indicate this
603
0
      *out = jv_array_slice(jv_copy(p->output), 0, 2);
604
0
      p->output = jv_array_slice(p->output, 0, 1);      // arrange one more output
605
0
    } else {
606
      // No further processing needed
607
0
      *out = p->output;
608
0
      p->output = jv_invalid();
609
0
    }
610
0
    return 1;
611
0
  } else {
612
0
    return 0;
613
0
  }
614
0
}
615
616
0
static int seq_check_truncation(struct jv_parser* p) {
617
0
  return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER));
618
0
}
619
620
0
static int stream_seq_check_truncation(struct jv_parser* p) {
621
0
  jv_kind k = jv_get_kind(p->next);
622
0
  return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL);
623
0
}
624
625
0
static int parse_is_top_num(struct jv_parser* p) {
626
0
  return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
627
0
}
628
629
0
static int stream_is_top_num(struct jv_parser* p) {
630
0
  return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
631
0
}
632
633
#define check_done(p, o) \
634
23.7M
   (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o)))
635
636
#define token(p, ch) \
637
   (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch)))
638
639
#define check_truncation(p) \
640
0
   (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p)))
641
642
#define is_top_num(p) \
643
0
   (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p)))
644
645
24.1M
static pfunc scan(struct jv_parser* p, char ch, jv* out) {
646
24.1M
  p->column++;
647
24.1M
  if (ch == '\n') {
648
26.0k
    p->line++;
649
26.0k
    p->column = 0;
650
26.0k
  }
651
24.1M
  if ((p->flags & JV_PARSE_SEQ)
652
0
      && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) {
653
0
    if (check_truncation(p)) {
654
0
      if (check_literal(p) == 0 && is_top_num(p))
655
0
        return "Potentially truncated top-level numeric value";
656
0
      return "Truncated value";
657
0
    }
658
0
    TRY(check_literal(p));
659
0
    if (p->st == JV_PARSER_NORMAL && check_done(p, out))
660
0
      return OK;
661
    // shouldn't happen?
662
0
    assert(!jv_is_valid(*out));
663
0
    parser_reset(p);
664
0
    jv_free(*out);
665
0
    *out = jv_invalid();
666
0
    return OK;
667
0
  }
668
24.1M
  presult answer = 0;
669
24.1M
  p->last_ch_was_ws = 0;
670
24.1M
  if (p->st == JV_PARSER_NORMAL) {
671
23.7M
    chclass cls = classify(ch);
672
23.7M
    if (cls == WHITESPACE)
673
0
      p->last_ch_was_ws = 1;
674
23.7M
    if (cls != LITERAL) {
675
26.1k
      TRY(check_literal(p));
676
26.1k
      if (check_done(p, out)) answer = OK;
677
26.1k
    }
678
23.7M
    switch (cls) {
679
23.7M
    case LITERAL:
680
23.7M
      tokenadd(p, ch);
681
23.7M
      break;
682
0
    case WHITESPACE:
683
0
      break;
684
26.1k
    case QUOTE:
685
26.1k
      p->st = JV_PARSER_STRING;
686
26.1k
      break;
687
0
    case STRUCTURE:
688
0
      TRY(token(p, ch));
689
0
      break;
690
0
    case INVALID:
691
0
      return "Invalid character";
692
23.7M
    }
693
23.7M
    if (check_done(p, out)) answer = OK;
694
23.7M
  } else {
695
423k
    if (ch == '"' && p->st == JV_PARSER_STRING) {
696
26.0k
      TRY(found_string(p));
697
163
      p->st = JV_PARSER_NORMAL;
698
163
      if (check_done(p, out)) answer = OK;
699
397k
    } else {
700
397k
      tokenadd(p, ch);
701
397k
      if (ch == '\\' && p->st == JV_PARSER_STRING) {
702
150k
        p->st = JV_PARSER_STRING_ESCAPE;
703
247k
      } else {
704
247k
        p->st = JV_PARSER_STRING;
705
247k
      }
706
397k
    }
707
423k
  }
708
24.1M
  return answer;
709
24.1M
}
710
711
0
struct jv_parser* jv_parser_new(int flags) {
712
0
  struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser));
713
0
  parser_init(p, flags);
714
0
  p->flags = flags;
715
0
  return p;
716
0
}
717
718
0
void jv_parser_free(struct jv_parser* p) {
719
0
  parser_free(p);
720
0
  jv_mem_free(p);
721
0
}
722
723
static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
724
725
0
int jv_parser_remaining(struct jv_parser* p) {
726
0
  if (p->curr_buf == 0)
727
0
    return 0;
728
0
  return (p->curr_buf_length - p->curr_buf_pos);
729
0
}
730
731
214k
void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
732
214k
  assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
733
214k
         && "previous buffer not exhausted");
734
429k
  while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) {
735
214k
    if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
736
      // matched a BOM character
737
0
      buf++;
738
0
      length--;
739
0
      p->bom_strip_position++;
740
214k
    } else {
741
214k
      if (p->bom_strip_position == 0) {
742
        // no BOM in this document
743
214k
        p->bom_strip_position = sizeof(UTF8_BOM);
744
214k
      } else {
745
        // malformed BOM (prefix present, rest missing)
746
0
        p->bom_strip_position = 0xff;
747
0
      }
748
214k
    }
749
214k
  }
750
214k
  p->curr_buf = buf;
751
214k
  p->curr_buf_length = length;
752
214k
  p->curr_buf_pos = 0;
753
214k
  p->curr_buf_is_partial = is_partial;
754
214k
}
755
756
static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3);
757
758
25.9k
static jv make_error(struct jv_parser* p, const char *fmt, ...) {
759
25.9k
  va_list ap;
760
25.9k
  va_start(ap, fmt);
761
25.9k
  jv e = jv_string_vfmt(fmt, ap);
762
25.9k
  va_end(ap);
763
25.9k
  if ((p->flags & JV_PARSE_STREAM_ERRORS))
764
0
    return JV_ARRAY(e, jv_copy(p->path));
765
25.9k
  return jv_invalid_with_msg(e);
766
25.9k
}
767
768
403k
jv jv_parser_next(struct jv_parser* p) {
769
403k
  if (p->eof)
770
188k
    return jv_invalid();
771
214k
  if (!p->curr_buf)
772
0
    return jv_invalid(); // Need a buffer
773
214k
  if (p->bom_strip_position == 0xff) {
774
0
    if (!(p->flags & JV_PARSE_SEQ))
775
0
      return jv_invalid_with_msg(jv_string("Malformed BOM"));
776
0
    p->st =JV_PARSER_WAITING_FOR_RS;
777
0
    parser_reset(p);
778
0
  }
779
214k
  jv value = jv_invalid();
780
214k
  if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value))
781
0
    return value;
782
214k
  char ch;
783
214k
  presult msg = 0;
784
24.3M
  while (!msg && p->curr_buf_pos < p->curr_buf_length) {
785
24.1M
    ch = p->curr_buf[p->curr_buf_pos++];
786
24.1M
    if (p->st == JV_PARSER_WAITING_FOR_RS) {
787
0
      if (ch == '\n') {
788
0
        p->line++;
789
0
        p->column = 0;
790
0
      } else {
791
0
        p->column++;
792
0
      }
793
0
      if (ch == '\036')
794
0
        p->st = JV_PARSER_NORMAL;
795
0
      continue; // need to resync, wait for RS
796
0
    }
797
24.1M
    msg = scan(p, ch, &value);
798
24.1M
  }
799
214k
  if (msg == OK) {
800
163
    return value;
801
214k
  } else if (msg) {
802
25.9k
    jv_free(value);
803
25.9k
    if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) {
804
      // Skip to the next RS
805
0
      p->st = JV_PARSER_WAITING_FOR_RS;
806
0
      value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column);
807
0
      parser_reset(p);
808
0
      return value;
809
0
    }
810
25.9k
    value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column);
811
25.9k
    parser_reset(p);
812
25.9k
    if (!(p->flags & JV_PARSE_SEQ)) {
813
      // We're not parsing a JSON text sequence; throw this buffer away.
814
      // XXX We should fail permanently here.
815
25.9k
      p->curr_buf = 0;
816
25.9k
      p->curr_buf_pos = 0;
817
25.9k
    } // Else ch must be RS; don't clear buf so we can start parsing again after this ch
818
25.9k
    return value;
819
188k
  } else if (p->curr_buf_is_partial) {
820
0
    assert(p->curr_buf_pos == p->curr_buf_length);
821
    // need another buffer
822
0
    return jv_invalid();
823
188k
  } else {
824
    // at EOF
825
188k
    p->eof = 1;
826
188k
    assert(p->curr_buf_pos == p->curr_buf_length);
827
188k
    jv_free(value);
828
188k
    if (p->st == JV_PARSER_WAITING_FOR_RS)
829
0
      return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column);
830
188k
    if (p->st != JV_PARSER_NORMAL) {
831
60
      value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column);
832
60
      parser_reset(p);
833
60
      p->st = JV_PARSER_WAITING_FOR_RS;
834
60
      return value;
835
60
    }
836
188k
    if ((msg = check_literal(p))) {
837
0
      value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column);
838
0
      parser_reset(p);
839
0
      p->st = JV_PARSER_WAITING_FOR_RS;
840
0
      return value;
841
0
    }
842
188k
    if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) ||
843
188k
        (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) {
844
0
      value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column);
845
0
      parser_reset(p);
846
0
      p->st = JV_PARSER_WAITING_FOR_RS;
847
0
      return value;
848
0
    }
849
    // p->next is either invalid (nothing here, but no syntax error)
850
    // or valid (this is the value). either way it's the thing to return
851
188k
    if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) {
852
0
      value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value]
853
188k
    } else {
854
188k
      value = p->next;
855
188k
    }
856
188k
    p->next = jv_invalid();
857
188k
    if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) {
858
0
      jv_free(value);
859
0
      return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column);
860
0
    }
861
188k
    return value;
862
188k
  }
863
214k
}
864
865
214k
jv jv_parse_sized_custom_flags(const char* string, int length, int flags) {
866
214k
  struct jv_parser parser;
867
214k
  parser_init(&parser, flags);
868
214k
  jv_parser_set_buf(&parser, string, length, 0);
869
214k
  jv value = jv_parser_next(&parser);
870
214k
  if (jv_is_valid(value)) {
871
188k
    jv next = jv_parser_next(&parser);
872
188k
    if (jv_is_valid(next)) {
873
      // multiple JSON values, we only wanted one
874
0
      jv_free(value);
875
0
      jv_free(next);
876
0
      value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
877
188k
    } else if (jv_invalid_has_msg(jv_copy(next))) {
878
      // parser error after the first JSON value
879
0
      jv_free(value);
880
0
      value = next;
881
188k
    } else {
882
      // a single valid JSON value
883
188k
      jv_free(next);
884
188k
    }
885
188k
  } else if (jv_invalid_has_msg(jv_copy(value))) {
886
    // parse error, we'll return it
887
25.9k
  } else {
888
    // no value at all
889
0
    jv_free(value);
890
0
    value = jv_invalid_with_msg(jv_string("Expected JSON value"));
891
0
  }
892
214k
  parser_free(&parser);
893
894
214k
  if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
895
25.9k
    jv msg = jv_invalid_get_msg(value);
896
25.9k
    value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
897
25.9k
                                              jv_string_value(msg),
898
25.9k
                                              string));
899
25.9k
    jv_free(msg);
900
25.9k
  }
901
214k
  return value;
902
214k
}
903
904
214k
jv jv_parse_sized(const char* string, int length) {
905
214k
  return jv_parse_sized_custom_flags(string, length, 0);
906
214k
}
907
908
0
jv jv_parse(const char* string) {
909
0
  return jv_parse_sized(string, strlen(string));
910
0
}
911
912
0
jv jv_parse_custom_flags(const char* string, int flags) {
913
0
  return jv_parse_sized_custom_flags(string, strlen(string), flags);
914
0
}