Coverage Report

Created: 2025-12-13 06:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jq/src/jv_parse.c
Line
Count
Source
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <string.h>
4
#include <assert.h>
5
#include "jv.h"
6
#include "jv_dtoa.h"
7
#include "jv_unicode.h"
8
#include "jv_alloc.h"
9
#include "jv_dtoa.h"
10
11
typedef const char* presult;
12
13
#ifndef MAX_PARSING_DEPTH
14
0
#define MAX_PARSING_DEPTH (10000)
15
#endif
16
17
964k
#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
18
#ifdef __GNUC__
19
#define pfunc __attribute__((warn_unused_result)) presult
20
#else
21
#define pfunc presult
22
#endif
23
24
enum last_seen {
25
  JV_LAST_NONE = 0,
26
  JV_LAST_OPEN_ARRAY = '[',
27
  JV_LAST_OPEN_OBJECT = '{',
28
  JV_LAST_COLON = ':',
29
  JV_LAST_COMMA = ',',
30
  JV_LAST_VALUE = 'V',
31
};
32
33
struct jv_parser {
34
  const char* curr_buf;
35
  int curr_buf_length;
36
  int curr_buf_pos;
37
  int curr_buf_is_partial;
38
  int eof;
39
  unsigned bom_strip_position;
40
41
  int flags;
42
43
  jv* stack;                   // parser
44
  int stackpos;                // parser
45
  int stacklen;                // both (optimization; it's really pathlen for streaming)
46
  jv path;                     // streamer
47
  enum last_seen last_seen;    // streamer
48
  jv output;                   // streamer
49
  jv next;                     // both
50
51
  char* tokenbuf;
52
  int tokenpos;
53
  int tokenlen;
54
55
  int line, column;
56
57
  struct dtoa_context dtoa;
58
59
  enum {
60
    JV_PARSER_NORMAL,
61
    JV_PARSER_STRING,
62
    JV_PARSER_STRING_ESCAPE,
63
    JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS
64
  } st;
65
  unsigned int last_ch_was_ws:1;
66
};
67
68
69
668k
static void parser_init(struct jv_parser* p, int flags) {
70
668k
  p->flags = flags;
71
668k
  if ((p->flags & JV_PARSE_STREAMING)) {
72
0
    p->path = jv_array();
73
668k
  } else {
74
668k
    p->path = jv_invalid();
75
668k
    p->flags &= ~(JV_PARSE_STREAM_ERRORS);
76
668k
  }
77
668k
  p->stack = 0;
78
668k
  p->stacklen = p->stackpos = 0;
79
668k
  p->last_seen = JV_LAST_NONE;
80
668k
  p->output = jv_invalid();
81
668k
  p->next = jv_invalid();
82
668k
  p->tokenbuf = 0;
83
668k
  p->tokenlen = p->tokenpos = 0;
84
668k
  if ((p->flags & JV_PARSE_SEQ))
85
0
    p->st = JV_PARSER_WAITING_FOR_RS;
86
668k
  else
87
668k
    p->st = JV_PARSER_NORMAL;
88
668k
  p->eof = 0;
89
668k
  p->curr_buf = 0;
90
668k
  p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
91
668k
  p->bom_strip_position = 0;
92
668k
  p->last_ch_was_ws = 0;
93
668k
  p->line = 1;
94
668k
  p->column = 0;
95
668k
  jvp_dtoa_context_init(&p->dtoa);
96
668k
}
97
98
976k
static void parser_reset(struct jv_parser* p) {
99
976k
  if ((p->flags & JV_PARSE_STREAMING)) {
100
0
    jv_free(p->path);
101
0
    p->path = jv_array();
102
0
    p->stacklen = 0;
103
0
  }
104
976k
  p->last_seen = JV_LAST_NONE;
105
976k
  jv_free(p->output);
106
976k
  p->output = jv_invalid();
107
976k
  jv_free(p->next);
108
976k
  p->next = jv_invalid();
109
976k
  for (int i=0; i<p->stackpos; i++)
110
0
    jv_free(p->stack[i]);
111
976k
  p->stackpos = 0;
112
976k
  p->tokenpos = 0;
113
976k
  p->st = JV_PARSER_NORMAL;
114
976k
}
115
116
668k
static void parser_free(struct jv_parser* p) {
117
668k
  parser_reset(p);
118
668k
  jv_free(p->path);
119
668k
  jv_free(p->output);
120
668k
  jv_mem_free(p->stack);
121
668k
  jv_mem_free(p->tokenbuf);
122
668k
  jvp_dtoa_context_free(&p->dtoa);
123
668k
}
124
125
360k
static pfunc value(struct jv_parser* p, jv val) {
126
360k
  if ((p->flags & JV_PARSE_STREAMING)) {
127
0
    if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) {
128
0
      jv_free(val);
129
0
      return "Expected separator between values";
130
0
    }
131
0
    if (p->stacklen > 0)
132
0
      p->last_seen = JV_LAST_VALUE;
133
0
    else
134
0
      p->last_seen = JV_LAST_NONE;
135
360k
  } else {
136
360k
    if (jv_is_valid(p->next)) {
137
0
      jv_free(val);
138
0
      return "Expected separator between values";
139
0
    }
140
360k
  }
141
360k
  jv_free(p->next);
142
360k
  p->next = val;
143
360k
  return 0;
144
360k
}
145
146
0
static void push(struct jv_parser* p, jv v) {
147
0
  assert(p->stackpos <= p->stacklen);
148
0
  if (p->stackpos == p->stacklen) {
149
0
    p->stacklen = p->stacklen * 2 + 10;
150
0
    p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv));
151
0
  }
152
0
  assert(p->stackpos < p->stacklen);
153
0
  p->stack[p->stackpos++] = v;
154
0
}
155
156
0
static pfunc parse_token(struct jv_parser* p, char ch) {
157
0
  switch (ch) {
158
0
  case '[':
159
0
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
160
0
    if (jv_is_valid(p->next)) return "Expected separator between values";
161
0
    push(p, jv_array());
162
0
    break;
163
164
0
  case '{':
165
0
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
166
0
    if (jv_is_valid(p->next)) return "Expected separator between values";
167
0
    push(p, jv_object());
168
0
    break;
169
170
0
  case ':':
171
0
    if (!jv_is_valid(p->next))
172
0
      return "Expected string key before ':'";
173
0
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
174
0
      return "':' not as part of an object";
175
0
    if (jv_get_kind(p->next) != JV_KIND_STRING)
176
0
      return "Object keys must be strings";
177
0
    push(p, p->next);
178
0
    p->next = jv_invalid();
179
0
    break;
180
181
0
  case ',':
182
0
    if (!jv_is_valid(p->next))
183
0
      return "Expected value before ','";
184
0
    if (p->stackpos == 0)
185
0
      return "',' not as part of an object or array";
186
0
    if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) {
187
0
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
188
0
      p->next = jv_invalid();
189
0
    } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) {
190
0
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
191
0
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
192
0
                                              p->stack[p->stackpos-1], p->next);
193
0
      p->stackpos--;
194
0
      p->next = jv_invalid();
195
0
    } else {
196
      // this case hits on input like {"a", "b"}
197
0
      return "Objects must consist of key:value pairs";
198
0
    }
199
0
    break;
200
201
0
  case ']':
202
0
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY)
203
0
      return "Unmatched ']'";
204
0
    if (jv_is_valid(p->next)) {
205
0
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
206
0
      p->next = jv_invalid();
207
0
    } else {
208
0
      if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) {
209
        // this case hits on input like [1,2,3,]
210
0
        return "Expected another array element";
211
0
      }
212
0
    }
213
0
    jv_free(p->next);
214
0
    p->next = p->stack[--p->stackpos];
215
0
    break;
216
217
0
  case '}':
218
0
    if (p->stackpos == 0)
219
0
      return "Unmatched '}'";
220
0
    if (jv_is_valid(p->next)) {
221
0
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING)
222
0
        return "Objects must consist of key:value pairs";
223
0
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
224
0
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
225
0
                                              p->stack[p->stackpos-1], p->next);
226
0
      p->stackpos--;
227
0
      p->next = jv_invalid();
228
0
    } else {
229
0
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
230
0
        return "Unmatched '}'";
231
0
      if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0)
232
0
        return "Expected another key-value pair";
233
0
    }
234
0
    jv_free(p->next);
235
0
    p->next = p->stack[--p->stackpos];
236
0
    break;
237
0
  }
238
0
  return 0;
239
0
}
240
241
0
static pfunc stream_token(struct jv_parser* p, char ch) {
242
0
  jv_kind k;
243
0
  jv last;
244
245
0
  switch (ch) {
246
0
  case '[':
247
0
    if (jv_is_valid(p->next))
248
0
      return "Expected a separator between values";
249
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
250
      // Looks like {["foo"]}
251
0
      return "Expected string key after '{', not '['";
252
0
    if (p->last_seen == JV_LAST_COMMA) {
253
0
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
254
0
      k = jv_get_kind(last);
255
0
      jv_free(last);
256
0
      if (k != JV_KIND_NUMBER)
257
        // Looks like {"x":"y",["foo"]}
258
0
        return "Expected string key after ',' in object, not '['";
259
0
    }
260
0
    p->path = jv_array_append(p->path, jv_number(0)); // push
261
0
    p->last_seen = JV_LAST_OPEN_ARRAY;
262
0
    p->stacklen++;
263
0
    break;
264
265
0
  case '{':
266
0
    if (p->last_seen == JV_LAST_VALUE)
267
0
      return "Expected a separator between values";
268
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
269
      // Looks like {{"foo":"bar"}}
270
0
      return "Expected string key after '{', not '{'";
271
0
    if (p->last_seen == JV_LAST_COMMA) {
272
0
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
273
0
      k = jv_get_kind(last);
274
0
      jv_free(last);
275
0
      if (k != JV_KIND_NUMBER)
276
        // Looks like {"x":"y",{"foo":"bar"}}
277
0
        return "Expected string key after ',' in object, not '{'";
278
0
    }
279
    // Push object key: null, since we don't know it yet
280
0
    p->path = jv_array_append(p->path, jv_null()); // push
281
0
    p->last_seen = JV_LAST_OPEN_OBJECT;
282
0
    p->stacklen++;
283
0
    break;
284
285
0
  case ':':
286
0
    last = jv_invalid();
287
0
    if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) {
288
0
      jv_free(last);
289
0
      return "':' not as part of an object";
290
0
    }
291
0
    jv_free(last);
292
0
    if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE)
293
0
      return "Expected string key before ':'";
294
0
    if (jv_get_kind(p->next) != JV_KIND_STRING)
295
0
      return "Object keys must be strings";
296
0
    if (p->last_seen != JV_LAST_VALUE)
297
0
      return "':' should follow a key";
298
0
    p->last_seen = JV_LAST_COLON;
299
0
    p->path = jv_array_set(p->path, p->stacklen - 1, p->next);
300
0
    p->next = jv_invalid();
301
0
    break;
302
303
0
  case ',':
304
0
    if (p->last_seen != JV_LAST_VALUE)
305
0
      return "Expected value before ','";
306
0
    if (p->stacklen == 0)
307
0
      return "',' not as part of an object or array";
308
0
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
309
0
    k = jv_get_kind(last);
310
0
    if (k == JV_KIND_NUMBER) {
311
0
      int idx = jv_number_value(last);
312
313
0
      if (jv_is_valid(p->next)) {
314
0
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
315
0
        p->next = jv_invalid();
316
0
      }
317
0
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1));
318
0
      p->last_seen = JV_LAST_COMMA;
319
0
    } else if (k == JV_KIND_STRING) {
320
0
      if (jv_is_valid(p->next)) {
321
0
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
322
0
        p->next = jv_invalid();
323
0
      }
324
0
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair
325
0
      p->last_seen = JV_LAST_COMMA;
326
0
    } else {
327
0
      assert(k == JV_KIND_NULL);
328
      // this case hits on input like {,}
329
      // make sure to handle input like {"a", "b"} and {"a":, ...}
330
0
      jv_free(last);
331
0
      return "Objects must consist of key:value pairs";
332
0
    }
333
0
    jv_free(last);
334
0
    break;
335
336
0
  case ']':
337
0
    if (p->stacklen == 0)
338
0
      return "Unmatched ']' at the top-level";
339
0
    if (p->last_seen == JV_LAST_COMMA)
340
0
      return "Expected another array element";
341
0
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
342
0
      assert(!jv_is_valid(p->next));
343
344
0
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
345
0
    k = jv_get_kind(last);
346
0
    jv_free(last);
347
348
0
    if (k != JV_KIND_NUMBER)
349
0
      return "Unmatched ']' in the middle of an object";
350
0
    if (jv_is_valid(p->next)) {
351
0
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
352
0
      p->next = jv_invalid();
353
0
    } else if (p->last_seen != JV_LAST_OPEN_ARRAY) {
354
0
      p->output = JV_ARRAY(jv_copy(p->path));
355
0
    }
356
357
0
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
358
    //assert(!jv_is_valid(p->next));
359
0
    jv_free(p->next);
360
0
    p->next = jv_invalid();
361
362
0
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
363
0
      p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves
364
365
0
    if (p->stacklen == 0)
366
0
      p->last_seen = JV_LAST_NONE;
367
0
    else
368
0
      p->last_seen = JV_LAST_VALUE;
369
0
    break;
370
371
0
  case '}':
372
0
    if (p->stacklen == 0)
373
0
      return "Unmatched '}' at the top-level";
374
0
    if (p->last_seen == JV_LAST_COMMA)
375
0
      return "Expected another key:value pair";
376
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
377
0
      assert(!jv_is_valid(p->next));
378
379
0
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
380
0
    k = jv_get_kind(last);
381
0
    jv_free(last);
382
0
    if (k == JV_KIND_NUMBER)
383
0
      return "Unmatched '}' in the middle of an array";
384
385
0
    if (jv_is_valid(p->next)) {
386
0
      if (k != JV_KIND_STRING)
387
0
        return "Objects must consist of key:value pairs";
388
0
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
389
0
      p->next = jv_invalid();
390
0
    } else {
391
      // Perhaps {"a":[]}
392
0
      if (p->last_seen == JV_LAST_COLON)
393
        // Looks like {"a":}
394
0
        return "Missing value in key:value pair";
395
0
      if (p->last_seen == JV_LAST_COMMA)
396
        // Looks like {"a":0,}
397
0
        return "Expected another key-value pair";
398
0
      if (p->last_seen == JV_LAST_OPEN_ARRAY)
399
0
        return "Unmatched '}' in the middle of an array";
400
0
      if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT)
401
0
        return "Unmatched '}'";
402
0
      if (p->last_seen != JV_LAST_OPEN_OBJECT)
403
0
        p->output = JV_ARRAY(jv_copy(p->path));
404
0
    }
405
0
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
406
0
    jv_free(p->next);
407
0
    p->next = jv_invalid();
408
409
0
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
410
0
      p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves
411
412
0
    if (p->stacklen == 0)
413
0
      p->last_seen = JV_LAST_NONE;
414
0
    else
415
0
      p->last_seen = JV_LAST_VALUE;
416
0
    break;
417
0
  }
418
0
  return 0;
419
0
}
420
421
23.1M
static void tokenadd(struct jv_parser* p, char c) {
422
23.1M
  assert(p->tokenpos <= p->tokenlen);
423
23.1M
  if (p->tokenpos >= (p->tokenlen - 1)) {
424
669k
    p->tokenlen = p->tokenlen*2 + 256;
425
669k
    p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen);
426
669k
  }
427
23.1M
  assert(p->tokenpos < p->tokenlen);
428
23.1M
  p->tokenbuf[p->tokenpos++] = c;
429
23.1M
}
430
431
2.96k
static int unhex4(char* hex) {
432
2.96k
  int r = 0;
433
9.76k
  for (int i=0; i<4; i++) {
434
8.28k
    char c = *hex++;
435
8.28k
    int n;
436
8.28k
    if ('0' <= c && c <= '9') n = c - '0';
437
4.90k
    else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
438
2.43k
    else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
439
1.48k
    else return -1;
440
6.79k
    r <<= 4;
441
6.79k
    r |= n;
442
6.79k
  }
443
1.48k
  return r;
444
2.96k
}
445
446
291k
static pfunc found_string(struct jv_parser* p) {
447
291k
  char* in = p->tokenbuf;
448
291k
  char* out = p->tokenbuf;
449
291k
  char* end = p->tokenbuf + p->tokenpos;
450
451
296k
  while (in < end) {
452
292k
    char c = *in++;
453
292k
    if (c == '\\') {
454
292k
      if (in >= end)
455
0
        return "Expected escape character at end of string";
456
292k
      c = *in++;
457
292k
      switch (c) {
458
2.26k
      case '\\':
459
3.46k
      case '"':
460
3.50k
      case '/': *out++ = c;    break;
461
17
      case 'b': *out++ = '\b'; break;
462
8
      case 'f': *out++ = '\f'; break;
463
1
      case 't': *out++ = '\t'; break;
464
6
      case 'n': *out++ = '\n'; break;
465
403
      case 'r': *out++ = '\r'; break;
466
467
2.69k
      case 'u':
468
        /* ahh, the complicated case */
469
2.69k
        if (in + 4 > end)
470
345
          return "Invalid \\uXXXX escape";
471
2.34k
        int hexvalue = unhex4(in);
472
2.34k
        if (hexvalue < 0)
473
1.15k
          return "Invalid characters in \\uXXXX escape";
474
1.19k
        unsigned long codepoint = (unsigned long)hexvalue;
475
1.19k
        in += 4;
476
1.19k
        if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
477
          /* who thought UTF-16 surrogate pairs were a good idea? */
478
761
          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
479
144
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
480
617
          unsigned long surrogate = unhex4(in+2);
481
617
          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
482
489
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
483
128
          in += 6;
484
128
          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
485
128
                                 |(surrogate - 0xDC00));
486
128
        }
487
563
        if (codepoint > 0x10FFFF)
488
0
          codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
489
563
        out += jvp_utf8_encode(codepoint, out);
490
563
        break;
491
492
285k
      default:
493
285k
        return "Invalid escape";
494
292k
      }
495
292k
    } else {
496
0
      if (!(c & ~0x1F))
497
0
        return "Invalid string: control characters from U+0000 through U+001F must be escaped";
498
0
      *out++ = c;
499
0
    }
500
292k
  }
501
3.76k
  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
502
3.76k
  p->tokenpos = 0;
503
3.76k
  return 0;
504
3.76k
}
505
506
672k
static pfunc check_literal(struct jv_parser* p) {
507
672k
  if (p->tokenpos == 0) return 0;
508
509
356k
  const char* pattern = 0;
510
356k
  int plen;
511
356k
  jv v;
512
356k
  switch (p->tokenbuf[0]) {
513
0
  case 't': pattern = "true"; plen = 4; v = jv_true(); break;
514
0
  case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
515
0
  case '\'':
516
0
    return "Invalid string literal; expected \", but got '";
517
0
  case 'n':
518
    // if it starts with 'n', it could be a literal "nan"
519
0
    if (p->tokenpos > 1 && p->tokenbuf[1] == 'u') {
520
0
      pattern = "null"; plen = 4; v = jv_null();
521
0
    }
522
356k
  }
523
356k
  if (pattern) {
524
0
    if (p->tokenpos != plen) return "Invalid literal";
525
0
    for (int i=0; i<plen; i++)
526
0
      if (p->tokenbuf[i] != pattern[i])
527
0
        return "Invalid literal";
528
0
    TRY(value(p, v));
529
356k
  } else {
530
    // FIXME: better parser
531
356k
    p->tokenbuf[p->tokenpos] = 0;
532
356k
#ifdef USE_DECNUM
533
356k
    jv number = jv_number_with_literal(p->tokenbuf);
534
356k
    if (jv_get_kind(number) == JV_KIND_INVALID) {
535
0
      return "Invalid numeric literal";
536
0
    }
537
356k
    TRY(value(p, number));
538
#else
539
    char *end = 0;
540
    double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end);
541
    if (end == 0 || *end != 0) {
542
      return "Invalid numeric literal";
543
    }
544
    TRY(value(p, jv_number(d)));
545
#endif
546
356k
  }
547
356k
  p->tokenpos = 0;
548
356k
  return 0;
549
356k
}
550
551
typedef enum {
552
  LITERAL,
553
  WHITESPACE,
554
  STRUCTURE,
555
  QUOTE,
556
  INVALID
557
} chclass;
558
559
22.5M
static chclass classify(char c) {
560
22.5M
  switch (c) {
561
0
  case ' ':
562
0
  case '\t':
563
0
  case '\r':
564
0
  case '\n':
565
0
    return WHITESPACE;
566
311k
  case '"':
567
311k
    return QUOTE;
568
0
  case '[':
569
0
  case ',':
570
0
  case ']':
571
0
  case '{':
572
0
  case ':':
573
0
  case '}':
574
0
    return STRUCTURE;
575
22.2M
  default:
576
22.2M
    return LITERAL;
577
22.5M
  }
578
22.5M
}
579
580
581
static const presult OK = "output produced";
582
583
22.8M
static int parse_check_done(struct jv_parser* p, jv* out) {
584
22.8M
  if (p->stackpos == 0 && jv_is_valid(p->next)) {
585
3.76k
    *out = p->next;
586
3.76k
    p->next = jv_invalid();
587
3.76k
    return 1;
588
22.8M
  } else {
589
22.8M
    return 0;
590
22.8M
  }
591
22.8M
}
592
593
0
static int stream_check_done(struct jv_parser* p, jv* out) {
594
0
  if (p->stacklen == 0 && jv_is_valid(p->next)) {
595
0
    *out = JV_ARRAY(jv_copy(p->path),p->next);
596
0
    p->next = jv_invalid();
597
0
    return 1;
598
0
  } else if (jv_is_valid(p->output)) {
599
0
    if (jv_array_length(jv_copy(p->output)) > 2) {
600
      // At end of an array or object, necessitating one more output by
601
      // which to indicate this
602
0
      *out = jv_array_slice(jv_copy(p->output), 0, 2);
603
0
      p->output = jv_array_slice(p->output, 0, 1);      // arrange one more output
604
0
    } else {
605
      // No further processing needed
606
0
      *out = p->output;
607
0
      p->output = jv_invalid();
608
0
    }
609
0
    return 1;
610
0
  } else {
611
0
    return 0;
612
0
  }
613
0
}
614
615
0
static int seq_check_truncation(struct jv_parser* p) {
616
0
  return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER));
617
0
}
618
619
0
static int stream_seq_check_truncation(struct jv_parser* p) {
620
0
  jv_kind k = jv_get_kind(p->next);
621
0
  return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL);
622
0
}
623
624
0
static int parse_is_top_num(struct jv_parser* p) {
625
0
  return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
626
0
}
627
628
0
static int stream_is_top_num(struct jv_parser* p) {
629
0
  return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
630
0
}
631
632
#define check_done(p, o) \
633
22.8M
   (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o)))
634
635
#define token(p, ch) \
636
   (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch)))
637
638
#define check_truncation(p) \
639
0
   (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p)))
640
641
#define is_top_num(p) \
642
0
   (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p)))
643
644
23.7M
static pfunc scan(struct jv_parser* p, char ch, jv* out) {
645
23.7M
  p->column++;
646
23.7M
  if (ch == '\n') {
647
287k
    p->line++;
648
287k
    p->column = 0;
649
287k
  }
650
23.7M
  if ((p->flags & JV_PARSE_SEQ)
651
0
      && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) {
652
0
    if (check_truncation(p)) {
653
0
      if (check_literal(p) == 0 && is_top_num(p))
654
0
        return "Potentially truncated top-level numeric value";
655
0
      return "Truncated value";
656
0
    }
657
0
    TRY(check_literal(p));
658
0
    if (p->st == JV_PARSER_NORMAL && check_done(p, out))
659
0
      return OK;
660
    // shouldn't happen?
661
0
    assert(!jv_is_valid(*out));
662
0
    parser_reset(p);
663
0
    jv_free(*out);
664
0
    *out = jv_invalid();
665
0
    return OK;
666
0
  }
667
23.7M
  presult answer = 0;
668
23.7M
  p->last_ch_was_ws = 0;
669
23.7M
  if (p->st == JV_PARSER_NORMAL) {
670
22.5M
    chclass cls = classify(ch);
671
22.5M
    if (cls == WHITESPACE)
672
0
      p->last_ch_was_ws = 1;
673
22.5M
    if (cls != LITERAL) {
674
311k
      TRY(check_literal(p));
675
311k
      if (check_done(p, out)) answer = OK;
676
311k
    }
677
22.5M
    switch (cls) {
678
22.2M
    case LITERAL:
679
22.2M
      tokenadd(p, ch);
680
22.2M
      break;
681
0
    case WHITESPACE:
682
0
      break;
683
311k
    case QUOTE:
684
311k
      p->st = JV_PARSER_STRING;
685
311k
      break;
686
0
    case STRUCTURE:
687
0
      TRY(token(p, ch));
688
0
      break;
689
0
    case INVALID:
690
0
      return "Invalid character";
691
22.5M
    }
692
22.5M
    if (check_done(p, out)) answer = OK;
693
22.5M
  } else {
694
1.21M
    if (ch == '"' && p->st == JV_PARSER_STRING) {
695
291k
      TRY(found_string(p));
696
3.76k
      p->st = JV_PARSER_NORMAL;
697
3.76k
      if (check_done(p, out)) answer = OK;
698
918k
    } else {
699
918k
      tokenadd(p, ch);
700
918k
      if (ch == '\\' && p->st == JV_PARSER_STRING) {
701
429k
        p->st = JV_PARSER_STRING_ESCAPE;
702
488k
      } else {
703
488k
        p->st = JV_PARSER_STRING;
704
488k
      }
705
918k
    }
706
1.21M
  }
707
23.4M
  return answer;
708
23.7M
}
709
710
0
struct jv_parser* jv_parser_new(int flags) {
711
0
  struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser));
712
0
  parser_init(p, flags);
713
0
  p->flags = flags;
714
0
  return p;
715
0
}
716
717
0
void jv_parser_free(struct jv_parser* p) {
718
0
  parser_free(p);
719
0
  jv_mem_free(p);
720
0
}
721
722
static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
723
724
0
int jv_parser_remaining(struct jv_parser* p) {
725
0
  if (p->curr_buf == 0)
726
0
    return 0;
727
0
  return (p->curr_buf_length - p->curr_buf_pos);
728
0
}
729
730
668k
void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
731
668k
  assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
732
668k
         && "previous buffer not exhausted");
733
1.33M
  while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) {
734
668k
    if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
735
      // matched a BOM character
736
0
      buf++;
737
0
      length--;
738
0
      p->bom_strip_position++;
739
668k
    } else {
740
668k
      if (p->bom_strip_position == 0) {
741
        // no BOM in this document
742
668k
        p->bom_strip_position = sizeof(UTF8_BOM);
743
668k
      } else {
744
        // malformed BOM (prefix present, rest missing)
745
0
        p->bom_strip_position = 0xff;
746
0
      }
747
668k
    }
748
668k
  }
749
668k
  p->curr_buf = buf;
750
668k
  p->curr_buf_length = length;
751
668k
  p->curr_buf_pos = 0;
752
668k
  p->curr_buf_is_partial = is_partial;
753
668k
}
754
755
static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3);
756
757
308k
static jv make_error(struct jv_parser* p, const char *fmt, ...) {
758
308k
  va_list ap;
759
308k
  va_start(ap, fmt);
760
308k
  jv e = jv_string_vfmt(fmt, ap);
761
308k
  va_end(ap);
762
308k
  if ((p->flags & JV_PARSE_STREAM_ERRORS))
763
0
    return JV_ARRAY(e, jv_copy(p->path));
764
308k
  return jv_invalid_with_msg(e);
765
308k
}
766
767
1.02M
jv jv_parser_next(struct jv_parser* p) {
768
1.02M
  if (p->eof)
769
356k
    return jv_invalid();
770
672k
  if (!p->curr_buf)
771
0
    return jv_invalid(); // Need a buffer
772
672k
  if (p->bom_strip_position == 0xff) {
773
0
    if (!(p->flags & JV_PARSE_SEQ))
774
0
      return jv_invalid_with_msg(jv_string("Malformed BOM"));
775
0
    p->st =JV_PARSER_WAITING_FOR_RS;
776
0
    parser_reset(p);
777
0
  }
778
672k
  jv value = jv_invalid();
779
672k
  if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value))
780
0
    return value;
781
672k
  char ch;
782
672k
  presult msg = 0;
783
24.4M
  while (!msg && p->curr_buf_pos < p->curr_buf_length) {
784
23.7M
    ch = p->curr_buf[p->curr_buf_pos++];
785
23.7M
    if (p->st == JV_PARSER_WAITING_FOR_RS) {
786
0
      if (ch == '\n') {
787
0
        p->line++;
788
0
        p->column = 0;
789
0
      } else {
790
0
        p->column++;
791
0
      }
792
0
      if (ch == '\036')
793
0
        p->st = JV_PARSER_NORMAL;
794
0
      continue; // need to resync, wait for RS
795
0
    }
796
23.7M
    msg = scan(p, ch, &value);
797
23.7M
  }
798
672k
  if (msg == OK) {
799
3.76k
    return value;
800
668k
  } else if (msg) {
801
288k
    jv_free(value);
802
288k
    if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) {
803
      // Skip to the next RS
804
0
      p->st = JV_PARSER_WAITING_FOR_RS;
805
0
      value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column);
806
0
      parser_reset(p);
807
0
      return value;
808
0
    }
809
288k
    value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column);
810
288k
    parser_reset(p);
811
288k
    if (!(p->flags & JV_PARSE_SEQ)) {
812
      // We're not parsing a JSON text sequence; throw this buffer away.
813
      // XXX We should fail permanently here.
814
288k
      p->curr_buf = 0;
815
288k
      p->curr_buf_pos = 0;
816
288k
    } // Else ch must be RS; don't clear buf so we can start parsing again after this ch
817
288k
    return value;
818
380k
  } else if (p->curr_buf_is_partial) {
819
0
    assert(p->curr_buf_pos == p->curr_buf_length);
820
    // need another buffer
821
0
    return jv_invalid();
822
380k
  } else {
823
    // at EOF
824
380k
    p->eof = 1;
825
380k
    assert(p->curr_buf_pos == p->curr_buf_length);
826
380k
    jv_free(value);
827
380k
    if (p->st == JV_PARSER_WAITING_FOR_RS)
828
0
      return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column);
829
380k
    if (p->st != JV_PARSER_NORMAL) {
830
20.1k
      value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column);
831
20.1k
      parser_reset(p);
832
20.1k
      p->st = JV_PARSER_WAITING_FOR_RS;
833
20.1k
      return value;
834
20.1k
    }
835
360k
    if ((msg = check_literal(p))) {
836
0
      value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column);
837
0
      parser_reset(p);
838
0
      p->st = JV_PARSER_WAITING_FOR_RS;
839
0
      return value;
840
0
    }
841
360k
    if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) ||
842
360k
        (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) {
843
0
      value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column);
844
0
      parser_reset(p);
845
0
      p->st = JV_PARSER_WAITING_FOR_RS;
846
0
      return value;
847
0
    }
848
    // p->next is either invalid (nothing here, but no syntax error)
849
    // or valid (this is the value). either way it's the thing to return
850
360k
    if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) {
851
0
      value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value]
852
360k
    } else {
853
360k
      value = p->next;
854
360k
    }
855
360k
    p->next = jv_invalid();
856
360k
    if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) {
857
0
      jv_free(value);
858
0
      return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column);
859
0
    }
860
360k
    return value;
861
360k
  }
862
672k
}
863
864
668k
jv jv_parse_sized_custom_flags(const char* string, int length, int flags) {
865
668k
  struct jv_parser parser;
866
668k
  parser_init(&parser, flags);
867
668k
  jv_parser_set_buf(&parser, string, length, 0);
868
668k
  jv value = jv_parser_next(&parser);
869
668k
  if (jv_is_valid(value)) {
870
360k
    jv next = jv_parser_next(&parser);
871
360k
    if (jv_is_valid(next)) {
872
      // multiple JSON values, we only wanted one
873
0
      jv_free(value);
874
0
      jv_free(next);
875
0
      value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
876
360k
    } else if (jv_invalid_has_msg(jv_copy(next))) {
877
      // parser error after the first JSON value
878
0
      jv_free(value);
879
0
      value = next;
880
360k
    } else {
881
      // a single valid JSON value
882
360k
      jv_free(next);
883
360k
    }
884
360k
  } else if (jv_invalid_has_msg(jv_copy(value))) {
885
    // parse error, we'll return it
886
308k
  } else {
887
    // no value at all
888
0
    jv_free(value);
889
0
    value = jv_invalid_with_msg(jv_string("Expected JSON value"));
890
0
  }
891
668k
  parser_free(&parser);
892
893
668k
  if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
894
308k
    jv msg = jv_invalid_get_msg(value);
895
308k
    value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
896
308k
                                              jv_string_value(msg),
897
308k
                                              string));
898
308k
    jv_free(msg);
899
308k
  }
900
668k
  return value;
901
668k
}
902
903
668k
jv jv_parse_sized(const char* string, int length) {
904
668k
  return jv_parse_sized_custom_flags(string, length, 0);
905
668k
}
906
907
0
jv jv_parse(const char* string) {
908
0
  return jv_parse_sized(string, strlen(string));
909
0
}
910
911
0
jv jv_parse_custom_flags(const char* string, int flags) {
912
0
  return jv_parse_sized_custom_flags(string, strlen(string), flags);
913
0
}