Coverage Report

Created: 2024-02-11 06:41

/src/jq/src/jv_parse.c
Line
Count
Source (jump to first uncovered line)
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <string.h>
4
#include <assert.h>
5
#include "jv.h"
6
#include "jv_dtoa.h"
7
#include "jv_unicode.h"
8
#include "jv_alloc.h"
9
#include "jv_dtoa.h"
10
11
typedef const char* presult;
12
13
#ifndef MAX_PARSING_DEPTH
14
1.91M
#define MAX_PARSING_DEPTH (256)
15
#endif
16
17
92.2M
#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
18
#ifdef __GNUC__
19
#define pfunc __attribute__((warn_unused_result)) presult
20
#else
21
#define pfunc presult
22
#endif
23
24
enum last_seen {
25
  JV_LAST_NONE = 0,
26
  JV_LAST_OPEN_ARRAY = '[',
27
  JV_LAST_OPEN_OBJECT = '{',
28
  JV_LAST_COLON = ':',
29
  JV_LAST_COMMA = ',',
30
  JV_LAST_VALUE = 'V',
31
};
32
33
struct jv_parser {
34
  const char* curr_buf;
35
  int curr_buf_length;
36
  int curr_buf_pos;
37
  int curr_buf_is_partial;
38
  int eof;
39
  unsigned bom_strip_position;
40
41
  int flags;
42
43
  jv* stack;                   // parser
44
  int stackpos;                // parser
45
  int stacklen;                // both (optimization; it's really pathlen for streaming)
46
  jv path;                     // streamer
47
  enum last_seen last_seen;    // streamer
48
  jv output;                   // streamer
49
  jv next;                     // both
50
51
  char* tokenbuf;
52
  int tokenpos;
53
  int tokenlen;
54
55
  int line, column;
56
57
  struct dtoa_context dtoa;
58
59
  enum {
60
    JV_PARSER_NORMAL,
61
    JV_PARSER_STRING,
62
    JV_PARSER_STRING_ESCAPE,
63
    JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS
64
  } st;
65
  unsigned int last_ch_was_ws:1;
66
};
67
68
69
1.75M
static void parser_init(struct jv_parser* p, int flags) {
70
1.75M
  p->flags = flags;
71
1.75M
  if ((p->flags & JV_PARSE_STREAMING)) {
72
3.73k
    p->path = jv_array();
73
1.75M
  } else {
74
1.75M
    p->path = jv_invalid();
75
1.75M
    p->flags &= ~(JV_PARSE_STREAM_ERRORS);
76
1.75M
  }
77
1.75M
  p->stack = 0;
78
1.75M
  p->stacklen = p->stackpos = 0;
79
1.75M
  p->last_seen = JV_LAST_NONE;
80
1.75M
  p->output = jv_invalid();
81
1.75M
  p->next = jv_invalid();
82
1.75M
  p->tokenbuf = 0;
83
1.75M
  p->tokenlen = p->tokenpos = 0;
84
1.75M
  if ((p->flags & JV_PARSE_SEQ))
85
293
    p->st = JV_PARSER_WAITING_FOR_RS;
86
1.75M
  else
87
1.75M
    p->st = JV_PARSER_NORMAL;
88
1.75M
  p->eof = 0;
89
1.75M
  p->curr_buf = 0;
90
1.75M
  p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
91
1.75M
  p->bom_strip_position = 0;
92
1.75M
  p->last_ch_was_ws = 0;
93
1.75M
  p->line = 1;
94
1.75M
  p->column = 0;
95
1.75M
  jvp_dtoa_context_init(&p->dtoa);
96
1.75M
}
97
98
1.86M
static void parser_reset(struct jv_parser* p) {
99
1.86M
  if ((p->flags & JV_PARSE_STREAMING)) {
100
5.79k
    jv_free(p->path);
101
5.79k
    p->path = jv_array();
102
5.79k
    p->stacklen = 0;
103
5.79k
  }
104
1.86M
  p->last_seen = JV_LAST_NONE;
105
1.86M
  jv_free(p->output);
106
1.86M
  p->output = jv_invalid();
107
1.86M
  jv_free(p->next);
108
1.86M
  p->next = jv_invalid();
109
1.92M
  for (int i=0; i<p->stackpos; i++)
110
55.1k
    jv_free(p->stack[i]);
111
1.86M
  p->stackpos = 0;
112
1.86M
  p->tokenpos = 0;
113
1.86M
  p->st = JV_PARSER_NORMAL;
114
1.86M
}
115
116
1.75M
static void parser_free(struct jv_parser* p) {
117
1.75M
  parser_reset(p);
118
1.75M
  jv_free(p->path);
119
1.75M
  jv_free(p->output);
120
1.75M
  jv_mem_free(p->stack);
121
1.75M
  jv_mem_free(p->tokenbuf);
122
1.75M
  jvp_dtoa_context_free(&p->dtoa);
123
1.75M
}
124
125
10.2M
static pfunc value(struct jv_parser* p, jv val) {
126
10.2M
  if ((p->flags & JV_PARSE_STREAMING)) {
127
19.0k
    if (jv_is_valid(p->next) || p->last_seen == JV_LAST_VALUE) {
128
109
      jv_free(val);
129
109
      return "Expected separator between values";
130
109
    }
131
18.8k
    if (p->stacklen > 0)
132
16.8k
      p->last_seen = JV_LAST_VALUE;
133
2.00k
    else
134
2.00k
      p->last_seen = JV_LAST_NONE;
135
10.2M
  } else {
136
10.2M
    if (jv_is_valid(p->next)) {
137
105
      jv_free(val);
138
105
      return "Expected separator between values";
139
105
    }
140
10.2M
  }
141
10.2M
  jv_free(p->next);
142
10.2M
  p->next = val;
143
10.2M
  return 0;
144
10.2M
}
145
146
1.96M
static void push(struct jv_parser* p, jv v) {
147
1.96M
  assert(p->stackpos <= p->stacklen);
148
1.96M
  if (p->stackpos == p->stacklen) {
149
14.3k
    p->stacklen = p->stacklen * 2 + 10;
150
14.3k
    p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv));
151
14.3k
  }
152
1.96M
  assert(p->stackpos < p->stacklen);
153
1.96M
  p->stack[p->stackpos++] = v;
154
1.96M
}
155
156
10.5M
static pfunc parse_token(struct jv_parser* p, char ch) {
157
10.5M
  switch (ch) {
158
56.3k
  case '[':
159
56.3k
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
160
56.3k
    if (jv_is_valid(p->next)) return "Expected separator between values";
161
56.0k
    push(p, jv_array());
162
56.0k
    break;
163
164
1.86M
  case '{':
165
1.86M
    if (p->stackpos >= MAX_PARSING_DEPTH) return "Exceeds depth limit for parsing";
166
1.86M
    if (jv_is_valid(p->next)) return "Expected separator between values";
167
1.86M
    push(p, jv_object());
168
1.86M
    break;
169
170
44.5k
  case ':':
171
44.5k
    if (!jv_is_valid(p->next))
172
427
      return "Expected string key before ':'";
173
44.1k
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
174
1.01k
      return "':' not as part of an object";
175
43.1k
    if (jv_get_kind(p->next) != JV_KIND_STRING)
176
31
      return "Object keys must be strings";
177
43.0k
    push(p, p->next);
178
43.0k
    p->next = jv_invalid();
179
43.0k
    break;
180
181
6.70M
  case ',':
182
6.70M
    if (!jv_is_valid(p->next))
183
8.04k
      return "Expected value before ','";
184
6.69M
    if (p->stackpos == 0)
185
0
      return "',' not as part of an object or array";
186
6.69M
    if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) {
187
6.65M
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
188
6.65M
      p->next = jv_invalid();
189
6.65M
    } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) {
190
35.8k
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
191
35.8k
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
192
35.8k
                                              p->stack[p->stackpos-1], p->next);
193
35.8k
      p->stackpos--;
194
35.8k
      p->next = jv_invalid();
195
35.8k
    } else {
196
      // this case hits on input like {"a", "b"}
197
53
      return "Objects must consist of key:value pairs";
198
53
    }
199
6.69M
    break;
200
201
6.69M
  case ']':
202
28.8k
    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY)
203
420
      return "Unmatched ']'";
204
28.3k
    if (jv_is_valid(p->next)) {
205
20.7k
      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
206
20.7k
      p->next = jv_invalid();
207
20.7k
    } else {
208
7.58k
      if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) {
209
        // this case hits on input like [1,2,3,]
210
62
        return "Expected another array element";
211
62
      }
212
7.58k
    }
213
28.3k
    jv_free(p->next);
214
28.3k
    p->next = p->stack[--p->stackpos];
215
28.3k
    break;
216
217
1.83M
  case '}':
218
1.83M
    if (p->stackpos == 0)
219
231
      return "Unmatched '}'";
220
1.83M
    if (jv_is_valid(p->next)) {
221
5.35k
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING)
222
29
        return "Objects must consist of key:value pairs";
223
5.32k
      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
224
5.32k
      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
225
5.32k
                                              p->stack[p->stackpos-1], p->next);
226
5.32k
      p->stackpos--;
227
5.32k
      p->next = jv_invalid();
228
1.83M
    } else {
229
1.83M
      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
230
44
        return "Unmatched '}'";
231
1.83M
      if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0)
232
29
        return "Expected another key-value pair";
233
1.83M
    }
234
1.83M
    jv_free(p->next);
235
1.83M
    p->next = p->stack[--p->stackpos];
236
1.83M
    break;
237
10.5M
  }
238
10.5M
  return 0;
239
10.5M
}
240
241
30.2M
static pfunc stream_token(struct jv_parser* p, char ch) {
242
30.2M
  jv_kind k;
243
30.2M
  jv last;
244
245
30.2M
  switch (ch) {
246
30.2M
  case '[':
247
30.2M
    if (jv_is_valid(p->next))
248
58
      return "Expected a separator between values";
249
30.2M
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
250
      // Looks like {["foo"]}
251
8
      return "Expected string key after '{', not '['";
252
30.2M
    if (p->last_seen == JV_LAST_COMMA) {
253
36
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
254
36
      k = jv_get_kind(last);
255
36
      jv_free(last);
256
36
      if (k != JV_KIND_NUMBER)
257
        // Looks like {"x":"y",["foo"]}
258
2
        return "Expected string key after ',' in object, not '['";
259
36
    }
260
30.2M
    p->path = jv_array_append(p->path, jv_number(0)); // push
261
30.2M
    p->last_seen = JV_LAST_OPEN_ARRAY;
262
30.2M
    p->stacklen++;
263
30.2M
    break;
264
265
10.5k
  case '{':
266
10.5k
    if (p->last_seen == JV_LAST_VALUE)
267
5
      return "Expected a separator between values";
268
10.5k
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
269
      // Looks like {{"foo":"bar"}}
270
11
      return "Expected string key after '{', not '{'";
271
10.5k
    if (p->last_seen == JV_LAST_COMMA) {
272
14
      last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
273
14
      k = jv_get_kind(last);
274
14
      jv_free(last);
275
14
      if (k != JV_KIND_NUMBER)
276
        // Looks like {"x":"y",{"foo":"bar"}}
277
2
        return "Expected string key after ',' in object, not '{'";
278
14
    }
279
    // Push object key: null, since we don't know it yet
280
10.5k
    p->path = jv_array_append(p->path, jv_null()); // push
281
10.5k
    p->last_seen = JV_LAST_OPEN_OBJECT;
282
10.5k
    p->stacklen++;
283
10.5k
    break;
284
285
16.4k
  case ':':
286
16.4k
    last = jv_invalid();
287
16.4k
    if (p->stacklen == 0 || jv_get_kind(last = jv_array_get(jv_copy(p->path), p->stacklen - 1)) == JV_KIND_NUMBER) {
288
38
      jv_free(last);
289
38
      return "':' not as part of an object";
290
38
    }
291
16.3k
    jv_free(last);
292
16.3k
    if (!jv_is_valid(p->next) || p->last_seen == JV_LAST_NONE)
293
26
      return "Expected string key before ':'";
294
16.3k
    if (jv_get_kind(p->next) != JV_KIND_STRING)
295
7
      return "Object keys must be strings";
296
16.3k
    if (p->last_seen != JV_LAST_VALUE)
297
0
      return "':' should follow a key";
298
16.3k
    p->last_seen = JV_LAST_COLON;
299
16.3k
    p->path = jv_array_set(p->path, p->stacklen - 1, p->next);
300
16.3k
    p->next = jv_invalid();
301
16.3k
    break;
302
303
261
  case ',':
304
261
    if (p->last_seen != JV_LAST_VALUE)
305
14
      return "Expected value before ','";
306
247
    if (p->stacklen == 0)
307
0
      return "',' not as part of an object or array";
308
247
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
309
247
    k = jv_get_kind(last);
310
247
    if (k == JV_KIND_NUMBER) {
311
180
      int idx = jv_number_value(last);
312
313
180
      if (jv_is_valid(p->next)) {
314
169
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
315
169
        p->next = jv_invalid();
316
169
      }
317
180
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_number(idx + 1));
318
180
      p->last_seen = JV_LAST_COMMA;
319
180
    } else if (k == JV_KIND_STRING) {
320
62
      if (jv_is_valid(p->next)) {
321
33
        p->output = JV_ARRAY(jv_copy(p->path), p->next);
322
33
        p->next = jv_invalid();
323
33
      }
324
62
      p->path = jv_array_set(p->path, p->stacklen - 1, jv_null()); // ready for another key:value pair
325
62
      p->last_seen = JV_LAST_COMMA;
326
62
    } else {
327
5
      assert(k == JV_KIND_NULL);
328
      // this case hits on input like {,}
329
      // make sure to handle input like {"a", "b"} and {"a":, ...}
330
5
      jv_free(last);
331
5
      return "Objects must consist of key:value pairs";
332
5
    }
333
242
    jv_free(last);
334
242
    break;
335
336
287
  case ']':
337
287
    if (p->stacklen == 0)
338
9
      return "Unmatched ']' at the top-level";
339
278
    if (p->last_seen == JV_LAST_COMMA)
340
2
      return "Expected another array element";
341
276
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
342
246
      assert(!jv_is_valid(p->next));
343
344
276
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
345
276
    k = jv_get_kind(last);
346
276
    jv_free(last);
347
348
276
    if (k != JV_KIND_NUMBER)
349
8
      return "Unmatched ']' in the middle of an object";
350
268
    if (jv_is_valid(p->next)) {
351
18
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
352
18
      p->next = jv_invalid();
353
250
    } else if (p->last_seen != JV_LAST_OPEN_ARRAY) {
354
4
      p->output = JV_ARRAY(jv_copy(p->path));
355
4
    }
356
357
268
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
358
    //assert(!jv_is_valid(p->next));
359
268
    jv_free(p->next);
360
268
    p->next = jv_invalid();
361
362
268
    if (p->last_seen == JV_LAST_OPEN_ARRAY)
363
246
      p->output = JV_ARRAY(jv_copy(p->path), jv_array()); // Empty arrays are leaves
364
365
268
    if (p->stacklen == 0)
366
26
      p->last_seen = JV_LAST_NONE;
367
242
    else
368
242
      p->last_seen = JV_LAST_VALUE;
369
268
    break;
370
371
196
  case '}':
372
196
    if (p->stacklen == 0)
373
9
      return "Unmatched '}' at the top-level";
374
187
    if (p->last_seen == JV_LAST_COMMA)
375
2
      return "Expected another key:value pair";
376
185
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
377
142
      assert(!jv_is_valid(p->next));
378
379
185
    last = jv_array_get(jv_copy(p->path), p->stacklen - 1);
380
185
    k = jv_get_kind(last);
381
185
    jv_free(last);
382
185
    if (k == JV_KIND_NUMBER)
383
23
      return "Unmatched '}' in the middle of an array";
384
385
162
    if (jv_is_valid(p->next)) {
386
12
      if (k != JV_KIND_STRING)
387
6
        return "Objects must consist of key:value pairs";
388
6
      p->output = JV_ARRAY(jv_copy(p->path), p->next, jv_true());
389
6
      p->next = jv_invalid();
390
150
    } else {
391
      // Perhaps {"a":[]}
392
150
      if (p->last_seen == JV_LAST_COLON)
393
        // Looks like {"a":}
394
6
        return "Missing value in key:value pair";
395
144
      if (p->last_seen == JV_LAST_COMMA)
396
        // Looks like {"a":0,}
397
0
        return "Expected another key-value pair";
398
144
      if (p->last_seen == JV_LAST_OPEN_ARRAY)
399
0
        return "Unmatched '}' in the middle of an array";
400
144
      if (p->last_seen != JV_LAST_VALUE && p->last_seen != JV_LAST_OPEN_OBJECT)
401
0
        return "Unmatched '}'";
402
144
      if (p->last_seen != JV_LAST_OPEN_OBJECT)
403
2
        p->output = JV_ARRAY(jv_copy(p->path));
404
144
    }
405
150
    p->path = jv_array_slice(p->path, 0, --(p->stacklen)); // pop
406
150
    jv_free(p->next);
407
150
    p->next = jv_invalid();
408
409
150
    if (p->last_seen == JV_LAST_OPEN_OBJECT)
410
142
      p->output = JV_ARRAY(jv_copy(p->path), jv_object()); // Empty arrays are leaves
411
412
150
    if (p->stacklen == 0)
413
39
      p->last_seen = JV_LAST_NONE;
414
111
    else
415
111
      p->last_seen = JV_LAST_VALUE;
416
150
    break;
417
30.2M
  }
418
30.2M
  return 0;
419
30.2M
}
420
421
350M
static void tokenadd(struct jv_parser* p, char c) {
422
350M
  assert(p->tokenpos <= p->tokenlen);
423
350M
  if (p->tokenpos >= (p->tokenlen - 1)) {
424
1.76M
    p->tokenlen = p->tokenlen*2 + 256;
425
1.76M
    p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen);
426
1.76M
  }
427
350M
  assert(p->tokenpos < p->tokenlen);
428
350M
  p->tokenbuf[p->tokenpos++] = c;
429
350M
}
430
431
25.7k
static int unhex4(char* hex) {
432
25.7k
  int r = 0;
433
122k
  for (int i=0; i<4; i++) {
434
99.5k
    char c = *hex++;
435
99.5k
    int n;
436
99.5k
    if ('0' <= c && c <= '9') n = c - '0';
437
49.1k
    else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
438
21.6k
    else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
439
2.50k
    else return -1;
440
97.0k
    r <<= 4;
441
97.0k
    r |= n;
442
97.0k
  }
443
23.2k
  return r;
444
25.7k
}
445
446
166k
static pfunc found_string(struct jv_parser* p) {
447
166k
  char* in = p->tokenbuf;
448
166k
  char* out = p->tokenbuf;
449
166k
  char* end = p->tokenbuf + p->tokenpos;
450
451
77.9M
  while (in < end) {
452
77.8M
    char c = *in++;
453
77.8M
    if (c == '\\') {
454
81.7k
      if (in >= end)
455
0
        return "Expected escape character at end of string";
456
81.7k
      c = *in++;
457
81.7k
      switch (c) {
458
10.3k
      case '\\':
459
11.6k
      case '"':
460
12.9k
      case '/': *out++ = c;    break;
461
1.67k
      case 'b': *out++ = '\b'; break;
462
1.55k
      case 'f': *out++ = '\f'; break;
463
1.22k
      case 't': *out++ = '\t'; break;
464
2.10k
      case 'n': *out++ = '\n'; break;
465
1.28k
      case 'r': *out++ = '\r'; break;
466
467
20.3k
      case 'u':
468
        /* ahh, the complicated case */
469
20.3k
        if (in + 4 > end)
470
1.01k
          return "Invalid \\uXXXX escape";
471
19.3k
        int hexvalue = unhex4(in);
472
19.3k
        if (hexvalue < 0)
473
1.35k
          return "Invalid characters in \\uXXXX escape";
474
17.9k
        unsigned long codepoint = (unsigned long)hexvalue;
475
17.9k
        in += 4;
476
17.9k
        if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
477
          /* who thought UTF-16 surrogate pairs were a good idea? */
478
11.4k
          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
479
4.97k
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
480
6.44k
          unsigned long surrogate = unhex4(in+2);
481
6.44k
          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
482
3.29k
            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
483
3.15k
          in += 6;
484
3.15k
          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
485
3.15k
                                 |(surrogate - 0xDC00));
486
3.15k
        }
487
9.68k
        if (codepoint > 0x10FFFF)
488
0
          codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
489
9.68k
        out += jvp_utf8_encode(codepoint, out);
490
9.68k
        break;
491
492
40.6k
      default:
493
40.6k
        return "Invalid escape";
494
81.7k
      }
495
77.7M
    } else {
496
77.7M
      if (c >= 0 && c <= 0x001f)
497
214
        return "Invalid string: control characters from U+0000 through U+001F must be escaped";
498
77.7M
      *out++ = c;
499
77.7M
    }
500
77.8M
  }
501
115k
  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
502
115k
  p->tokenpos = 0;
503
115k
  return 0;
504
115k
}
505
506
42.7M
static pfunc check_literal(struct jv_parser* p) {
507
42.7M
  if (p->tokenpos == 0) return 0;
508
509
10.1M
  const char* pattern = 0;
510
10.1M
  int plen;
511
10.1M
  jv v;
512
10.1M
  switch (p->tokenbuf[0]) {
513
1.75k
  case 't': pattern = "true"; plen = 4; v = jv_true(); break;
514
6.74k
  case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
515
56.5k
  case 'n':
516
    // if it starts with 'n', it could be a literal "nan"
517
56.5k
    if (p->tokenbuf[1] == 'u') {
518
54.6k
      pattern = "null"; plen = 4; v = jv_null();
519
54.6k
    }
520
10.1M
  }
521
10.1M
  if (pattern) {
522
63.1k
    if (p->tokenpos != plen) return "Invalid literal";
523
318k
    for (int i=0; i<plen; i++)
524
256k
      if (p->tokenbuf[i] != pattern[i])
525
227
        return "Invalid literal";
526
62.3k
    TRY(value(p, v));
527
10.0M
  } else {
528
    // FIXME: better parser
529
10.0M
    p->tokenbuf[p->tokenpos] = 0;
530
10.0M
#ifdef USE_DECNUM
531
10.0M
    jv number = jv_number_with_literal(p->tokenbuf);
532
10.0M
    if (jv_get_kind(number) == JV_KIND_INVALID) {
533
12.5k
      return "Invalid numeric literal";
534
12.5k
    }
535
10.0M
    TRY(value(p, number));
536
#else
537
    char *end = 0;
538
    double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end);
539
    if (end == 0 || *end != 0) {
540
      return "Invalid numeric literal";
541
    }
542
    TRY(value(p, jv_number(d)));
543
#endif
544
10.0M
  }
545
10.1M
  p->tokenpos = 0;
546
10.1M
  return 0;
547
10.1M
}
548
549
typedef enum {
550
  LITERAL,
551
  WHITESPACE,
552
  STRUCTURE,
553
  QUOTE,
554
  INVALID
555
} chclass;
556
557
300M
static chclass classify(char c) {
558
300M
  switch (c) {
559
17.9k
  case ' ':
560
22.6k
  case '\t':
561
36.5k
  case '\r':
562
51.3k
  case '\n':
563
51.3k
    return WHITESPACE;
564
193k
  case '"':
565
193k
    return QUOTE;
566
30.3M
  case '[':
567
37.0M
  case ',':
568
37.0M
  case ']':
569
38.9M
  case '{':
570
38.9M
  case ':':
571
40.8M
  case '}':
572
40.8M
    return STRUCTURE;
573
259M
  default:
574
259M
    return LITERAL;
575
300M
  }
576
300M
}
577
578
579
static const presult OK = "output produced";
580
581
230M
static int parse_check_done(struct jv_parser* p, jv* out) {
582
230M
  if (p->stackpos == 0 && jv_is_valid(p->next)) {
583
3.67M
    *out = p->next;
584
3.67M
    p->next = jv_invalid();
585
3.67M
    return 1;
586
226M
  } else {
587
226M
    return 0;
588
226M
  }
589
230M
}
590
591
112M
static int stream_check_done(struct jv_parser* p, jv* out) {
592
112M
  if (p->stacklen == 0 && jv_is_valid(p->next)) {
593
794
    *out = JV_ARRAY(jv_copy(p->path),p->next);
594
794
    p->next = jv_invalid();
595
794
    return 1;
596
112M
  } else if (jv_is_valid(p->output)) {
597
628
    if (jv_array_length(jv_copy(p->output)) > 2) {
598
      // At end of an array or object, necessitating one more output by
599
      // which to indicate this
600
24
      *out = jv_array_slice(jv_copy(p->output), 0, 2);
601
24
      p->output = jv_array_slice(p->output, 0, 1);      // arrange one more output
602
604
    } else {
603
      // No further processing needed
604
604
      *out = p->output;
605
604
      p->output = jv_invalid();
606
604
    }
607
628
    return 1;
608
112M
  } else {
609
112M
    return 0;
610
112M
  }
611
112M
}
612
613
37
static int seq_check_truncation(struct jv_parser* p) {
614
37
  return (!p->last_ch_was_ws && (p->stackpos > 0 || p->tokenpos > 0 || jv_get_kind(p->next) == JV_KIND_NUMBER));
615
37
}
616
617
43
static int stream_seq_check_truncation(struct jv_parser* p) {
618
43
  jv_kind k = jv_get_kind(p->next);
619
43
  return (p->stacklen > 0 || k == JV_KIND_NUMBER || k == JV_KIND_TRUE || k == JV_KIND_FALSE || k == JV_KIND_NULL);
620
43
}
621
622
11
static int parse_is_top_num(struct jv_parser* p) {
623
11
  return (p->stackpos == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
624
11
}
625
626
19
static int stream_is_top_num(struct jv_parser* p) {
627
19
  return (p->stacklen == 0 && jv_get_kind(p->next) == JV_KIND_NUMBER);
628
19
}
629
630
#define check_done(p, o) \
631
342M
   (((p)->flags & JV_PARSE_STREAMING) ? stream_check_done((p), (o)) : parse_check_done((p), (o)))
632
633
#define token(p, ch) \
634
   (((p)->flags & JV_PARSE_STREAMING) ? stream_token((p), (ch)) : parse_token((p), (ch)))
635
636
#define check_truncation(p) \
637
80
   (((p)->flags & JV_PARSE_STREAMING) ? stream_seq_check_truncation((p)) : seq_check_truncation((p)))
638
639
#define is_top_num(p) \
640
30
   (((p)->flags & JV_PARSE_STREAMING) ? stream_is_top_num((p)) : parse_is_top_num((p)))
641
642
392M
static pfunc scan(struct jv_parser* p, char ch, jv* out) {
643
392M
  p->column++;
644
392M
  if (ch == '\n') {
645
193k
    p->line++;
646
193k
    p->column = 0;
647
193k
  }
648
392M
  if ((p->flags & JV_PARSE_SEQ)
649
392M
      && ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) {
650
80
    if (check_truncation(p)) {
651
51
      if (check_literal(p) == 0 && is_top_num(p))
652
1
        return "Potentially truncated top-level numeric value";
653
50
      return "Truncated value";
654
51
    }
655
29
    TRY(check_literal(p));
656
16
    if (p->st == JV_PARSER_NORMAL && check_done(p, out))
657
4
      return OK;
658
    // shouldn't happen?
659
12
    assert(!jv_is_valid(*out));
660
12
    parser_reset(p);
661
12
    jv_free(*out);
662
12
    *out = jv_invalid();
663
12
    return OK;
664
12
  }
665
392M
  presult answer = 0;
666
392M
  p->last_ch_was_ws = 0;
667
392M
  if (p->st == JV_PARSER_NORMAL) {
668
300M
    chclass cls = classify(ch);
669
300M
    if (cls == WHITESPACE)
670
51.3k
      p->last_ch_was_ws = 1;
671
300M
    if (cls != LITERAL) {
672
41.0M
      TRY(check_literal(p));
673
41.0M
      if (check_done(p, out)) answer = OK;
674
41.0M
    }
675
300M
    switch (cls) {
676
259M
    case LITERAL:
677
259M
      tokenadd(p, ch);
678
259M
      break;
679
51.2k
    case WHITESPACE:
680
51.2k
      break;
681
193k
    case QUOTE:
682
193k
      p->st = JV_PARSER_STRING;
683
193k
      break;
684
40.8M
    case STRUCTURE:
685
40.8M
      TRY(token(p, ch));
686
40.8M
      break;
687
40.8M
    case INVALID:
688
0
      return "Invalid character";
689
300M
    }
690
300M
    if (check_done(p, out)) answer = OK;
691
300M
  } else {
692
91.2M
    if (ch == '"' && p->st == JV_PARSER_STRING) {
693
166k
      TRY(found_string(p));
694
115k
      p->st = JV_PARSER_NORMAL;
695
115k
      if (check_done(p, out)) answer = OK;
696
91.0M
    } else {
697
91.0M
      tokenadd(p, ch);
698
91.0M
      if (ch == '\\' && p->st == JV_PARSER_STRING) {
699
1.03M
        p->st = JV_PARSER_STRING_ESCAPE;
700
90.0M
      } else {
701
90.0M
        p->st = JV_PARSER_STRING;
702
90.0M
      }
703
91.0M
    }
704
91.2M
  }
705
392M
  return answer;
706
392M
}
707
708
3.13k
struct jv_parser* jv_parser_new(int flags) {
709
3.13k
  struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser));
710
3.13k
  parser_init(p, flags);
711
3.13k
  p->flags = flags;
712
3.13k
  return p;
713
3.13k
}
714
715
3.13k
void jv_parser_free(struct jv_parser* p) {
716
3.13k
  parser_free(p);
717
3.13k
  jv_mem_free(p);
718
3.13k
}
719
720
static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
721
722
0
int jv_parser_remaining(struct jv_parser* p) {
723
0
  if (p->curr_buf == 0)
724
0
    return 0;
725
0
  return (p->curr_buf_length - p->curr_buf_pos);
726
0
}
727
728
1.77M
void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
729
1.77M
  assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
730
1.77M
         && "previous buffer not exhausted");
731
3.52M
  while (length > 0 && p->bom_strip_position < sizeof(UTF8_BOM)) {
732
1.75M
    if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
733
      // matched a BOM character
734
1.25k
      buf++;
735
1.25k
      length--;
736
1.25k
      p->bom_strip_position++;
737
1.75M
    } else {
738
1.75M
      if (p->bom_strip_position == 0) {
739
        // no BOM in this document
740
1.75M
        p->bom_strip_position = sizeof(UTF8_BOM);
741
1.75M
      } else {
742
        // malformed BOM (prefix present, rest missing)
743
1.18k
        p->bom_strip_position = 0xff;
744
1.18k
      }
745
1.75M
    }
746
1.75M
  }
747
1.77M
  p->curr_buf = buf;
748
1.77M
  p->curr_buf_length = length;
749
1.77M
  p->curr_buf_pos = 0;
750
1.77M
  p->curr_buf_is_partial = is_partial;
751
1.77M
}
752
753
static jv make_error(struct jv_parser*, const char *, ...) JV_PRINTF_LIKE(2, 3);
754
755
107k
static jv make_error(struct jv_parser* p, const char *fmt, ...) {
756
107k
  va_list ap;
757
107k
  va_start(ap, fmt);
758
107k
  jv e = jv_string_vfmt(fmt, ap);
759
107k
  va_end(ap);
760
107k
  if ((p->flags & JV_PARSE_STREAM_ERRORS))
761
516
    return JV_ARRAY(e, jv_copy(p->path));
762
107k
  return jv_invalid_with_msg(e);
763
107k
}
764
765
7.07M
jv jv_parser_next(struct jv_parser* p) {
766
7.07M
  if (p->eof)
767
1.63M
    return jv_invalid();
768
5.44M
  if (!p->curr_buf)
769
57
    return jv_invalid(); // Need a buffer
770
5.44M
  if (p->bom_strip_position == 0xff) {
771
1.22k
    if (!(p->flags & JV_PARSE_SEQ))
772
1.10k
      return jv_invalid_with_msg(jv_string("Malformed BOM"));
773
116
    p->st =JV_PARSER_WAITING_FOR_RS;
774
116
    parser_reset(p);
775
116
  }
776
5.44M
  jv value = jv_invalid();
777
5.44M
  if ((p->flags & JV_PARSE_STREAMING) && stream_check_done(p, &value))
778
8
    return value;
779
5.44M
  char ch;
780
5.44M
  presult msg = 0;
781
400M
  while (!msg && p->curr_buf_pos < p->curr_buf_length) {
782
395M
    ch = p->curr_buf[p->curr_buf_pos++];
783
395M
    if (p->st == JV_PARSER_WAITING_FOR_RS) {
784
3.17M
      if (ch == '\n') {
785
4.22k
        p->line++;
786
4.22k
        p->column = 0;
787
3.17M
      } else {
788
3.17M
        p->column++;
789
3.17M
      }
790
3.17M
      if (ch == '\036')
791
106
        p->st = JV_PARSER_NORMAL;
792
3.17M
      continue; // need to resync, wait for RS
793
3.17M
    }
794
392M
    msg = scan(p, ch, &value);
795
392M
  }
796
5.44M
  if (msg == OK) {
797
3.67M
    return value;
798
3.67M
  } else if (msg) {
799
69.8k
    jv_free(value);
800
69.8k
    if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) {
801
      // Skip to the next RS
802
126
      p->st = JV_PARSER_WAITING_FOR_RS;
803
126
      value = make_error(p, "%s at line %d, column %d (need RS to resync)", msg, p->line, p->column);
804
126
      parser_reset(p);
805
126
      return value;
806
126
    }
807
69.7k
    value = make_error(p, "%s at line %d, column %d", msg, p->line, p->column);
808
69.7k
    parser_reset(p);
809
69.7k
    if (!(p->flags & JV_PARSE_SEQ)) {
810
      // We're not parsing a JSON text sequence; throw this buffer away.
811
      // XXX We should fail permanently here.
812
69.6k
      p->curr_buf = 0;
813
69.6k
      p->curr_buf_pos = 0;
814
69.6k
    } // Else ch must be RS; don't clear buf so we can start parsing again after this ch
815
69.7k
    return value;
816
1.70M
  } else if (p->curr_buf_is_partial) {
817
11.9k
    assert(p->curr_buf_pos == p->curr_buf_length);
818
    // need another buffer
819
11.9k
    return jv_invalid();
820
1.68M
  } else {
821
    // at EOF
822
1.68M
    p->eof = 1;
823
1.68M
    assert(p->curr_buf_pos == p->curr_buf_length);
824
1.68M
    jv_free(value);
825
1.68M
    if (p->st == JV_PARSER_WAITING_FOR_RS)
826
107
      return make_error(p, "Unfinished abandoned text at EOF at line %d, column %d", p->line, p->column);
827
1.68M
    if (p->st != JV_PARSER_NORMAL) {
828
26.8k
      value = make_error(p, "Unfinished string at EOF at line %d, column %d", p->line, p->column);
829
26.8k
      parser_reset(p);
830
26.8k
      p->st = JV_PARSER_WAITING_FOR_RS;
831
26.8k
      return value;
832
26.8k
    }
833
1.66M
    if ((msg = check_literal(p))) {
834
6.11k
      value = make_error(p, "%s at EOF at line %d, column %d", msg, p->line, p->column);
835
6.11k
      parser_reset(p);
836
6.11k
      p->st = JV_PARSER_WAITING_FOR_RS;
837
6.11k
      return value;
838
6.11k
    }
839
1.65M
    if (((p->flags & JV_PARSE_STREAMING) && p->stacklen != 0) ||
840
1.65M
        (!(p->flags & JV_PARSE_STREAMING) && p->stackpos != 0)) {
841
5.06k
      value = make_error(p, "Unfinished JSON term at EOF at line %d, column %d", p->line, p->column);
842
5.06k
      parser_reset(p);
843
5.06k
      p->st = JV_PARSER_WAITING_FOR_RS;
844
5.06k
      return value;
845
5.06k
    }
846
    // p->next is either invalid (nothing here, but no syntax error)
847
    // or valid (this is the value). either way it's the thing to return
848
1.65M
    if ((p->flags & JV_PARSE_STREAMING) && jv_is_valid(p->next)) {
849
1.20k
      value = JV_ARRAY(jv_copy(p->path), p->next); // except in streaming mode we've got to make it [path,value]
850
1.64M
    } else {
851
1.64M
      value = p->next;
852
1.64M
    }
853
1.65M
    p->next = jv_invalid();
854
1.65M
    if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) {
855
4
      jv_free(value);
856
4
      return make_error(p, "Potentially truncated top-level numeric value at EOF at line %d, column %d", p->line, p->column);
857
4
    }
858
1.65M
    return value;
859
1.65M
  }
860
5.44M
}
861
862
1.75M
jv jv_parse_sized_custom_flags(const char* string, int length, int flags) {
863
1.75M
  struct jv_parser parser;
864
1.75M
  parser_init(&parser, flags);
865
1.75M
  jv_parser_set_buf(&parser, string, length, 0);
866
1.75M
  jv value = jv_parser_next(&parser);
867
1.75M
  if (jv_is_valid(value)) {
868
1.65M
    jv next = jv_parser_next(&parser);
869
1.65M
    if (jv_is_valid(next)) {
870
      // multiple JSON values, we only wanted one
871
794
      jv_free(value);
872
794
      jv_free(next);
873
794
      value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
874
1.65M
    } else if (jv_invalid_has_msg(jv_copy(next))) {
875
      // parser error after the first JSON value
876
6.11k
      jv_free(value);
877
6.11k
      value = next;
878
1.64M
    } else {
879
      // a single valid JSON value
880
1.64M
      jv_free(next);
881
1.64M
    }
882
1.65M
  } else if (jv_invalid_has_msg(jv_copy(value))) {
883
    // parse error, we'll return it
884
100k
  } else {
885
    // no value at all
886
3.11k
    jv_free(value);
887
3.11k
    value = jv_invalid_with_msg(jv_string("Expected JSON value"));
888
3.11k
  }
889
1.75M
  parser_free(&parser);
890
891
1.75M
  if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
892
111k
    jv msg = jv_invalid_get_msg(value);
893
111k
    value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
894
111k
                                              jv_string_value(msg),
895
111k
                                              string));
896
111k
    jv_free(msg);
897
111k
  }
898
1.75M
  return value;
899
1.75M
}
900
901
1.75M
jv jv_parse_sized(const char* string, int length) {
902
1.75M
  return jv_parse_sized_custom_flags(string, length, 0);
903
1.75M
}
904
905
48.5k
jv jv_parse(const char* string) {
906
48.5k
  return jv_parse_sized(string, strlen(string));
907
48.5k
}
908
909
6.59k
jv jv_parse_custom_flags(const char* string, int flags) {
910
6.59k
  return jv_parse_sized_custom_flags(string, strlen(string), flags);
911
6.59k
}