/src/postgres/src/common/jsonapi.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * jsonapi.c |
4 | | * JSON parser and lexer interfaces |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * IDENTIFICATION |
10 | | * src/common/jsonapi.c |
11 | | * |
12 | | *------------------------------------------------------------------------- |
13 | | */ |
14 | | #ifndef FRONTEND |
15 | | #include "postgres.h" |
16 | | #else |
17 | | #include "postgres_fe.h" |
18 | | #endif |
19 | | |
20 | | #include "common/jsonapi.h" |
21 | | #include "mb/pg_wchar.h" |
22 | | #include "port/pg_lfind.h" |
23 | | |
24 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
25 | | #include "pqexpbuffer.h" |
26 | | #else |
27 | | #include "lib/stringinfo.h" |
28 | | #include "miscadmin.h" |
29 | | #endif |
30 | | |
31 | | /* |
32 | | * By default, we will use palloc/pfree along with StringInfo. In libpq, |
33 | | * use malloc and PQExpBuffer, and return JSON_OUT_OF_MEMORY on out-of-memory. |
34 | | */ |
35 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
36 | | |
37 | | #define STRDUP(s) strdup(s) |
38 | | #define ALLOC(size) malloc(size) |
39 | | #define ALLOC0(size) calloc(1, size) |
40 | | #define REALLOC realloc |
41 | | #define FREE(s) free(s) |
42 | | |
43 | | #define jsonapi_appendStringInfo appendPQExpBuffer |
44 | | #define jsonapi_appendBinaryStringInfo appendBinaryPQExpBuffer |
45 | | #define jsonapi_appendStringInfoChar appendPQExpBufferChar |
46 | | /* XXX should we add a macro version to PQExpBuffer? */ |
47 | | #define jsonapi_appendStringInfoCharMacro appendPQExpBufferChar |
48 | | #define jsonapi_makeStringInfo createPQExpBuffer |
49 | | #define jsonapi_initStringInfo initPQExpBuffer |
50 | | #define jsonapi_resetStringInfo resetPQExpBuffer |
51 | | #define jsonapi_termStringInfo termPQExpBuffer |
52 | | #define jsonapi_destroyStringInfo destroyPQExpBuffer |
53 | | |
54 | | #else /* !JSONAPI_USE_PQEXPBUFFER */ |
55 | | |
56 | 0 | #define STRDUP(s) pstrdup(s) |
57 | 0 | #define ALLOC(size) palloc(size) |
58 | 1.53k | #define ALLOC0(size) palloc0(size) |
59 | 0 | #define REALLOC repalloc |
60 | | |
61 | | #ifdef FRONTEND |
62 | | #define FREE pfree |
63 | | #else |
64 | | /* |
65 | | * Backend pfree() doesn't handle NULL pointers like the frontend's does; smooth |
66 | | * that over to reduce mental gymnastics. Avoid multiple evaluation of the macro |
67 | | * argument to avoid future hair-pulling. |
68 | | */ |
69 | 2 | #define FREE(s) do { \ |
70 | 2 | void *__v = (s); \ |
71 | 2 | if (__v) \ |
72 | 2 | pfree(__v); \ |
73 | 2 | } while (0) |
74 | | #endif |
75 | | |
76 | 0 | #define jsonapi_appendStringInfo appendStringInfo |
77 | 13.2k | #define jsonapi_appendBinaryStringInfo appendBinaryStringInfo |
78 | 29.4k | #define jsonapi_appendStringInfoChar appendStringInfoChar |
79 | 0 | #define jsonapi_appendStringInfoCharMacro appendStringInfoCharMacro |
80 | 1.53k | #define jsonapi_makeStringInfo makeStringInfo |
81 | 0 | #define jsonapi_initStringInfo initStringInfo |
82 | 1.43k | #define jsonapi_resetStringInfo resetStringInfo |
83 | 0 | #define jsonapi_termStringInfo(s) pfree((s)->data) |
84 | 0 | #define jsonapi_destroyStringInfo destroyStringInfo |
85 | | |
86 | | #endif /* JSONAPI_USE_PQEXPBUFFER */ |
87 | | |
88 | | /* |
89 | | * The context of the parser is maintained by the recursive descent |
90 | | * mechanism, but is passed explicitly to the error reporting routine |
91 | | * for better diagnostics. |
92 | | */ |
93 | | typedef enum /* contexts of JSON parser */ |
94 | | { |
95 | | JSON_PARSE_VALUE, /* expecting a value */ |
96 | | JSON_PARSE_STRING, /* expecting a string (for a field name) */ |
97 | | JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */ |
98 | | JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */ |
99 | | JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */ |
100 | | JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */ |
101 | | JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */ |
102 | | JSON_PARSE_OBJECT_COMMA, /* saw object ',', expecting next label */ |
103 | | JSON_PARSE_END, /* saw the end of a document, expect nothing */ |
104 | | } JsonParseContext; |
105 | | |
106 | | /* |
107 | | * Setup for table-driven parser. |
108 | | * These enums need to be separate from the JsonTokenType and from each other |
109 | | * so we can have all of them on the prediction stack, which consists of |
110 | | * tokens, non-terminals, and semantic action markers. |
111 | | */ |
112 | | |
113 | | enum JsonNonTerminal |
114 | | { |
115 | | JSON_NT_JSON = 32, |
116 | | JSON_NT_ARRAY_ELEMENTS, |
117 | | JSON_NT_MORE_ARRAY_ELEMENTS, |
118 | | JSON_NT_KEY_PAIRS, |
119 | | JSON_NT_MORE_KEY_PAIRS, |
120 | | }; |
121 | | |
122 | | enum JsonParserSem |
123 | | { |
124 | | JSON_SEM_OSTART = 64, |
125 | | JSON_SEM_OEND, |
126 | | JSON_SEM_ASTART, |
127 | | JSON_SEM_AEND, |
128 | | JSON_SEM_OFIELD_INIT, |
129 | | JSON_SEM_OFIELD_START, |
130 | | JSON_SEM_OFIELD_END, |
131 | | JSON_SEM_AELEM_START, |
132 | | JSON_SEM_AELEM_END, |
133 | | JSON_SEM_SCALAR_INIT, |
134 | | JSON_SEM_SCALAR_CALL, |
135 | | }; |
136 | | |
137 | | /* |
138 | | * struct containing the 3 stacks used in non-recursive parsing, |
139 | | * and the token and value for scalars that need to be preserved |
140 | | * across calls. |
141 | | * |
142 | | * typedef appears in jsonapi.h |
143 | | */ |
144 | | struct JsonParserStack |
145 | | { |
146 | | int stack_size; |
147 | | char *prediction; |
148 | | size_t pred_index; |
149 | | /* these two are indexed by lex_level */ |
150 | | char **fnames; |
151 | | bool *fnull; |
152 | | JsonTokenType scalar_tok; |
153 | | char *scalar_val; |
154 | | }; |
155 | | |
156 | | /* |
157 | | * struct containing state used when there is a possible partial token at the |
158 | | * end of a json chunk when we are doing incremental parsing. |
159 | | * |
160 | | * typedef appears in jsonapi.h |
161 | | */ |
162 | | struct JsonIncrementalState |
163 | | { |
164 | | bool started; |
165 | | bool is_last_chunk; |
166 | | bool partial_completed; |
167 | | jsonapi_StrValType partial_token; |
168 | | }; |
169 | | |
170 | | /* |
171 | | * constants and macros used in the nonrecursive parser |
172 | | */ |
173 | | #define JSON_NUM_TERMINALS 13 |
174 | | #define JSON_NUM_NONTERMINALS 5 |
175 | 0 | #define JSON_NT_OFFSET JSON_NT_JSON |
176 | | /* for indexing the table */ |
177 | 0 | #define OFS(NT) (NT) - JSON_NT_OFFSET |
178 | | /* classify items we get off the stack */ |
179 | 0 | #define IS_SEM(x) ((x) & 0x40) |
180 | 0 | #define IS_NT(x) ((x) & 0x20) |
181 | | |
182 | | /* |
183 | | * These productions are stored in reverse order right to left so that when |
184 | | * they are pushed on the stack what we expect next is at the top of the stack. |
185 | | */ |
186 | | static char JSON_PROD_EPSILON[] = {0}; /* epsilon - an empty production */ |
187 | | |
188 | | /* JSON -> string */ |
189 | | static char JSON_PROD_SCALAR_STRING[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_STRING, JSON_SEM_SCALAR_INIT, 0}; |
190 | | |
191 | | /* JSON -> number */ |
192 | | static char JSON_PROD_SCALAR_NUMBER[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NUMBER, JSON_SEM_SCALAR_INIT, 0}; |
193 | | |
194 | | /* JSON -> 'true' */ |
195 | | static char JSON_PROD_SCALAR_TRUE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_TRUE, JSON_SEM_SCALAR_INIT, 0}; |
196 | | |
197 | | /* JSON -> 'false' */ |
198 | | static char JSON_PROD_SCALAR_FALSE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_FALSE, JSON_SEM_SCALAR_INIT, 0}; |
199 | | |
200 | | /* JSON -> 'null' */ |
201 | | static char JSON_PROD_SCALAR_NULL[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NULL, JSON_SEM_SCALAR_INIT, 0}; |
202 | | |
203 | | /* JSON -> '{' KEY_PAIRS '}' */ |
204 | | static char JSON_PROD_OBJECT[] = {JSON_SEM_OEND, JSON_TOKEN_OBJECT_END, JSON_NT_KEY_PAIRS, JSON_TOKEN_OBJECT_START, JSON_SEM_OSTART, 0}; |
205 | | |
206 | | /* JSON -> '[' ARRAY_ELEMENTS ']' */ |
207 | | static char JSON_PROD_ARRAY[] = {JSON_SEM_AEND, JSON_TOKEN_ARRAY_END, JSON_NT_ARRAY_ELEMENTS, JSON_TOKEN_ARRAY_START, JSON_SEM_ASTART, 0}; |
208 | | |
209 | | /* ARRAY_ELEMENTS -> JSON MORE_ARRAY_ELEMENTS */ |
210 | | static char JSON_PROD_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, 0}; |
211 | | |
212 | | /* MORE_ARRAY_ELEMENTS -> ',' JSON MORE_ARRAY_ELEMENTS */ |
213 | | static char JSON_PROD_MORE_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, JSON_TOKEN_COMMA, 0}; |
214 | | |
215 | | /* KEY_PAIRS -> string ':' JSON MORE_KEY_PAIRS */ |
216 | | static char JSON_PROD_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, 0}; |
217 | | |
218 | | /* MORE_KEY_PAIRS -> ',' string ':' JSON MORE_KEY_PAIRS */ |
219 | | static char JSON_PROD_MORE_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, JSON_TOKEN_COMMA, 0}; |
220 | | |
221 | | /* |
222 | | * Note: there are also epsilon productions for ARRAY_ELEMENTS, |
223 | | * MORE_ARRAY_ELEMENTS, KEY_PAIRS and MORE_KEY_PAIRS |
224 | | * They are all the same as none require any semantic actions. |
225 | | */ |
226 | | |
227 | | /* |
228 | | * Table connecting the productions with their director sets of |
229 | | * terminal symbols. |
230 | | * Any combination not specified here represents an error. |
231 | | */ |
232 | | |
233 | | typedef struct |
234 | | { |
235 | | size_t len; |
236 | | char *prod; |
237 | | } td_entry; |
238 | | |
239 | 0 | #define TD_ENTRY(PROD) { sizeof(PROD) - 1, (PROD) } |
240 | | |
241 | | static td_entry td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] = |
242 | | { |
243 | | /* JSON */ |
244 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_SCALAR_STRING), |
245 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_SCALAR_NUMBER), |
246 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_SCALAR_TRUE), |
247 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_SCALAR_FALSE), |
248 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_SCALAR_NULL), |
249 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY), |
250 | | [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_OBJECT), |
251 | | /* ARRAY_ELEMENTS */ |
252 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
253 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
254 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
255 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
256 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
257 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
258 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), |
259 | | [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON), |
260 | | /* MORE_ARRAY_ELEMENTS */ |
261 | | [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_ARRAY_ELEMENTS), |
262 | | [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON), |
263 | | /* KEY_PAIRS */ |
264 | | [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEY_PAIRS), |
265 | | [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON), |
266 | | /* MORE_KEY_PAIRS */ |
267 | | [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_KEY_PAIRS), |
268 | | [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON), |
269 | | }; |
270 | | |
271 | | /* the GOAL production. Not stored in the table, but will be the initial contents of the prediction stack */ |
272 | | static char JSON_PROD_GOAL[] = {JSON_TOKEN_END, JSON_NT_JSON, 0}; |
273 | | |
274 | | static inline JsonParseErrorType json_lex_string(JsonLexContext *lex); |
275 | | static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, const char *s, |
276 | | bool *num_err, size_t *total_len); |
277 | | static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, const JsonSemAction *sem); |
278 | | static JsonParseErrorType parse_object_field(JsonLexContext *lex, const JsonSemAction *sem); |
279 | | static JsonParseErrorType parse_object(JsonLexContext *lex, const JsonSemAction *sem); |
280 | | static JsonParseErrorType parse_array_element(JsonLexContext *lex, const JsonSemAction *sem); |
281 | | static JsonParseErrorType parse_array(JsonLexContext *lex, const JsonSemAction *sem); |
282 | | static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex); |
283 | | static bool allocate_incremental_state(JsonLexContext *lex); |
284 | | static inline void set_fname(JsonLexContext *lex, char *fname); |
285 | | |
286 | | /* the null action object used for pure validation */ |
287 | | const JsonSemAction nullSemAction = |
288 | | { |
289 | | NULL, NULL, NULL, NULL, NULL, |
290 | | NULL, NULL, NULL, NULL, NULL |
291 | | }; |
292 | | |
293 | | /* sentinels used for out-of-memory conditions */ |
294 | | static JsonLexContext failed_oom; |
295 | | static JsonIncrementalState failed_inc_oom; |
296 | | |
297 | | /* Parser support routines */ |
298 | | |
299 | | /* |
300 | | * lex_peek |
301 | | * |
302 | | * what is the current look_ahead token? |
303 | | */ |
304 | | static inline JsonTokenType |
305 | | lex_peek(JsonLexContext *lex) |
306 | 10.5k | { |
307 | 10.5k | return lex->token_type; |
308 | 10.5k | } |
309 | | |
310 | | /* |
311 | | * lex_expect |
312 | | * |
313 | | * move the lexer to the next token if the current look_ahead token matches |
314 | | * the parameter token. Otherwise, report an error. |
315 | | */ |
316 | | static inline JsonParseErrorType |
317 | | lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token) |
318 | 3.17k | { |
319 | 3.17k | if (lex_peek(lex) == token) |
320 | 3.13k | return json_lex(lex); |
321 | 45 | else |
322 | 45 | return report_parse_error(ctx, lex); |
323 | 3.17k | } |
324 | | |
325 | | /* chars to consider as part of an alphanumeric token */ |
326 | | #define JSON_ALPHANUMERIC_CHAR(c) \ |
327 | 4.75M | (((c) >= 'a' && (c) <= 'z') || \ |
328 | 4.75M | ((c) >= 'A' && (c) <= 'Z') || \ |
329 | 4.75M | ((c) >= '0' && (c) <= '9') || \ |
330 | 4.75M | (c) == '_' || \ |
331 | 4.75M | IS_HIGHBIT_SET(c)) |
332 | | |
333 | | /* |
334 | | * Utility function to check if a string is a valid JSON number. |
335 | | * |
336 | | * str is of length len, and need not be null-terminated. |
337 | | */ |
338 | | bool |
339 | | IsValidJsonNumber(const char *str, size_t len) |
340 | 0 | { |
341 | 0 | bool numeric_error; |
342 | 0 | size_t total_len; |
343 | 0 | JsonLexContext dummy_lex = {0}; |
344 | |
|
345 | 0 | if (len <= 0) |
346 | 0 | return false; |
347 | | |
348 | | /* |
349 | | * json_lex_number expects a leading '-' to have been eaten already. |
350 | | * |
351 | | * having to cast away the constness of str is ugly, but there's not much |
352 | | * easy alternative. |
353 | | */ |
354 | 0 | if (*str == '-') |
355 | 0 | { |
356 | 0 | dummy_lex.input = str + 1; |
357 | 0 | dummy_lex.input_length = len - 1; |
358 | 0 | } |
359 | 0 | else |
360 | 0 | { |
361 | 0 | dummy_lex.input = str; |
362 | 0 | dummy_lex.input_length = len; |
363 | 0 | } |
364 | |
|
365 | 0 | dummy_lex.token_start = dummy_lex.input; |
366 | |
|
367 | 0 | json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len); |
368 | |
|
369 | 0 | return (!numeric_error) && (total_len == dummy_lex.input_length); |
370 | 0 | } |
371 | | |
372 | | /* |
373 | | * makeJsonLexContextCstringLen |
374 | | * Initialize the given JsonLexContext object, or create one |
375 | | * |
376 | | * If a valid 'lex' pointer is given, it is initialized. This can |
377 | | * be used for stack-allocated structs, saving overhead. If NULL is |
378 | | * given, a new struct is allocated. |
379 | | * |
380 | | * If need_escapes is true, ->strval stores the unescaped lexemes. |
381 | | * Unescaping is expensive, so only request it when necessary. |
382 | | * |
383 | | * If need_escapes is true or lex was given as NULL, then caller is |
384 | | * responsible for freeing the returned struct, either by calling |
385 | | * freeJsonLexContext() or (in backend environment) via memory context |
386 | | * cleanup. |
387 | | * |
388 | | * In shlib code, any out-of-memory failures will be deferred to time |
389 | | * of use; this function is guaranteed to return a valid JsonLexContext. |
390 | | */ |
391 | | JsonLexContext * |
392 | | makeJsonLexContextCstringLen(JsonLexContext *lex, const char *json, |
393 | | size_t len, int encoding, bool need_escapes) |
394 | 1.53k | { |
395 | 1.53k | if (lex == NULL) |
396 | 1.53k | { |
397 | 1.53k | lex = ALLOC0(sizeof(JsonLexContext)); |
398 | 1.53k | if (!lex) |
399 | 0 | return &failed_oom; |
400 | 1.53k | lex->flags |= JSONLEX_FREE_STRUCT; |
401 | 1.53k | } |
402 | 0 | else |
403 | 0 | memset(lex, 0, sizeof(JsonLexContext)); |
404 | | |
405 | 1.53k | lex->errormsg = NULL; |
406 | 1.53k | lex->input = lex->token_terminator = lex->line_start = json; |
407 | 1.53k | lex->line_number = 1; |
408 | 1.53k | lex->input_length = len; |
409 | 1.53k | lex->input_encoding = encoding; |
410 | 1.53k | lex->need_escapes = need_escapes; |
411 | 1.53k | if (need_escapes) |
412 | 1.53k | { |
413 | | /* |
414 | | * This call can fail in shlib code. We defer error handling to time |
415 | | * of use (json_lex_string()) since we might not need to parse any |
416 | | * strings anyway. |
417 | | */ |
418 | 1.53k | lex->strval = jsonapi_makeStringInfo(); |
419 | 1.53k | lex->flags |= JSONLEX_FREE_STRVAL; |
420 | 1.53k | } |
421 | | |
422 | 1.53k | return lex; |
423 | 1.53k | } |
424 | | |
425 | | /* |
426 | | * Allocates the internal bookkeeping structures for incremental parsing. This |
427 | | * can only fail in-band with shlib code. |
428 | | */ |
429 | 0 | #define JS_STACK_CHUNK_SIZE 64 |
430 | 0 | #define JS_MAX_PROD_LEN 10 /* more than we need */ |
431 | 0 | #define JSON_TD_MAX_STACK 6400 /* hard coded for now - this is a REALLY high |
432 | | * number */ |
433 | | static bool |
434 | | allocate_incremental_state(JsonLexContext *lex) |
435 | 0 | { |
436 | 0 | void *pstack, |
437 | 0 | *prediction, |
438 | 0 | *fnames, |
439 | 0 | *fnull; |
440 | |
|
441 | 0 | lex->inc_state = ALLOC0(sizeof(JsonIncrementalState)); |
442 | 0 | pstack = ALLOC0(sizeof(JsonParserStack)); |
443 | 0 | prediction = ALLOC(JS_STACK_CHUNK_SIZE * JS_MAX_PROD_LEN); |
444 | 0 | fnames = ALLOC(JS_STACK_CHUNK_SIZE * sizeof(char *)); |
445 | 0 | fnull = ALLOC(JS_STACK_CHUNK_SIZE * sizeof(bool)); |
446 | |
|
447 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
448 | | if (!lex->inc_state |
449 | | || !pstack |
450 | | || !prediction |
451 | | || !fnames |
452 | | || !fnull) |
453 | | { |
454 | | FREE(lex->inc_state); |
455 | | FREE(pstack); |
456 | | FREE(prediction); |
457 | | FREE(fnames); |
458 | | FREE(fnull); |
459 | | |
460 | | lex->inc_state = &failed_inc_oom; |
461 | | return false; |
462 | | } |
463 | | #endif |
464 | |
|
465 | 0 | jsonapi_initStringInfo(&(lex->inc_state->partial_token)); |
466 | 0 | lex->pstack = pstack; |
467 | 0 | lex->pstack->stack_size = JS_STACK_CHUNK_SIZE; |
468 | 0 | lex->pstack->prediction = prediction; |
469 | 0 | lex->pstack->fnames = fnames; |
470 | 0 | lex->pstack->fnull = fnull; |
471 | | |
472 | | /* |
473 | | * fnames between 0 and lex_level must always be defined so that |
474 | | * freeJsonLexContext() can handle them safely. inc/dec_lex_level() handle |
475 | | * the rest. |
476 | | */ |
477 | 0 | Assert(lex->lex_level == 0); |
478 | 0 | lex->pstack->fnames[0] = NULL; |
479 | |
|
480 | 0 | lex->incremental = true; |
481 | 0 | return true; |
482 | 0 | } |
483 | | |
484 | | |
485 | | /* |
486 | | * makeJsonLexContextIncremental |
487 | | * |
488 | | * Similar to above but set up for use in incremental parsing. That means we |
489 | | * need explicit stacks for predictions, field names and null indicators, but |
490 | | * we don't need the input, that will be handed in bit by bit to the |
491 | | * parse routine. We also need an accumulator for partial tokens in case |
492 | | * the boundary between chunks happens to fall in the middle of a token. |
493 | | * |
494 | | * In shlib code, any out-of-memory failures will be deferred to time of use; |
495 | | * this function is guaranteed to return a valid JsonLexContext. |
496 | | */ |
497 | | JsonLexContext * |
498 | | makeJsonLexContextIncremental(JsonLexContext *lex, int encoding, |
499 | | bool need_escapes) |
500 | 0 | { |
501 | 0 | if (lex == NULL) |
502 | 0 | { |
503 | 0 | lex = ALLOC0(sizeof(JsonLexContext)); |
504 | 0 | if (!lex) |
505 | 0 | return &failed_oom; |
506 | | |
507 | 0 | lex->flags |= JSONLEX_FREE_STRUCT; |
508 | 0 | } |
509 | 0 | else |
510 | 0 | memset(lex, 0, sizeof(JsonLexContext)); |
511 | | |
512 | 0 | lex->line_number = 1; |
513 | 0 | lex->input_encoding = encoding; |
514 | |
|
515 | 0 | if (!allocate_incremental_state(lex)) |
516 | 0 | { |
517 | 0 | if (lex->flags & JSONLEX_FREE_STRUCT) |
518 | 0 | { |
519 | 0 | FREE(lex); |
520 | 0 | return &failed_oom; |
521 | 0 | } |
522 | | |
523 | | /* lex->inc_state tracks the OOM failure; we can return here. */ |
524 | 0 | return lex; |
525 | 0 | } |
526 | | |
527 | 0 | lex->need_escapes = need_escapes; |
528 | 0 | if (need_escapes) |
529 | 0 | { |
530 | | /* |
531 | | * This call can fail in shlib code. We defer error handling to time |
532 | | * of use (json_lex_string()) since we might not need to parse any |
533 | | * strings anyway. |
534 | | */ |
535 | 0 | lex->strval = jsonapi_makeStringInfo(); |
536 | 0 | lex->flags |= JSONLEX_FREE_STRVAL; |
537 | 0 | } |
538 | |
|
539 | 0 | return lex; |
540 | 0 | } |
541 | | |
542 | | void |
543 | | setJsonLexContextOwnsTokens(JsonLexContext *lex, bool owned_by_context) |
544 | 0 | { |
545 | 0 | if (lex->incremental && lex->inc_state->started) |
546 | 0 | { |
547 | | /* |
548 | | * Switching this flag after parsing has already started is a |
549 | | * programming error. |
550 | | */ |
551 | 0 | Assert(false); |
552 | 0 | return; |
553 | 0 | } |
554 | | |
555 | 0 | if (owned_by_context) |
556 | 0 | lex->flags |= JSONLEX_CTX_OWNS_TOKENS; |
557 | 0 | else |
558 | 0 | lex->flags &= ~JSONLEX_CTX_OWNS_TOKENS; |
559 | 0 | } |
560 | | |
561 | | static inline bool |
562 | | inc_lex_level(JsonLexContext *lex) |
563 | 0 | { |
564 | 0 | if (lex->incremental && (lex->lex_level + 1) >= lex->pstack->stack_size) |
565 | 0 | { |
566 | 0 | size_t new_stack_size; |
567 | 0 | char *new_prediction; |
568 | 0 | char **new_fnames; |
569 | 0 | bool *new_fnull; |
570 | |
|
571 | 0 | new_stack_size = lex->pstack->stack_size + JS_STACK_CHUNK_SIZE; |
572 | |
|
573 | 0 | new_prediction = REALLOC(lex->pstack->prediction, |
574 | 0 | new_stack_size * JS_MAX_PROD_LEN); |
575 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
576 | | if (!new_prediction) |
577 | | return false; |
578 | | #endif |
579 | 0 | lex->pstack->prediction = new_prediction; |
580 | |
|
581 | 0 | new_fnames = REALLOC(lex->pstack->fnames, |
582 | 0 | new_stack_size * sizeof(char *)); |
583 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
584 | | if (!new_fnames) |
585 | | return false; |
586 | | #endif |
587 | 0 | lex->pstack->fnames = new_fnames; |
588 | |
|
589 | 0 | new_fnull = REALLOC(lex->pstack->fnull, new_stack_size * sizeof(bool)); |
590 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
591 | | if (!new_fnull) |
592 | | return false; |
593 | | #endif |
594 | 0 | lex->pstack->fnull = new_fnull; |
595 | |
|
596 | 0 | lex->pstack->stack_size = new_stack_size; |
597 | 0 | } |
598 | |
|
599 | 0 | lex->lex_level += 1; |
600 | |
|
601 | 0 | if (lex->incremental) |
602 | 0 | { |
603 | | /* |
604 | | * Ensure freeJsonLexContext() remains safe even if no fname is |
605 | | * assigned at this level. |
606 | | */ |
607 | 0 | lex->pstack->fnames[lex->lex_level] = NULL; |
608 | 0 | } |
609 | |
|
610 | 0 | return true; |
611 | 0 | } |
612 | | |
613 | | static inline void |
614 | | dec_lex_level(JsonLexContext *lex) |
615 | 0 | { |
616 | 0 | set_fname(lex, NULL); /* free the current level's fname, if needed */ |
617 | 0 | lex->lex_level -= 1; |
618 | 0 | } |
619 | | |
620 | | static inline void |
621 | | push_prediction(JsonParserStack *pstack, td_entry entry) |
622 | 0 | { |
623 | 0 | memcpy(pstack->prediction + pstack->pred_index, entry.prod, entry.len); |
624 | 0 | pstack->pred_index += entry.len; |
625 | 0 | } |
626 | | |
627 | | static inline char |
628 | | pop_prediction(JsonParserStack *pstack) |
629 | 0 | { |
630 | 0 | Assert(pstack->pred_index > 0); |
631 | 0 | return pstack->prediction[--pstack->pred_index]; |
632 | 0 | } |
633 | | |
634 | | static inline char |
635 | | next_prediction(JsonParserStack *pstack) |
636 | 0 | { |
637 | 0 | Assert(pstack->pred_index > 0); |
638 | 0 | return pstack->prediction[pstack->pred_index - 1]; |
639 | 0 | } |
640 | | |
641 | | static inline bool |
642 | | have_prediction(JsonParserStack *pstack) |
643 | 0 | { |
644 | 0 | return pstack->pred_index > 0; |
645 | 0 | } |
646 | | |
647 | | static inline void |
648 | | set_fname(JsonLexContext *lex, char *fname) |
649 | 0 | { |
650 | 0 | if (lex->flags & JSONLEX_CTX_OWNS_TOKENS) |
651 | 0 | { |
652 | | /* |
653 | | * Don't leak prior fnames. If one hasn't been assigned yet, |
654 | | * inc_lex_level ensured that it's NULL (and therefore safe to free). |
655 | | */ |
656 | 0 | FREE(lex->pstack->fnames[lex->lex_level]); |
657 | 0 | } |
658 | |
|
659 | 0 | lex->pstack->fnames[lex->lex_level] = fname; |
660 | 0 | } |
661 | | |
662 | | static inline char * |
663 | | get_fname(JsonLexContext *lex) |
664 | 0 | { |
665 | 0 | return lex->pstack->fnames[lex->lex_level]; |
666 | 0 | } |
667 | | |
668 | | static inline void |
669 | | set_fnull(JsonLexContext *lex, bool fnull) |
670 | 0 | { |
671 | 0 | lex->pstack->fnull[lex->lex_level] = fnull; |
672 | 0 | } |
673 | | |
674 | | static inline bool |
675 | | get_fnull(JsonLexContext *lex) |
676 | 0 | { |
677 | 0 | return lex->pstack->fnull[lex->lex_level]; |
678 | 0 | } |
679 | | |
680 | | /* |
681 | | * Free memory in a JsonLexContext. |
682 | | * |
683 | | * There's no need for this if a *lex pointer was given when the object was |
684 | | * made, need_escapes was false, and json_errdetail() was not called; or if (in |
685 | | * backend environment) a memory context delete/reset is imminent. |
686 | | */ |
687 | | void |
688 | | freeJsonLexContext(JsonLexContext *lex) |
689 | 0 | { |
690 | 0 | static const JsonLexContext empty = {0}; |
691 | |
|
692 | 0 | if (!lex || lex == &failed_oom) |
693 | 0 | return; |
694 | | |
695 | 0 | if (lex->flags & JSONLEX_FREE_STRVAL) |
696 | 0 | jsonapi_destroyStringInfo(lex->strval); |
697 | |
|
698 | 0 | if (lex->errormsg) |
699 | 0 | jsonapi_destroyStringInfo(lex->errormsg); |
700 | |
|
701 | 0 | if (lex->incremental) |
702 | 0 | { |
703 | 0 | jsonapi_termStringInfo(&lex->inc_state->partial_token); |
704 | 0 | FREE(lex->inc_state); |
705 | 0 | FREE(lex->pstack->prediction); |
706 | |
|
707 | 0 | if (lex->flags & JSONLEX_CTX_OWNS_TOKENS) |
708 | 0 | { |
709 | 0 | int i; |
710 | | |
711 | | /* Clean up any tokens that were left behind. */ |
712 | 0 | for (i = 0; i <= lex->lex_level; i++) |
713 | 0 | FREE(lex->pstack->fnames[i]); |
714 | 0 | } |
715 | |
|
716 | 0 | FREE(lex->pstack->fnames); |
717 | 0 | FREE(lex->pstack->fnull); |
718 | 0 | FREE(lex->pstack->scalar_val); |
719 | 0 | FREE(lex->pstack); |
720 | 0 | } |
721 | |
|
722 | 0 | if (lex->flags & JSONLEX_FREE_STRUCT) |
723 | 0 | FREE(lex); |
724 | 0 | else |
725 | 0 | *lex = empty; |
726 | 0 | } |
727 | | |
728 | | /* |
729 | | * pg_parse_json |
730 | | * |
731 | | * Publicly visible entry point for the JSON parser. |
732 | | * |
733 | | * lex is a lexing context, set up for the json to be processed by calling |
734 | | * makeJsonLexContext(). sem is a structure of function pointers to semantic |
735 | | * action routines to be called at appropriate spots during parsing, and a |
736 | | * pointer to a state object to be passed to those routines. |
737 | | * |
738 | | * If FORCE_JSON_PSTACK is defined then the routine will call the non-recursive |
739 | | * JSON parser. This is a useful way to validate that it's doing the right |
740 | | * thing at least for non-incremental cases. If this is on we expect to see |
741 | | * regression diffs relating to error messages about stack depth, but no |
742 | | * other differences. |
743 | | */ |
744 | | JsonParseErrorType |
745 | | pg_parse_json(JsonLexContext *lex, const JsonSemAction *sem) |
746 | 1.53k | { |
747 | | #ifdef FORCE_JSON_PSTACK |
748 | | /* |
749 | | * We don't need partial token processing, there is only one chunk. But we |
750 | | * still need to init the partial token string so that freeJsonLexContext |
751 | | * works, so perform the full incremental initialization. |
752 | | */ |
753 | | if (!allocate_incremental_state(lex)) |
754 | | return JSON_OUT_OF_MEMORY; |
755 | | |
756 | | return pg_parse_json_incremental(lex, sem, lex->input, lex->input_length, true); |
757 | | |
758 | | #else |
759 | | |
760 | 1.53k | JsonTokenType tok; |
761 | 1.53k | JsonParseErrorType result; |
762 | | |
763 | 1.53k | if (lex == &failed_oom) |
764 | 0 | return JSON_OUT_OF_MEMORY; |
765 | 1.53k | if (lex->incremental) |
766 | 0 | return JSON_INVALID_LEXER_TYPE; |
767 | | |
768 | | /* get the initial token */ |
769 | 1.53k | result = json_lex(lex); |
770 | 1.53k | if (result != JSON_SUCCESS) |
771 | 1.24k | return result; |
772 | | |
773 | 293 | tok = lex_peek(lex); |
774 | | |
775 | | /* parse by recursive descent */ |
776 | 293 | switch (tok) |
777 | 293 | { |
778 | 8 | case JSON_TOKEN_OBJECT_START: |
779 | 8 | result = parse_object(lex, sem); |
780 | 8 | break; |
781 | 18 | case JSON_TOKEN_ARRAY_START: |
782 | 18 | result = parse_array(lex, sem); |
783 | 18 | break; |
784 | 267 | default: |
785 | 267 | result = parse_scalar(lex, sem); /* json can be a bare scalar */ |
786 | 293 | } |
787 | | |
788 | 290 | if (result == JSON_SUCCESS) |
789 | 41 | result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END); |
790 | | |
791 | 290 | return result; |
792 | 293 | #endif |
793 | 293 | } |
794 | | |
795 | | /* |
796 | | * json_count_array_elements |
797 | | * |
798 | | * Returns number of array elements in lex context at start of array token |
799 | | * until end of array token at same nesting level. |
800 | | * |
801 | | * Designed to be called from array_start routines. |
802 | | */ |
803 | | JsonParseErrorType |
804 | | json_count_array_elements(JsonLexContext *lex, int *elements) |
805 | 0 | { |
806 | 0 | JsonLexContext copylex; |
807 | 0 | int count; |
808 | 0 | JsonParseErrorType result; |
809 | |
|
810 | 0 | if (lex == &failed_oom) |
811 | 0 | return JSON_OUT_OF_MEMORY; |
812 | | |
813 | | /* |
814 | | * It's safe to do this with a shallow copy because the lexical routines |
815 | | * don't scribble on the input. They do scribble on the other pointers |
816 | | * etc, so doing this with a copy makes that safe. |
817 | | */ |
818 | 0 | memcpy(©lex, lex, sizeof(JsonLexContext)); |
819 | 0 | copylex.need_escapes = false; /* not interested in values here */ |
820 | 0 | copylex.lex_level++; |
821 | |
|
822 | 0 | count = 0; |
823 | 0 | result = lex_expect(JSON_PARSE_ARRAY_START, ©lex, |
824 | 0 | JSON_TOKEN_ARRAY_START); |
825 | 0 | if (result != JSON_SUCCESS) |
826 | 0 | return result; |
827 | 0 | if (lex_peek(©lex) != JSON_TOKEN_ARRAY_END) |
828 | 0 | { |
829 | 0 | while (1) |
830 | 0 | { |
831 | 0 | count++; |
832 | 0 | result = parse_array_element(©lex, &nullSemAction); |
833 | 0 | if (result != JSON_SUCCESS) |
834 | 0 | return result; |
835 | 0 | if (copylex.token_type != JSON_TOKEN_COMMA) |
836 | 0 | break; |
837 | 0 | result = json_lex(©lex); |
838 | 0 | if (result != JSON_SUCCESS) |
839 | 0 | return result; |
840 | 0 | } |
841 | 0 | } |
842 | 0 | result = lex_expect(JSON_PARSE_ARRAY_NEXT, ©lex, |
843 | 0 | JSON_TOKEN_ARRAY_END); |
844 | 0 | if (result != JSON_SUCCESS) |
845 | 0 | return result; |
846 | | |
847 | 0 | *elements = count; |
848 | 0 | return JSON_SUCCESS; |
849 | 0 | } |
850 | | |
851 | | /* |
852 | | * pg_parse_json_incremental |
853 | | * |
854 | | * Routine for incremental parsing of json. This uses the non-recursive top |
855 | | * down method of the Dragon Book Algorithm 4.3. It's somewhat slower than |
856 | | * the Recursive Descent pattern used above, so we only use it for incremental |
857 | | * parsing of JSON. |
858 | | * |
859 | | * The lexing context needs to be set up by a call to |
860 | | * makeJsonLexContextIncremental(). sem is a structure of function pointers |
861 | | * to semantic action routines, which should function exactly as those used |
862 | | * in the recursive descent parser. |
863 | | * |
864 | | * This routine can be called repeatedly with chunks of JSON. On the final |
865 | | * chunk is_last must be set to true. len is the length of the json chunk, |
866 | | * which does not need to be null terminated. |
867 | | */ |
868 | | JsonParseErrorType |
869 | | pg_parse_json_incremental(JsonLexContext *lex, |
870 | | const JsonSemAction *sem, |
871 | | const char *json, |
872 | | size_t len, |
873 | | bool is_last) |
874 | 0 | { |
875 | 0 | JsonTokenType tok; |
876 | 0 | JsonParseErrorType result; |
877 | 0 | JsonParseContext ctx = JSON_PARSE_VALUE; |
878 | 0 | JsonParserStack *pstack = lex->pstack; |
879 | |
|
880 | 0 | if (lex == &failed_oom || lex->inc_state == &failed_inc_oom) |
881 | 0 | return JSON_OUT_OF_MEMORY; |
882 | 0 | if (!lex->incremental) |
883 | 0 | return JSON_INVALID_LEXER_TYPE; |
884 | | |
885 | 0 | lex->input = lex->token_terminator = lex->line_start = json; |
886 | 0 | lex->input_length = len; |
887 | 0 | lex->inc_state->is_last_chunk = is_last; |
888 | 0 | lex->inc_state->started = true; |
889 | | |
890 | | /* get the initial token */ |
891 | 0 | result = json_lex(lex); |
892 | 0 | if (result != JSON_SUCCESS) |
893 | 0 | return result; |
894 | | |
895 | 0 | tok = lex_peek(lex); |
896 | | |
897 | | /* use prediction stack for incremental parsing */ |
898 | |
|
899 | 0 | if (!have_prediction(pstack)) |
900 | 0 | { |
901 | 0 | td_entry goal = TD_ENTRY(JSON_PROD_GOAL); |
902 | |
|
903 | 0 | push_prediction(pstack, goal); |
904 | 0 | } |
905 | |
|
906 | 0 | while (have_prediction(pstack)) |
907 | 0 | { |
908 | 0 | char top = pop_prediction(pstack); |
909 | 0 | td_entry entry; |
910 | | |
911 | | /* |
912 | | * these first two branches are the guts of the Table Driven method |
913 | | */ |
914 | 0 | if (top == tok) |
915 | 0 | { |
916 | | /* |
917 | | * tok can only be a terminal symbol, so top must be too. the |
918 | | * token matches the top of the stack, so get the next token. |
919 | | */ |
920 | 0 | if (tok < JSON_TOKEN_END) |
921 | 0 | { |
922 | 0 | result = json_lex(lex); |
923 | 0 | if (result != JSON_SUCCESS) |
924 | 0 | return result; |
925 | 0 | tok = lex_peek(lex); |
926 | 0 | } |
927 | 0 | } |
928 | 0 | else if (IS_NT(top) && (entry = td_parser_table[OFS(top)][tok]).prod != NULL) |
929 | 0 | { |
930 | | /* |
931 | | * the token is in the director set for a production of the |
932 | | * non-terminal at the top of the stack, so push the reversed RHS |
933 | | * of the production onto the stack. |
934 | | */ |
935 | 0 | push_prediction(pstack, entry); |
936 | 0 | } |
937 | 0 | else if (IS_SEM(top)) |
938 | 0 | { |
939 | | /* |
940 | | * top is a semantic action marker, so take action accordingly. |
941 | | * It's important to have these markers in the prediction stack |
942 | | * before any token they might need so we don't advance the token |
943 | | * prematurely. Note in a couple of cases we need to do something |
944 | | * both before and after the token. |
945 | | */ |
946 | 0 | switch (top) |
947 | 0 | { |
948 | 0 | case JSON_SEM_OSTART: |
949 | 0 | { |
950 | 0 | json_struct_action ostart = sem->object_start; |
951 | |
|
952 | 0 | if (lex->lex_level >= JSON_TD_MAX_STACK) |
953 | 0 | return JSON_NESTING_TOO_DEEP; |
954 | | |
955 | 0 | if (ostart != NULL) |
956 | 0 | { |
957 | 0 | result = (*ostart) (sem->semstate); |
958 | 0 | if (result != JSON_SUCCESS) |
959 | 0 | return result; |
960 | 0 | } |
961 | | |
962 | 0 | if (!inc_lex_level(lex)) |
963 | 0 | return JSON_OUT_OF_MEMORY; |
964 | 0 | } |
965 | 0 | break; |
966 | 0 | case JSON_SEM_OEND: |
967 | 0 | { |
968 | 0 | json_struct_action oend = sem->object_end; |
969 | |
|
970 | 0 | dec_lex_level(lex); |
971 | 0 | if (oend != NULL) |
972 | 0 | { |
973 | 0 | result = (*oend) (sem->semstate); |
974 | 0 | if (result != JSON_SUCCESS) |
975 | 0 | return result; |
976 | 0 | } |
977 | 0 | } |
978 | 0 | break; |
979 | 0 | case JSON_SEM_ASTART: |
980 | 0 | { |
981 | 0 | json_struct_action astart = sem->array_start; |
982 | |
|
983 | 0 | if (lex->lex_level >= JSON_TD_MAX_STACK) |
984 | 0 | return JSON_NESTING_TOO_DEEP; |
985 | | |
986 | 0 | if (astart != NULL) |
987 | 0 | { |
988 | 0 | result = (*astart) (sem->semstate); |
989 | 0 | if (result != JSON_SUCCESS) |
990 | 0 | return result; |
991 | 0 | } |
992 | | |
993 | 0 | if (!inc_lex_level(lex)) |
994 | 0 | return JSON_OUT_OF_MEMORY; |
995 | 0 | } |
996 | 0 | break; |
997 | 0 | case JSON_SEM_AEND: |
998 | 0 | { |
999 | 0 | json_struct_action aend = sem->array_end; |
1000 | |
|
1001 | 0 | dec_lex_level(lex); |
1002 | 0 | if (aend != NULL) |
1003 | 0 | { |
1004 | 0 | result = (*aend) (sem->semstate); |
1005 | 0 | if (result != JSON_SUCCESS) |
1006 | 0 | return result; |
1007 | 0 | } |
1008 | 0 | } |
1009 | 0 | break; |
1010 | 0 | case JSON_SEM_OFIELD_INIT: |
1011 | 0 | { |
1012 | | /* |
1013 | | * all we do here is save out the field name. We have |
1014 | | * to wait to get past the ':' to see if the next |
1015 | | * value is null so we can call the semantic routine |
1016 | | */ |
1017 | 0 | char *fname = NULL; |
1018 | 0 | json_ofield_action ostart = sem->object_field_start; |
1019 | 0 | json_ofield_action oend = sem->object_field_end; |
1020 | |
|
1021 | 0 | if ((ostart != NULL || oend != NULL) && lex->need_escapes) |
1022 | 0 | { |
1023 | 0 | fname = STRDUP(lex->strval->data); |
1024 | 0 | if (fname == NULL) |
1025 | 0 | return JSON_OUT_OF_MEMORY; |
1026 | 0 | } |
1027 | 0 | set_fname(lex, fname); |
1028 | 0 | } |
1029 | 0 | break; |
1030 | 0 | case JSON_SEM_OFIELD_START: |
1031 | 0 | { |
1032 | | /* |
1033 | | * the current token should be the first token of the |
1034 | | * value |
1035 | | */ |
1036 | 0 | bool isnull = tok == JSON_TOKEN_NULL; |
1037 | 0 | json_ofield_action ostart = sem->object_field_start; |
1038 | |
|
1039 | 0 | set_fnull(lex, isnull); |
1040 | |
|
1041 | 0 | if (ostart != NULL) |
1042 | 0 | { |
1043 | 0 | char *fname = get_fname(lex); |
1044 | |
|
1045 | 0 | result = (*ostart) (sem->semstate, fname, isnull); |
1046 | 0 | if (result != JSON_SUCCESS) |
1047 | 0 | return result; |
1048 | 0 | } |
1049 | 0 | } |
1050 | 0 | break; |
1051 | 0 | case JSON_SEM_OFIELD_END: |
1052 | 0 | { |
1053 | 0 | json_ofield_action oend = sem->object_field_end; |
1054 | |
|
1055 | 0 | if (oend != NULL) |
1056 | 0 | { |
1057 | 0 | char *fname = get_fname(lex); |
1058 | 0 | bool isnull = get_fnull(lex); |
1059 | |
|
1060 | 0 | result = (*oend) (sem->semstate, fname, isnull); |
1061 | 0 | if (result != JSON_SUCCESS) |
1062 | 0 | return result; |
1063 | 0 | } |
1064 | 0 | } |
1065 | 0 | break; |
1066 | 0 | case JSON_SEM_AELEM_START: |
1067 | 0 | { |
1068 | 0 | json_aelem_action astart = sem->array_element_start; |
1069 | 0 | bool isnull = tok == JSON_TOKEN_NULL; |
1070 | |
|
1071 | 0 | set_fnull(lex, isnull); |
1072 | |
|
1073 | 0 | if (astart != NULL) |
1074 | 0 | { |
1075 | 0 | result = (*astart) (sem->semstate, isnull); |
1076 | 0 | if (result != JSON_SUCCESS) |
1077 | 0 | return result; |
1078 | 0 | } |
1079 | 0 | } |
1080 | 0 | break; |
1081 | 0 | case JSON_SEM_AELEM_END: |
1082 | 0 | { |
1083 | 0 | json_aelem_action aend = sem->array_element_end; |
1084 | |
|
1085 | 0 | if (aend != NULL) |
1086 | 0 | { |
1087 | 0 | bool isnull = get_fnull(lex); |
1088 | |
|
1089 | 0 | result = (*aend) (sem->semstate, isnull); |
1090 | 0 | if (result != JSON_SUCCESS) |
1091 | 0 | return result; |
1092 | 0 | } |
1093 | 0 | } |
1094 | 0 | break; |
1095 | 0 | case JSON_SEM_SCALAR_INIT: |
1096 | 0 | { |
1097 | 0 | json_scalar_action sfunc = sem->scalar; |
1098 | |
|
1099 | 0 | pstack->scalar_val = NULL; |
1100 | |
|
1101 | 0 | if (sfunc != NULL) |
1102 | 0 | { |
1103 | | /* |
1104 | | * extract the de-escaped string value, or the raw |
1105 | | * lexeme |
1106 | | */ |
1107 | | /* |
1108 | | * XXX copied from RD parser but looks like a |
1109 | | * buglet |
1110 | | */ |
1111 | 0 | if (tok == JSON_TOKEN_STRING) |
1112 | 0 | { |
1113 | 0 | if (lex->need_escapes) |
1114 | 0 | { |
1115 | 0 | pstack->scalar_val = STRDUP(lex->strval->data); |
1116 | 0 | if (pstack->scalar_val == NULL) |
1117 | 0 | return JSON_OUT_OF_MEMORY; |
1118 | 0 | } |
1119 | 0 | } |
1120 | 0 | else |
1121 | 0 | { |
1122 | 0 | ptrdiff_t tlen = (lex->token_terminator - lex->token_start); |
1123 | |
|
1124 | 0 | pstack->scalar_val = ALLOC(tlen + 1); |
1125 | 0 | if (pstack->scalar_val == NULL) |
1126 | 0 | return JSON_OUT_OF_MEMORY; |
1127 | | |
1128 | 0 | memcpy(pstack->scalar_val, lex->token_start, tlen); |
1129 | 0 | pstack->scalar_val[tlen] = '\0'; |
1130 | 0 | } |
1131 | 0 | pstack->scalar_tok = tok; |
1132 | 0 | } |
1133 | 0 | } |
1134 | 0 | break; |
1135 | 0 | case JSON_SEM_SCALAR_CALL: |
1136 | 0 | { |
1137 | | /* |
1138 | | * We'd like to be able to get rid of this business of |
1139 | | * two bits of scalar action, but we can't. It breaks |
1140 | | * certain semantic actions which expect that when |
1141 | | * called the lexer has consumed the item. See for |
1142 | | * example get_scalar() in jsonfuncs.c. |
1143 | | */ |
1144 | 0 | json_scalar_action sfunc = sem->scalar; |
1145 | |
|
1146 | 0 | if (sfunc != NULL) |
1147 | 0 | { |
1148 | 0 | result = (*sfunc) (sem->semstate, pstack->scalar_val, pstack->scalar_tok); |
1149 | | |
1150 | | /* |
1151 | | * Either ownership of the token passed to the |
1152 | | * callback, or we need to free it now. Either |
1153 | | * way, clear our pointer to it so it doesn't get |
1154 | | * freed in the future. |
1155 | | */ |
1156 | 0 | if (lex->flags & JSONLEX_CTX_OWNS_TOKENS) |
1157 | 0 | FREE(pstack->scalar_val); |
1158 | 0 | pstack->scalar_val = NULL; |
1159 | |
|
1160 | 0 | if (result != JSON_SUCCESS) |
1161 | 0 | return result; |
1162 | 0 | } |
1163 | 0 | } |
1164 | 0 | break; |
1165 | 0 | default: |
1166 | | /* should not happen */ |
1167 | 0 | break; |
1168 | 0 | } |
1169 | 0 | } |
1170 | 0 | else |
1171 | 0 | { |
1172 | | /* |
1173 | | * The token didn't match the stack top if it's a terminal nor a |
1174 | | * production for the stack top if it's a non-terminal. |
1175 | | * |
1176 | | * Various cases here are Asserted to be not possible, as the |
1177 | | * token would not appear at the top of the prediction stack |
1178 | | * unless the lookahead matched. |
1179 | | */ |
1180 | 0 | switch (top) |
1181 | 0 | { |
1182 | 0 | case JSON_TOKEN_STRING: |
1183 | 0 | if (next_prediction(pstack) == JSON_TOKEN_COLON) |
1184 | 0 | ctx = JSON_PARSE_STRING; |
1185 | 0 | else |
1186 | 0 | { |
1187 | 0 | Assert(false); |
1188 | 0 | ctx = JSON_PARSE_VALUE; |
1189 | 0 | } |
1190 | 0 | break; |
1191 | 0 | case JSON_TOKEN_NUMBER: |
1192 | 0 | case JSON_TOKEN_TRUE: |
1193 | 0 | case JSON_TOKEN_FALSE: |
1194 | 0 | case JSON_TOKEN_NULL: |
1195 | 0 | case JSON_TOKEN_ARRAY_START: |
1196 | 0 | case JSON_TOKEN_OBJECT_START: |
1197 | 0 | Assert(false); |
1198 | 0 | ctx = JSON_PARSE_VALUE; |
1199 | 0 | break; |
1200 | 0 | case JSON_TOKEN_ARRAY_END: |
1201 | 0 | Assert(false); |
1202 | 0 | ctx = JSON_PARSE_ARRAY_NEXT; |
1203 | 0 | break; |
1204 | 0 | case JSON_TOKEN_OBJECT_END: |
1205 | 0 | Assert(false); |
1206 | 0 | ctx = JSON_PARSE_OBJECT_NEXT; |
1207 | 0 | break; |
1208 | 0 | case JSON_TOKEN_COMMA: |
1209 | 0 | Assert(false); |
1210 | 0 | if (next_prediction(pstack) == JSON_TOKEN_STRING) |
1211 | 0 | ctx = JSON_PARSE_OBJECT_NEXT; |
1212 | 0 | else |
1213 | 0 | ctx = JSON_PARSE_ARRAY_NEXT; |
1214 | 0 | break; |
1215 | 0 | case JSON_TOKEN_COLON: |
1216 | 0 | ctx = JSON_PARSE_OBJECT_LABEL; |
1217 | 0 | break; |
1218 | 0 | case JSON_TOKEN_END: |
1219 | 0 | ctx = JSON_PARSE_END; |
1220 | 0 | break; |
1221 | 0 | case JSON_NT_MORE_ARRAY_ELEMENTS: |
1222 | 0 | ctx = JSON_PARSE_ARRAY_NEXT; |
1223 | 0 | break; |
1224 | 0 | case JSON_NT_ARRAY_ELEMENTS: |
1225 | 0 | ctx = JSON_PARSE_ARRAY_START; |
1226 | 0 | break; |
1227 | 0 | case JSON_NT_MORE_KEY_PAIRS: |
1228 | 0 | ctx = JSON_PARSE_OBJECT_NEXT; |
1229 | 0 | break; |
1230 | 0 | case JSON_NT_KEY_PAIRS: |
1231 | 0 | ctx = JSON_PARSE_OBJECT_START; |
1232 | 0 | break; |
1233 | 0 | default: |
1234 | 0 | ctx = JSON_PARSE_VALUE; |
1235 | 0 | } |
1236 | 0 | return report_parse_error(ctx, lex); |
1237 | 0 | } |
1238 | 0 | } |
1239 | | |
1240 | 0 | return JSON_SUCCESS; |
1241 | 0 | } |
1242 | | |
1243 | | /* |
1244 | | * Recursive Descent parse routines. There is one for each structural |
1245 | | * element in a json document: |
1246 | | * - scalar (string, number, true, false, null) |
1247 | | * - array ( [ ] ) |
1248 | | * - array element |
1249 | | * - object ( { } ) |
1250 | | * - object field |
1251 | | */ |
1252 | | static inline JsonParseErrorType |
1253 | | parse_scalar(JsonLexContext *lex, const JsonSemAction *sem) |
1254 | 281 | { |
1255 | 281 | char *val = NULL; |
1256 | 281 | json_scalar_action sfunc = sem->scalar; |
1257 | 281 | JsonTokenType tok = lex_peek(lex); |
1258 | 281 | JsonParseErrorType result; |
1259 | | |
1260 | | /* a scalar must be a string, a number, true, false, or null */ |
1261 | 281 | if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER && |
1262 | 281 | tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE && |
1263 | 281 | tok != JSON_TOKEN_NULL) |
1264 | 5 | return report_parse_error(JSON_PARSE_VALUE, lex); |
1265 | | |
1266 | | /* if no semantic function, just consume the token */ |
1267 | 276 | if (sfunc == NULL) |
1268 | 276 | return json_lex(lex); |
1269 | | |
1270 | | /* extract the de-escaped string value, or the raw lexeme */ |
1271 | 0 | if (lex_peek(lex) == JSON_TOKEN_STRING) |
1272 | 0 | { |
1273 | 0 | if (lex->need_escapes) |
1274 | 0 | { |
1275 | 0 | val = STRDUP(lex->strval->data); |
1276 | 0 | if (val == NULL) |
1277 | 0 | return JSON_OUT_OF_MEMORY; |
1278 | 0 | } |
1279 | 0 | } |
1280 | 0 | else |
1281 | 0 | { |
1282 | 0 | int len = (lex->token_terminator - lex->token_start); |
1283 | |
|
1284 | 0 | val = ALLOC(len + 1); |
1285 | 0 | if (val == NULL) |
1286 | 0 | return JSON_OUT_OF_MEMORY; |
1287 | | |
1288 | 0 | memcpy(val, lex->token_start, len); |
1289 | 0 | val[len] = '\0'; |
1290 | 0 | } |
1291 | | |
1292 | | /* consume the token */ |
1293 | 0 | result = json_lex(lex); |
1294 | 0 | if (result != JSON_SUCCESS) |
1295 | 0 | { |
1296 | 0 | FREE(val); |
1297 | 0 | return result; |
1298 | 0 | } |
1299 | | |
1300 | | /* |
1301 | | * invoke the callback, which may take ownership of val. For string |
1302 | | * values, val is NULL if need_escapes is false. |
1303 | | */ |
1304 | 0 | result = (*sfunc) (sem->semstate, val, tok); |
1305 | |
|
1306 | 0 | if (lex->flags & JSONLEX_CTX_OWNS_TOKENS) |
1307 | 0 | FREE(val); |
1308 | |
|
1309 | 0 | return result; |
1310 | 0 | } |
1311 | | |
1312 | | static JsonParseErrorType |
1313 | | parse_object_field(JsonLexContext *lex, const JsonSemAction *sem) |
1314 | 563 | { |
1315 | | /* |
1316 | | * An object field is "fieldname" : value where value can be a scalar, |
1317 | | * object or array. Note: in user-facing docs and error messages, we |
1318 | | * generally call a field name a "key". |
1319 | | */ |
1320 | | |
1321 | 563 | char *fname = NULL; |
1322 | 563 | json_ofield_action ostart = sem->object_field_start; |
1323 | 563 | json_ofield_action oend = sem->object_field_end; |
1324 | 563 | bool isnull; |
1325 | 563 | JsonTokenType tok; |
1326 | 563 | JsonParseErrorType result; |
1327 | | |
1328 | 563 | if (lex_peek(lex) != JSON_TOKEN_STRING) |
1329 | 0 | return report_parse_error(JSON_PARSE_STRING, lex); |
1330 | 563 | if ((ostart != NULL || oend != NULL) && lex->need_escapes) |
1331 | 0 | { |
1332 | | /* fname is NULL if need_escapes is false */ |
1333 | 0 | fname = STRDUP(lex->strval->data); |
1334 | 0 | if (fname == NULL) |
1335 | 0 | return JSON_OUT_OF_MEMORY; |
1336 | 0 | } |
1337 | 563 | result = json_lex(lex); |
1338 | 563 | if (result != JSON_SUCCESS) |
1339 | 0 | { |
1340 | 0 | FREE(fname); |
1341 | 0 | return result; |
1342 | 0 | } |
1343 | | |
1344 | 563 | result = lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON); |
1345 | 563 | if (result != JSON_SUCCESS) |
1346 | 2 | { |
1347 | 2 | FREE(fname); |
1348 | 2 | return result; |
1349 | 2 | } |
1350 | | |
1351 | 561 | tok = lex_peek(lex); |
1352 | 561 | isnull = tok == JSON_TOKEN_NULL; |
1353 | | |
1354 | 561 | if (ostart != NULL) |
1355 | 0 | { |
1356 | 0 | result = (*ostart) (sem->semstate, fname, isnull); |
1357 | 0 | if (result != JSON_SUCCESS) |
1358 | 0 | goto ofield_cleanup; |
1359 | 0 | } |
1360 | | |
1361 | 561 | switch (tok) |
1362 | 561 | { |
1363 | 161 | case JSON_TOKEN_OBJECT_START: |
1364 | 161 | result = parse_object(lex, sem); |
1365 | 161 | break; |
1366 | 399 | case JSON_TOKEN_ARRAY_START: |
1367 | 399 | result = parse_array(lex, sem); |
1368 | 399 | break; |
1369 | 1 | default: |
1370 | 1 | result = parse_scalar(lex, sem); |
1371 | 561 | } |
1372 | 3 | if (result != JSON_SUCCESS) |
1373 | 3 | goto ofield_cleanup; |
1374 | | |
1375 | 0 | if (oend != NULL) |
1376 | 0 | { |
1377 | 0 | result = (*oend) (sem->semstate, fname, isnull); |
1378 | 0 | if (result != JSON_SUCCESS) |
1379 | 0 | goto ofield_cleanup; |
1380 | 0 | } |
1381 | | |
1382 | 3 | ofield_cleanup: |
1383 | 3 | if (lex->flags & JSONLEX_CTX_OWNS_TOKENS) |
1384 | 0 | FREE(fname); |
1385 | 3 | return result; |
1386 | 0 | } |
1387 | | |
1388 | | static JsonParseErrorType |
1389 | | parse_object(JsonLexContext *lex, const JsonSemAction *sem) |
1390 | 569 | { |
1391 | | /* |
1392 | | * an object is a possibly empty sequence of object fields, separated by |
1393 | | * commas and surrounded by curly braces. |
1394 | | */ |
1395 | 569 | json_struct_action ostart = sem->object_start; |
1396 | 569 | json_struct_action oend = sem->object_end; |
1397 | 569 | JsonTokenType tok; |
1398 | 569 | JsonParseErrorType result; |
1399 | | |
1400 | 569 | #ifndef FRONTEND |
1401 | | |
1402 | | /* |
1403 | | * TODO: clients need some way to put a bound on stack growth. Parse level |
1404 | | * limits maybe? |
1405 | | */ |
1406 | 569 | check_stack_depth(); |
1407 | 569 | #endif |
1408 | | |
1409 | 569 | if (ostart != NULL) |
1410 | 0 | { |
1411 | 0 | result = (*ostart) (sem->semstate); |
1412 | 0 | if (result != JSON_SUCCESS) |
1413 | 0 | return result; |
1414 | 0 | } |
1415 | | |
1416 | | /* |
1417 | | * Data inside an object is at a higher nesting level than the object |
1418 | | * itself. Note that we increment this after we call the semantic routine |
1419 | | * for the object start and restore it before we call the routine for the |
1420 | | * object end. |
1421 | | */ |
1422 | 569 | lex->lex_level++; |
1423 | | |
1424 | 569 | Assert(lex_peek(lex) == JSON_TOKEN_OBJECT_START); |
1425 | 569 | result = json_lex(lex); |
1426 | 569 | if (result != JSON_SUCCESS) |
1427 | 4 | return result; |
1428 | | |
1429 | 565 | tok = lex_peek(lex); |
1430 | 565 | switch (tok) |
1431 | 565 | { |
1432 | 563 | case JSON_TOKEN_STRING: |
1433 | 563 | result = parse_object_field(lex, sem); |
1434 | 563 | while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA) |
1435 | 0 | { |
1436 | 0 | result = json_lex(lex); |
1437 | 0 | if (result != JSON_SUCCESS) |
1438 | 0 | break; |
1439 | 0 | result = parse_object_field(lex, sem); |
1440 | 0 | } |
1441 | 563 | break; |
1442 | 0 | case JSON_TOKEN_OBJECT_END: |
1443 | 0 | break; |
1444 | 2 | default: |
1445 | | /* case of an invalid initial token inside the object */ |
1446 | 2 | result = report_parse_error(JSON_PARSE_OBJECT_START, lex); |
1447 | 565 | } |
1448 | 7 | if (result != JSON_SUCCESS) |
1449 | 7 | return result; |
1450 | | |
1451 | 0 | result = lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END); |
1452 | 0 | if (result != JSON_SUCCESS) |
1453 | 0 | return result; |
1454 | | |
1455 | 0 | lex->lex_level--; |
1456 | |
|
1457 | 0 | if (oend != NULL) |
1458 | 0 | { |
1459 | 0 | result = (*oend) (sem->semstate); |
1460 | 0 | if (result != JSON_SUCCESS) |
1461 | 0 | return result; |
1462 | 0 | } |
1463 | | |
1464 | 0 | return JSON_SUCCESS; |
1465 | 0 | } |
1466 | | |
1467 | | static JsonParseErrorType |
1468 | | parse_array_element(JsonLexContext *lex, const JsonSemAction *sem) |
1469 | 2.57k | { |
1470 | 2.57k | json_aelem_action astart = sem->array_element_start; |
1471 | 2.57k | json_aelem_action aend = sem->array_element_end; |
1472 | 2.57k | JsonTokenType tok = lex_peek(lex); |
1473 | 2.57k | JsonParseErrorType result; |
1474 | 2.57k | bool isnull; |
1475 | | |
1476 | 2.57k | isnull = tok == JSON_TOKEN_NULL; |
1477 | | |
1478 | 2.57k | if (astart != NULL) |
1479 | 0 | { |
1480 | 0 | result = (*astart) (sem->semstate, isnull); |
1481 | 0 | if (result != JSON_SUCCESS) |
1482 | 0 | return result; |
1483 | 0 | } |
1484 | | |
1485 | | /* an array element is any object, array or scalar */ |
1486 | 2.57k | switch (tok) |
1487 | 2.57k | { |
1488 | 400 | case JSON_TOKEN_OBJECT_START: |
1489 | 400 | result = parse_object(lex, sem); |
1490 | 400 | break; |
1491 | 2.15k | case JSON_TOKEN_ARRAY_START: |
1492 | 2.15k | result = parse_array(lex, sem); |
1493 | 2.15k | break; |
1494 | 13 | default: |
1495 | 13 | result = parse_scalar(lex, sem); |
1496 | 2.57k | } |
1497 | | |
1498 | 24 | if (result != JSON_SUCCESS) |
1499 | 15 | return result; |
1500 | | |
1501 | 9 | if (aend != NULL) |
1502 | 0 | { |
1503 | 0 | result = (*aend) (sem->semstate, isnull); |
1504 | 0 | if (result != JSON_SUCCESS) |
1505 | 0 | return result; |
1506 | 0 | } |
1507 | | |
1508 | 9 | return JSON_SUCCESS; |
1509 | 9 | } |
1510 | | |
1511 | | static JsonParseErrorType |
1512 | | parse_array(JsonLexContext *lex, const JsonSemAction *sem) |
1513 | 2.57k | { |
1514 | | /* |
1515 | | * an array is a possibly empty sequence of array elements, separated by |
1516 | | * commas and surrounded by square brackets. |
1517 | | */ |
1518 | 2.57k | json_struct_action astart = sem->array_start; |
1519 | 2.57k | json_struct_action aend = sem->array_end; |
1520 | 2.57k | JsonParseErrorType result; |
1521 | | |
1522 | 2.57k | #ifndef FRONTEND |
1523 | 2.57k | check_stack_depth(); |
1524 | 2.57k | #endif |
1525 | | |
1526 | 2.57k | if (astart != NULL) |
1527 | 0 | { |
1528 | 0 | result = (*astart) (sem->semstate); |
1529 | 0 | if (result != JSON_SUCCESS) |
1530 | 0 | return result; |
1531 | 0 | } |
1532 | | |
1533 | | /* |
1534 | | * Data inside an array is at a higher nesting level than the array |
1535 | | * itself. Note that we increment this after we call the semantic routine |
1536 | | * for the array start and restore it before we call the routine for the |
1537 | | * array end. |
1538 | | */ |
1539 | 2.57k | lex->lex_level++; |
1540 | | |
1541 | 2.57k | result = lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START); |
1542 | 2.57k | if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END) |
1543 | 2.56k | { |
1544 | 2.56k | result = parse_array_element(lex, sem); |
1545 | | |
1546 | 2.57k | while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA) |
1547 | 7 | { |
1548 | 7 | result = json_lex(lex); |
1549 | 7 | if (result != JSON_SUCCESS) |
1550 | 0 | break; |
1551 | 7 | result = parse_array_element(lex, sem); |
1552 | 7 | } |
1553 | 2.56k | } |
1554 | 2.57k | if (result != JSON_SUCCESS) |
1555 | 22 | return result; |
1556 | | |
1557 | 2.55k | result = lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END); |
1558 | 2.55k | if (result != JSON_SUCCESS) |
1559 | 3 | return result; |
1560 | | |
1561 | 2.55k | lex->lex_level--; |
1562 | | |
1563 | 2.55k | if (aend != NULL) |
1564 | 0 | { |
1565 | 0 | result = (*aend) (sem->semstate); |
1566 | 0 | if (result != JSON_SUCCESS) |
1567 | 0 | return result; |
1568 | 0 | } |
1569 | | |
1570 | 2.55k | return JSON_SUCCESS; |
1571 | 2.55k | } |
1572 | | |
1573 | | /* |
1574 | | * Lex one token from the input stream. |
1575 | | * |
1576 | | * When doing incremental parsing, we can reach the end of the input string |
1577 | | * without having (or knowing we have) a complete token. If it's not the |
1578 | | * final chunk of input, the partial token is then saved to the lex |
1579 | | * structure's ptok StringInfo. On subsequent calls input is appended to this |
1580 | | * buffer until we have something that we think is a complete token, |
1581 | | * which is then lexed using a recursive call to json_lex. Processing then |
1582 | | * continues as normal on subsequent calls. |
1583 | | * |
1584 | | * Note than when doing incremental processing, the lex.prev_token_terminator |
1585 | | * should not be relied on. It could point into a previous input chunk or |
1586 | | * worse. |
1587 | | */ |
1588 | | JsonParseErrorType |
1589 | | json_lex(JsonLexContext *lex) |
1590 | 6.08k | { |
1591 | 6.08k | const char *s; |
1592 | 6.08k | const char *const end = lex->input + lex->input_length; |
1593 | 6.08k | JsonParseErrorType result; |
1594 | | |
1595 | 6.08k | if (lex == &failed_oom || lex->inc_state == &failed_inc_oom) |
1596 | 0 | return JSON_OUT_OF_MEMORY; |
1597 | | |
1598 | 6.08k | if (lex->incremental) |
1599 | 0 | { |
1600 | 0 | if (lex->inc_state->partial_completed) |
1601 | 0 | { |
1602 | | /* |
1603 | | * We just lexed a completed partial token on the last call, so |
1604 | | * reset everything |
1605 | | */ |
1606 | 0 | jsonapi_resetStringInfo(&(lex->inc_state->partial_token)); |
1607 | 0 | lex->token_terminator = lex->input; |
1608 | 0 | lex->inc_state->partial_completed = false; |
1609 | 0 | } |
1610 | |
|
1611 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
1612 | | /* Make sure our partial token buffer is valid before using it below. */ |
1613 | | if (PQExpBufferDataBroken(lex->inc_state->partial_token)) |
1614 | | return JSON_OUT_OF_MEMORY; |
1615 | | #endif |
1616 | 0 | } |
1617 | | |
1618 | 6.08k | s = lex->token_terminator; |
1619 | | |
1620 | 6.08k | if (lex->incremental && lex->inc_state->partial_token.len) |
1621 | 0 | { |
1622 | | /* |
1623 | | * We have a partial token. Extend it and if completed lex it by a |
1624 | | * recursive call |
1625 | | */ |
1626 | 0 | jsonapi_StrValType *ptok = &(lex->inc_state->partial_token); |
1627 | 0 | size_t added = 0; |
1628 | 0 | bool tok_done = false; |
1629 | 0 | JsonLexContext dummy_lex = {0}; |
1630 | 0 | JsonParseErrorType partial_result; |
1631 | |
|
1632 | 0 | if (ptok->data[0] == '"') |
1633 | 0 | { |
1634 | | /* |
1635 | | * It's a string. Accumulate characters until we reach an |
1636 | | * unescaped '"'. |
1637 | | */ |
1638 | 0 | int escapes = 0; |
1639 | |
|
1640 | 0 | for (int i = ptok->len - 1; i > 0; i--) |
1641 | 0 | { |
1642 | | /* count the trailing backslashes on the partial token */ |
1643 | 0 | if (ptok->data[i] == '\\') |
1644 | 0 | escapes++; |
1645 | 0 | else |
1646 | 0 | break; |
1647 | 0 | } |
1648 | |
|
1649 | 0 | for (size_t i = 0; i < lex->input_length; i++) |
1650 | 0 | { |
1651 | 0 | char c = lex->input[i]; |
1652 | |
|
1653 | 0 | jsonapi_appendStringInfoCharMacro(ptok, c); |
1654 | 0 | added++; |
1655 | 0 | if (c == '"' && escapes % 2 == 0) |
1656 | 0 | { |
1657 | 0 | tok_done = true; |
1658 | 0 | break; |
1659 | 0 | } |
1660 | 0 | if (c == '\\') |
1661 | 0 | escapes++; |
1662 | 0 | else |
1663 | 0 | escapes = 0; |
1664 | 0 | } |
1665 | 0 | } |
1666 | 0 | else |
1667 | 0 | { |
1668 | | /* not a string */ |
1669 | 0 | char c = ptok->data[0]; |
1670 | |
|
1671 | 0 | if (c == '-' || (c >= '0' && c <= '9')) |
1672 | 0 | { |
1673 | | /* for numbers look for possible numeric continuations */ |
1674 | |
|
1675 | 0 | bool numend = false; |
1676 | |
|
1677 | 0 | for (size_t i = 0; i < lex->input_length && !numend; i++) |
1678 | 0 | { |
1679 | 0 | char cc = lex->input[i]; |
1680 | |
|
1681 | 0 | switch (cc) |
1682 | 0 | { |
1683 | 0 | case '+': |
1684 | 0 | case '-': |
1685 | 0 | case 'e': |
1686 | 0 | case 'E': |
1687 | 0 | case '0': |
1688 | 0 | case '1': |
1689 | 0 | case '2': |
1690 | 0 | case '3': |
1691 | 0 | case '4': |
1692 | 0 | case '5': |
1693 | 0 | case '6': |
1694 | 0 | case '7': |
1695 | 0 | case '8': |
1696 | 0 | case '9': |
1697 | 0 | { |
1698 | 0 | jsonapi_appendStringInfoCharMacro(ptok, cc); |
1699 | 0 | added++; |
1700 | 0 | } |
1701 | 0 | break; |
1702 | 0 | default: |
1703 | 0 | numend = true; |
1704 | 0 | } |
1705 | 0 | } |
1706 | 0 | } |
1707 | | |
1708 | | /* |
1709 | | * Add any remaining alphanumeric chars. This takes care of the |
1710 | | * {null, false, true} literals as well as any trailing |
1711 | | * alphanumeric junk on non-string tokens. |
1712 | | */ |
1713 | 0 | for (size_t i = added; i < lex->input_length; i++) |
1714 | 0 | { |
1715 | 0 | char cc = lex->input[i]; |
1716 | |
|
1717 | 0 | if (JSON_ALPHANUMERIC_CHAR(cc)) |
1718 | 0 | { |
1719 | 0 | jsonapi_appendStringInfoCharMacro(ptok, cc); |
1720 | 0 | added++; |
1721 | 0 | } |
1722 | 0 | else |
1723 | 0 | { |
1724 | 0 | tok_done = true; |
1725 | 0 | break; |
1726 | 0 | } |
1727 | 0 | } |
1728 | 0 | if (added == lex->input_length && |
1729 | 0 | lex->inc_state->is_last_chunk) |
1730 | 0 | { |
1731 | 0 | tok_done = true; |
1732 | 0 | } |
1733 | 0 | } |
1734 | | |
1735 | 0 | if (!tok_done) |
1736 | 0 | { |
1737 | | /* We should have consumed the whole chunk in this case. */ |
1738 | 0 | Assert(added == lex->input_length); |
1739 | |
|
1740 | 0 | if (!lex->inc_state->is_last_chunk) |
1741 | 0 | return JSON_INCOMPLETE; |
1742 | | |
1743 | | /* json_errdetail() needs access to the accumulated token. */ |
1744 | 0 | lex->token_start = ptok->data; |
1745 | 0 | lex->token_terminator = ptok->data + ptok->len; |
1746 | 0 | return JSON_INVALID_TOKEN; |
1747 | 0 | } |
1748 | | |
1749 | | /* |
1750 | | * Everything up to lex->input[added] has been added to the partial |
1751 | | * token, so move the input past it. |
1752 | | */ |
1753 | 0 | lex->input += added; |
1754 | 0 | lex->input_length -= added; |
1755 | |
|
1756 | 0 | dummy_lex.input = dummy_lex.token_terminator = |
1757 | 0 | dummy_lex.line_start = ptok->data; |
1758 | 0 | dummy_lex.line_number = lex->line_number; |
1759 | 0 | dummy_lex.input_length = ptok->len; |
1760 | 0 | dummy_lex.input_encoding = lex->input_encoding; |
1761 | 0 | dummy_lex.incremental = false; |
1762 | 0 | dummy_lex.need_escapes = lex->need_escapes; |
1763 | 0 | dummy_lex.strval = lex->strval; |
1764 | |
|
1765 | 0 | partial_result = json_lex(&dummy_lex); |
1766 | | |
1767 | | /* |
1768 | | * We either have a complete token or an error. In either case we need |
1769 | | * to point to the partial token data for the semantic or error |
1770 | | * routines. If it's not an error we'll readjust on the next call to |
1771 | | * json_lex. |
1772 | | */ |
1773 | 0 | lex->token_type = dummy_lex.token_type; |
1774 | 0 | lex->line_number = dummy_lex.line_number; |
1775 | | |
1776 | | /* |
1777 | | * We know the prev_token_terminator must be back in some previous |
1778 | | * piece of input, so we just make it NULL. |
1779 | | */ |
1780 | 0 | lex->prev_token_terminator = NULL; |
1781 | | |
1782 | | /* |
1783 | | * Normally token_start would be ptok->data, but it could be later, |
1784 | | * see json_lex_string's handling of invalid escapes. |
1785 | | */ |
1786 | 0 | lex->token_start = dummy_lex.token_start; |
1787 | 0 | lex->token_terminator = dummy_lex.token_terminator; |
1788 | 0 | if (partial_result == JSON_SUCCESS) |
1789 | 0 | { |
1790 | | /* make sure we've used all the input */ |
1791 | 0 | if (lex->token_terminator - lex->token_start != ptok->len) |
1792 | 0 | { |
1793 | 0 | Assert(false); |
1794 | 0 | return JSON_INVALID_TOKEN; |
1795 | 0 | } |
1796 | | |
1797 | 0 | lex->inc_state->partial_completed = true; |
1798 | 0 | } |
1799 | 0 | return partial_result; |
1800 | | /* end of partial token processing */ |
1801 | 0 | } |
1802 | | |
1803 | | /* Skip leading whitespace. */ |
1804 | 7.64k | while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')) |
1805 | 1.55k | { |
1806 | 1.55k | if (*s++ == '\n') |
1807 | 198 | { |
1808 | 198 | ++lex->line_number; |
1809 | 198 | lex->line_start = s; |
1810 | 198 | } |
1811 | 1.55k | } |
1812 | 6.08k | lex->token_start = s; |
1813 | | |
1814 | | /* Determine token type. */ |
1815 | 6.08k | if (s >= end) |
1816 | 0 | { |
1817 | 0 | lex->token_start = NULL; |
1818 | 0 | lex->prev_token_terminator = lex->token_terminator; |
1819 | 0 | lex->token_terminator = s; |
1820 | 0 | lex->token_type = JSON_TOKEN_END; |
1821 | 0 | } |
1822 | 6.08k | else |
1823 | 6.08k | { |
1824 | 6.08k | switch (*s) |
1825 | 6.08k | { |
1826 | | /* Single-character token, some kind of punctuation mark. */ |
1827 | 572 | case '{': |
1828 | 572 | lex->prev_token_terminator = lex->token_terminator; |
1829 | 572 | lex->token_terminator = s + 1; |
1830 | 572 | lex->token_type = JSON_TOKEN_OBJECT_START; |
1831 | 572 | break; |
1832 | 2 | case '}': |
1833 | 2 | lex->prev_token_terminator = lex->token_terminator; |
1834 | 2 | lex->token_terminator = s + 1; |
1835 | 2 | lex->token_type = JSON_TOKEN_OBJECT_END; |
1836 | 2 | break; |
1837 | 2.57k | case '[': |
1838 | 2.57k | lex->prev_token_terminator = lex->token_terminator; |
1839 | 2.57k | lex->token_terminator = s + 1; |
1840 | 2.57k | lex->token_type = JSON_TOKEN_ARRAY_START; |
1841 | 2.57k | break; |
1842 | 2 | case ']': |
1843 | 2 | lex->prev_token_terminator = lex->token_terminator; |
1844 | 2 | lex->token_terminator = s + 1; |
1845 | 2 | lex->token_type = JSON_TOKEN_ARRAY_END; |
1846 | 2 | break; |
1847 | 13 | case ',': |
1848 | 13 | lex->prev_token_terminator = lex->token_terminator; |
1849 | 13 | lex->token_terminator = s + 1; |
1850 | 13 | lex->token_type = JSON_TOKEN_COMMA; |
1851 | 13 | break; |
1852 | 566 | case ':': |
1853 | 566 | lex->prev_token_terminator = lex->token_terminator; |
1854 | 566 | lex->token_terminator = s + 1; |
1855 | 566 | lex->token_type = JSON_TOKEN_COLON; |
1856 | 566 | break; |
1857 | 1.43k | case '"': |
1858 | | /* string */ |
1859 | 1.43k | result = json_lex_string(lex); |
1860 | 1.43k | if (result != JSON_SUCCESS) |
1861 | 843 | return result; |
1862 | 596 | lex->token_type = JSON_TOKEN_STRING; |
1863 | 596 | break; |
1864 | 97 | case '-': |
1865 | | /* Negative number. */ |
1866 | 97 | result = json_lex_number(lex, s + 1, NULL, NULL); |
1867 | 97 | if (result != JSON_SUCCESS) |
1868 | 67 | return result; |
1869 | 30 | lex->token_type = JSON_TOKEN_NUMBER; |
1870 | 30 | break; |
1871 | 59 | case '0': |
1872 | 109 | case '1': |
1873 | 161 | case '2': |
1874 | 204 | case '3': |
1875 | 229 | case '4': |
1876 | 282 | case '5': |
1877 | 334 | case '6': |
1878 | 375 | case '7': |
1879 | 415 | case '8': |
1880 | 444 | case '9': |
1881 | | /* Positive number. */ |
1882 | 444 | result = json_lex_number(lex, s, NULL, NULL); |
1883 | 444 | if (result != JSON_SUCCESS) |
1884 | 213 | return result; |
1885 | 231 | lex->token_type = JSON_TOKEN_NUMBER; |
1886 | 231 | break; |
1887 | 376 | default: |
1888 | 376 | { |
1889 | 376 | const char *p; |
1890 | | |
1891 | | /* |
1892 | | * We're not dealing with a string, number, legal |
1893 | | * punctuation mark, or end of string. The only legal |
1894 | | * tokens we might find here are true, false, and null, |
1895 | | * but for error reporting purposes we scan until we see a |
1896 | | * non-alphanumeric character. That way, we can report |
1897 | | * the whole word as an unexpected token, rather than just |
1898 | | * some unintuitive prefix thereof. |
1899 | | */ |
1900 | 4.19M | for (p = s; p < end && JSON_ALPHANUMERIC_CHAR(*p); p++) |
1901 | 4.19M | /* skip */ ; |
1902 | | |
1903 | | /* |
1904 | | * We got some sort of unexpected punctuation or an |
1905 | | * otherwise unexpected character, so just complain about |
1906 | | * that one character. |
1907 | | */ |
1908 | 376 | if (p == s) |
1909 | 202 | { |
1910 | 202 | lex->prev_token_terminator = lex->token_terminator; |
1911 | 202 | lex->token_terminator = s + 1; |
1912 | 202 | return JSON_INVALID_TOKEN; |
1913 | 202 | } |
1914 | | |
1915 | 174 | if (lex->incremental && !lex->inc_state->is_last_chunk && |
1916 | 174 | p == lex->input + lex->input_length) |
1917 | 0 | { |
1918 | 0 | jsonapi_appendBinaryStringInfo(&(lex->inc_state->partial_token), s, end - s); |
1919 | 0 | return JSON_INCOMPLETE; |
1920 | 0 | } |
1921 | | |
1922 | | /* |
1923 | | * We've got a real alphanumeric token here. If it |
1924 | | * happens to be true, false, or null, all is well. If |
1925 | | * not, error out. |
1926 | | */ |
1927 | 174 | lex->prev_token_terminator = lex->token_terminator; |
1928 | 174 | lex->token_terminator = p; |
1929 | 174 | if (p - s == 4) |
1930 | 43 | { |
1931 | 43 | if (memcmp(s, "true", 4) == 0) |
1932 | 6 | lex->token_type = JSON_TOKEN_TRUE; |
1933 | 37 | else if (memcmp(s, "null", 4) == 0) |
1934 | 6 | lex->token_type = JSON_TOKEN_NULL; |
1935 | 31 | else |
1936 | 31 | return JSON_INVALID_TOKEN; |
1937 | 43 | } |
1938 | 131 | else if (p - s == 5 && memcmp(s, "false", 5) == 0) |
1939 | 3 | lex->token_type = JSON_TOKEN_FALSE; |
1940 | 128 | else |
1941 | 128 | return JSON_INVALID_TOKEN; |
1942 | 174 | } |
1943 | 6.08k | } /* end of switch */ |
1944 | 6.08k | } |
1945 | | |
1946 | 4.60k | if (lex->incremental && lex->token_type == JSON_TOKEN_END && !lex->inc_state->is_last_chunk) |
1947 | 0 | return JSON_INCOMPLETE; |
1948 | 4.60k | else |
1949 | 4.60k | return JSON_SUCCESS; |
1950 | 4.60k | } |
1951 | | |
1952 | | /* |
1953 | | * The next token in the input stream is known to be a string; lex it. |
1954 | | * |
1955 | | * If lex->strval isn't NULL, fill it with the decoded string. |
1956 | | * Set lex->token_terminator to the end of the decoded input, and in |
1957 | | * success cases, transfer its previous value to lex->prev_token_terminator. |
1958 | | * Return JSON_SUCCESS or an error code. |
1959 | | * |
1960 | | * Note: be careful that all error exits advance lex->token_terminator |
1961 | | * to the point after the character we detected the error on. |
1962 | | */ |
1963 | | static inline JsonParseErrorType |
1964 | | json_lex_string(JsonLexContext *lex) |
1965 | 1.43k | { |
1966 | 1.43k | const char *s; |
1967 | 1.43k | const char *const end = lex->input + lex->input_length; |
1968 | 1.43k | int hi_surrogate = -1; |
1969 | | |
1970 | | /* Convenience macros for error exits */ |
1971 | 1.43k | #define FAIL_OR_INCOMPLETE_AT_CHAR_START(code) \ |
1972 | 1.43k | do { \ |
1973 | 0 | if (lex->incremental && !lex->inc_state->is_last_chunk) \ |
1974 | 0 | { \ |
1975 | 0 | jsonapi_appendBinaryStringInfo(&lex->inc_state->partial_token, \ |
1976 | 0 | lex->token_start, \ |
1977 | 0 | end - lex->token_start); \ |
1978 | 0 | return JSON_INCOMPLETE; \ |
1979 | 0 | } \ |
1980 | 0 | lex->token_terminator = s; \ |
1981 | 0 | return code; \ |
1982 | 0 | } while (0) |
1983 | 1.43k | #define FAIL_AT_CHAR_END(code) \ |
1984 | 1.43k | do { \ |
1985 | 643 | ptrdiff_t remaining = end - s; \ |
1986 | 643 | int charlen; \ |
1987 | 643 | charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \ |
1988 | 643 | s, remaining); \ |
1989 | 643 | lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \ |
1990 | 643 | return code; \ |
1991 | 643 | } while (0) |
1992 | | |
1993 | 1.43k | if (lex->need_escapes) |
1994 | 1.43k | { |
1995 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
1996 | | /* make sure initialization succeeded */ |
1997 | | if (lex->strval == NULL) |
1998 | | return JSON_OUT_OF_MEMORY; |
1999 | | #endif |
2000 | 1.43k | jsonapi_resetStringInfo(lex->strval); |
2001 | 1.43k | } |
2002 | | |
2003 | 1.43k | Assert(lex->input_length > 0); |
2004 | 1.43k | s = lex->token_start; |
2005 | 1.43k | for (;;) |
2006 | 45.9k | { |
2007 | 45.9k | s++; |
2008 | | /* Premature end of the string. */ |
2009 | 45.9k | if (s >= end) |
2010 | 0 | FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN); |
2011 | 45.9k | else if (*s == '"') |
2012 | 607 | break; |
2013 | 45.3k | else if (*s == '\\') |
2014 | 31.8k | { |
2015 | | /* OK, we have an escape character. */ |
2016 | 31.8k | s++; |
2017 | 31.8k | if (s >= end) |
2018 | 0 | FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN); |
2019 | 31.8k | else if (*s == 'u') |
2020 | 2.23k | { |
2021 | 2.23k | int i; |
2022 | 2.23k | int ch = 0; |
2023 | | |
2024 | 10.8k | for (i = 1; i <= 4; i++) |
2025 | 8.67k | { |
2026 | 8.67k | s++; |
2027 | 8.67k | if (s >= end) |
2028 | 0 | FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN); |
2029 | 8.67k | else if (*s >= '0' && *s <= '9') |
2030 | 6.47k | ch = (ch * 16) + (*s - '0'); |
2031 | 2.20k | else if (*s >= 'a' && *s <= 'f') |
2032 | 1.10k | ch = (ch * 16) + (*s - 'a') + 10; |
2033 | 1.09k | else if (*s >= 'A' && *s <= 'F') |
2034 | 978 | ch = (ch * 16) + (*s - 'A') + 10; |
2035 | 115 | else |
2036 | 115 | FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT); |
2037 | 8.67k | } |
2038 | 2.12k | if (lex->need_escapes) |
2039 | 2.12k | { |
2040 | | /* |
2041 | | * Combine surrogate pairs. |
2042 | | */ |
2043 | 2.12k | if (is_utf16_surrogate_first(ch)) |
2044 | 292 | { |
2045 | 292 | if (hi_surrogate != -1) |
2046 | 50 | FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE); |
2047 | 242 | hi_surrogate = ch; |
2048 | 242 | continue; |
2049 | 292 | } |
2050 | 1.83k | else if (is_utf16_surrogate_second(ch)) |
2051 | 78 | { |
2052 | 78 | if (hi_surrogate == -1) |
2053 | 53 | FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); |
2054 | 25 | ch = surrogate_pair_to_codepoint(hi_surrogate, ch); |
2055 | 25 | hi_surrogate = -1; |
2056 | 25 | } |
2057 | | |
2058 | 1.77k | if (hi_surrogate != -1) |
2059 | 47 | FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); |
2060 | | |
2061 | | /* |
2062 | | * Reject invalid cases. We can't have a value above |
2063 | | * 0xFFFF here (since we only accepted 4 hex digits |
2064 | | * above), so no need to test for out-of-range chars. |
2065 | | */ |
2066 | 1.73k | if (ch == 0) |
2067 | 40 | { |
2068 | | /* We can't allow this, since our TEXT type doesn't */ |
2069 | 40 | FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO); |
2070 | 40 | } |
2071 | | |
2072 | | /* |
2073 | | * Add the represented character to lex->strval. In the |
2074 | | * backend, we can let pg_unicode_to_server_noerror() |
2075 | | * handle any required character set conversion; in |
2076 | | * frontend, we can only deal with trivial conversions. |
2077 | | */ |
2078 | 1.69k | #ifndef FRONTEND |
2079 | 1.69k | { |
2080 | 1.69k | char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; |
2081 | | |
2082 | 1.69k | if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf)) |
2083 | 106 | FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE); |
2084 | 1.58k | appendStringInfoString(lex->strval, cbuf); |
2085 | 1.58k | } |
2086 | | #else |
2087 | | if (lex->input_encoding == PG_UTF8) |
2088 | | { |
2089 | | /* OK, we can map the code point to UTF8 easily */ |
2090 | | char utf8str[5]; |
2091 | | int utf8len; |
2092 | | |
2093 | | unicode_to_utf8(ch, (unsigned char *) utf8str); |
2094 | | utf8len = pg_utf_mblen((unsigned char *) utf8str); |
2095 | | jsonapi_appendBinaryStringInfo(lex->strval, utf8str, utf8len); |
2096 | | } |
2097 | | else if (ch <= 0x007f) |
2098 | | { |
2099 | | /* The ASCII range is the same in all encodings */ |
2100 | | jsonapi_appendStringInfoChar(lex->strval, (char) ch); |
2101 | | } |
2102 | | else |
2103 | | FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE); |
2104 | | #endif /* FRONTEND */ |
2105 | 1.58k | } |
2106 | 2.12k | } |
2107 | 29.6k | else if (lex->need_escapes) |
2108 | 29.6k | { |
2109 | 29.6k | if (hi_surrogate != -1) |
2110 | 51 | FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); |
2111 | | |
2112 | 29.5k | switch (*s) |
2113 | 29.5k | { |
2114 | 557 | case '"': |
2115 | 12.8k | case '\\': |
2116 | 13.2k | case '/': |
2117 | 13.2k | jsonapi_appendStringInfoChar(lex->strval, *s); |
2118 | 13.2k | break; |
2119 | 9.47k | case 'b': |
2120 | 9.47k | jsonapi_appendStringInfoChar(lex->strval, '\b'); |
2121 | 9.47k | break; |
2122 | 5.05k | case 'f': |
2123 | 5.05k | jsonapi_appendStringInfoChar(lex->strval, '\f'); |
2124 | 5.05k | break; |
2125 | 444 | case 'n': |
2126 | 444 | jsonapi_appendStringInfoChar(lex->strval, '\n'); |
2127 | 444 | break; |
2128 | 808 | case 'r': |
2129 | 808 | jsonapi_appendStringInfoChar(lex->strval, '\r'); |
2130 | 808 | break; |
2131 | 376 | case 't': |
2132 | 376 | jsonapi_appendStringInfoChar(lex->strval, '\t'); |
2133 | 376 | break; |
2134 | 125 | default: |
2135 | | |
2136 | | /* |
2137 | | * Not a valid string escape, so signal error. We |
2138 | | * adjust token_start so that just the escape sequence |
2139 | | * is reported, not the whole string. |
2140 | | */ |
2141 | 125 | lex->token_start = s; |
2142 | 125 | FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID); |
2143 | 29.5k | } |
2144 | 29.5k | } |
2145 | 0 | else if (strchr("\"\\/bfnrt", *s) == NULL) |
2146 | 0 | { |
2147 | | /* |
2148 | | * Simpler processing if we're not bothered about de-escaping |
2149 | | * |
2150 | | * It's very tempting to remove the strchr() call here and |
2151 | | * replace it with a switch statement, but testing so far has |
2152 | | * shown it's not a performance win. |
2153 | | */ |
2154 | 0 | lex->token_start = s; |
2155 | 0 | FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID); |
2156 | 0 | } |
2157 | 31.8k | } |
2158 | 13.5k | else |
2159 | 13.5k | { |
2160 | 13.5k | const char *p = s; |
2161 | | |
2162 | 13.5k | if (hi_surrogate != -1) |
2163 | 56 | FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); |
2164 | | |
2165 | | /* |
2166 | | * Skip to the first byte that requires special handling, so we |
2167 | | * can batch calls to jsonapi_appendBinaryStringInfo. |
2168 | | */ |
2169 | 435k | while (p < end - sizeof(Vector8) && |
2170 | 435k | !pg_lfind8('\\', (uint8 *) p, sizeof(Vector8)) && |
2171 | 435k | !pg_lfind8('"', (uint8 *) p, sizeof(Vector8)) && |
2172 | 435k | !pg_lfind8_le(31, (uint8 *) p, sizeof(Vector8))) |
2173 | 422k | p += sizeof(Vector8); |
2174 | | |
2175 | 44.2k | for (; p < end; p++) |
2176 | 44.2k | { |
2177 | 44.2k | if (*p == '\\' || *p == '"') |
2178 | 13.2k | break; |
2179 | 30.9k | else if ((unsigned char) *p <= 31) |
2180 | 189 | { |
2181 | | /* Per RFC4627, these characters MUST be escaped. */ |
2182 | | /* |
2183 | | * Since *p isn't printable, exclude it from the context |
2184 | | * string |
2185 | | */ |
2186 | 189 | lex->token_terminator = p; |
2187 | 189 | return JSON_ESCAPING_REQUIRED; |
2188 | 189 | } |
2189 | 44.2k | } |
2190 | | |
2191 | 13.2k | if (lex->need_escapes) |
2192 | 13.2k | jsonapi_appendBinaryStringInfo(lex->strval, s, p - s); |
2193 | | |
2194 | | /* |
2195 | | * s will be incremented at the top of the loop, so set it to just |
2196 | | * behind our lookahead position |
2197 | | */ |
2198 | 13.2k | s = p - 1; |
2199 | 13.2k | } |
2200 | 45.9k | } |
2201 | | |
2202 | 607 | if (hi_surrogate != -1) |
2203 | 11 | { |
2204 | 11 | lex->token_terminator = s + 1; |
2205 | 11 | return JSON_UNICODE_LOW_SURROGATE; |
2206 | 11 | } |
2207 | | |
2208 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
2209 | | if (lex->need_escapes && PQExpBufferBroken(lex->strval)) |
2210 | | return JSON_OUT_OF_MEMORY; |
2211 | | #endif |
2212 | | |
2213 | | /* Hooray, we found the end of the string! */ |
2214 | 596 | lex->prev_token_terminator = lex->token_terminator; |
2215 | 596 | lex->token_terminator = s + 1; |
2216 | 596 | return JSON_SUCCESS; |
2217 | | |
2218 | 607 | #undef FAIL_OR_INCOMPLETE_AT_CHAR_START |
2219 | 607 | #undef FAIL_AT_CHAR_END |
2220 | 607 | } |
2221 | | |
2222 | | /* |
2223 | | * The next token in the input stream is known to be a number; lex it. |
2224 | | * |
2225 | | * In JSON, a number consists of four parts: |
2226 | | * |
2227 | | * (1) An optional minus sign ('-'). |
2228 | | * |
2229 | | * (2) Either a single '0', or a string of one or more digits that does not |
2230 | | * begin with a '0'. |
2231 | | * |
2232 | | * (3) An optional decimal part, consisting of a period ('.') followed by |
2233 | | * one or more digits. (Note: While this part can be omitted |
2234 | | * completely, it's not OK to have only the decimal point without |
2235 | | * any digits afterwards.) |
2236 | | * |
2237 | | * (4) An optional exponent part, consisting of 'e' or 'E', optionally |
2238 | | * followed by '+' or '-', followed by one or more digits. (Note: |
2239 | | * As with the decimal part, if 'e' or 'E' is present, it must be |
2240 | | * followed by at least one digit.) |
2241 | | * |
2242 | | * The 's' argument to this function points to the ostensible beginning |
2243 | | * of part 2 - i.e. the character after any optional minus sign, or the |
2244 | | * first character of the string if there is none. |
2245 | | * |
2246 | | * If num_err is not NULL, we return an error flag to *num_err rather than |
2247 | | * raising an error for a badly-formed number. Also, if total_len is not NULL |
2248 | | * the distance from lex->input to the token end+1 is returned to *total_len. |
2249 | | */ |
2250 | | static inline JsonParseErrorType |
2251 | | json_lex_number(JsonLexContext *lex, const char *s, |
2252 | | bool *num_err, size_t *total_len) |
2253 | 541 | { |
2254 | 541 | bool error = false; |
2255 | 541 | int len = s - lex->input; |
2256 | | |
2257 | | /* Part (1): leading sign indicator. */ |
2258 | | /* Caller already did this for us; so do nothing. */ |
2259 | | |
2260 | | /* Part (2): parse main digit string. */ |
2261 | 541 | if (len < lex->input_length && *s == '0') |
2262 | 60 | { |
2263 | 60 | s++; |
2264 | 60 | len++; |
2265 | 60 | } |
2266 | 481 | else if (len < lex->input_length && *s >= '1' && *s <= '9') |
2267 | 424 | { |
2268 | 424 | do |
2269 | 60.7k | { |
2270 | 60.7k | s++; |
2271 | 60.7k | len++; |
2272 | 60.7k | } while (len < lex->input_length && *s >= '0' && *s <= '9'); |
2273 | 424 | } |
2274 | 57 | else |
2275 | 57 | error = true; |
2276 | | |
2277 | | /* Part (3): parse optional decimal portion. */ |
2278 | 541 | if (len < lex->input_length && *s == '.') |
2279 | 137 | { |
2280 | 137 | s++; |
2281 | 137 | len++; |
2282 | 137 | if (len == lex->input_length || *s < '0' || *s > '9') |
2283 | 39 | error = true; |
2284 | 98 | else |
2285 | 98 | { |
2286 | 98 | do |
2287 | 16.8k | { |
2288 | 16.8k | s++; |
2289 | 16.8k | len++; |
2290 | 16.8k | } while (len < lex->input_length && *s >= '0' && *s <= '9'); |
2291 | 98 | } |
2292 | 137 | } |
2293 | | |
2294 | | /* Part (4): parse optional exponent. */ |
2295 | 541 | if (len < lex->input_length && (*s == 'e' || *s == 'E')) |
2296 | 166 | { |
2297 | 166 | s++; |
2298 | 166 | len++; |
2299 | 166 | if (len < lex->input_length && (*s == '+' || *s == '-')) |
2300 | 14 | { |
2301 | 14 | s++; |
2302 | 14 | len++; |
2303 | 14 | } |
2304 | 166 | if (len == lex->input_length || *s < '0' || *s > '9') |
2305 | 59 | error = true; |
2306 | 107 | else |
2307 | 107 | { |
2308 | 107 | do |
2309 | 24.3k | { |
2310 | 24.3k | s++; |
2311 | 24.3k | len++; |
2312 | 24.3k | } while (len < lex->input_length && *s >= '0' && *s <= '9'); |
2313 | 107 | } |
2314 | 166 | } |
2315 | | |
2316 | | /* |
2317 | | * Check for trailing garbage. As in json_lex(), any alphanumeric stuff |
2318 | | * here should be considered part of the token for error-reporting |
2319 | | * purposes. |
2320 | | */ |
2321 | 557k | for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++) |
2322 | 556k | error = true; |
2323 | | |
2324 | 541 | if (total_len != NULL) |
2325 | 0 | *total_len = len; |
2326 | | |
2327 | 541 | if (lex->incremental && !lex->inc_state->is_last_chunk && |
2328 | 541 | len >= lex->input_length) |
2329 | 0 | { |
2330 | 0 | jsonapi_appendBinaryStringInfo(&lex->inc_state->partial_token, |
2331 | 0 | lex->token_start, s - lex->token_start); |
2332 | 0 | if (num_err != NULL) |
2333 | 0 | *num_err = error; |
2334 | |
|
2335 | 0 | return JSON_INCOMPLETE; |
2336 | 0 | } |
2337 | 541 | else if (num_err != NULL) |
2338 | 0 | { |
2339 | | /* let the caller handle any error */ |
2340 | 0 | *num_err = error; |
2341 | 0 | } |
2342 | 541 | else |
2343 | 541 | { |
2344 | | /* return token endpoint */ |
2345 | 541 | lex->prev_token_terminator = lex->token_terminator; |
2346 | 541 | lex->token_terminator = s; |
2347 | | /* handle error if any */ |
2348 | 541 | if (error) |
2349 | 280 | return JSON_INVALID_TOKEN; |
2350 | 541 | } |
2351 | | |
2352 | 261 | return JSON_SUCCESS; |
2353 | 541 | } |
2354 | | |
2355 | | /* |
2356 | | * Report a parse error. |
2357 | | * |
2358 | | * lex->token_start and lex->token_terminator must identify the current token. |
2359 | | */ |
2360 | | static JsonParseErrorType |
2361 | | report_parse_error(JsonParseContext ctx, JsonLexContext *lex) |
2362 | 52 | { |
2363 | | /* Handle case where the input ended prematurely. */ |
2364 | 52 | if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END) |
2365 | 0 | return JSON_EXPECTED_MORE; |
2366 | | |
2367 | | /* Otherwise choose the error type based on the parsing context. */ |
2368 | 52 | switch (ctx) |
2369 | 52 | { |
2370 | 41 | case JSON_PARSE_END: |
2371 | 41 | return JSON_EXPECTED_END; |
2372 | 5 | case JSON_PARSE_VALUE: |
2373 | 5 | return JSON_EXPECTED_JSON; |
2374 | 0 | case JSON_PARSE_STRING: |
2375 | 0 | return JSON_EXPECTED_STRING; |
2376 | 0 | case JSON_PARSE_ARRAY_START: |
2377 | 0 | return JSON_EXPECTED_ARRAY_FIRST; |
2378 | 2 | case JSON_PARSE_ARRAY_NEXT: |
2379 | 2 | return JSON_EXPECTED_ARRAY_NEXT; |
2380 | 2 | case JSON_PARSE_OBJECT_START: |
2381 | 2 | return JSON_EXPECTED_OBJECT_FIRST; |
2382 | 2 | case JSON_PARSE_OBJECT_LABEL: |
2383 | 2 | return JSON_EXPECTED_COLON; |
2384 | 0 | case JSON_PARSE_OBJECT_NEXT: |
2385 | 0 | return JSON_EXPECTED_OBJECT_NEXT; |
2386 | 0 | case JSON_PARSE_OBJECT_COMMA: |
2387 | 0 | return JSON_EXPECTED_STRING; |
2388 | 52 | } |
2389 | | |
2390 | | /* |
2391 | | * We don't use a default: case, so that the compiler will warn about |
2392 | | * unhandled enum values. |
2393 | | */ |
2394 | 0 | Assert(false); |
2395 | 0 | return JSON_SUCCESS; /* silence stupider compilers */ |
2396 | 52 | } |
2397 | | |
2398 | | /* |
2399 | | * Construct an (already translated) detail message for a JSON error. |
2400 | | * |
2401 | | * The returned pointer should not be freed, the allocation is either static |
2402 | | * or owned by the JsonLexContext. |
2403 | | */ |
2404 | | char * |
2405 | | json_errdetail(JsonParseErrorType error, JsonLexContext *lex) |
2406 | 0 | { |
2407 | 0 | if (error == JSON_OUT_OF_MEMORY || lex == &failed_oom) |
2408 | 0 | { |
2409 | | /* Short circuit. Allocating anything for this case is unhelpful. */ |
2410 | 0 | return _("out of memory"); |
2411 | 0 | } |
2412 | | |
2413 | 0 | if (lex->errormsg) |
2414 | 0 | jsonapi_resetStringInfo(lex->errormsg); |
2415 | 0 | else |
2416 | 0 | lex->errormsg = jsonapi_makeStringInfo(); |
2417 | | |
2418 | | /* |
2419 | | * A helper for error messages that should print the current token. The |
2420 | | * format must contain exactly one %.*s specifier. |
2421 | | */ |
2422 | 0 | #define json_token_error(lex, format) \ |
2423 | 0 | jsonapi_appendStringInfo((lex)->errormsg, _(format), \ |
2424 | 0 | (int) ((lex)->token_terminator - (lex)->token_start), \ |
2425 | 0 | (lex)->token_start); |
2426 | |
|
2427 | 0 | switch (error) |
2428 | 0 | { |
2429 | 0 | case JSON_INCOMPLETE: |
2430 | 0 | case JSON_SUCCESS: |
2431 | | /* fall through to the error code after switch */ |
2432 | 0 | break; |
2433 | 0 | case JSON_INVALID_LEXER_TYPE: |
2434 | 0 | if (lex->incremental) |
2435 | 0 | return _("Recursive descent parser cannot use incremental lexer."); |
2436 | 0 | else |
2437 | 0 | return _("Incremental parser requires incremental lexer."); |
2438 | 0 | case JSON_NESTING_TOO_DEEP: |
2439 | 0 | return (_("JSON nested too deep, maximum permitted depth is 6400.")); |
2440 | 0 | case JSON_ESCAPING_INVALID: |
2441 | 0 | json_token_error(lex, "Escape sequence \"\\%.*s\" is invalid."); |
2442 | 0 | break; |
2443 | 0 | case JSON_ESCAPING_REQUIRED: |
2444 | 0 | jsonapi_appendStringInfo(lex->errormsg, |
2445 | 0 | _("Character with value 0x%02x must be escaped."), |
2446 | 0 | (unsigned char) *(lex->token_terminator)); |
2447 | 0 | break; |
2448 | 0 | case JSON_EXPECTED_END: |
2449 | 0 | json_token_error(lex, "Expected end of input, but found \"%.*s\"."); |
2450 | 0 | break; |
2451 | 0 | case JSON_EXPECTED_ARRAY_FIRST: |
2452 | 0 | json_token_error(lex, "Expected array element or \"]\", but found \"%.*s\"."); |
2453 | 0 | break; |
2454 | 0 | case JSON_EXPECTED_ARRAY_NEXT: |
2455 | 0 | json_token_error(lex, "Expected \",\" or \"]\", but found \"%.*s\"."); |
2456 | 0 | break; |
2457 | 0 | case JSON_EXPECTED_COLON: |
2458 | 0 | json_token_error(lex, "Expected \":\", but found \"%.*s\"."); |
2459 | 0 | break; |
2460 | 0 | case JSON_EXPECTED_JSON: |
2461 | 0 | json_token_error(lex, "Expected JSON value, but found \"%.*s\"."); |
2462 | 0 | break; |
2463 | 0 | case JSON_EXPECTED_MORE: |
2464 | 0 | return _("The input string ended unexpectedly."); |
2465 | 0 | case JSON_EXPECTED_OBJECT_FIRST: |
2466 | 0 | json_token_error(lex, "Expected string or \"}\", but found \"%.*s\"."); |
2467 | 0 | break; |
2468 | 0 | case JSON_EXPECTED_OBJECT_NEXT: |
2469 | 0 | json_token_error(lex, "Expected \",\" or \"}\", but found \"%.*s\"."); |
2470 | 0 | break; |
2471 | 0 | case JSON_EXPECTED_STRING: |
2472 | 0 | json_token_error(lex, "Expected string, but found \"%.*s\"."); |
2473 | 0 | break; |
2474 | 0 | case JSON_INVALID_TOKEN: |
2475 | 0 | json_token_error(lex, "Token \"%.*s\" is invalid."); |
2476 | 0 | break; |
2477 | 0 | case JSON_OUT_OF_MEMORY: |
2478 | | /* should have been handled above; use the error path */ |
2479 | 0 | break; |
2480 | 0 | case JSON_UNICODE_CODE_POINT_ZERO: |
2481 | 0 | return _("\\u0000 cannot be converted to text."); |
2482 | 0 | case JSON_UNICODE_ESCAPE_FORMAT: |
2483 | 0 | return _("\"\\u\" must be followed by four hexadecimal digits."); |
2484 | 0 | case JSON_UNICODE_HIGH_ESCAPE: |
2485 | | /* note: this case is only reachable in frontend not backend */ |
2486 | 0 | return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8."); |
2487 | 0 | case JSON_UNICODE_UNTRANSLATABLE: |
2488 | | |
2489 | | /* |
2490 | | * Note: this case is only reachable in backend and not frontend. |
2491 | | * #ifdef it away so the frontend doesn't try to link against |
2492 | | * backend functionality. |
2493 | | */ |
2494 | 0 | #ifndef FRONTEND |
2495 | 0 | return psprintf(_("Unicode escape value could not be translated to the server's encoding %s."), |
2496 | 0 | GetDatabaseEncodingName()); |
2497 | | #else |
2498 | | Assert(false); |
2499 | | break; |
2500 | | #endif |
2501 | 0 | case JSON_UNICODE_HIGH_SURROGATE: |
2502 | 0 | return _("Unicode high surrogate must not follow a high surrogate."); |
2503 | 0 | case JSON_UNICODE_LOW_SURROGATE: |
2504 | 0 | return _("Unicode low surrogate must follow a high surrogate."); |
2505 | 0 | case JSON_SEM_ACTION_FAILED: |
2506 | | /* fall through to the error code after switch */ |
2507 | 0 | break; |
2508 | 0 | } |
2509 | 0 | #undef json_token_error |
2510 | | |
2511 | | /* Note that lex->errormsg can be NULL in shlib code. */ |
2512 | 0 | if (lex->errormsg && lex->errormsg->len == 0) |
2513 | 0 | { |
2514 | | /* |
2515 | | * We don't use a default: case, so that the compiler will warn about |
2516 | | * unhandled enum values. But this needs to be here anyway to cover |
2517 | | * the possibility of an incorrect input. |
2518 | | */ |
2519 | 0 | jsonapi_appendStringInfo(lex->errormsg, |
2520 | 0 | "unexpected json parse error type: %d", |
2521 | 0 | (int) error); |
2522 | 0 | } |
2523 | |
|
2524 | | #ifdef JSONAPI_USE_PQEXPBUFFER |
2525 | | if (PQExpBufferBroken(lex->errormsg)) |
2526 | | return _("out of memory while constructing error description"); |
2527 | | #endif |
2528 | |
|
2529 | 0 | return lex->errormsg->data; |
2530 | 0 | } |