/src/json-glib/json-glib/json-scanner.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* json-scanner.c: Tokenizer for JSON |
2 | | * Copyright (C) 2008 OpenedHand |
3 | | * |
4 | | * Based on JsonScanner: Flexible lexical scanner for general purpose. |
5 | | * Copyright (C) 1997, 1998 Tim Janik |
6 | | * |
7 | | * Modified by Emmanuele Bassi <ebassi@openedhand.com> |
8 | | * |
9 | | * This library is free software; you can redistribute it and/or |
10 | | * modify it under the terms of the GNU Lesser General Public |
11 | | * License as published by the Free Software Foundation; either |
12 | | * version 2 of the License, or (at your option) any later version. |
13 | | * |
14 | | * This library is distributed in the hope that it will be useful, |
15 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | | * Lesser General Public License for more details. |
18 | | * |
19 | | * You should have received a copy of the GNU Lesser General Public |
20 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
21 | | */ |
22 | | |
23 | | #include "config.h" |
24 | | |
25 | | #include <errno.h> |
26 | | #include <stdlib.h> |
27 | | #include <stdarg.h> |
28 | | #include <string.h> |
29 | | #include <stdio.h> |
30 | | #ifdef HAVE_UNISTD_H |
31 | | #include <unistd.h> |
32 | | #endif |
33 | | |
34 | | #include <glib.h> |
35 | | #include <glib/gprintf.h> |
36 | | |
37 | | #include "json-scanner.h" |
38 | | |
39 | | #ifdef G_OS_WIN32 |
40 | | #include <io.h> /* For _read() */ |
41 | | #endif |
42 | | |
43 | | enum { |
44 | | JSON_ERR_MALFORMED_SURROGATE_PAIR = G_TOKEN_LAST + 1, |
45 | | }; |
46 | | |
47 | | struct _JsonScannerConfig |
48 | | { |
49 | | /* Character sets |
50 | | */ |
51 | | gchar *cset_skip_characters; /* default: " \t\n" */ |
52 | | gchar *cset_identifier_first; |
53 | | gchar *cset_identifier_nth; |
54 | | gchar *cpair_comment_single; /* default: "#\n" */ |
55 | | |
56 | | /* Should symbol lookup work case sensitive? */ |
57 | | guint case_sensitive : 1; |
58 | | |
59 | | /* Boolean values to be adjusted "on the fly" |
60 | | * to configure scanning behaviour. |
61 | | */ |
62 | | guint skip_comment_multi : 1; /* C like comment */ |
63 | | guint skip_comment_single : 1; /* single line comment */ |
64 | | guint scan_comment_multi : 1; /* scan multi line comments? */ |
65 | | guint scan_identifier : 1; |
66 | | guint scan_identifier_1char : 1; |
67 | | guint scan_identifier_NULL : 1; |
68 | | guint scan_symbols : 1; |
69 | | guint scan_binary : 1; |
70 | | guint scan_octal : 1; |
71 | | guint scan_float : 1; |
72 | | guint scan_hex : 1; /* `0x0ff0' */ |
73 | | guint scan_hex_dollar : 1; /* `$0ff0' */ |
74 | | guint scan_string_sq : 1; /* string: 'anything' */ |
75 | | guint scan_string_dq : 1; /* string: "\\-escapes!\n" */ |
76 | | guint numbers_2_int : 1; /* bin, octal, hex => int */ |
77 | | guint int_2_float : 1; /* int => G_TOKEN_FLOAT? */ |
78 | | guint identifier_2_string : 1; |
79 | | guint char_2_token : 1; /* return G_TOKEN_CHAR? */ |
80 | | guint symbol_2_token : 1; |
81 | | guint scope_0_fallback : 1; /* try scope 0 on lookups? */ |
82 | | guint store_int64 : 1; /* use value.v_int64 rather than v_int */ |
83 | | guint padding_dummy; |
84 | | }; |
85 | | |
86 | | static JsonScannerConfig json_scanner_config_template = |
87 | | { |
88 | | .cset_skip_characters = ( " \t\r\n" ), |
89 | | .cset_identifier_first = ( |
90 | | "_" |
91 | | G_CSET_a_2_z |
92 | | G_CSET_A_2_Z |
93 | | ), |
94 | | .cset_identifier_nth = ( |
95 | | G_CSET_DIGITS |
96 | | "-_" |
97 | | G_CSET_a_2_z |
98 | | G_CSET_A_2_Z |
99 | | ), |
100 | | .cpair_comment_single = ( "//\n" ), |
101 | | .case_sensitive = TRUE, |
102 | | .skip_comment_multi = TRUE, |
103 | | .skip_comment_single = TRUE, |
104 | | .scan_comment_multi = FALSE, |
105 | | .scan_identifier = TRUE, |
106 | | .scan_identifier_1char = TRUE, |
107 | | .scan_identifier_NULL = FALSE, |
108 | | .scan_symbols = TRUE, |
109 | | .scan_binary = TRUE, |
110 | | .scan_octal = TRUE, |
111 | | .scan_float = TRUE, |
112 | | .scan_hex = TRUE, |
113 | | .scan_hex_dollar = TRUE, |
114 | | .scan_string_sq = TRUE, |
115 | | .scan_string_dq = TRUE, |
116 | | .numbers_2_int = TRUE, |
117 | | .int_2_float = FALSE, |
118 | | .identifier_2_string = FALSE, |
119 | | .char_2_token = TRUE, |
120 | | .symbol_2_token = TRUE, |
121 | | .scope_0_fallback = FALSE, |
122 | | .store_int64 = TRUE, |
123 | | .padding_dummy = 0, |
124 | | }; |
125 | | |
126 | | /* --- defines --- */ |
127 | 0 | #define to_lower(c) ( \ |
128 | 0 | (guchar) ( \ |
129 | 0 | ( (((guchar)(c))>='A' && ((guchar)(c))<='Z') * ('a'-'A') ) | \ |
130 | 0 | ( (((guchar)(c))>=192 && ((guchar)(c))<=214) * (224-192) ) | \ |
131 | 0 | ( (((guchar)(c))>=216 && ((guchar)(c))<=222) * (248-216) ) | \ |
132 | 0 | ((guchar)(c)) \ |
133 | 0 | ) \ |
134 | 0 | ) |
135 | | |
136 | | #define READ_BUFFER_SIZE (4000) |
137 | | |
138 | | /* --- typedefs --- */ |
139 | | typedef struct _JsonScannerKey JsonScannerKey; |
140 | | |
141 | | struct _JsonScannerKey |
142 | | { |
143 | | guint scope_id; |
144 | | gchar *symbol; |
145 | | gpointer value; |
146 | | }; |
147 | | |
148 | | /* --- prototypes --- */ |
149 | | static gboolean json_scanner_key_equal (gconstpointer v1, |
150 | | gconstpointer v2); |
151 | | static guint json_scanner_key_hash (gconstpointer v); |
152 | | |
153 | | static inline |
154 | | JsonScannerKey *json_scanner_lookup_internal (JsonScanner *scanner, |
155 | | guint scope_id, |
156 | | const gchar *symbol); |
157 | | static void json_scanner_get_token_ll (JsonScanner *scanner, |
158 | | GTokenType *token_p, |
159 | | GTokenValue *value_p, |
160 | | guint *line_p, |
161 | | guint *position_p); |
162 | | static void json_scanner_get_token_i (JsonScanner *scanner, |
163 | | GTokenType *token_p, |
164 | | GTokenValue *value_p, |
165 | | guint *line_p, |
166 | | guint *position_p); |
167 | | |
168 | | static guchar json_scanner_peek_next_char (JsonScanner *scanner); |
169 | | static guchar json_scanner_get_char (JsonScanner *scanner, |
170 | | guint *line_p, |
171 | | guint *position_p); |
172 | | static gunichar json_scanner_get_unichar (JsonScanner *scanner, |
173 | | guint *line_p, |
174 | | guint *position_p); |
175 | | |
176 | | /* --- functions --- */ |
177 | | static inline gint |
178 | | json_scanner_char_2_num (guchar c, |
179 | | guchar base) |
180 | 0 | { |
181 | 0 | if (c >= '0' && c <= '9') |
182 | 0 | c -= '0'; |
183 | 0 | else if (c >= 'A' && c <= 'Z') |
184 | 0 | c -= 'A' - 10; |
185 | 0 | else if (c >= 'a' && c <= 'z') |
186 | 0 | c -= 'a' - 10; |
187 | 0 | else |
188 | 0 | return -1; |
189 | | |
190 | 0 | if (c < base) |
191 | 0 | return c; |
192 | | |
193 | 0 | return -1; |
194 | 0 | } |
195 | | |
196 | | JsonScanner * |
197 | | json_scanner_new (void) |
198 | 0 | { |
199 | 0 | JsonScanner *scanner; |
200 | 0 | JsonScannerConfig *config_templ; |
201 | | |
202 | 0 | config_templ = &json_scanner_config_template; |
203 | | |
204 | 0 | scanner = g_new0 (JsonScanner, 1); |
205 | | |
206 | 0 | scanner->user_data = NULL; |
207 | 0 | scanner->max_parse_errors = 1; |
208 | 0 | scanner->parse_errors = 0; |
209 | 0 | scanner->input_name = NULL; |
210 | 0 | g_datalist_init (&scanner->qdata); |
211 | | |
212 | 0 | scanner->config = g_new0 (JsonScannerConfig, 1); |
213 | | |
214 | 0 | scanner->config->case_sensitive = config_templ->case_sensitive; |
215 | 0 | scanner->config->cset_skip_characters = config_templ->cset_skip_characters; |
216 | 0 | if (!scanner->config->cset_skip_characters) |
217 | 0 | scanner->config->cset_skip_characters = ""; |
218 | 0 | scanner->config->cset_identifier_first = config_templ->cset_identifier_first; |
219 | 0 | scanner->config->cset_identifier_nth = config_templ->cset_identifier_nth; |
220 | 0 | scanner->config->cpair_comment_single = config_templ->cpair_comment_single; |
221 | 0 | scanner->config->skip_comment_multi = config_templ->skip_comment_multi; |
222 | 0 | scanner->config->skip_comment_single = config_templ->skip_comment_single; |
223 | 0 | scanner->config->scan_comment_multi = config_templ->scan_comment_multi; |
224 | 0 | scanner->config->scan_identifier = config_templ->scan_identifier; |
225 | 0 | scanner->config->scan_identifier_1char = config_templ->scan_identifier_1char; |
226 | 0 | scanner->config->scan_identifier_NULL = config_templ->scan_identifier_NULL; |
227 | 0 | scanner->config->scan_symbols = config_templ->scan_symbols; |
228 | 0 | scanner->config->scan_binary = config_templ->scan_binary; |
229 | 0 | scanner->config->scan_octal = config_templ->scan_octal; |
230 | 0 | scanner->config->scan_float = config_templ->scan_float; |
231 | 0 | scanner->config->scan_hex = config_templ->scan_hex; |
232 | 0 | scanner->config->scan_hex_dollar = config_templ->scan_hex_dollar; |
233 | 0 | scanner->config->scan_string_sq = config_templ->scan_string_sq; |
234 | 0 | scanner->config->scan_string_dq = config_templ->scan_string_dq; |
235 | 0 | scanner->config->numbers_2_int = config_templ->numbers_2_int; |
236 | 0 | scanner->config->int_2_float = config_templ->int_2_float; |
237 | 0 | scanner->config->identifier_2_string = config_templ->identifier_2_string; |
238 | 0 | scanner->config->char_2_token = config_templ->char_2_token; |
239 | 0 | scanner->config->symbol_2_token = config_templ->symbol_2_token; |
240 | 0 | scanner->config->scope_0_fallback = config_templ->scope_0_fallback; |
241 | 0 | scanner->config->store_int64 = config_templ->store_int64; |
242 | | |
243 | 0 | scanner->token = G_TOKEN_NONE; |
244 | 0 | scanner->value.v_int64 = 0; |
245 | 0 | scanner->line = 1; |
246 | 0 | scanner->position = 0; |
247 | | |
248 | 0 | scanner->next_token = G_TOKEN_NONE; |
249 | 0 | scanner->next_value.v_int64 = 0; |
250 | 0 | scanner->next_line = 1; |
251 | 0 | scanner->next_position = 0; |
252 | | |
253 | 0 | scanner->symbol_table = g_hash_table_new (json_scanner_key_hash, |
254 | 0 | json_scanner_key_equal); |
255 | 0 | scanner->text = NULL; |
256 | 0 | scanner->text_end = NULL; |
257 | 0 | scanner->buffer = NULL; |
258 | 0 | scanner->scope_id = 0; |
259 | | |
260 | 0 | return scanner; |
261 | 0 | } |
262 | | |
263 | | static inline void |
264 | | json_scanner_free_value (GTokenType *token_p, |
265 | | GTokenValue *value_p) |
266 | 0 | { |
267 | 0 | switch (*token_p) |
268 | 0 | { |
269 | 0 | case G_TOKEN_STRING: |
270 | 0 | case G_TOKEN_IDENTIFIER: |
271 | 0 | case G_TOKEN_IDENTIFIER_NULL: |
272 | 0 | case G_TOKEN_COMMENT_SINGLE: |
273 | 0 | case G_TOKEN_COMMENT_MULTI: |
274 | 0 | g_free (value_p->v_string); |
275 | 0 | break; |
276 | | |
277 | 0 | default: |
278 | 0 | break; |
279 | 0 | } |
280 | | |
281 | 0 | *token_p = G_TOKEN_NONE; |
282 | 0 | } |
283 | | |
284 | | static void |
285 | | json_scanner_destroy_symbol_table_entry (gpointer _key, |
286 | | gpointer _value G_GNUC_UNUSED, |
287 | | gpointer _data G_GNUC_UNUSED) |
288 | 0 | { |
289 | 0 | JsonScannerKey *key = _key; |
290 | | |
291 | 0 | g_free (key->symbol); |
292 | 0 | g_slice_free (JsonScannerKey, key); |
293 | 0 | } |
294 | | |
295 | | void |
296 | | json_scanner_destroy (JsonScanner *scanner) |
297 | 0 | { |
298 | 0 | g_return_if_fail (scanner != NULL); |
299 | | |
300 | 0 | g_datalist_clear (&scanner->qdata); |
301 | 0 | g_hash_table_foreach (scanner->symbol_table, |
302 | 0 | json_scanner_destroy_symbol_table_entry, |
303 | 0 | NULL); |
304 | 0 | g_hash_table_destroy (scanner->symbol_table); |
305 | 0 | json_scanner_free_value (&scanner->token, &scanner->value); |
306 | 0 | json_scanner_free_value (&scanner->next_token, &scanner->next_value); |
307 | 0 | g_free (scanner->config); |
308 | 0 | g_free (scanner->buffer); |
309 | 0 | g_free (scanner); |
310 | 0 | } |
311 | | |
312 | | void |
313 | | json_scanner_error (JsonScanner *scanner, |
314 | | const gchar *format, |
315 | | ...) |
316 | 0 | { |
317 | 0 | g_return_if_fail (scanner != NULL); |
318 | 0 | g_return_if_fail (format != NULL); |
319 | | |
320 | 0 | scanner->parse_errors++; |
321 | | |
322 | 0 | if (scanner->msg_handler) |
323 | 0 | { |
324 | 0 | va_list args; |
325 | 0 | gchar *string; |
326 | | |
327 | 0 | va_start (args, format); |
328 | 0 | string = g_strdup_vprintf (format, args); |
329 | 0 | va_end (args); |
330 | | |
331 | 0 | scanner->msg_handler (scanner, string); |
332 | | |
333 | 0 | g_free (string); |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | | static gboolean |
338 | | json_scanner_key_equal (gconstpointer v1, |
339 | | gconstpointer v2) |
340 | 0 | { |
341 | 0 | const JsonScannerKey *key1 = v1; |
342 | 0 | const JsonScannerKey *key2 = v2; |
343 | | |
344 | 0 | return (key1->scope_id == key2->scope_id) && |
345 | 0 | (strcmp (key1->symbol, key2->symbol) == 0); |
346 | 0 | } |
347 | | |
348 | | static guint |
349 | | json_scanner_key_hash (gconstpointer v) |
350 | 0 | { |
351 | 0 | const JsonScannerKey *key = v; |
352 | 0 | gchar *c; |
353 | 0 | guint h; |
354 | | |
355 | 0 | h = key->scope_id; |
356 | 0 | for (c = key->symbol; *c; c++) |
357 | 0 | h = (h << 5) - h + *c; |
358 | | |
359 | 0 | return h; |
360 | 0 | } |
361 | | |
362 | | static inline JsonScannerKey * |
363 | | json_scanner_lookup_internal (JsonScanner *scanner, |
364 | | guint scope_id, |
365 | | const gchar *symbol) |
366 | 0 | { |
367 | 0 | JsonScannerKey *key_p; |
368 | 0 | JsonScannerKey key; |
369 | | |
370 | 0 | key.scope_id = scope_id; |
371 | | |
372 | 0 | if (!scanner->config->case_sensitive) |
373 | 0 | { |
374 | 0 | gchar *d; |
375 | 0 | const gchar *c; |
376 | | |
377 | 0 | key.symbol = g_new (gchar, strlen (symbol) + 1); |
378 | 0 | for (d = key.symbol, c = symbol; *c; c++, d++) |
379 | 0 | *d = to_lower (*c); |
380 | 0 | *d = 0; |
381 | 0 | key_p = g_hash_table_lookup (scanner->symbol_table, &key); |
382 | 0 | g_free (key.symbol); |
383 | 0 | } |
384 | 0 | else |
385 | 0 | { |
386 | 0 | key.symbol = (gchar*) symbol; |
387 | 0 | key_p = g_hash_table_lookup (scanner->symbol_table, &key); |
388 | 0 | } |
389 | | |
390 | 0 | return key_p; |
391 | 0 | } |
392 | | |
393 | | void |
394 | | json_scanner_scope_add_symbol (JsonScanner *scanner, |
395 | | guint scope_id, |
396 | | const gchar *symbol, |
397 | | gpointer value) |
398 | 0 | { |
399 | 0 | JsonScannerKey *key; |
400 | |
|
401 | 0 | g_return_if_fail (scanner != NULL); |
402 | 0 | g_return_if_fail (symbol != NULL); |
403 | | |
404 | 0 | key = json_scanner_lookup_internal (scanner, scope_id, symbol); |
405 | 0 | if (!key) |
406 | 0 | { |
407 | 0 | key = g_slice_new (JsonScannerKey); |
408 | 0 | key->scope_id = scope_id; |
409 | 0 | key->symbol = g_strdup (symbol); |
410 | 0 | key->value = value; |
411 | 0 | if (!scanner->config->case_sensitive) |
412 | 0 | { |
413 | 0 | gchar *c; |
414 | |
|
415 | 0 | c = key->symbol; |
416 | 0 | while (*c != 0) |
417 | 0 | { |
418 | 0 | *c = to_lower (*c); |
419 | 0 | c++; |
420 | 0 | } |
421 | 0 | } |
422 | |
|
423 | 0 | g_hash_table_insert (scanner->symbol_table, key, key); |
424 | 0 | } |
425 | 0 | else |
426 | 0 | key->value = value; |
427 | 0 | } |
428 | | |
429 | | GTokenType |
430 | | json_scanner_peek_next_token (JsonScanner *scanner) |
431 | 0 | { |
432 | 0 | g_return_val_if_fail (scanner != NULL, G_TOKEN_EOF); |
433 | | |
434 | 0 | if (scanner->next_token == G_TOKEN_NONE) |
435 | 0 | { |
436 | 0 | scanner->next_line = scanner->line; |
437 | 0 | scanner->next_position = scanner->position; |
438 | 0 | json_scanner_get_token_i (scanner, |
439 | 0 | &scanner->next_token, |
440 | 0 | &scanner->next_value, |
441 | 0 | &scanner->next_line, |
442 | 0 | &scanner->next_position); |
443 | 0 | } |
444 | |
|
445 | 0 | return scanner->next_token; |
446 | 0 | } |
447 | | |
448 | | GTokenType |
449 | | json_scanner_get_next_token (JsonScanner *scanner) |
450 | 0 | { |
451 | 0 | g_return_val_if_fail (scanner != NULL, G_TOKEN_EOF); |
452 | | |
453 | 0 | if (scanner->next_token != G_TOKEN_NONE) |
454 | 0 | { |
455 | 0 | json_scanner_free_value (&scanner->token, &scanner->value); |
456 | |
|
457 | 0 | scanner->token = scanner->next_token; |
458 | 0 | scanner->value = scanner->next_value; |
459 | 0 | scanner->line = scanner->next_line; |
460 | 0 | scanner->position = scanner->next_position; |
461 | 0 | scanner->next_token = G_TOKEN_NONE; |
462 | 0 | } |
463 | 0 | else |
464 | 0 | json_scanner_get_token_i (scanner, |
465 | 0 | &scanner->token, |
466 | 0 | &scanner->value, |
467 | 0 | &scanner->line, |
468 | 0 | &scanner->position); |
469 | |
|
470 | 0 | return scanner->token; |
471 | 0 | } |
472 | | |
473 | | void |
474 | | json_scanner_input_text (JsonScanner *scanner, |
475 | | const gchar *text, |
476 | | guint text_len) |
477 | 0 | { |
478 | 0 | g_return_if_fail (scanner != NULL); |
479 | 0 | if (text_len) |
480 | 0 | g_return_if_fail (text != NULL); |
481 | 0 | else |
482 | 0 | text = NULL; |
483 | | |
484 | 0 | scanner->token = G_TOKEN_NONE; |
485 | 0 | scanner->value.v_int64 = 0; |
486 | 0 | scanner->line = 1; |
487 | 0 | scanner->position = 0; |
488 | 0 | scanner->next_token = G_TOKEN_NONE; |
489 | |
|
490 | 0 | scanner->text = text; |
491 | 0 | scanner->text_end = text + text_len; |
492 | |
|
493 | 0 | if (scanner->buffer) |
494 | 0 | { |
495 | 0 | g_free (scanner->buffer); |
496 | 0 | scanner->buffer = NULL; |
497 | 0 | } |
498 | 0 | } |
499 | | |
500 | | static guchar |
501 | | json_scanner_peek_next_char (JsonScanner *scanner) |
502 | 0 | { |
503 | 0 | if (scanner->text < scanner->text_end) |
504 | 0 | return *scanner->text; |
505 | 0 | else |
506 | 0 | return 0; |
507 | 0 | } |
508 | | |
509 | | static guchar |
510 | | json_scanner_get_char (JsonScanner *scanner, |
511 | | guint *line_p, |
512 | | guint *position_p) |
513 | 0 | { |
514 | 0 | guchar fchar; |
515 | |
|
516 | 0 | if (scanner->text < scanner->text_end) |
517 | 0 | fchar = *(scanner->text++); |
518 | 0 | else |
519 | 0 | fchar = 0; |
520 | | |
521 | 0 | if (fchar == '\n') |
522 | 0 | { |
523 | 0 | (*position_p) = 0; |
524 | 0 | (*line_p)++; |
525 | 0 | } |
526 | 0 | else if (fchar) |
527 | 0 | { |
528 | 0 | (*position_p)++; |
529 | 0 | } |
530 | | |
531 | 0 | return fchar; |
532 | 0 | } |
533 | | |
534 | 0 | #define is_hex_digit(c) (((c) >= '0' && (c) <= '9') || \ |
535 | 0 | ((c) >= 'a' && (c) <= 'f') || \ |
536 | 0 | ((c) >= 'A' && (c) <= 'F')) |
537 | 0 | #define to_hex_digit(c) (((c) <= '9') ? (c) - '0' : ((c) & 7) + 9) |
538 | | |
539 | | static gunichar |
540 | | json_scanner_get_unichar (JsonScanner *scanner, |
541 | | guint *line_p, |
542 | | guint *position_p) |
543 | 0 | { |
544 | 0 | gunichar uchar; |
545 | 0 | gchar ch; |
546 | 0 | gint i; |
547 | |
|
548 | 0 | uchar = 0; |
549 | 0 | for (i = 0; i < 4; i++) |
550 | 0 | { |
551 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
552 | |
|
553 | 0 | if (is_hex_digit (ch)) |
554 | 0 | uchar += ((gunichar) to_hex_digit (ch) << ((3 - i) * 4)); |
555 | 0 | else |
556 | 0 | break; |
557 | 0 | } |
558 | |
|
559 | 0 | g_assert (g_unichar_validate (uchar) || g_unichar_type (uchar) == G_UNICODE_SURROGATE); |
560 | | |
561 | 0 | return uchar; |
562 | 0 | } |
563 | | |
564 | | /* |
565 | | * decode_utf16_surrogate_pair: |
566 | | * @units: (array length=2): a pair of UTF-16 code points |
567 | | * |
568 | | * Decodes a surrogate pair of UTF-16 code points into the equivalent |
569 | | * Unicode code point. |
570 | | * |
571 | | * Returns: the Unicode code point equivalent to the surrogate pair |
572 | | */ |
573 | | static inline gunichar |
574 | | decode_utf16_surrogate_pair (const gunichar units[2]) |
575 | 0 | { |
576 | 0 | gunichar ucs; |
577 | |
|
578 | 0 | g_assert (0xd800 <= units[0] && units[0] <= 0xdbff); |
579 | 0 | g_assert (0xdc00 <= units[1] && units[1] <= 0xdfff); |
580 | | |
581 | 0 | ucs = 0x10000; |
582 | 0 | ucs += (units[0] & 0x3ff) << 10; |
583 | 0 | ucs += (units[1] & 0x3ff); |
584 | |
|
585 | 0 | return ucs; |
586 | 0 | } |
587 | | |
588 | | void |
589 | | json_scanner_unexp_token (JsonScanner *scanner, |
590 | | GTokenType expected_token, |
591 | | const gchar *identifier_spec, |
592 | | const gchar *symbol_spec, |
593 | | const gchar *symbol_name, |
594 | | const gchar *message) |
595 | 0 | { |
596 | 0 | gchar *token_string; |
597 | 0 | guint token_string_len; |
598 | 0 | gchar *expected_string; |
599 | 0 | guint expected_string_len; |
600 | 0 | gchar *message_prefix; |
601 | 0 | gboolean print_unexp; |
602 | | |
603 | 0 | g_return_if_fail (scanner != NULL); |
604 | | |
605 | 0 | if (!identifier_spec) |
606 | 0 | identifier_spec = "identifier"; |
607 | 0 | if (!symbol_spec) |
608 | 0 | symbol_spec = "symbol"; |
609 | | |
610 | 0 | token_string_len = 56; |
611 | 0 | token_string = g_new (gchar, token_string_len + 1); |
612 | 0 | expected_string_len = 64; |
613 | 0 | expected_string = g_new (gchar, expected_string_len + 1); |
614 | 0 | print_unexp = TRUE; |
615 | | |
616 | 0 | switch (scanner->token) |
617 | 0 | { |
618 | 0 | case G_TOKEN_EOF: |
619 | 0 | g_snprintf (token_string, token_string_len, "end of file"); |
620 | 0 | break; |
621 | | |
622 | 0 | default: |
623 | 0 | if (scanner->token >= 1 && scanner->token <= 255) |
624 | 0 | { |
625 | 0 | if ((scanner->token >= ' ' && scanner->token <= '~') || |
626 | 0 | strchr (scanner->config->cset_identifier_first, scanner->token) || |
627 | 0 | strchr (scanner->config->cset_identifier_nth, scanner->token)) |
628 | 0 | g_snprintf (token_string, token_string_len, "character `%c'", scanner->token); |
629 | 0 | else |
630 | 0 | g_snprintf (token_string, token_string_len, "character `\\%o'", scanner->token); |
631 | 0 | break; |
632 | 0 | } |
633 | 0 | else if (!scanner->config->symbol_2_token) |
634 | 0 | { |
635 | 0 | g_snprintf (token_string, token_string_len, "(unknown) token <%d>", scanner->token); |
636 | 0 | break; |
637 | 0 | } |
638 | | /* fall through */ |
639 | 0 | case G_TOKEN_SYMBOL: |
640 | 0 | if (expected_token == G_TOKEN_SYMBOL || |
641 | 0 | (scanner->config->symbol_2_token && |
642 | 0 | expected_token > G_TOKEN_LAST)) |
643 | 0 | print_unexp = FALSE; |
644 | 0 | if (symbol_name) |
645 | 0 | g_snprintf (token_string, token_string_len, |
646 | 0 | "%s%s `%s'", |
647 | 0 | print_unexp ? "" : "invalid ", |
648 | 0 | symbol_spec, |
649 | 0 | symbol_name); |
650 | 0 | else |
651 | 0 | g_snprintf (token_string, token_string_len, |
652 | 0 | "%s%s", |
653 | 0 | print_unexp ? "" : "invalid ", |
654 | 0 | symbol_spec); |
655 | 0 | break; |
656 | | |
657 | 0 | case G_TOKEN_ERROR: |
658 | 0 | print_unexp = FALSE; |
659 | 0 | expected_token = G_TOKEN_NONE; |
660 | 0 | switch (scanner->value.v_error) |
661 | 0 | { |
662 | 0 | case G_ERR_UNEXP_EOF: |
663 | 0 | g_snprintf (token_string, token_string_len, "scanner: unexpected end of file"); |
664 | 0 | break; |
665 | | |
666 | 0 | case G_ERR_UNEXP_EOF_IN_STRING: |
667 | 0 | g_snprintf (token_string, token_string_len, "scanner: unterminated string constant"); |
668 | 0 | break; |
669 | | |
670 | 0 | case G_ERR_UNEXP_EOF_IN_COMMENT: |
671 | 0 | g_snprintf (token_string, token_string_len, "scanner: unterminated comment"); |
672 | 0 | break; |
673 | | |
674 | 0 | case G_ERR_NON_DIGIT_IN_CONST: |
675 | 0 | g_snprintf (token_string, token_string_len, "scanner: non digit in constant"); |
676 | 0 | break; |
677 | | |
678 | 0 | case G_ERR_FLOAT_RADIX: |
679 | 0 | g_snprintf (token_string, token_string_len, "scanner: invalid radix for floating constant"); |
680 | 0 | break; |
681 | | |
682 | 0 | case G_ERR_FLOAT_MALFORMED: |
683 | 0 | g_snprintf (token_string, token_string_len, "scanner: malformed floating constant"); |
684 | 0 | break; |
685 | | |
686 | 0 | case G_ERR_DIGIT_RADIX: |
687 | 0 | g_snprintf (token_string, token_string_len, "scanner: digit is beyond radix"); |
688 | 0 | break; |
689 | | |
690 | 0 | case JSON_ERR_MALFORMED_SURROGATE_PAIR: |
691 | 0 | g_snprintf (token_string, token_string_len, "scanner: malformed surrogate pair"); |
692 | 0 | break; |
693 | | |
694 | 0 | case G_ERR_UNKNOWN: |
695 | 0 | default: |
696 | 0 | g_snprintf (token_string, token_string_len, "scanner: unknown error"); |
697 | 0 | break; |
698 | 0 | } |
699 | 0 | break; |
700 | | |
701 | 0 | case G_TOKEN_CHAR: |
702 | 0 | g_snprintf (token_string, token_string_len, "character `%c'", scanner->value.v_char); |
703 | 0 | break; |
704 | | |
705 | 0 | case G_TOKEN_IDENTIFIER: |
706 | 0 | case G_TOKEN_IDENTIFIER_NULL: |
707 | 0 | if (expected_token == G_TOKEN_IDENTIFIER || |
708 | 0 | expected_token == G_TOKEN_IDENTIFIER_NULL) |
709 | 0 | print_unexp = FALSE; |
710 | 0 | g_snprintf (token_string, token_string_len, |
711 | 0 | "%s%s `%s'", |
712 | 0 | print_unexp ? "" : "invalid ", |
713 | 0 | identifier_spec, |
714 | 0 | scanner->token == G_TOKEN_IDENTIFIER ? scanner->value.v_string : "null"); |
715 | 0 | break; |
716 | | |
717 | 0 | case G_TOKEN_BINARY: |
718 | 0 | case G_TOKEN_OCTAL: |
719 | 0 | case G_TOKEN_INT: |
720 | 0 | case G_TOKEN_HEX: |
721 | 0 | if (scanner->config->store_int64) |
722 | 0 | g_snprintf (token_string, token_string_len, "number `%" G_GUINT64_FORMAT "'", scanner->value.v_int64); |
723 | 0 | else |
724 | 0 | g_snprintf (token_string, token_string_len, "number `%lu'", scanner->value.v_int); |
725 | 0 | break; |
726 | | |
727 | 0 | case G_TOKEN_FLOAT: |
728 | 0 | g_snprintf (token_string, token_string_len, "number `%.3f'", scanner->value.v_float); |
729 | 0 | break; |
730 | | |
731 | 0 | case G_TOKEN_STRING: |
732 | 0 | if (expected_token == G_TOKEN_STRING) |
733 | 0 | print_unexp = FALSE; |
734 | 0 | g_snprintf (token_string, token_string_len, |
735 | 0 | "%s%sstring constant \"%s\"", |
736 | 0 | print_unexp ? "" : "invalid ", |
737 | 0 | scanner->value.v_string[0] == 0 ? "empty " : "", |
738 | 0 | scanner->value.v_string); |
739 | 0 | token_string[token_string_len - 2] = '"'; |
740 | 0 | token_string[token_string_len - 1] = 0; |
741 | 0 | break; |
742 | | |
743 | 0 | case G_TOKEN_COMMENT_SINGLE: |
744 | 0 | case G_TOKEN_COMMENT_MULTI: |
745 | 0 | g_snprintf (token_string, token_string_len, "comment"); |
746 | 0 | break; |
747 | | |
748 | 0 | case G_TOKEN_NONE: |
749 | | /* somehow the user's parsing code is screwed, there isn't much |
750 | | * we can do about it. |
751 | | * Note, a common case to trigger this is |
752 | | * json_scanner_peek_next_token(); json_scanner_unexp_token(); |
753 | | * without an intermediate json_scanner_get_next_token(). |
754 | | */ |
755 | 0 | g_assert_not_reached (); |
756 | 0 | break; |
757 | 0 | } |
758 | | |
759 | | |
760 | 0 | switch (expected_token) |
761 | 0 | { |
762 | 0 | gboolean need_valid; |
763 | 0 | gchar *tstring; |
764 | 0 | case G_TOKEN_EOF: |
765 | 0 | g_snprintf (expected_string, expected_string_len, "end of file"); |
766 | 0 | break; |
767 | 0 | default: |
768 | 0 | if (expected_token >= 1 && expected_token <= 255) |
769 | 0 | { |
770 | 0 | if ((expected_token >= ' ' && expected_token <= '~') || |
771 | 0 | strchr (scanner->config->cset_identifier_first, expected_token) || |
772 | 0 | strchr (scanner->config->cset_identifier_nth, expected_token)) |
773 | 0 | g_snprintf (expected_string, expected_string_len, "character `%c'", expected_token); |
774 | 0 | else |
775 | 0 | g_snprintf (expected_string, expected_string_len, "character `\\%o'", expected_token); |
776 | 0 | break; |
777 | 0 | } |
778 | 0 | else if (!scanner->config->symbol_2_token) |
779 | 0 | { |
780 | 0 | g_snprintf (expected_string, expected_string_len, "(unknown) token <%d>", expected_token); |
781 | 0 | break; |
782 | 0 | } |
783 | | /* fall through */ |
784 | 0 | case G_TOKEN_SYMBOL: |
785 | 0 | need_valid = (scanner->token == G_TOKEN_SYMBOL || |
786 | 0 | (scanner->config->symbol_2_token && |
787 | 0 | scanner->token > G_TOKEN_LAST)); |
788 | 0 | g_snprintf (expected_string, expected_string_len, |
789 | 0 | "%s%s", |
790 | 0 | need_valid ? "valid " : "", |
791 | 0 | symbol_spec); |
792 | | /* FIXME: should we attempt to lookup the symbol_name for symbol_2_token? */ |
793 | 0 | break; |
794 | 0 | case G_TOKEN_CHAR: |
795 | 0 | g_snprintf (expected_string, expected_string_len, "%scharacter", |
796 | 0 | scanner->token == G_TOKEN_CHAR ? "valid " : ""); |
797 | 0 | break; |
798 | 0 | case G_TOKEN_BINARY: |
799 | 0 | tstring = "binary"; |
800 | 0 | g_snprintf (expected_string, expected_string_len, "%snumber (%s)", |
801 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
802 | 0 | break; |
803 | 0 | case G_TOKEN_OCTAL: |
804 | 0 | tstring = "octal"; |
805 | 0 | g_snprintf (expected_string, expected_string_len, "%snumber (%s)", |
806 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
807 | 0 | break; |
808 | 0 | case G_TOKEN_INT: |
809 | 0 | tstring = "integer"; |
810 | 0 | g_snprintf (expected_string, expected_string_len, "%snumber (%s)", |
811 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
812 | 0 | break; |
813 | 0 | case G_TOKEN_HEX: |
814 | 0 | tstring = "hexadecimal"; |
815 | 0 | g_snprintf (expected_string, expected_string_len, "%snumber (%s)", |
816 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
817 | 0 | break; |
818 | 0 | case G_TOKEN_FLOAT: |
819 | 0 | tstring = "float"; |
820 | 0 | g_snprintf (expected_string, expected_string_len, "%snumber (%s)", |
821 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
822 | 0 | break; |
823 | 0 | case G_TOKEN_STRING: |
824 | 0 | g_snprintf (expected_string, |
825 | 0 | expected_string_len, |
826 | 0 | "%sstring constant", |
827 | 0 | scanner->token == G_TOKEN_STRING ? "valid " : ""); |
828 | 0 | break; |
829 | 0 | case G_TOKEN_IDENTIFIER: |
830 | 0 | case G_TOKEN_IDENTIFIER_NULL: |
831 | 0 | need_valid = (scanner->token == G_TOKEN_IDENTIFIER_NULL || |
832 | 0 | scanner->token == G_TOKEN_IDENTIFIER); |
833 | 0 | g_snprintf (expected_string, |
834 | 0 | expected_string_len, |
835 | 0 | "%s%s", |
836 | 0 | need_valid ? "valid " : "", |
837 | 0 | identifier_spec); |
838 | 0 | break; |
839 | 0 | case G_TOKEN_COMMENT_SINGLE: |
840 | 0 | tstring = "single-line"; |
841 | 0 | g_snprintf (expected_string, expected_string_len, "%scomment (%s)", |
842 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
843 | 0 | break; |
844 | 0 | case G_TOKEN_COMMENT_MULTI: |
845 | 0 | tstring = "multi-line"; |
846 | 0 | g_snprintf (expected_string, expected_string_len, "%scomment (%s)", |
847 | 0 | scanner->token == expected_token ? "valid " : "", tstring); |
848 | 0 | break; |
849 | 0 | case G_TOKEN_NONE: |
850 | 0 | case G_TOKEN_ERROR: |
851 | | /* this is handled upon printout */ |
852 | 0 | break; |
853 | 0 | } |
854 | | |
855 | 0 | if (message && message[0] != 0) |
856 | 0 | message_prefix = " - "; |
857 | 0 | else |
858 | 0 | { |
859 | 0 | message_prefix = ""; |
860 | 0 | message = ""; |
861 | 0 | } |
862 | 0 | if (expected_token == G_TOKEN_ERROR) |
863 | 0 | { |
864 | 0 | json_scanner_error (scanner, |
865 | 0 | "failure around %s%s%s", |
866 | 0 | token_string, |
867 | 0 | message_prefix, |
868 | 0 | message); |
869 | 0 | } |
870 | 0 | else if (expected_token == G_TOKEN_NONE) |
871 | 0 | { |
872 | 0 | if (print_unexp) |
873 | 0 | json_scanner_error (scanner, |
874 | 0 | "unexpected %s%s%s", |
875 | 0 | token_string, |
876 | 0 | message_prefix, |
877 | 0 | message); |
878 | 0 | else |
879 | 0 | json_scanner_error (scanner, |
880 | 0 | "%s%s%s", |
881 | 0 | token_string, |
882 | 0 | message_prefix, |
883 | 0 | message); |
884 | 0 | } |
885 | 0 | else |
886 | 0 | { |
887 | 0 | if (print_unexp) |
888 | 0 | json_scanner_error (scanner, |
889 | 0 | "unexpected %s, expected %s%s%s", |
890 | 0 | token_string, |
891 | 0 | expected_string, |
892 | 0 | message_prefix, |
893 | 0 | message); |
894 | 0 | else |
895 | 0 | json_scanner_error (scanner, |
896 | 0 | "%s, expected %s%s%s", |
897 | 0 | token_string, |
898 | 0 | expected_string, |
899 | 0 | message_prefix, |
900 | 0 | message); |
901 | 0 | } |
902 | | |
903 | 0 | g_free (token_string); |
904 | 0 | g_free (expected_string); |
905 | 0 | } |
906 | | |
907 | | static void |
908 | | json_scanner_get_token_i (JsonScanner *scanner, |
909 | | GTokenType *token_p, |
910 | | GTokenValue *value_p, |
911 | | guint *line_p, |
912 | | guint *position_p) |
913 | 0 | { |
914 | 0 | do |
915 | 0 | { |
916 | 0 | json_scanner_free_value (token_p, value_p); |
917 | 0 | json_scanner_get_token_ll (scanner, token_p, value_p, line_p, position_p); |
918 | 0 | } |
919 | 0 | while (((*token_p > 0 && *token_p < 256) && |
920 | 0 | strchr (scanner->config->cset_skip_characters, *token_p)) || |
921 | 0 | (*token_p == G_TOKEN_CHAR && |
922 | 0 | strchr (scanner->config->cset_skip_characters, value_p->v_char)) || |
923 | 0 | (*token_p == G_TOKEN_COMMENT_MULTI && |
924 | 0 | scanner->config->skip_comment_multi) || |
925 | 0 | (*token_p == G_TOKEN_COMMENT_SINGLE && |
926 | 0 | scanner->config->skip_comment_single)); |
927 | | |
928 | 0 | switch (*token_p) |
929 | 0 | { |
930 | 0 | case G_TOKEN_IDENTIFIER: |
931 | 0 | if (scanner->config->identifier_2_string) |
932 | 0 | *token_p = G_TOKEN_STRING; |
933 | 0 | break; |
934 | | |
935 | 0 | case G_TOKEN_SYMBOL: |
936 | 0 | if (scanner->config->symbol_2_token) |
937 | 0 | *token_p = GPOINTER_TO_INT (value_p->v_symbol); |
938 | 0 | break; |
939 | | |
940 | 0 | case G_TOKEN_BINARY: |
941 | 0 | case G_TOKEN_OCTAL: |
942 | 0 | case G_TOKEN_HEX: |
943 | 0 | if (scanner->config->numbers_2_int) |
944 | 0 | *token_p = G_TOKEN_INT; |
945 | 0 | break; |
946 | | |
947 | 0 | default: |
948 | 0 | break; |
949 | 0 | } |
950 | | |
951 | 0 | if (*token_p == G_TOKEN_INT && |
952 | 0 | scanner->config->int_2_float) |
953 | 0 | { |
954 | 0 | *token_p = G_TOKEN_FLOAT; |
955 | 0 | if (scanner->config->store_int64) |
956 | 0 | { |
957 | | #ifdef _MSC_VER |
958 | | /* work around error C2520, see gvaluetransform.c */ |
959 | | value_p->v_float = (__int64)value_p->v_int64; |
960 | | #else |
961 | 0 | value_p->v_float = value_p->v_int64; |
962 | 0 | #endif |
963 | 0 | } |
964 | 0 | else |
965 | 0 | value_p->v_float = value_p->v_int; |
966 | 0 | } |
967 | | |
968 | 0 | errno = 0; |
969 | 0 | } |
970 | | |
971 | | static void |
972 | | json_scanner_get_token_ll (JsonScanner *scanner, |
973 | | GTokenType *token_p, |
974 | | GTokenValue *value_p, |
975 | | guint *line_p, |
976 | | guint *position_p) |
977 | 0 | { |
978 | 0 | JsonScannerConfig *config; |
979 | 0 | GTokenType token; |
980 | 0 | gboolean in_comment_multi; |
981 | 0 | gboolean in_comment_single; |
982 | 0 | gboolean in_string_sq; |
983 | 0 | gboolean in_string_dq; |
984 | 0 | GString *gstring; |
985 | 0 | GTokenValue value; |
986 | 0 | guchar ch; |
987 | | |
988 | 0 | config = scanner->config; |
989 | 0 | (*value_p).v_int64 = 0; |
990 | | |
991 | 0 | if (scanner->text >= scanner->text_end || |
992 | 0 | scanner->token == G_TOKEN_EOF) |
993 | 0 | { |
994 | 0 | *token_p = G_TOKEN_EOF; |
995 | 0 | return; |
996 | 0 | } |
997 | | |
998 | 0 | in_comment_multi = FALSE; |
999 | 0 | in_comment_single = FALSE; |
1000 | 0 | in_string_sq = FALSE; |
1001 | 0 | in_string_dq = FALSE; |
1002 | 0 | gstring = NULL; |
1003 | | |
1004 | 0 | do /* while (ch != 0) */ |
1005 | 0 | { |
1006 | 0 | gboolean dotted_float = FALSE; |
1007 | | |
1008 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1009 | | |
1010 | 0 | value.v_int64 = 0; |
1011 | 0 | token = G_TOKEN_NONE; |
1012 | | |
1013 | | /* this is *evil*, but needed ;( |
1014 | | * we first check for identifier first character, because it |
1015 | | * might interfere with other key chars like slashes or numbers |
1016 | | */ |
1017 | 0 | if (config->scan_identifier && |
1018 | 0 | ch && strchr (config->cset_identifier_first, ch)) |
1019 | 0 | goto identifier_precedence; |
1020 | | |
1021 | 0 | switch (ch) |
1022 | 0 | { |
1023 | 0 | case 0: |
1024 | 0 | token = G_TOKEN_EOF; |
1025 | 0 | (*position_p)++; |
1026 | | /* ch = 0; */ |
1027 | 0 | break; |
1028 | | |
1029 | 0 | case '/': |
1030 | 0 | if (!config->scan_comment_multi || |
1031 | 0 | json_scanner_peek_next_char (scanner) != '*') |
1032 | 0 | goto default_case; |
1033 | 0 | json_scanner_get_char (scanner, line_p, position_p); |
1034 | 0 | token = G_TOKEN_COMMENT_MULTI; |
1035 | 0 | in_comment_multi = TRUE; |
1036 | 0 | gstring = g_string_new (NULL); |
1037 | 0 | while ((ch = json_scanner_get_char (scanner, line_p, position_p)) != 0) |
1038 | 0 | { |
1039 | 0 | if (ch == '*' && json_scanner_peek_next_char (scanner) == '/') |
1040 | 0 | { |
1041 | 0 | json_scanner_get_char (scanner, line_p, position_p); |
1042 | 0 | in_comment_multi = FALSE; |
1043 | 0 | break; |
1044 | 0 | } |
1045 | 0 | else |
1046 | 0 | gstring = g_string_append_c (gstring, ch); |
1047 | 0 | } |
1048 | 0 | ch = 0; |
1049 | 0 | break; |
1050 | | |
1051 | 0 | case '\'': |
1052 | 0 | if (!config->scan_string_sq) |
1053 | 0 | goto default_case; |
1054 | 0 | token = G_TOKEN_STRING; |
1055 | 0 | in_string_sq = TRUE; |
1056 | 0 | gstring = g_string_new (NULL); |
1057 | 0 | while ((ch = json_scanner_get_char (scanner, line_p, position_p)) != 0) |
1058 | 0 | { |
1059 | 0 | if (ch == '\'') |
1060 | 0 | { |
1061 | 0 | in_string_sq = FALSE; |
1062 | 0 | break; |
1063 | 0 | } |
1064 | 0 | else |
1065 | 0 | gstring = g_string_append_c (gstring, ch); |
1066 | 0 | } |
1067 | 0 | ch = 0; |
1068 | 0 | break; |
1069 | | |
1070 | 0 | case '"': |
1071 | 0 | if (!config->scan_string_dq) |
1072 | 0 | goto default_case; |
1073 | 0 | token = G_TOKEN_STRING; |
1074 | 0 | in_string_dq = TRUE; |
1075 | 0 | gstring = g_string_new (NULL); |
1076 | 0 | while ((ch = json_scanner_get_char (scanner, line_p, position_p)) != 0) |
1077 | 0 | { |
1078 | 0 | if (ch == '"' || token == G_TOKEN_ERROR) |
1079 | 0 | { |
1080 | 0 | in_string_dq = FALSE; |
1081 | 0 | break; |
1082 | 0 | } |
1083 | 0 | else |
1084 | 0 | { |
1085 | 0 | if (ch == '\\') |
1086 | 0 | { |
1087 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1088 | 0 | switch (ch) |
1089 | 0 | { |
1090 | 0 | guint i; |
1091 | 0 | guint fchar; |
1092 | | |
1093 | 0 | case 0: |
1094 | 0 | break; |
1095 | | |
1096 | 0 | case '\\': |
1097 | 0 | gstring = g_string_append_c (gstring, '\\'); |
1098 | 0 | break; |
1099 | | |
1100 | 0 | case 'n': |
1101 | 0 | gstring = g_string_append_c (gstring, '\n'); |
1102 | 0 | break; |
1103 | | |
1104 | 0 | case 't': |
1105 | 0 | gstring = g_string_append_c (gstring, '\t'); |
1106 | 0 | break; |
1107 | | |
1108 | 0 | case 'r': |
1109 | 0 | gstring = g_string_append_c (gstring, '\r'); |
1110 | 0 | break; |
1111 | | |
1112 | 0 | case 'b': |
1113 | 0 | gstring = g_string_append_c (gstring, '\b'); |
1114 | 0 | break; |
1115 | | |
1116 | 0 | case 'f': |
1117 | 0 | gstring = g_string_append_c (gstring, '\f'); |
1118 | 0 | break; |
1119 | | |
1120 | 0 | case 'u': |
1121 | 0 | fchar = json_scanner_peek_next_char (scanner); |
1122 | 0 | if (is_hex_digit (fchar)) |
1123 | 0 | { |
1124 | 0 | gunichar ucs; |
1125 | |
|
1126 | 0 | ucs = json_scanner_get_unichar (scanner, line_p, position_p); |
1127 | | |
1128 | | /* resolve UTF-16 surrogates for Unicode characters not in the BMP, |
1129 | | * as per ECMA 404, § 9, "String" |
1130 | | */ |
1131 | 0 | if (g_unichar_type (ucs) == G_UNICODE_SURROGATE) |
1132 | 0 | { |
1133 | | /* read next surrogate */ |
1134 | 0 | if ('\\' == json_scanner_get_char (scanner, line_p, position_p) && |
1135 | 0 | 'u' == json_scanner_get_char (scanner, line_p, position_p)) |
1136 | 0 | { |
1137 | 0 | gunichar units[2]; |
1138 | |
|
1139 | 0 | units[0] = ucs; |
1140 | 0 | units[1] = json_scanner_get_unichar (scanner, line_p, position_p); |
1141 | |
|
1142 | 0 | if (0xdc00 <= units[1] && units[1] <= 0xdfff && |
1143 | 0 | 0xd800 <= units[0] && units[0] <= 0xdbff) |
1144 | 0 | { |
1145 | 0 | ucs = decode_utf16_surrogate_pair (units); |
1146 | 0 | g_assert (g_unichar_validate (ucs)); |
1147 | 0 | } |
1148 | 0 | else |
1149 | 0 | { |
1150 | 0 | token = G_TOKEN_ERROR; |
1151 | 0 | value.v_error = JSON_ERR_MALFORMED_SURROGATE_PAIR; |
1152 | 0 | gstring = NULL; |
1153 | 0 | break; |
1154 | 0 | } |
1155 | |
|
1156 | 0 | } |
1157 | 0 | } |
1158 | | |
1159 | 0 | gstring = g_string_append_unichar (gstring, ucs); |
1160 | 0 | } |
1161 | 0 | break; |
1162 | | |
1163 | 0 | case '0': |
1164 | 0 | case '1': |
1165 | 0 | case '2': |
1166 | 0 | case '3': |
1167 | 0 | case '4': |
1168 | 0 | case '5': |
1169 | 0 | case '6': |
1170 | 0 | case '7': |
1171 | 0 | i = ch - '0'; |
1172 | 0 | fchar = json_scanner_peek_next_char (scanner); |
1173 | 0 | if (fchar >= '0' && fchar <= '7') |
1174 | 0 | { |
1175 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1176 | 0 | i = i * 8 + ch - '0'; |
1177 | 0 | fchar = json_scanner_peek_next_char (scanner); |
1178 | 0 | if (fchar >= '0' && fchar <= '7') |
1179 | 0 | { |
1180 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1181 | 0 | i = i * 8 + ch - '0'; |
1182 | 0 | } |
1183 | 0 | } |
1184 | 0 | gstring = g_string_append_c (gstring, i); |
1185 | 0 | break; |
1186 | | |
1187 | 0 | default: |
1188 | 0 | gstring = g_string_append_c (gstring, ch); |
1189 | 0 | break; |
1190 | 0 | } |
1191 | 0 | } |
1192 | 0 | else |
1193 | 0 | gstring = g_string_append_c (gstring, ch); |
1194 | 0 | } |
1195 | 0 | } |
1196 | 0 | ch = 0; |
1197 | 0 | break; |
1198 | | |
1199 | 0 | case '.': |
1200 | 0 | if (!config->scan_float) |
1201 | 0 | goto default_case; |
1202 | 0 | token = G_TOKEN_FLOAT; |
1203 | 0 | dotted_float = TRUE; |
1204 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1205 | 0 | goto number_parsing; |
1206 | | |
1207 | 0 | case '$': |
1208 | 0 | if (!config->scan_hex_dollar) |
1209 | 0 | goto default_case; |
1210 | 0 | token = G_TOKEN_HEX; |
1211 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1212 | 0 | goto number_parsing; |
1213 | | |
1214 | 0 | case '0': |
1215 | 0 | if (config->scan_octal) |
1216 | 0 | token = G_TOKEN_OCTAL; |
1217 | 0 | else |
1218 | 0 | token = G_TOKEN_INT; |
1219 | 0 | ch = json_scanner_peek_next_char (scanner); |
1220 | 0 | if (config->scan_hex && (ch == 'x' || ch == 'X')) |
1221 | 0 | { |
1222 | 0 | token = G_TOKEN_HEX; |
1223 | 0 | json_scanner_get_char (scanner, line_p, position_p); |
1224 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1225 | 0 | if (ch == 0) |
1226 | 0 | { |
1227 | 0 | token = G_TOKEN_ERROR; |
1228 | 0 | value.v_error = G_ERR_UNEXP_EOF; |
1229 | 0 | (*position_p)++; |
1230 | 0 | break; |
1231 | 0 | } |
1232 | 0 | if (json_scanner_char_2_num (ch, 16) < 0) |
1233 | 0 | { |
1234 | 0 | token = G_TOKEN_ERROR; |
1235 | 0 | value.v_error = G_ERR_DIGIT_RADIX; |
1236 | 0 | ch = 0; |
1237 | 0 | break; |
1238 | 0 | } |
1239 | 0 | } |
1240 | 0 | else if (config->scan_binary && (ch == 'b' || ch == 'B')) |
1241 | 0 | { |
1242 | 0 | token = G_TOKEN_BINARY; |
1243 | 0 | json_scanner_get_char (scanner, line_p, position_p); |
1244 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1245 | 0 | if (ch == 0) |
1246 | 0 | { |
1247 | 0 | token = G_TOKEN_ERROR; |
1248 | 0 | value.v_error = G_ERR_UNEXP_EOF; |
1249 | 0 | (*position_p)++; |
1250 | 0 | break; |
1251 | 0 | } |
1252 | 0 | if (json_scanner_char_2_num (ch, 10) < 0) |
1253 | 0 | { |
1254 | 0 | token = G_TOKEN_ERROR; |
1255 | 0 | value.v_error = G_ERR_NON_DIGIT_IN_CONST; |
1256 | 0 | ch = 0; |
1257 | 0 | break; |
1258 | 0 | } |
1259 | 0 | } |
1260 | 0 | else |
1261 | 0 | ch = '0'; |
1262 | | /* fall through */ |
1263 | 0 | case '1': |
1264 | 0 | case '2': |
1265 | 0 | case '3': |
1266 | 0 | case '4': |
1267 | 0 | case '5': |
1268 | 0 | case '6': |
1269 | 0 | case '7': |
1270 | 0 | case '8': |
1271 | 0 | case '9': |
1272 | 0 | number_parsing: |
1273 | 0 | { |
1274 | 0 | gboolean in_number = TRUE; |
1275 | 0 | gchar *endptr; |
1276 | | |
1277 | 0 | if (token == G_TOKEN_NONE) |
1278 | 0 | token = G_TOKEN_INT; |
1279 | | |
1280 | 0 | gstring = g_string_new (dotted_float ? "0." : ""); |
1281 | 0 | gstring = g_string_append_c (gstring, ch); |
1282 | | |
1283 | 0 | do /* while (in_number) */ |
1284 | 0 | { |
1285 | 0 | gboolean is_E; |
1286 | | |
1287 | 0 | is_E = token == G_TOKEN_FLOAT && (ch == 'e' || ch == 'E'); |
1288 | | |
1289 | 0 | ch = json_scanner_peek_next_char (scanner); |
1290 | | |
1291 | 0 | if (json_scanner_char_2_num (ch, 36) >= 0 || |
1292 | 0 | (config->scan_float && ch == '.') || |
1293 | 0 | (is_E && (ch == '+' || ch == '-'))) |
1294 | 0 | { |
1295 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1296 | | |
1297 | 0 | switch (ch) |
1298 | 0 | { |
1299 | 0 | case '.': |
1300 | 0 | if (token != G_TOKEN_INT && token != G_TOKEN_OCTAL) |
1301 | 0 | { |
1302 | 0 | value.v_error = token == G_TOKEN_FLOAT ? G_ERR_FLOAT_MALFORMED : G_ERR_FLOAT_RADIX; |
1303 | 0 | token = G_TOKEN_ERROR; |
1304 | 0 | in_number = FALSE; |
1305 | 0 | } |
1306 | 0 | else |
1307 | 0 | { |
1308 | 0 | token = G_TOKEN_FLOAT; |
1309 | 0 | gstring = g_string_append_c (gstring, ch); |
1310 | 0 | } |
1311 | 0 | break; |
1312 | | |
1313 | 0 | case '0': |
1314 | 0 | case '1': |
1315 | 0 | case '2': |
1316 | 0 | case '3': |
1317 | 0 | case '4': |
1318 | 0 | case '5': |
1319 | 0 | case '6': |
1320 | 0 | case '7': |
1321 | 0 | case '8': |
1322 | 0 | case '9': |
1323 | 0 | gstring = g_string_append_c (gstring, ch); |
1324 | 0 | break; |
1325 | | |
1326 | 0 | case '-': |
1327 | 0 | case '+': |
1328 | 0 | if (token != G_TOKEN_FLOAT) |
1329 | 0 | { |
1330 | 0 | token = G_TOKEN_ERROR; |
1331 | 0 | value.v_error = G_ERR_NON_DIGIT_IN_CONST; |
1332 | 0 | in_number = FALSE; |
1333 | 0 | } |
1334 | 0 | else |
1335 | 0 | gstring = g_string_append_c (gstring, ch); |
1336 | 0 | break; |
1337 | | |
1338 | 0 | case 'e': |
1339 | 0 | case 'E': |
1340 | 0 | if ((token != G_TOKEN_HEX && !config->scan_float) || |
1341 | 0 | (token != G_TOKEN_HEX && |
1342 | 0 | token != G_TOKEN_OCTAL && |
1343 | 0 | token != G_TOKEN_FLOAT && |
1344 | 0 | token != G_TOKEN_INT)) |
1345 | 0 | { |
1346 | 0 | token = G_TOKEN_ERROR; |
1347 | 0 | value.v_error = G_ERR_NON_DIGIT_IN_CONST; |
1348 | 0 | in_number = FALSE; |
1349 | 0 | } |
1350 | 0 | else |
1351 | 0 | { |
1352 | 0 | if (token != G_TOKEN_HEX) |
1353 | 0 | token = G_TOKEN_FLOAT; |
1354 | 0 | gstring = g_string_append_c (gstring, ch); |
1355 | 0 | } |
1356 | 0 | break; |
1357 | | |
1358 | 0 | default: |
1359 | 0 | if (token != G_TOKEN_HEX) |
1360 | 0 | { |
1361 | 0 | token = G_TOKEN_ERROR; |
1362 | 0 | value.v_error = G_ERR_NON_DIGIT_IN_CONST; |
1363 | 0 | in_number = FALSE; |
1364 | 0 | } |
1365 | 0 | else |
1366 | 0 | gstring = g_string_append_c (gstring, ch); |
1367 | 0 | break; |
1368 | 0 | } |
1369 | 0 | } |
1370 | 0 | else |
1371 | 0 | in_number = FALSE; |
1372 | 0 | } |
1373 | 0 | while (in_number); |
1374 | | |
1375 | 0 | endptr = NULL; |
1376 | 0 | if (token == G_TOKEN_FLOAT) |
1377 | 0 | value.v_float = g_strtod (gstring->str, &endptr); |
1378 | 0 | else |
1379 | 0 | { |
1380 | 0 | guint64 ui64 = 0; |
1381 | 0 | switch (token) |
1382 | 0 | { |
1383 | 0 | case G_TOKEN_BINARY: |
1384 | 0 | ui64 = g_ascii_strtoull (gstring->str, &endptr, 2); |
1385 | 0 | break; |
1386 | 0 | case G_TOKEN_OCTAL: |
1387 | 0 | ui64 = g_ascii_strtoull (gstring->str, &endptr, 8); |
1388 | 0 | break; |
1389 | 0 | case G_TOKEN_INT: |
1390 | 0 | ui64 = g_ascii_strtoull (gstring->str, &endptr, 10); |
1391 | 0 | break; |
1392 | 0 | case G_TOKEN_HEX: |
1393 | 0 | ui64 = g_ascii_strtoull (gstring->str, &endptr, 16); |
1394 | 0 | break; |
1395 | 0 | default: ; |
1396 | 0 | } |
1397 | 0 | if (scanner->config->store_int64) |
1398 | 0 | value.v_int64 = ui64; |
1399 | 0 | else |
1400 | 0 | value.v_int = ui64; |
1401 | 0 | } |
1402 | 0 | if (endptr && *endptr) |
1403 | 0 | { |
1404 | 0 | token = G_TOKEN_ERROR; |
1405 | 0 | if (*endptr == 'e' || *endptr == 'E') |
1406 | 0 | value.v_error = G_ERR_NON_DIGIT_IN_CONST; |
1407 | 0 | else |
1408 | 0 | value.v_error = G_ERR_DIGIT_RADIX; |
1409 | 0 | } |
1410 | 0 | g_string_free (gstring, TRUE); |
1411 | 0 | gstring = NULL; |
1412 | 0 | ch = 0; |
1413 | 0 | } /* number_parsing:... */ |
1414 | 0 | break; |
1415 | | |
1416 | 0 | default: |
1417 | 0 | default_case: |
1418 | 0 | { |
1419 | 0 | if (config->cpair_comment_single && |
1420 | 0 | ch == config->cpair_comment_single[0]) |
1421 | 0 | { |
1422 | 0 | token = G_TOKEN_COMMENT_SINGLE; |
1423 | 0 | in_comment_single = TRUE; |
1424 | 0 | gstring = g_string_new (NULL); |
1425 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1426 | 0 | while (ch != 0) |
1427 | 0 | { |
1428 | 0 | if (ch == config->cpair_comment_single[1]) |
1429 | 0 | { |
1430 | 0 | in_comment_single = FALSE; |
1431 | 0 | ch = 0; |
1432 | 0 | break; |
1433 | 0 | } |
1434 | | |
1435 | 0 | gstring = g_string_append_c (gstring, ch); |
1436 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1437 | 0 | } |
1438 | | /* ignore a missing newline at EOF for single line comments */ |
1439 | 0 | if (in_comment_single && |
1440 | 0 | config->cpair_comment_single[1] == '\n') |
1441 | 0 | in_comment_single = FALSE; |
1442 | 0 | } |
1443 | 0 | else if (config->scan_identifier && ch && |
1444 | 0 | strchr (config->cset_identifier_first, ch)) |
1445 | 0 | { |
1446 | 0 | identifier_precedence: |
1447 | | |
1448 | 0 | if (config->cset_identifier_nth && ch && |
1449 | 0 | strchr (config->cset_identifier_nth, |
1450 | 0 | json_scanner_peek_next_char (scanner))) |
1451 | 0 | { |
1452 | 0 | token = G_TOKEN_IDENTIFIER; |
1453 | 0 | gstring = g_string_new (NULL); |
1454 | 0 | gstring = g_string_append_c (gstring, ch); |
1455 | 0 | do |
1456 | 0 | { |
1457 | 0 | ch = json_scanner_get_char (scanner, line_p, position_p); |
1458 | 0 | gstring = g_string_append_c (gstring, ch); |
1459 | 0 | ch = json_scanner_peek_next_char (scanner); |
1460 | 0 | } |
1461 | 0 | while (ch && strchr (config->cset_identifier_nth, ch)); |
1462 | 0 | ch = 0; |
1463 | 0 | } |
1464 | 0 | else if (config->scan_identifier_1char) |
1465 | 0 | { |
1466 | 0 | token = G_TOKEN_IDENTIFIER; |
1467 | 0 | value.v_identifier = g_new0 (gchar, 2); |
1468 | 0 | value.v_identifier[0] = ch; |
1469 | 0 | ch = 0; |
1470 | 0 | } |
1471 | 0 | } |
1472 | 0 | if (ch) |
1473 | 0 | { |
1474 | 0 | if (config->char_2_token) |
1475 | 0 | token = ch; |
1476 | 0 | else |
1477 | 0 | { |
1478 | 0 | token = G_TOKEN_CHAR; |
1479 | 0 | value.v_char = ch; |
1480 | 0 | } |
1481 | 0 | ch = 0; |
1482 | 0 | } |
1483 | 0 | } /* default_case:... */ |
1484 | 0 | break; |
1485 | 0 | } |
1486 | 0 | g_assert (ch == 0 && token != G_TOKEN_NONE); /* paranoid */ |
1487 | 0 | } |
1488 | 0 | while (ch != 0); |
1489 | | |
1490 | 0 | if (in_comment_multi || in_comment_single || |
1491 | 0 | in_string_sq || in_string_dq) |
1492 | 0 | { |
1493 | 0 | token = G_TOKEN_ERROR; |
1494 | 0 | if (gstring) |
1495 | 0 | { |
1496 | 0 | g_string_free (gstring, TRUE); |
1497 | 0 | gstring = NULL; |
1498 | 0 | } |
1499 | 0 | (*position_p)++; |
1500 | 0 | if (in_comment_multi || in_comment_single) |
1501 | 0 | value.v_error = G_ERR_UNEXP_EOF_IN_COMMENT; |
1502 | 0 | else /* (in_string_sq || in_string_dq) */ |
1503 | 0 | value.v_error = G_ERR_UNEXP_EOF_IN_STRING; |
1504 | 0 | } |
1505 | | |
1506 | 0 | if (gstring) |
1507 | 0 | { |
1508 | 0 | value.v_string = g_string_free (gstring, FALSE); |
1509 | 0 | gstring = NULL; |
1510 | 0 | } |
1511 | | |
1512 | 0 | if (token == G_TOKEN_IDENTIFIER) |
1513 | 0 | { |
1514 | 0 | if (config->scan_symbols) |
1515 | 0 | { |
1516 | 0 | JsonScannerKey *key; |
1517 | 0 | guint scope_id; |
1518 | | |
1519 | 0 | scope_id = scanner->scope_id; |
1520 | 0 | key = json_scanner_lookup_internal (scanner, scope_id, value.v_identifier); |
1521 | 0 | if (!key && scope_id && scanner->config->scope_0_fallback) |
1522 | 0 | key = json_scanner_lookup_internal (scanner, 0, value.v_identifier); |
1523 | | |
1524 | 0 | if (key) |
1525 | 0 | { |
1526 | 0 | g_free (value.v_identifier); |
1527 | 0 | token = G_TOKEN_SYMBOL; |
1528 | 0 | value.v_symbol = key->value; |
1529 | 0 | } |
1530 | 0 | } |
1531 | | |
1532 | 0 | if (token == G_TOKEN_IDENTIFIER && |
1533 | 0 | config->scan_identifier_NULL && |
1534 | 0 | strlen (value.v_identifier) == 4) |
1535 | 0 | { |
1536 | 0 | gchar *null_upper = "NULL"; |
1537 | 0 | gchar *null_lower = "null"; |
1538 | | |
1539 | 0 | if (scanner->config->case_sensitive) |
1540 | 0 | { |
1541 | 0 | if (value.v_identifier[0] == null_upper[0] && |
1542 | 0 | value.v_identifier[1] == null_upper[1] && |
1543 | 0 | value.v_identifier[2] == null_upper[2] && |
1544 | 0 | value.v_identifier[3] == null_upper[3]) |
1545 | 0 | token = G_TOKEN_IDENTIFIER_NULL; |
1546 | 0 | } |
1547 | 0 | else |
1548 | 0 | { |
1549 | 0 | if ((value.v_identifier[0] == null_upper[0] || |
1550 | 0 | value.v_identifier[0] == null_lower[0]) && |
1551 | 0 | (value.v_identifier[1] == null_upper[1] || |
1552 | 0 | value.v_identifier[1] == null_lower[1]) && |
1553 | 0 | (value.v_identifier[2] == null_upper[2] || |
1554 | 0 | value.v_identifier[2] == null_lower[2]) && |
1555 | 0 | (value.v_identifier[3] == null_upper[3] || |
1556 | 0 | value.v_identifier[3] == null_lower[3])) |
1557 | 0 | token = G_TOKEN_IDENTIFIER_NULL; |
1558 | 0 | } |
1559 | 0 | } |
1560 | 0 | } |
1561 | | |
1562 | 0 | *token_p = token; |
1563 | 0 | *value_p = value; |
1564 | 0 | } |