/src/serenity/AK/JsonParser.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/CharacterTypes.h> |
8 | | #include <AK/FloatingPointStringConversions.h> |
9 | | #include <AK/JsonArray.h> |
10 | | #include <AK/JsonObject.h> |
11 | | #include <AK/JsonParser.h> |
12 | | #include <math.h> |
13 | | |
14 | | namespace AK { |
15 | | |
16 | | constexpr bool is_space(int ch) |
17 | 45.4M | { |
18 | 45.4M | return ch == '\t' || ch == '\n' || ch == '\r' || ch == ' '; |
19 | 45.4M | } |
20 | | |
21 | | // ECMA-404 9 String |
22 | | // Boils down to |
23 | | // STRING = "\"" *("[^\"\\]" | "\\" ("[\"\\bfnrt]" | "u[0-9A-Za-z]{4}")) "\"" |
24 | | // │├── " ──╮───────────────────────────────────────────────╭── " ──┤│ |
25 | | // │ │ |
26 | | // │ ╭───────────────────<─────────────────────╮ │ |
27 | | // │ │ │ │ |
28 | | // ╰──╰──╮───────────── [^"\\] ──────────────╭──╯──╯ |
29 | | // │ │ |
30 | | // ╰── \ ───╮──── ["\\bfnrt] ───────╭──╯ |
31 | | // │ │ |
32 | | // ╰─── u[0-9A-Za-z]{4} ──╯ |
33 | | // |
34 | | ErrorOr<ByteString> JsonParser::consume_and_unescape_string() |
35 | 4.07M | { |
36 | 4.07M | if (!consume_specific('"')) |
37 | 175 | return Error::from_string_literal("JsonParser: Expected '\"'"); |
38 | 4.07M | StringBuilder final_sb; |
39 | | |
40 | 5.28M | for (;;) { |
41 | | // OPTIMIZATION: We try to append as many literal characters as possible at a time |
42 | | // This also pre-checks some error conditions |
43 | | // Note: All utf8 characters are either plain ascii, or have their most signifiant bit set, |
44 | | // which puts the, above plain ascii in value, so they will always consist |
45 | | // of a set of "legal" non-special bytes, |
46 | | // hence we don't need to bother with a code-point iterator, |
47 | | // as a simple byte iterator suffices, which GenericLexer provides by default |
48 | 5.28M | size_t literal_characters = 0; |
49 | 80.4M | for (;;) { |
50 | 80.4M | char ch = peek(literal_characters); |
51 | | // Note: We get a 0 byte when we hit EOF |
52 | 80.4M | if (ch == 0) |
53 | 739 | return Error::from_string_literal("JsonParser: EOF while parsing String"); |
54 | | // Spec: All code points may be placed within the quotation marks except |
55 | | // for the code points that must be escaped: quotation mark (U+0022), |
56 | | // reverse solidus (U+005C), and the control characters U+0000 to U+001F. |
57 | | // There are two-character escape sequence representations of some characters. |
58 | 80.4M | if (is_ascii_c0_control(ch)) |
59 | 34 | return Error::from_string_literal("JsonParser: ASCII control sequence encountered"); |
60 | 80.4M | if (ch == '"' || ch == '\\') |
61 | 5.28M | break; |
62 | 75.2M | ++literal_characters; |
63 | 75.2M | } |
64 | 5.28M | final_sb.append(consume(literal_characters)); |
65 | | |
66 | | // We have checked all cases except end-of-string and escaped characters in the loop above, |
67 | | // so we now only have to handle those two cases |
68 | 5.28M | char ch = peek(); |
69 | | |
70 | 5.28M | if (ch == '"') { |
71 | 4.07M | consume(); |
72 | 4.07M | break; |
73 | 4.07M | } |
74 | | |
75 | 1.20M | ignore(); // '\' |
76 | | |
77 | 1.20M | switch (peek()) { |
78 | 76 | case '\0': |
79 | 76 | return Error::from_string_literal("JsonParser: EOF while parsing String"); |
80 | 743 | case '"': |
81 | 21.9k | case '\\': |
82 | 25.4k | case '/': |
83 | 25.4k | final_sb.append(consume()); |
84 | 25.4k | break; |
85 | 289k | case 'b': |
86 | 289k | ignore(); |
87 | 289k | final_sb.append('\b'); |
88 | 289k | break; |
89 | 213k | case 'f': |
90 | 213k | ignore(); |
91 | 213k | final_sb.append('\f'); |
92 | 213k | break; |
93 | 2.70k | case 'n': |
94 | 2.70k | ignore(); |
95 | 2.70k | final_sb.append('\n'); |
96 | 2.70k | break; |
97 | 696 | case 'r': |
98 | 696 | ignore(); |
99 | 696 | final_sb.append('\r'); |
100 | 696 | break; |
101 | 830 | case 't': |
102 | 830 | ignore(); |
103 | 830 | final_sb.append('\t'); |
104 | 830 | break; |
105 | 675k | case 'u': { |
106 | 675k | ignore(); // 'u' |
107 | | |
108 | | // https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf |
109 | | // |
110 | | // To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a |
111 | | // twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for |
112 | | // example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E". |
113 | | // However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an |
114 | | // explicit surrogate pair is a semantic decision that is determined by the specific processor. |
115 | 675k | auto code_point = decode_single_or_paired_surrogate(); |
116 | | |
117 | 675k | if (code_point.is_error()) |
118 | 314 | return Error::from_string_literal("JsonParser: Error while parsing Unicode escape"); |
119 | | |
120 | 675k | final_sb.append_code_point(code_point.value()); |
121 | 675k | break; |
122 | 675k | } |
123 | 48 | default: |
124 | 48 | dbgln("JsonParser: Invalid escaped character '{}' ({:#x}) ", peek(), peek()); |
125 | 48 | return Error::from_string_literal("JsonParser: Invalid escaped character"); |
126 | 1.20M | } |
127 | 1.20M | } |
128 | | |
129 | 4.07M | return final_sb.to_byte_string(); |
130 | 4.07M | } |
131 | | |
132 | | ErrorOr<JsonValue> JsonParser::parse_object() |
133 | 74.5k | { |
134 | 74.5k | JsonObject object; |
135 | 74.5k | if (!consume_specific('{')) |
136 | 0 | return Error::from_string_literal("JsonParser: Expected '{'"); |
137 | 1.25M | for (;;) { |
138 | 1.25M | ignore_while(is_space); |
139 | 1.25M | if (peek() == '}') |
140 | 9.88k | break; |
141 | 1.24M | ignore_while(is_space); |
142 | 1.24M | auto name = TRY(consume_and_unescape_string()); |
143 | 1.24M | ignore_while(is_space); |
144 | 1.24M | if (!consume_specific(':')) |
145 | 113 | return Error::from_string_literal("JsonParser: Expected ':'"); |
146 | 1.24M | ignore_while(is_space); |
147 | 1.24M | auto value = TRY(parse_helper()); |
148 | 1.18M | object.set(name, move(value)); |
149 | 1.18M | ignore_while(is_space); |
150 | 1.18M | if (peek() == '}') |
151 | 5.01k | break; |
152 | 1.18M | if (!consume_specific(',')) |
153 | 495 | return Error::from_string_literal("JsonParser: Expected ','"); |
154 | 1.18M | ignore_while(is_space); |
155 | 1.18M | if (peek() == '}') |
156 | 1 | return Error::from_string_literal("JsonParser: Unexpected '}'"); |
157 | 1.18M | } |
158 | 14.8k | if (!consume_specific('}')) |
159 | 0 | return Error::from_string_literal("JsonParser: Expected '}'"); |
160 | 14.8k | return JsonValue { move(object) }; |
161 | 14.8k | } |
162 | | |
163 | | ErrorOr<JsonValue> JsonParser::parse_array() |
164 | 288k | { |
165 | 288k | JsonArray array; |
166 | 288k | if (!consume_specific('[')) |
167 | 0 | return Error::from_string_literal("JsonParser: Expected '['"); |
168 | 9.27M | for (;;) { |
169 | 9.27M | ignore_while(is_space); |
170 | 9.27M | if (peek() == ']') |
171 | 835 | break; |
172 | 9.27M | auto element = TRY(parse_helper()); |
173 | 9.08M | TRY(array.append(move(element))); |
174 | 9.08M | ignore_while(is_space); |
175 | 9.08M | if (peek() == ']') |
176 | 92.7k | break; |
177 | 8.98M | if (!consume_specific(',')) |
178 | 919 | return Error::from_string_literal("JsonParser: Expected ','"); |
179 | 8.98M | ignore_while(is_space); |
180 | 8.98M | if (peek() == ']') |
181 | 1 | return Error::from_string_literal("JsonParser: Unexpected ']'"); |
182 | 8.98M | } |
183 | 93.6k | ignore_while(is_space); |
184 | 93.6k | if (!consume_specific(']')) |
185 | 0 | return Error::from_string_literal("JsonParser: Expected ']'"); |
186 | 93.6k | return JsonValue { move(array) }; |
187 | 93.6k | } |
188 | | |
189 | | ErrorOr<JsonValue> JsonParser::parse_string() |
190 | 2.82M | { |
191 | 2.82M | auto string = TRY(consume_and_unescape_string()); |
192 | 2.82M | return JsonValue(move(string)); |
193 | 2.82M | } |
194 | | |
195 | | ErrorOr<JsonValue> JsonParser::parse_number() |
196 | 7.31M | { |
197 | 7.31M | Vector<char, 32> number_buffer; |
198 | | |
199 | 7.31M | auto start_index = tell(); |
200 | | |
201 | 7.31M | bool negative = false; |
202 | 7.31M | if (peek() == '-') { |
203 | 135k | number_buffer.append('-'); |
204 | 135k | ++m_index; |
205 | 135k | negative = true; |
206 | | |
207 | 135k | if (!is_ascii_digit(peek())) |
208 | 25 | return Error::from_string_literal("JsonParser: Unexpected '-' without further digits"); |
209 | 135k | } |
210 | | |
211 | 7.31M | auto fallback_to_double_parse = [&]() -> ErrorOr<JsonValue> { |
212 | | #ifdef KERNEL |
213 | | # error JSONParser is currently not available for the Kernel because it disallows floating point. \ |
214 | | If you want to make this KERNEL compatible you can just make this fallback_to_double \ |
215 | | function fail with an error in KERNEL mode. |
216 | | #endif |
217 | | // FIXME: Since we know all the characters so far are ascii digits (and one . or e) we could |
218 | | // use that in the floating point parser. |
219 | | |
220 | | // The first part should be just ascii digits |
221 | 288k | StringView view = m_input.substring_view(start_index); |
222 | | |
223 | 288k | char const* start = view.characters_without_null_termination(); |
224 | 288k | auto parse_result = parse_first_floating_point(start, start + view.length()); |
225 | | |
226 | 288k | if (parse_result.parsed_value()) { |
227 | 288k | auto characters_parsed = parse_result.end_ptr - start; |
228 | 288k | m_index = start_index + characters_parsed; |
229 | | |
230 | 288k | return JsonValue(parse_result.value); |
231 | 288k | } |
232 | 0 | return Error::from_string_literal("JsonParser: Invalid floating point"); |
233 | 288k | }; |
234 | | |
235 | 7.31M | if (peek() == '0') { |
236 | 1.89M | if (is_ascii_digit(peek(1))) |
237 | 10 | return Error::from_string_literal("JsonParser: Cannot have leading zeros"); |
238 | | |
239 | | // Leading zeros are not allowed, however we can have a '.' or 'e' with |
240 | | // valid digits after just a zero. These cases will be detected by having the next element |
241 | | // start with a '.' or 'e'. |
242 | 1.89M | } |
243 | | |
244 | 7.31M | bool all_zero = true; |
245 | 32.4M | for (;;) { |
246 | 32.4M | char ch = peek(); |
247 | 32.4M | if (ch == '.') { |
248 | 13.7k | if (!is_ascii_digit(peek(1))) |
249 | 18 | return Error::from_string_literal("JsonParser: Must have digits after decimal point"); |
250 | | |
251 | 13.7k | return fallback_to_double_parse(); |
252 | 13.7k | } |
253 | 32.4M | if (ch == 'e' || ch == 'E') { |
254 | 269k | char next = peek(1); |
255 | 269k | if (!is_ascii_digit(next) && ((next != '+' && next != '-') || !is_ascii_digit(peek(2)))) |
256 | 65 | return Error::from_string_literal("JsonParser: Must have digits after exponent with an optional sign inbetween"); |
257 | | |
258 | 269k | return fallback_to_double_parse(); |
259 | 269k | } |
260 | | |
261 | 32.2M | if (is_ascii_digit(ch)) { |
262 | 25.1M | if (ch != '0') |
263 | 7.60M | all_zero = false; |
264 | | |
265 | 25.1M | number_buffer.append(ch); |
266 | 25.1M | ++m_index; |
267 | 25.1M | continue; |
268 | 25.1M | } |
269 | | |
270 | 7.03M | break; |
271 | 32.2M | } |
272 | | |
273 | | // Negative zero is always a double |
274 | 7.03M | if (negative && all_zero) |
275 | 1.82k | return JsonValue(-0.0); |
276 | | |
277 | 7.03M | StringView number_string(number_buffer.data(), number_buffer.size()); |
278 | | |
279 | 7.03M | if (auto number = number_string.to_number<u64>(); number.has_value()) |
280 | 6.99M | return JsonValue(*number); |
281 | 35.0k | if (auto number = number_string.to_number<i64>(); number.has_value()) |
282 | 29.9k | return JsonValue(*number); |
283 | | |
284 | | // It's possible the unsigned value is bigger than u64 max |
285 | 5.11k | return fallback_to_double_parse(); |
286 | 35.0k | } |
287 | | |
288 | | ErrorOr<JsonValue> JsonParser::parse_true() |
289 | 2.62k | { |
290 | 2.62k | if (!consume_specific("true"sv)) |
291 | 49 | return Error::from_string_literal("JsonParser: Expected 'true'"); |
292 | 2.57k | return JsonValue(true); |
293 | 2.62k | } |
294 | | |
295 | | ErrorOr<JsonValue> JsonParser::parse_false() |
296 | 9.34k | { |
297 | 9.34k | if (!consume_specific("false"sv)) |
298 | 70 | return Error::from_string_literal("JsonParser: Expected 'false'"); |
299 | 9.27k | return JsonValue(false); |
300 | 9.34k | } |
301 | | |
302 | | ErrorOr<JsonValue> JsonParser::parse_null() |
303 | 6.55k | { |
304 | 6.55k | if (!consume_specific("null"sv)) |
305 | 59 | return Error::from_string_literal("JsonParser: Expected 'null'"); |
306 | 6.49k | return JsonValue {}; |
307 | 6.55k | } |
308 | | |
309 | | ErrorOr<JsonValue> JsonParser::parse_helper() |
310 | 10.5M | { |
311 | 10.5M | ignore_while(is_space); |
312 | 10.5M | auto type_hint = peek(); |
313 | 10.5M | switch (type_hint) { |
314 | 74.5k | case '{': |
315 | 74.5k | return parse_object(); |
316 | 288k | case '[': |
317 | 288k | return parse_array(); |
318 | 2.82M | case '"': |
319 | 2.82M | return parse_string(); |
320 | 135k | case '-': |
321 | 2.02M | case '0': |
322 | 2.37M | case '1': |
323 | 2.52M | case '2': |
324 | 2.68M | case '3': |
325 | 3.40M | case '4': |
326 | 3.54M | case '5': |
327 | 6.35M | case '6': |
328 | 6.87M | case '7': |
329 | 7.09M | case '8': |
330 | 7.31M | case '9': |
331 | 7.31M | return parse_number(); |
332 | 9.34k | case 'f': |
333 | 9.34k | return parse_false(); |
334 | 2.62k | case 't': |
335 | 2.62k | return parse_true(); |
336 | 6.55k | case 'n': |
337 | 6.55k | return parse_null(); |
338 | 10.5M | } |
339 | | |
340 | 364 | return Error::from_string_literal("JsonParser: Unexpected character"); |
341 | 10.5M | } |
342 | | |
343 | | ErrorOr<JsonValue> JsonParser::parse() |
344 | 6.25k | { |
345 | 6.25k | auto result = TRY(parse_helper()); |
346 | 2.68k | ignore_while(is_space); |
347 | 2.68k | if (!is_eof()) |
348 | 226 | return Error::from_string_literal("JsonParser: Didn't consume all input"); |
349 | 2.45k | return result; |
350 | 2.68k | } |
351 | | |
352 | | } |