/src/simdjson/include/simdjson/dom/serialization-inl.h
Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | #ifndef SIMDJSON_SERIALIZATION_INL_H |
3 | | #define SIMDJSON_SERIALIZATION_INL_H |
4 | | |
5 | | #include "simdjson/dom/serialization.h" |
6 | | |
7 | | #include <cinttypes> |
8 | | #include <type_traits> |
9 | | |
10 | | namespace simdjson { |
11 | | namespace dom { |
12 | 0 | inline bool parser::print_json(std::ostream &os) const noexcept { |
13 | 0 | if (!valid) { return false; } |
14 | 0 | simdjson::internal::string_builder<> sb; |
15 | 0 | sb.append(doc.root()); |
16 | 0 | std::string_view answer = sb.str(); |
17 | 0 | os << answer; |
18 | 0 | return true; |
19 | 0 | } |
20 | | } |
21 | | /*** |
22 | | * Number utility functions |
23 | | **/ |
24 | | |
25 | | |
26 | | namespace { |
27 | | /**@private |
28 | | * Escape sequence like \b or \u0001 |
29 | | * We expect that most compilers will use 8 bytes for this data structure. |
30 | | **/ |
31 | | struct escape_sequence { |
32 | | uint8_t length; |
33 | | const char string[7]; // technically, we only ever need 6 characters, we pad to 8 |
34 | | }; |
35 | | /**@private |
36 | | * This converts a signed integer into a character sequence. |
37 | | * The caller is responsible for providing enough memory (at least |
38 | | * 20 characters.) |
39 | | * Though various runtime libraries provide itoa functions, |
40 | | * it is not part of the C++ standard. The C++17 standard |
41 | | * adds the to_chars functions which would do as well, but |
42 | | * we want to support C++11. |
43 | | */ |
44 | 0 | char *fast_itoa(char *output, int64_t value) noexcept { |
45 | 0 | // This is a standard implementation of itoa. |
46 | 0 | char buffer[20]; |
47 | 0 | uint64_t value_positive; |
48 | 0 | // In general, negating a signed integer is unsafe. |
49 | 0 | if(value < 0) { |
50 | 0 | *output++ = '-'; |
51 | 0 | // Doing value_positive = -value; while avoiding |
52 | 0 | // undefined behavior warnings. |
53 | 0 | // It assumes two complement's which is universal at this |
54 | 0 | // point in time. |
55 | 0 | std::memcpy(&value_positive, &value, sizeof(value)); |
56 | 0 | value_positive = (~value_positive) + 1; // this is a negation |
57 | 0 | } else { |
58 | 0 | value_positive = value; |
59 | 0 | } |
60 | 0 | // We work solely with value_positive. It *might* be easier |
61 | 0 | // for an optimizing compiler to deal with an unsigned variable |
62 | 0 | // as far as performance goes. |
63 | 0 | const char *const end_buffer = buffer + 20; |
64 | 0 | char *write_pointer = buffer + 19; |
65 | 0 | // A faster approach is possible if we expect large integers: |
66 | 0 | // unroll the loop (work in 100s, 1000s) and use some kind of |
67 | 0 | // memoization. |
68 | 0 | while(value_positive >= 10) { |
69 | 0 | *write_pointer-- = char('0' + (value_positive % 10)); |
70 | 0 | value_positive /= 10; |
71 | 0 | } |
72 | 0 | *write_pointer = char('0' + value_positive); |
73 | 0 | size_t len = end_buffer - write_pointer; |
74 | 0 | std::memcpy(output, write_pointer, len); |
75 | 0 | return output + len; |
76 | 0 | } Unexecuted instantiation: fuzz_parser.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, long) Unexecuted instantiation: simdjson.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, long) |
77 | | /**@private |
78 | | * This converts an unsigned integer into a character sequence. |
79 | | * The caller is responsible for providing enough memory (at least |
80 | | * 19 characters.) |
81 | | * Though various runtime libraries provide itoa functions, |
82 | | * it is not part of the C++ standard. The C++17 standard |
83 | | * adds the to_chars functions which would do as well, but |
84 | | * we want to support C++11. |
85 | | */ |
86 | 0 | char *fast_itoa(char *output, uint64_t value) noexcept { |
87 | 0 | // This is a standard implementation of itoa. |
88 | 0 | char buffer[20]; |
89 | 0 | const char *const end_buffer = buffer + 20; |
90 | 0 | char *write_pointer = buffer + 19; |
91 | 0 | // A faster approach is possible if we expect large integers: |
92 | 0 | // unroll the loop (work in 100s, 1000s) and use some kind of |
93 | 0 | // memoization. |
94 | 0 | while(value >= 10) { |
95 | 0 | *write_pointer-- = char('0' + (value % 10)); |
96 | 0 | value /= 10; |
97 | 0 | }; |
98 | 0 | *write_pointer = char('0' + value); |
99 | 0 | size_t len = end_buffer - write_pointer; |
100 | 0 | std::memcpy(output, write_pointer, len); |
101 | 0 | return output + len; |
102 | 0 | } Unexecuted instantiation: fuzz_parser.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, unsigned long) Unexecuted instantiation: simdjson.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, unsigned long) |
103 | | } // anonymous namespace |
104 | | namespace internal { |
105 | | |
106 | | /*** |
107 | | * Minifier/formatter code. |
108 | | **/ |
109 | | |
110 | 0 | simdjson_inline void mini_formatter::number(uint64_t x) { |
111 | 0 | char number_buffer[24]; |
112 | 0 | char *newp = fast_itoa(number_buffer, x); |
113 | 0 | buffer.insert(buffer.end(), number_buffer, newp); |
114 | 0 | } |
115 | | |
116 | 0 | simdjson_inline void mini_formatter::number(int64_t x) { |
117 | 0 | char number_buffer[24]; |
118 | 0 | char *newp = fast_itoa(number_buffer, x); |
119 | 0 | buffer.insert(buffer.end(), number_buffer, newp); |
120 | 0 | } |
121 | | |
122 | 0 | simdjson_inline void mini_formatter::number(double x) { |
123 | 0 | char number_buffer[24]; |
124 | 0 | // Currently, passing the nullptr to the second argument is |
125 | 0 | // safe because our implementation does not check the second |
126 | 0 | // argument. |
127 | 0 | char *newp = internal::to_chars(number_buffer, nullptr, x); |
128 | 0 | buffer.insert(buffer.end(), number_buffer, newp); |
129 | 0 | } |
130 | | |
131 | 0 | simdjson_inline void mini_formatter::start_array() { one_char('['); } |
132 | 0 | simdjson_inline void mini_formatter::end_array() { one_char(']'); } |
133 | 0 | simdjson_inline void mini_formatter::start_object() { one_char('{'); } |
134 | 0 | simdjson_inline void mini_formatter::end_object() { one_char('}'); } |
135 | 0 | simdjson_inline void mini_formatter::comma() { one_char(','); } |
136 | | |
137 | | |
138 | 0 | simdjson_inline void mini_formatter::true_atom() { |
139 | 0 | const char * s = "true"; |
140 | 0 | buffer.insert(buffer.end(), s, s + 4); |
141 | 0 | } |
142 | 0 | simdjson_inline void mini_formatter::false_atom() { |
143 | 0 | const char * s = "false"; |
144 | 0 | buffer.insert(buffer.end(), s, s + 5); |
145 | 0 | } |
146 | 0 | simdjson_inline void mini_formatter::null_atom() { |
147 | 0 | const char * s = "null"; |
148 | 0 | buffer.insert(buffer.end(), s, s + 4); |
149 | 0 | } |
150 | 0 | simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(c); } |
151 | 0 | simdjson_inline void mini_formatter::key(std::string_view unescaped) { |
152 | 0 | string(unescaped); |
153 | 0 | one_char(':'); |
154 | 0 | } |
155 | 0 | simdjson_inline void mini_formatter::string(std::string_view unescaped) { |
156 | 0 | one_char('\"'); |
157 | 0 | size_t i = 0; |
158 | 0 | // Fast path for the case where we have no control character, no ", and no backslash. |
159 | 0 | // This should include most keys. |
160 | 0 | // |
161 | 0 | // We would like to use 'bool' but some compilers take offense to bitwise operation |
162 | 0 | // with bool types. |
163 | 0 | constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
164 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
165 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
166 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, |
167 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
168 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
169 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
170 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
171 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
172 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
173 | 0 | for(;i + 8 <= unescaped.length(); i += 8) { |
174 | 0 | // Poor's man vectorization. This could get much faster if we used SIMD. |
175 | 0 | // |
176 | 0 | // It is not the case that replacing '|' with '||' would be neutral performance-wise. |
177 | 0 | if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])] |
178 | 0 | | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])] |
179 | 0 | | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])] |
180 | 0 | | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])] |
181 | 0 | ) { break; } |
182 | 0 | } |
183 | 0 | for(;i < unescaped.length(); i++) { |
184 | 0 | if(needs_escaping[uint8_t(unescaped[i])]) { break; } |
185 | 0 | } |
186 | 0 | // The following is also possible and omits a 256-byte table, but it is slower: |
187 | 0 | // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F) |
188 | 0 | // && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {} |
189 | 0 |
|
190 | 0 | // At least for long strings, the following should be fast. We could |
191 | 0 | // do better by integrating the checks and the insertion. |
192 | 0 | buffer.insert(buffer.end(), unescaped.data(), unescaped.data() + i); |
193 | 0 | // We caught a control character if we enter this loop (slow). |
194 | 0 | // Note that we are do not restart from the beginning, but rather we continue |
195 | 0 | // from the point where we encountered something that requires escaping. |
196 | 0 | for (; i < unescaped.length(); i++) { |
197 | 0 | switch (unescaped[i]) { |
198 | 0 | case '\"': |
199 | 0 | { |
200 | 0 | const char * s = "\\\""; |
201 | 0 | buffer.insert(buffer.end(), s, s + 2); |
202 | 0 | } |
203 | 0 | break; |
204 | 0 | case '\\': |
205 | 0 | { |
206 | 0 | const char * s = "\\\\"; |
207 | 0 | buffer.insert(buffer.end(), s, s + 2); |
208 | 0 | } |
209 | 0 | break; |
210 | 0 | default: |
211 | 0 | if (uint8_t(unescaped[i]) <= 0x1F) { |
212 | 0 | // If packed, this uses 8 * 32 bytes. |
213 | 0 | // Note that we expect most compilers to embed this code in the data |
214 | 0 | // section. |
215 | 0 | constexpr static escape_sequence escaped[32] = { |
216 | 0 | {6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"}, |
217 | 0 | {6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"}, {6, "\\u0007"}, |
218 | 0 | {2, "\\b"}, {2, "\\t"}, {2, "\\n"}, {6, "\\u000b"}, |
219 | 0 | {2, "\\f"}, {2, "\\r"}, {6, "\\u000e"}, {6, "\\u000f"}, |
220 | 0 | {6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"}, |
221 | 0 | {6, "\\u0014"}, {6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"}, |
222 | 0 | {6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"}, |
223 | 0 | {6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}}; |
224 | 0 | auto u = escaped[uint8_t(unescaped[i])]; |
225 | 0 | buffer.insert(buffer.end(), u.string, u.string + u.length); |
226 | 0 | } else { |
227 | 0 | one_char(unescaped[i]); |
228 | 0 | } |
229 | 0 | } // switch |
230 | 0 | } // for |
231 | 0 | one_char('\"'); |
232 | 0 | } |
233 | | |
234 | 0 | inline void mini_formatter::clear() { |
235 | 0 | buffer.clear(); |
236 | 0 | } |
237 | | |
238 | 0 | simdjson_inline std::string_view mini_formatter::str() const { |
239 | 0 | return std::string_view(buffer.data(), buffer.size()); |
240 | 0 | } |
241 | | |
242 | | |
243 | | /*** |
244 | | * String building code. |
245 | | **/ |
246 | | |
247 | | template <class serializer> |
248 | 0 | inline void string_builder<serializer>::append(simdjson::dom::element value) { |
249 | 0 | // using tape_type = simdjson::internal::tape_type; |
250 | 0 | size_t depth = 0; |
251 | 0 | constexpr size_t MAX_DEPTH = 16; |
252 | 0 | bool is_object[MAX_DEPTH]; |
253 | 0 | is_object[0] = false; |
254 | 0 | bool after_value = false; |
255 | 0 |
|
256 | 0 | internal::tape_ref iter(value.tape); |
257 | 0 | do { |
258 | 0 | // print commas after each value |
259 | 0 | if (after_value) { |
260 | 0 | format.comma(); |
261 | 0 | } |
262 | 0 | // If we are in an object, print the next key and :, and skip to the next |
263 | 0 | // value. |
264 | 0 | if (is_object[depth]) { |
265 | 0 | format.key(iter.get_string_view()); |
266 | 0 | iter.json_index++; |
267 | 0 | } |
268 | 0 | switch (iter.tape_ref_type()) { |
269 | 0 |
|
270 | 0 | // Arrays |
271 | 0 | case tape_type::START_ARRAY: { |
272 | 0 | // If we're too deep, we need to recurse to go deeper. |
273 | 0 | depth++; |
274 | 0 | if (simdjson_unlikely(depth >= MAX_DEPTH)) { |
275 | 0 | append(simdjson::dom::array(iter)); |
276 | 0 | iter.json_index = iter.matching_brace_index() - 1; // Jump to the ] |
277 | 0 | depth--; |
278 | 0 | break; |
279 | 0 | } |
280 | 0 |
|
281 | 0 | // Output start [ |
282 | 0 | format.start_array(); |
283 | 0 | iter.json_index++; |
284 | 0 |
|
285 | 0 | // Handle empty [] (we don't want to come back around and print commas) |
286 | 0 | if (iter.tape_ref_type() == tape_type::END_ARRAY) { |
287 | 0 | format.end_array(); |
288 | 0 | depth--; |
289 | 0 | break; |
290 | 0 | } |
291 | 0 |
|
292 | 0 | is_object[depth] = false; |
293 | 0 | after_value = false; |
294 | 0 | continue; |
295 | 0 | } |
296 | 0 |
|
297 | 0 | // Objects |
298 | 0 | case tape_type::START_OBJECT: { |
299 | 0 | // If we're too deep, we need to recurse to go deeper. |
300 | 0 | depth++; |
301 | 0 | if (simdjson_unlikely(depth >= MAX_DEPTH)) { |
302 | 0 | append(simdjson::dom::object(iter)); |
303 | 0 | iter.json_index = iter.matching_brace_index() - 1; // Jump to the } |
304 | 0 | depth--; |
305 | 0 | break; |
306 | 0 | } |
307 | 0 |
|
308 | 0 | // Output start { |
309 | 0 | format.start_object(); |
310 | 0 | iter.json_index++; |
311 | 0 |
|
312 | 0 | // Handle empty {} (we don't want to come back around and print commas) |
313 | 0 | if (iter.tape_ref_type() == tape_type::END_OBJECT) { |
314 | 0 | format.end_object(); |
315 | 0 | depth--; |
316 | 0 | break; |
317 | 0 | } |
318 | 0 |
|
319 | 0 | is_object[depth] = true; |
320 | 0 | after_value = false; |
321 | 0 | continue; |
322 | 0 | } |
323 | 0 |
|
324 | 0 | // Scalars |
325 | 0 | case tape_type::STRING: |
326 | 0 | format.string(iter.get_string_view()); |
327 | 0 | break; |
328 | 0 | case tape_type::INT64: |
329 | 0 | format.number(iter.next_tape_value<int64_t>()); |
330 | 0 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
331 | 0 | // extra |
332 | 0 | break; |
333 | 0 | case tape_type::UINT64: |
334 | 0 | format.number(iter.next_tape_value<uint64_t>()); |
335 | 0 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
336 | 0 | // extra |
337 | 0 | break; |
338 | 0 | case tape_type::DOUBLE: |
339 | 0 | format.number(iter.next_tape_value<double>()); |
340 | 0 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
341 | 0 | // extra |
342 | 0 | break; |
343 | 0 | case tape_type::TRUE_VALUE: |
344 | 0 | format.true_atom(); |
345 | 0 | break; |
346 | 0 | case tape_type::FALSE_VALUE: |
347 | 0 | format.false_atom(); |
348 | 0 | break; |
349 | 0 | case tape_type::NULL_VALUE: |
350 | 0 | format.null_atom(); |
351 | 0 | break; |
352 | 0 |
|
353 | 0 | // These are impossible |
354 | 0 | case tape_type::END_ARRAY: |
355 | 0 | case tape_type::END_OBJECT: |
356 | 0 | case tape_type::ROOT: |
357 | 0 | SIMDJSON_UNREACHABLE(); |
358 | 0 | } |
359 | 0 | iter.json_index++; |
360 | 0 | after_value = true; |
361 | 0 |
|
362 | 0 | // Handle multiple ends in a row |
363 | 0 | while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY || |
364 | 0 | iter.tape_ref_type() == tape_type::END_OBJECT)) { |
365 | 0 | if (iter.tape_ref_type() == tape_type::END_ARRAY) { |
366 | 0 | format.end_array(); |
367 | 0 | } else { |
368 | 0 | format.end_object(); |
369 | 0 | } |
370 | 0 | depth--; |
371 | 0 | iter.json_index++; |
372 | 0 | } |
373 | 0 |
|
374 | 0 | // Stop when we're at depth 0 |
375 | 0 | } while (depth != 0); |
376 | 0 | } |
377 | | |
378 | | template <class serializer> |
379 | 0 | inline void string_builder<serializer>::append(simdjson::dom::object value) { |
380 | 0 | format.start_object(); |
381 | 0 | auto pair = value.begin(); |
382 | 0 | auto end = value.end(); |
383 | 0 | if (pair != end) { |
384 | 0 | append(*pair); |
385 | 0 | for (++pair; pair != end; ++pair) { |
386 | 0 | format.comma(); |
387 | 0 | append(*pair); |
388 | 0 | } |
389 | 0 | } |
390 | 0 | format.end_object(); |
391 | 0 | } |
392 | | |
393 | | template <class serializer> |
394 | 0 | inline void string_builder<serializer>::append(simdjson::dom::array value) { |
395 | 0 | format.start_array(); |
396 | 0 | auto iter = value.begin(); |
397 | 0 | auto end = value.end(); |
398 | 0 | if (iter != end) { |
399 | 0 | append(*iter); |
400 | 0 | for (++iter; iter != end; ++iter) { |
401 | 0 | format.comma(); |
402 | 0 | append(*iter); |
403 | 0 | } |
404 | 0 | } |
405 | 0 | format.end_array(); |
406 | 0 | } |
407 | | |
408 | | template <class serializer> |
409 | 0 | simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) { |
410 | 0 | format.key(kv.key); |
411 | 0 | append(kv.value); |
412 | 0 | } |
413 | | |
414 | | template <class serializer> |
415 | | simdjson_inline void string_builder<serializer>::clear() { |
416 | | format.clear(); |
417 | | } |
418 | | |
419 | | template <class serializer> |
420 | 0 | simdjson_inline std::string_view string_builder<serializer>::str() const { |
421 | 0 | return format.str(); |
422 | 0 | } |
423 | | |
424 | | |
425 | | } // namespace internal |
426 | | } // namespace simdjson |
427 | | |
428 | | #endif |