Coverage Report

Created: 2023-03-26 06:03

/src/simdjson/include/simdjson/dom/serialization-inl.h
Line
Count
Source (jump to first uncovered line)
1
2
#ifndef SIMDJSON_SERIALIZATION_INL_H
3
#define SIMDJSON_SERIALIZATION_INL_H
4
5
#include "simdjson/dom/serialization.h"
6
7
#include <cinttypes>
8
#include <type_traits>
9
10
namespace simdjson {
11
namespace dom {
12
0
inline bool parser::print_json(std::ostream &os) const noexcept {
13
0
  if (!valid) { return false; }
14
0
  simdjson::internal::string_builder<> sb;
15
0
  sb.append(doc.root());
16
0
  std::string_view answer = sb.str();
17
0
  os << answer;
18
0
  return true;
19
0
}
20
}
21
/***
22
 * Number utility functions
23
 **/
24
25
26
namespace {
27
/**@private
28
 * Escape sequence like \b or \u0001
29
 * We expect that most compilers will use 8 bytes for this data structure.
30
 **/
31
struct escape_sequence {
32
    uint8_t length;
33
    const char string[7]; // technically, we only ever need 6 characters, we pad to 8
34
};
35
/**@private
36
 * This converts a signed integer into a character sequence.
37
 * The caller is responsible for providing enough memory (at least
38
 * 20 characters.)
39
 * Though various runtime libraries provide itoa functions,
40
 * it is not part of the C++ standard. The C++17 standard
41
 * adds the to_chars functions which would do as well, but
42
 * we want to support C++11.
43
 */
44
0
char *fast_itoa(char *output, int64_t value) noexcept {
45
0
  // This is a standard implementation of itoa.
46
0
  char buffer[20];
47
0
  uint64_t value_positive;
48
0
  // In general, negating a signed integer is unsafe.
49
0
  if(value < 0) {
50
0
    *output++ = '-';
51
0
    // Doing value_positive = -value; while avoiding
52
0
    // undefined behavior warnings.
53
0
    // It assumes two complement's which is universal at this
54
0
    // point in time.
55
0
    std::memcpy(&value_positive, &value, sizeof(value));
56
0
    value_positive = (~value_positive) + 1; // this is a negation
57
0
  } else {
58
0
    value_positive = value;
59
0
  }
60
0
  // We work solely with value_positive. It *might* be easier
61
0
  // for an optimizing compiler to deal with an unsigned variable
62
0
  // as far as performance goes.
63
0
  const char *const end_buffer = buffer + 20;
64
0
  char *write_pointer = buffer + 19;
65
0
  // A faster approach is possible if we expect large integers:
66
0
  // unroll the loop (work in 100s, 1000s) and use some kind of
67
0
  // memoization.
68
0
  while(value_positive >= 10) {
69
0
    *write_pointer-- = char('0' + (value_positive % 10));
70
0
    value_positive /= 10;
71
0
  }
72
0
  *write_pointer = char('0' + value_positive);
73
0
  size_t len = end_buffer - write_pointer;
74
0
  std::memcpy(output, write_pointer, len);
75
0
  return output + len;
76
0
}
Unexecuted instantiation: fuzz_parser.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, long)
Unexecuted instantiation: simdjson.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, long)
77
/**@private
78
 * This converts an unsigned integer into a character sequence.
79
 * The caller is responsible for providing enough memory (at least
80
 * 19 characters.)
81
 * Though various runtime libraries provide itoa functions,
82
 * it is not part of the C++ standard. The C++17 standard
83
 * adds the to_chars functions which would do as well, but
84
 * we want to support C++11.
85
 */
86
0
char *fast_itoa(char *output, uint64_t value) noexcept {
87
0
  // This is a standard implementation of itoa.
88
0
  char buffer[20];
89
0
  const char *const end_buffer = buffer + 20;
90
0
  char *write_pointer = buffer + 19;
91
0
  // A faster approach is possible if we expect large integers:
92
0
  // unroll the loop (work in 100s, 1000s) and use some kind of
93
0
  // memoization.
94
0
  while(value >= 10) {
95
0
    *write_pointer-- = char('0' + (value % 10));
96
0
    value /= 10;
97
0
  };
98
0
  *write_pointer = char('0' + value);
99
0
  size_t len = end_buffer - write_pointer;
100
0
  std::memcpy(output, write_pointer, len);
101
0
  return output + len;
102
0
}
Unexecuted instantiation: fuzz_parser.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, unsigned long)
Unexecuted instantiation: simdjson.cpp:simdjson::(anonymous namespace)::fast_itoa(char*, unsigned long)
103
} // anonymous namespace
104
namespace internal {
105
106
/***
107
 * Minifier/formatter code.
108
 **/
109
110
0
simdjson_inline void mini_formatter::number(uint64_t x) {
111
0
  char number_buffer[24];
112
0
  char *newp = fast_itoa(number_buffer, x);
113
0
  buffer.insert(buffer.end(), number_buffer, newp);
114
0
}
115
116
0
simdjson_inline void mini_formatter::number(int64_t x) {
117
0
  char number_buffer[24];
118
0
  char *newp = fast_itoa(number_buffer, x);
119
0
  buffer.insert(buffer.end(), number_buffer, newp);
120
0
}
121
122
0
simdjson_inline void mini_formatter::number(double x) {
123
0
  char number_buffer[24];
124
0
  // Currently, passing the nullptr to the second argument is
125
0
  // safe because our implementation does not check the second
126
0
  // argument.
127
0
  char *newp = internal::to_chars(number_buffer, nullptr, x);
128
0
  buffer.insert(buffer.end(), number_buffer, newp);
129
0
}
130
131
0
simdjson_inline void mini_formatter::start_array() { one_char('['); }
132
0
simdjson_inline void mini_formatter::end_array() { one_char(']'); }
133
0
simdjson_inline void mini_formatter::start_object() { one_char('{'); }
134
0
simdjson_inline void mini_formatter::end_object() { one_char('}'); }
135
0
simdjson_inline void mini_formatter::comma() { one_char(','); }
136
137
138
0
simdjson_inline void mini_formatter::true_atom() {
139
0
  const char * s = "true";
140
0
  buffer.insert(buffer.end(), s, s + 4);
141
0
}
142
0
simdjson_inline void mini_formatter::false_atom() {
143
0
  const char * s = "false";
144
0
  buffer.insert(buffer.end(), s, s + 5);
145
0
}
146
0
simdjson_inline void mini_formatter::null_atom() {
147
0
  const char * s = "null";
148
0
  buffer.insert(buffer.end(), s, s + 4);
149
0
}
150
0
simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(c); }
151
0
simdjson_inline void mini_formatter::key(std::string_view unescaped) {
152
0
  string(unescaped);
153
0
  one_char(':');
154
0
}
155
0
simdjson_inline void mini_formatter::string(std::string_view unescaped) {
156
0
  one_char('\"');
157
0
  size_t i = 0;
158
0
  // Fast path for the case where we have no control character, no ", and no backslash.
159
0
  // This should include most keys.
160
0
  //
161
0
  // We would like to use 'bool' but some compilers take offense to bitwise operation
162
0
  // with bool types.
163
0
  constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164
0
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
165
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
167
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172
0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
173
0
  for(;i + 8 <= unescaped.length(); i += 8) {
174
0
    // Poor's man vectorization. This could get much faster if we used SIMD.
175
0
    //
176
0
    // It is not the case that replacing '|' with '||' would be neutral performance-wise.
177
0
    if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])]
178
0
      | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])]
179
0
      | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])]
180
0
      | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])]
181
0
      ) { break; }
182
0
  }
183
0
  for(;i < unescaped.length(); i++) {
184
0
    if(needs_escaping[uint8_t(unescaped[i])]) { break; }
185
0
  }
186
0
  // The following is also possible and omits a 256-byte table, but it is slower:
187
0
  // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F)
188
0
  //      && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {}
189
0
190
0
  // At least for long strings, the following should be fast. We could
191
0
  // do better by integrating the checks and the insertion.
192
0
  buffer.insert(buffer.end(), unescaped.data(), unescaped.data() + i);
193
0
  // We caught a control character if we enter this loop (slow).
194
0
  // Note that we are do not restart from the beginning, but rather we continue
195
0
  // from the point where we encountered something that requires escaping.
196
0
  for (; i < unescaped.length(); i++) {
197
0
    switch (unescaped[i]) {
198
0
    case '\"':
199
0
      {
200
0
        const char * s = "\\\"";
201
0
        buffer.insert(buffer.end(), s, s + 2);
202
0
      }
203
0
      break;
204
0
    case '\\':
205
0
      {
206
0
        const char * s = "\\\\";
207
0
        buffer.insert(buffer.end(), s, s + 2);
208
0
      }
209
0
      break;
210
0
    default:
211
0
      if (uint8_t(unescaped[i]) <= 0x1F) {
212
0
        // If packed, this uses 8 * 32 bytes.
213
0
        // Note that we expect most compilers to embed this code in the data
214
0
        // section.
215
0
        constexpr static escape_sequence escaped[32] = {
216
0
          {6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"},
217
0
          {6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"}, {6, "\\u0007"},
218
0
          {2, "\\b"},     {2, "\\t"},     {2, "\\n"},     {6, "\\u000b"},
219
0
          {2, "\\f"},     {2, "\\r"},     {6, "\\u000e"}, {6, "\\u000f"},
220
0
          {6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"},
221
0
          {6, "\\u0014"}, {6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"},
222
0
          {6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"},
223
0
          {6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}};
224
0
        auto u = escaped[uint8_t(unescaped[i])];
225
0
        buffer.insert(buffer.end(), u.string, u.string + u.length);
226
0
      } else {
227
0
        one_char(unescaped[i]);
228
0
      }
229
0
    } // switch
230
0
  }   // for
231
0
  one_char('\"');
232
0
}
233
234
0
inline void mini_formatter::clear() {
235
0
  buffer.clear();
236
0
}
237
238
0
simdjson_inline std::string_view mini_formatter::str() const {
239
0
  return std::string_view(buffer.data(), buffer.size());
240
0
}
241
242
243
/***
244
 * String building code.
245
 **/
246
247
template <class serializer>
248
0
inline void string_builder<serializer>::append(simdjson::dom::element value) {
249
0
  // using tape_type = simdjson::internal::tape_type;
250
0
  size_t depth = 0;
251
0
  constexpr size_t MAX_DEPTH = 16;
252
0
  bool is_object[MAX_DEPTH];
253
0
  is_object[0] = false;
254
0
  bool after_value = false;
255
0
256
0
  internal::tape_ref iter(value.tape);
257
0
  do {
258
0
    // print commas after each value
259
0
    if (after_value) {
260
0
      format.comma();
261
0
    }
262
0
    // If we are in an object, print the next key and :, and skip to the next
263
0
    // value.
264
0
    if (is_object[depth]) {
265
0
      format.key(iter.get_string_view());
266
0
      iter.json_index++;
267
0
    }
268
0
    switch (iter.tape_ref_type()) {
269
0
270
0
    // Arrays
271
0
    case tape_type::START_ARRAY: {
272
0
      // If we're too deep, we need to recurse to go deeper.
273
0
      depth++;
274
0
      if (simdjson_unlikely(depth >= MAX_DEPTH)) {
275
0
        append(simdjson::dom::array(iter));
276
0
        iter.json_index = iter.matching_brace_index() - 1; // Jump to the ]
277
0
        depth--;
278
0
        break;
279
0
      }
280
0
281
0
      // Output start [
282
0
      format.start_array();
283
0
      iter.json_index++;
284
0
285
0
      // Handle empty [] (we don't want to come back around and print commas)
286
0
      if (iter.tape_ref_type() == tape_type::END_ARRAY) {
287
0
        format.end_array();
288
0
        depth--;
289
0
        break;
290
0
      }
291
0
292
0
      is_object[depth] = false;
293
0
      after_value = false;
294
0
      continue;
295
0
    }
296
0
297
0
    // Objects
298
0
    case tape_type::START_OBJECT: {
299
0
      // If we're too deep, we need to recurse to go deeper.
300
0
      depth++;
301
0
      if (simdjson_unlikely(depth >= MAX_DEPTH)) {
302
0
        append(simdjson::dom::object(iter));
303
0
        iter.json_index = iter.matching_brace_index() - 1; // Jump to the }
304
0
        depth--;
305
0
        break;
306
0
      }
307
0
308
0
      // Output start {
309
0
      format.start_object();
310
0
      iter.json_index++;
311
0
312
0
      // Handle empty {} (we don't want to come back around and print commas)
313
0
      if (iter.tape_ref_type() == tape_type::END_OBJECT) {
314
0
        format.end_object();
315
0
        depth--;
316
0
        break;
317
0
      }
318
0
319
0
      is_object[depth] = true;
320
0
      after_value = false;
321
0
      continue;
322
0
    }
323
0
324
0
    // Scalars
325
0
    case tape_type::STRING:
326
0
      format.string(iter.get_string_view());
327
0
      break;
328
0
    case tape_type::INT64:
329
0
      format.number(iter.next_tape_value<int64_t>());
330
0
      iter.json_index++; // numbers take up 2 spots, so we need to increment
331
0
                         // extra
332
0
      break;
333
0
    case tape_type::UINT64:
334
0
      format.number(iter.next_tape_value<uint64_t>());
335
0
      iter.json_index++; // numbers take up 2 spots, so we need to increment
336
0
                         // extra
337
0
      break;
338
0
    case tape_type::DOUBLE:
339
0
      format.number(iter.next_tape_value<double>());
340
0
      iter.json_index++; // numbers take up 2 spots, so we need to increment
341
0
                         // extra
342
0
      break;
343
0
    case tape_type::TRUE_VALUE:
344
0
      format.true_atom();
345
0
      break;
346
0
    case tape_type::FALSE_VALUE:
347
0
      format.false_atom();
348
0
      break;
349
0
    case tape_type::NULL_VALUE:
350
0
      format.null_atom();
351
0
      break;
352
0
353
0
    // These are impossible
354
0
    case tape_type::END_ARRAY:
355
0
    case tape_type::END_OBJECT:
356
0
    case tape_type::ROOT:
357
0
      SIMDJSON_UNREACHABLE();
358
0
    }
359
0
    iter.json_index++;
360
0
    after_value = true;
361
0
362
0
    // Handle multiple ends in a row
363
0
    while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY ||
364
0
                          iter.tape_ref_type() == tape_type::END_OBJECT)) {
365
0
      if (iter.tape_ref_type() == tape_type::END_ARRAY) {
366
0
        format.end_array();
367
0
      } else {
368
0
        format.end_object();
369
0
      }
370
0
      depth--;
371
0
      iter.json_index++;
372
0
    }
373
0
374
0
    // Stop when we're at depth 0
375
0
  } while (depth != 0);
376
0
}
377
378
template <class serializer>
379
0
inline void string_builder<serializer>::append(simdjson::dom::object value) {
380
0
  format.start_object();
381
0
  auto pair = value.begin();
382
0
  auto end = value.end();
383
0
  if (pair != end) {
384
0
    append(*pair);
385
0
    for (++pair; pair != end; ++pair) {
386
0
      format.comma();
387
0
      append(*pair);
388
0
    }
389
0
  }
390
0
  format.end_object();
391
0
}
392
393
template <class serializer>
394
0
inline void string_builder<serializer>::append(simdjson::dom::array value) {
395
0
  format.start_array();
396
0
  auto iter = value.begin();
397
0
  auto end = value.end();
398
0
  if (iter != end) {
399
0
    append(*iter);
400
0
    for (++iter; iter != end; ++iter) {
401
0
      format.comma();
402
0
      append(*iter);
403
0
    }
404
0
  }
405
0
  format.end_array();
406
0
}
407
408
template <class serializer>
409
0
simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) {
410
0
  format.key(kv.key);
411
0
  append(kv.value);
412
0
}
413
414
template <class serializer>
415
simdjson_inline void string_builder<serializer>::clear() {
416
  format.clear();
417
}
418
419
template <class serializer>
420
0
simdjson_inline std::string_view string_builder<serializer>::str() const {
421
0
  return format.str();
422
0
}
423
424
425
} // namespace internal
426
} // namespace simdjson
427
428
#endif