Coverage Report

Created: 2024-02-11 06:37

/src/simdjson/src/fallback.cpp
Line
Count
Source (jump to first uncovered line)
1
#ifndef SIMDJSON_SRC_FALLBACK_CPP
2
#define SIMDJSON_SRC_FALLBACK_CPP
3
4
#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5
#include <base.h>
6
#endif // SIMDJSON_CONDITIONAL_INCLUDE
7
8
#include <simdjson/fallback.h>
9
#include <simdjson/fallback/implementation.h>
10
11
#include <simdjson/fallback/begin.h>
12
#include <generic/stage1/find_next_document_index.h>
13
#include <generic/stage2/stringparsing.h>
14
#include <generic/stage2/logger.h>
15
#include <generic/stage2/json_iterator.h>
16
#include <generic/stage2/tape_writer.h>
17
#include <generic/stage2/tape_builder.h>
18
19
//
20
// Stage 1
21
//
22
23
namespace simdjson {
24
namespace fallback {
25
26
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
27
  size_t capacity,
28
  size_t max_depth,
29
  std::unique_ptr<internal::dom_parser_implementation>& dst
30
11.4k
) const noexcept {
31
11.4k
  dst.reset( new (std::nothrow) SIMDJSON_IMPLEMENTATION::dom_parser_implementation() );
32
11.4k
  if (!dst) { return MEMALLOC; }
33
11.4k
  if (auto err = dst->set_capacity(capacity))
34
0
    return err;
35
11.4k
  if (auto err = dst->set_max_depth(max_depth))
36
0
    return err;
37
11.4k
  return SUCCESS;
38
11.4k
}
39
40
namespace {
41
namespace stage1 {
42
43
class structural_scanner {
44
public:
45
46
simdjson_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial)
47
  : buf{_parser.buf},
48
    next_structural_index{_parser.structural_indexes.get()},
49
    parser{_parser},
50
    len{static_cast<uint32_t>(_parser.len)},
51
11.4k
    partial{_partial} {
52
11.4k
}
53
54
11.2M
simdjson_inline void add_structural() {
55
11.2M
  *next_structural_index = idx;
56
11.2M
  next_structural_index++;
57
11.2M
}
58
59
83.3k
simdjson_inline bool is_continuation(uint8_t c) {
60
83.3k
  return (c & 0xc0) == 0x80;
61
83.3k
}
62
63
95.9k
simdjson_inline void validate_utf8_character() {
64
  // Continuation
65
95.9k
  if (simdjson_unlikely((buf[idx] & 0x40) == 0)) {
66
    // extra continuation
67
20.8k
    error = UTF8_ERROR;
68
20.8k
    idx++;
69
20.8k
    return;
70
20.8k
  }
71
72
  // 2-byte
73
75.1k
  if ((buf[idx] & 0x20) == 0) {
74
    // missing continuation
75
19.5k
    if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
76
17.5k
      if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
77
17.5k
      error = UTF8_ERROR;
78
17.5k
      idx++;
79
17.5k
      return;
80
17.5k
    }
81
    // overlong: 1100000_ 10______
82
2.00k
    if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
83
2.00k
    idx += 2;
84
2.00k
    return;
85
19.5k
  }
86
87
  // 3-byte
88
55.6k
  if ((buf[idx] & 0x10) == 0) {
89
    // missing continuation
90
6.95k
    if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
91
4.28k
      if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
92
4.28k
      error = UTF8_ERROR;
93
4.28k
      idx++;
94
4.28k
      return;
95
4.28k
    }
96
    // overlong: 11100000 100_____ ________
97
2.66k
    if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
98
    // surrogates: U+D800-U+DFFF 11101101 101_____
99
2.66k
    if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
100
2.66k
    idx += 3;
101
2.66k
    return;
102
6.95k
  }
103
104
  // 4-byte
105
  // missing continuation
106
48.6k
  if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
107
47.2k
    if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
108
47.2k
    error = UTF8_ERROR;
109
47.2k
    idx++;
110
47.2k
    return;
111
47.2k
  }
112
  // overlong: 11110000 1000____ ________ ________
113
1.42k
  if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
114
  // too large: > U+10FFFF:
115
  // 11110100 (1001|101_)____
116
  // 1111(1___|011_|0101) 10______
117
  // also includes 5, 6, 7 and 8 byte characters:
118
  // 11111___
119
1.42k
  if (buf[idx] == 0xf4 && buf[idx+1] >= 0x90) { error = UTF8_ERROR; }
120
1.42k
  if (buf[idx] >= 0xf5) { error = UTF8_ERROR; }
121
1.42k
  idx += 4;
122
1.42k
}
123
124
// Returns true if the string is unclosed.
125
101k
simdjson_inline bool validate_string() {
126
101k
  idx++; // skip first quote
127
17.4M
  while (idx < len && buf[idx] != '"') {
128
17.3M
    if (buf[idx] == '\\') {
129
141k
      idx += 2;
130
17.2M
    } else if (simdjson_unlikely(buf[idx] & 0x80)) {
131
95.9k
      validate_utf8_character();
132
17.1M
    } else {
133
17.1M
      if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
134
17.1M
      idx++;
135
17.1M
    }
136
17.3M
  }
137
101k
  if (idx >= len) { return true; }
138
100k
  return false;
139
101k
}
140
141
53.5M
simdjson_inline bool is_whitespace_or_operator(uint8_t c) {
142
53.5M
  switch (c) {
143
3.34M
    case '{': case '}': case '[': case ']': case ',': case ':':
144
3.35M
    case ' ': case '\r': case '\n': case '\t':
145
3.35M
      return true;
146
50.1M
    default:
147
50.1M
      return false;
148
53.5M
  }
149
53.5M
}
150
151
//
152
// Parse the entire input in STEP_SIZE-byte chunks.
153
//
154
11.4k
simdjson_inline error_code scan() {
155
11.4k
  bool unclosed_string = false;
156
11.2M
  for (;idx<len;idx++) {
157
11.2M
    switch (buf[idx]) {
158
      // String
159
101k
      case '"':
160
101k
        add_structural();
161
101k
        unclosed_string |= validate_string();
162
101k
        break;
163
      // Operator
164
7.77M
      case '{': case '}': case '[': case ']': case ',': case ':':
165
7.77M
        add_structural();
166
7.77M
        break;
167
      // Whitespace
168
39.2k
      case ' ': case '\r': case '\n': case '\t':
169
39.2k
        break;
170
      // Primitive or invalid character (invalid characters will be checked in stage 2)
171
3.35M
      default:
172
        // Anything else, add the structural and go until we find the next one
173
3.35M
        add_structural();
174
53.5M
        while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
175
50.1M
          idx++;
176
50.1M
        };
177
3.35M
        break;
178
11.2M
    }
179
11.2M
  }
180
  // We pad beyond.
181
  // https://github.com/simdjson/simdjson/issues/906
182
  // See json_structural_indexer.h for an explanation.
183
11.4k
  *next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
184
11.4k
  next_structural_index[1] = len;
185
11.4k
  next_structural_index[2] = 0;
186
11.4k
  parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
187
11.4k
  if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
188
11.4k
  parser.next_structural_index = 0;
189
11.4k
  if (partial == stage1_mode::streaming_partial) {
190
0
    if(unclosed_string) {
191
0
      parser.n_structural_indexes--;
192
0
      if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
193
0
    }
194
    // We truncate the input to the end of the last complete document (or zero).
195
0
    auto new_structural_indexes = find_next_document_index(parser);
196
0
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
197
0
      if(parser.structural_indexes[0] == 0) {
198
        // If the buffer is partial and we started at index 0 but the document is
199
        // incomplete, it's too big to parse.
200
0
        return CAPACITY;
201
0
      } else {
202
        // It is possible that the document could be parsed, we just had a lot
203
        // of white space.
204
0
        parser.n_structural_indexes = 0;
205
0
        return EMPTY;
206
0
      }
207
0
    }
208
0
    parser.n_structural_indexes = new_structural_indexes;
209
11.4k
  } else if(partial == stage1_mode::streaming_final) {
210
0
    if(unclosed_string) { parser.n_structural_indexes--; }
211
    // We truncate the input to the end of the last complete document (or zero).
212
    // Because partial == stage1_mode::streaming_final, it means that we may
213
    // silently ignore trailing garbage. Though it sounds bad, we do it
214
    // deliberately because many people who have streams of JSON documents
215
    // will truncate them for processing. E.g., imagine that you are uncompressing
216
    // the data from a size file or receiving it in chunks from the network. You
217
    // may not know where exactly the last document will be. Meanwhile the
218
    // document_stream instances allow people to know the JSON documents they are
219
    // parsing (see the iterator.source() method).
220
0
    parser.n_structural_indexes = find_next_document_index(parser);
221
    // We store the initial n_structural_indexes so that the client can see
222
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
223
    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
224
    // otherwise, it will copy some prior index.
225
0
    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
226
    // This next line is critical, do not change it unless you understand what you are
227
    // doing.
228
0
    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
229
0
    if (parser.n_structural_indexes == 0) { return EMPTY; }
230
11.4k
  } else if(unclosed_string) { error = UNCLOSED_STRING; }
231
11.4k
  return error;
232
11.4k
}
233
234
private:
235
  const uint8_t *buf;
236
  uint32_t *next_structural_index;
237
  dom_parser_implementation &parser;
238
  uint32_t len;
239
  uint32_t idx{0};
240
  error_code error{SUCCESS};
241
  stage1_mode partial;
242
}; // structural_scanner
243
244
} // namespace stage1
245
} // unnamed namespace
246
247
11.4k
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept {
248
11.4k
  this->buf = _buf;
249
11.4k
  this->len = _len;
250
11.4k
  stage1::structural_scanner scanner(*this, partial);
251
11.4k
  return scanner.scan();
252
11.4k
}
253
254
// big table for the minifier
255
static uint8_t jump_table[256 * 3] = {
256
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
257
    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
258
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
259
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
260
    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
261
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
262
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
263
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
264
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
265
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
266
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
267
    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
268
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
269
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
270
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
271
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
272
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
273
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
274
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
275
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
276
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
277
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
278
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
279
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
280
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
281
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
282
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
283
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
284
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
285
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
286
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
287
};
288
289
688
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
290
688
  size_t i = 0, pos = 0;
291
688
  uint8_t quote = 0;
292
688
  uint8_t nonescape = 1;
293
294
21.8M
  while (i < len) {
295
21.8M
    unsigned char c = buf[i];
296
21.8M
    uint8_t *meta = jump_table + 3 * c;
297
298
21.8M
    quote = quote ^ (meta[0] & nonescape);
299
21.8M
    dst[pos] = c;
300
21.8M
    pos += meta[2] | quote;
301
302
21.8M
    i += 1;
303
21.8M
    nonescape = uint8_t(~nonescape) | (meta[1]);
304
21.8M
  }
305
688
  dst_len = pos; // we intentionally do not work with a reference
306
  // for fear of aliasing
307
688
  return quote ? UNCLOSED_STRING : SUCCESS;
308
688
}
309
310
// credit: based on code from Google Fuchsia (Apache Licensed)
311
471
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
312
471
  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
313
471
  uint64_t pos = 0;
314
471
  uint32_t code_point = 0;
315
1.54M
  while (pos < len) {
316
    // check of the next 8 bytes are ascii.
317
1.54M
    uint64_t next_pos = pos + 16;
318
1.54M
    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
319
1.54M
      uint64_t v1;
320
1.54M
      memcpy(&v1, data + pos, sizeof(uint64_t));
321
1.54M
      uint64_t v2;
322
1.54M
      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
323
1.54M
      uint64_t v{v1 | v2};
324
1.54M
      if ((v & 0x8080808080808080) == 0) {
325
689k
        pos = next_pos;
326
689k
        continue;
327
689k
      }
328
1.54M
    }
329
855k
    unsigned char byte = data[pos];
330
855k
    if (byte < 0x80) {
331
721k
      pos++;
332
721k
      continue;
333
721k
    } else if ((byte & 0xe0) == 0xc0) {
334
63.2k
      next_pos = pos + 2;
335
63.2k
      if (next_pos > len) { return false; }
336
63.1k
      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
337
      // range check
338
63.1k
      code_point = (byte & 0x1f) << 6 | (data[pos + 1] & 0x3f);
339
63.1k
      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
340
71.0k
    } else if ((byte & 0xf0) == 0xe0) {
341
34.8k
      next_pos = pos + 3;
342
34.8k
      if (next_pos > len) { return false; }
343
34.8k
      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
344
34.7k
      if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
345
      // range check
346
34.7k
      code_point = (byte & 0x0f) << 12 |
347
34.7k
                   (data[pos + 1] & 0x3f) << 6 |
348
34.7k
                   (data[pos + 2] & 0x3f);
349
34.7k
      if (code_point < 0x800 || 0xffff < code_point ||
350
34.7k
          (0xd7ff < code_point && code_point < 0xe000)) {
351
26
        return false;
352
26
      }
353
36.1k
    } else if ((byte & 0xf8) == 0xf0) { // 0b11110000
354
36.0k
      next_pos = pos + 4;
355
36.0k
      if (next_pos > len) { return false; }
356
36.0k
      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
357
36.0k
      if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
358
36.0k
      if ((data[pos + 3] & 0xc0) != 0x80) { return false; }
359
      // range check
360
36.0k
      code_point =
361
36.0k
          (byte & 0x07) << 18 | (data[pos + 1] & 0x3f) << 12 |
362
36.0k
          (data[pos + 2] & 0x3f) << 6 | (data[pos + 3] & 0x3f);
363
36.0k
      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
364
36.0k
    } else {
365
      // we may have a continuation
366
97
      return false;
367
97
    }
368
133k
    pos = next_pos;
369
133k
  }
370
142
  return true;
371
471
}
372
373
} // namespace fallback
374
} // namespace simdjson
375
376
//
377
// Stage 2
378
//
379
380
namespace simdjson {
381
namespace fallback {
382
383
10.7k
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
384
10.7k
  return stage2::tape_builder::parse_document<false>(*this, _doc);
385
10.7k
}
386
387
0
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
388
0
  return stage2::tape_builder::parse_document<true>(*this, _doc);
389
0
}
390
391
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
392
0
  return fallback::stringparsing::parse_string(src, dst, replacement_char);
393
0
}
394
395
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
396
0
  return fallback::stringparsing::parse_wobbly_string(src, dst);
397
0
}
398
399
11.4k
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
400
11.4k
  auto error = stage1(_buf, _len, stage1_mode::regular);
401
11.4k
  if (error) { return error; }
402
10.7k
  return stage2(_doc);
403
11.4k
}
404
405
} // namespace fallback
406
} // namespace simdjson
407
408
#include <simdjson/fallback/end.h>
409
410
#endif // SIMDJSON_SRC_FALLBACK_CPP