Coverage Report

Created: 2023-11-19 06:24

/src/simdjson/src/fallback.cpp
Line
Count
Source (jump to first uncovered line)
1
#ifndef SIMDJSON_SRC_FALLBACK_CPP
2
#define SIMDJSON_SRC_FALLBACK_CPP
3
4
#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5
#include <base.h>
6
#endif // SIMDJSON_CONDITIONAL_INCLUDE
7
8
#include <simdjson/fallback.h>
9
#include <simdjson/fallback/implementation.h>
10
11
#include <simdjson/fallback/begin.h>
12
#include <generic/stage1/find_next_document_index.h>
13
#include <generic/stage2/stringparsing.h>
14
#include <generic/stage2/logger.h>
15
#include <generic/stage2/json_iterator.h>
16
#include <generic/stage2/tape_writer.h>
17
#include <generic/stage2/tape_builder.h>
18
19
//
20
// Stage 1
21
//
22
23
namespace simdjson {
24
namespace fallback {
25
26
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
27
  size_t capacity,
28
  size_t max_depth,
29
  std::unique_ptr<internal::dom_parser_implementation>& dst
30
11.4k
) const noexcept {
31
11.4k
  dst.reset( new (std::nothrow) SIMDJSON_IMPLEMENTATION::dom_parser_implementation() );
32
11.4k
  if (!dst) { return MEMALLOC; }
33
11.4k
  if (auto err = dst->set_capacity(capacity))
34
0
    return err;
35
11.4k
  if (auto err = dst->set_max_depth(max_depth))
36
0
    return err;
37
11.4k
  return SUCCESS;
38
11.4k
}
39
40
namespace {
41
namespace stage1 {
42
43
class structural_scanner {
44
public:
45
46
simdjson_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial)
47
  : buf{_parser.buf},
48
    next_structural_index{_parser.structural_indexes.get()},
49
    parser{_parser},
50
    len{static_cast<uint32_t>(_parser.len)},
51
11.4k
    partial{_partial} {
52
11.4k
}
53
54
13.7M
simdjson_inline void add_structural() {
55
13.7M
  *next_structural_index = idx;
56
13.7M
  next_structural_index++;
57
13.7M
}
58
59
50.7k
simdjson_inline bool is_continuation(uint8_t c) {
60
50.7k
  return (c & 0xc0) == 0x80;
61
50.7k
}
62
63
78.3k
simdjson_inline void validate_utf8_character() {
64
  // Continuation
65
78.3k
  if (simdjson_unlikely((buf[idx] & 0x40) == 0)) {
66
    // extra continuation
67
35.0k
    error = UTF8_ERROR;
68
35.0k
    idx++;
69
35.0k
    return;
70
35.0k
  }
71
72
  // 2-byte
73
43.3k
  if ((buf[idx] & 0x20) == 0) {
74
    // missing continuation
75
7.24k
    if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
76
6.10k
      if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
77
6.10k
      error = UTF8_ERROR;
78
6.10k
      idx++;
79
6.10k
      return;
80
6.10k
    }
81
    // overlong: 1100000_ 10______
82
1.13k
    if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
83
1.13k
    idx += 2;
84
1.13k
    return;
85
7.24k
  }
86
87
  // 3-byte
88
36.1k
  if ((buf[idx] & 0x10) == 0) {
89
    // missing continuation
90
9.47k
    if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
91
7.14k
      if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
92
7.14k
      error = UTF8_ERROR;
93
7.14k
      idx++;
94
7.14k
      return;
95
7.14k
    }
96
    // overlong: 11100000 100_____ ________
97
2.33k
    if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
98
    // surrogates: U+D800-U+DFFF 11101101 101_____
99
2.33k
    if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
100
2.33k
    idx += 3;
101
2.33k
    return;
102
9.47k
  }
103
104
  // 4-byte
105
  // missing continuation
106
26.6k
  if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
107
24.7k
    if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
108
24.7k
    error = UTF8_ERROR;
109
24.7k
    idx++;
110
24.7k
    return;
111
24.7k
  }
112
  // overlong: 11110000 1000____ ________ ________
113
1.92k
  if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
114
  // too large: > U+10FFFF:
115
  // 11110100 (1001|101_)____
116
  // 1111(1___|011_|0101) 10______
117
  // also includes 5, 6, 7 and 8 byte characters:
118
  // 11111___
119
1.92k
  if (buf[idx] == 0xf4 && buf[idx+1] >= 0x90) { error = UTF8_ERROR; }
120
1.92k
  if (buf[idx] >= 0xf5) { error = UTF8_ERROR; }
121
1.92k
  idx += 4;
122
1.92k
}
123
124
// Returns true if the string is unclosed.
125
223k
simdjson_inline bool validate_string() {
126
223k
  idx++; // skip first quote
127
32.4M
  while (idx < len && buf[idx] != '"') {
128
32.2M
    if (buf[idx] == '\\') {
129
98.7k
      idx += 2;
130
32.1M
    } else if (simdjson_unlikely(buf[idx] & 0x80)) {
131
78.3k
      validate_utf8_character();
132
32.0M
    } else {
133
32.0M
      if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
134
32.0M
      idx++;
135
32.0M
    }
136
32.2M
  }
137
223k
  if (idx >= len) { return true; }
138
222k
  return false;
139
223k
}
140
141
76.7M
simdjson_inline bool is_whitespace_or_operator(uint8_t c) {
142
76.7M
  switch (c) {
143
4.35M
    case '{': case '}': case '[': case ']': case ',': case ':':
144
4.35M
    case ' ': case '\r': case '\n': case '\t':
145
4.35M
      return true;
146
72.4M
    default:
147
72.4M
      return false;
148
76.7M
  }
149
76.7M
}
150
151
//
152
// Parse the entire input in STEP_SIZE-byte chunks.
153
//
154
11.4k
simdjson_inline error_code scan() {
155
11.4k
  bool unclosed_string = false;
156
13.8M
  for (;idx<len;idx++) {
157
13.8M
    switch (buf[idx]) {
158
      // String
159
223k
      case '"':
160
223k
        add_structural();
161
223k
        unclosed_string |= validate_string();
162
223k
        break;
163
      // Operator
164
9.20M
      case '{': case '}': case '[': case ']': case ',': case ':':
165
9.20M
        add_structural();
166
9.20M
        break;
167
      // Whitespace
168
12.4k
      case ' ': case '\r': case '\n': case '\t':
169
12.4k
        break;
170
      // Primitive or invalid character (invalid characters will be checked in stage 2)
171
4.35M
      default:
172
        // Anything else, add the structural and go until we find the next one
173
4.35M
        add_structural();
174
76.8M
        while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
175
72.4M
          idx++;
176
72.4M
        };
177
4.35M
        break;
178
13.8M
    }
179
13.8M
  }
180
  // We pad beyond.
181
  // https://github.com/simdjson/simdjson/issues/906
182
  // See json_structural_indexer.h for an explanation.
183
11.4k
  *next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
184
11.4k
  next_structural_index[1] = len;
185
11.4k
  next_structural_index[2] = 0;
186
11.4k
  parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
187
11.4k
  if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
188
11.4k
  parser.next_structural_index = 0;
189
11.4k
  if (partial == stage1_mode::streaming_partial) {
190
0
    if(unclosed_string) {
191
0
      parser.n_structural_indexes--;
192
0
      if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
193
0
    }
194
    // We truncate the input to the end of the last complete document (or zero).
195
0
    auto new_structural_indexes = find_next_document_index(parser);
196
0
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
197
0
      if(parser.structural_indexes[0] == 0) {
198
        // If the buffer is partial and we started at index 0 but the document is
199
        // incomplete, it's too big to parse.
200
0
        return CAPACITY;
201
0
      } else {
202
        // It is possible that the document could be parsed, we just had a lot
203
        // of white space.
204
0
        parser.n_structural_indexes = 0;
205
0
        return EMPTY;
206
0
      }
207
0
    }
208
0
    parser.n_structural_indexes = new_structural_indexes;
209
11.4k
  } else if(partial == stage1_mode::streaming_final) {
210
0
    if(unclosed_string) { parser.n_structural_indexes--; }
211
    // We truncate the input to the end of the last complete document (or zero).
212
    // Because partial == stage1_mode::streaming_final, it means that we may
213
    // silently ignore trailing garbage. Though it sounds bad, we do it
214
    // deliberately because many people who have streams of JSON documents
215
    // will truncate them for processing. E.g., imagine that you are uncompressing
216
    // the data from a size file or receiving it in chunks from the network. You
217
    // may not know where exactly the last document will be. Meanwhile the
218
    // document_stream instances allow people to know the JSON documents they are
219
    // parsing (see the iterator.source() method).
220
0
    parser.n_structural_indexes = find_next_document_index(parser);
221
    // We store the initial n_structural_indexes so that the client can see
222
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
223
    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
224
    // otherwise, it will copy some prior index.
225
0
    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
226
    // This next line is critical, do not change it unless you understand what you are
227
    // doing.
228
0
    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
229
0
    if (parser.n_structural_indexes == 0) { return EMPTY; }
230
11.4k
  } else if(unclosed_string) { error = UNCLOSED_STRING; }
231
11.4k
  return error;
232
11.4k
}
233
234
private:
235
  const uint8_t *buf;
236
  uint32_t *next_structural_index;
237
  dom_parser_implementation &parser;
238
  uint32_t len;
239
  uint32_t idx{0};
240
  error_code error{SUCCESS};
241
  stage1_mode partial;
242
}; // structural_scanner
243
244
} // namespace stage1
245
} // unnamed namespace
246
247
11.4k
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept {
248
11.4k
  this->buf = _buf;
249
11.4k
  this->len = _len;
250
11.4k
  stage1::structural_scanner scanner(*this, partial);
251
11.4k
  return scanner.scan();
252
11.4k
}
253
254
// big table for the minifier
255
static uint8_t jump_table[256 * 3] = {
256
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
257
    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
258
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
259
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
260
    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
261
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
262
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
263
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
264
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
265
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
266
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
267
    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
268
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
269
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
270
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
271
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
272
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
273
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
274
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
275
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
276
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
277
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
278
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
279
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
280
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
281
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
282
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
283
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
284
    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
285
    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
286
    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
287
};
288
289
644
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
290
644
  size_t i = 0, pos = 0;
291
644
  uint8_t quote = 0;
292
644
  uint8_t nonescape = 1;
293
294
21.0M
  while (i < len) {
295
21.0M
    unsigned char c = buf[i];
296
21.0M
    uint8_t *meta = jump_table + 3 * c;
297
298
21.0M
    quote = quote ^ (meta[0] & nonescape);
299
21.0M
    dst[pos] = c;
300
21.0M
    pos += meta[2] | quote;
301
302
21.0M
    i += 1;
303
21.0M
    nonescape = uint8_t(~nonescape) | (meta[1]);
304
21.0M
  }
305
644
  dst_len = pos; // we intentionally do not work with a reference
306
  // for fear of aliasing
307
644
  return quote ? UNCLOSED_STRING : SUCCESS;
308
644
}
309
310
// credit: based on code from Google Fuchsia (Apache Licensed)
311
481
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
312
481
  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
313
481
  uint64_t pos = 0;
314
481
  uint32_t code_point = 0;
315
1.49M
  while (pos < len) {
316
    // check of the next 8 bytes are ascii.
317
1.49M
    uint64_t next_pos = pos + 16;
318
1.49M
    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
319
1.49M
      uint64_t v1;
320
1.49M
      memcpy(&v1, data + pos, sizeof(uint64_t));
321
1.49M
      uint64_t v2;
322
1.49M
      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
323
1.49M
      uint64_t v{v1 | v2};
324
1.49M
      if ((v & 0x8080808080808080) == 0) {
325
740k
        pos = next_pos;
326
740k
        continue;
327
740k
      }
328
1.49M
    }
329
752k
    unsigned char byte = data[pos];
330
752k
    if (byte < 0x80) {
331
636k
      pos++;
332
636k
      continue;
333
636k
    } else if ((byte & 0xe0) == 0xc0) {
334
58.9k
      next_pos = pos + 2;
335
58.9k
      if (next_pos > len) { return false; }
336
58.9k
      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
337
      // range check
338
58.8k
      code_point = (byte & 0x1f) << 6 | (data[pos + 1] & 0x3f);
339
58.8k
      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
340
58.8k
    } else if ((byte & 0xf0) == 0xe0) {
341
24.8k
      next_pos = pos + 3;
342
24.8k
      if (next_pos > len) { return false; }
343
24.8k
      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
344
24.7k
      if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
345
      // range check
346
24.7k
      code_point = (byte & 0x0f) << 12 |
347
24.7k
                   (data[pos + 1] & 0x3f) << 6 |
348
24.7k
                   (data[pos + 2] & 0x3f);
349
24.7k
      if (code_point < 0x800 || 0xffff < code_point ||
350
24.7k
          (0xd7ff < code_point && code_point < 0xe000)) {
351
32
        return false;
352
32
      }
353
32.2k
    } else if ((byte & 0xf8) == 0xf0) { // 0b11110000
354
32.1k
      next_pos = pos + 4;
355
32.1k
      if (next_pos > len) { return false; }
356
32.1k
      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
357
32.1k
      if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
358
32.1k
      if ((data[pos + 3] & 0xc0) != 0x80) { return false; }
359
      // range check
360
32.1k
      code_point =
361
32.1k
          (byte & 0x07) << 18 | (data[pos + 1] & 0x3f) << 12 |
362
32.1k
          (data[pos + 2] & 0x3f) << 6 | (data[pos + 3] & 0x3f);
363
32.1k
      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
364
32.1k
    } else {
365
      // we may have a continuation
366
80
      return false;
367
80
    }
368
115k
    pos = next_pos;
369
115k
  }
370
150
  return true;
371
481
}
372
373
} // namespace fallback
374
} // namespace simdjson
375
376
//
377
// Stage 2
378
//
379
380
namespace simdjson {
381
namespace fallback {
382
383
10.7k
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
384
10.7k
  return stage2::tape_builder::parse_document<false>(*this, _doc);
385
10.7k
}
386
387
0
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
388
0
  return stage2::tape_builder::parse_document<true>(*this, _doc);
389
0
}
390
391
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
392
0
  return fallback::stringparsing::parse_string(src, dst, replacement_char);
393
0
}
394
395
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
396
0
  return fallback::stringparsing::parse_wobbly_string(src, dst);
397
0
}
398
399
11.4k
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
400
11.4k
  auto error = stage1(_buf, _len, stage1_mode::regular);
401
11.4k
  if (error) { return error; }
402
10.7k
  return stage2(_doc);
403
11.4k
}
404
405
} // namespace fallback
406
} // namespace simdjson
407
408
#include <simdjson/fallback/end.h>
409
410
#endif // SIMDJSON_SRC_FALLBACK_CPP