Coverage Report

Created: 2025-06-13 06:21

/src/simdjson/include/simdjson/dom/parser.h
Line
Count
Source
1
#ifndef SIMDJSON_DOM_PARSER_H
2
#define SIMDJSON_DOM_PARSER_H
3
4
#include "simdjson/dom/base.h"
5
#include "simdjson/dom/document.h"
6
7
namespace simdjson {
8
9
namespace dom {
10
11
/**
12
 * A persistent document parser.
13
 *
14
 * The parser is designed to be reused, holding the internal buffers necessary to do parsing,
15
 * as well as memory for a single document. The parsed document is overwritten on each parse.
16
 *
17
 * This class cannot be copied, only moved, to avoid unintended allocations.
18
 *
19
 * @note Moving a parser instance may invalidate "dom::element" instances. If you need to
20
 * preserve both the "dom::element" instances and the parser, consider wrapping the parser
21
 * instance in a std::unique_ptr instance:
22
 *
23
 *   std::unique_ptr<dom::parser> parser(new dom::parser{});
24
 *   auto error = parser->load(f).get(root);
25
 *
26
 * You can then move std::unique_ptr safely.
27
 *
28
 * @note This is not thread safe: one parser cannot produce two documents at the same time!
29
 */
30
class parser {
31
public:
32
  /**
33
   * Create a JSON parser.
34
   *
35
   * The new parser will have zero capacity.
36
   *
37
   * @param max_capacity The maximum document length the parser can automatically handle. The parser
38
   *    will allocate more capacity on an as needed basis (when it sees documents too big to handle)
39
   *    up to this amount. The parser still starts with zero capacity no matter what this number is:
40
   *    to allocate an initial capacity, call allocate() after constructing the parser.
41
   *    Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process).
42
   */
43
  simdjson_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;
44
  /**
45
   * Take another parser's buffers and state.
46
   *
47
   * @param other The parser to take. Its capacity is zeroed.
48
   */
49
  simdjson_inline parser(parser &&other) noexcept;
50
  parser(const parser &) = delete; ///< @private Disallow copying
51
  /**
52
   * Take another parser's buffers and state.
53
   *
54
   * @param other The parser to take. Its capacity is zeroed.
55
   */
56
  simdjson_inline parser &operator=(parser &&other) noexcept;
57
  parser &operator=(const parser &) = delete; ///< @private Disallow copying
58
59
  /** Deallocate the JSON parser. */
60
46.0k
  ~parser()=default;
61
62
  /**
63
   * Load a JSON document from a file and return a reference to it.
64
   *
65
   *   dom::parser parser;
66
   *   const element doc = parser.load("jsonexamples/twitter.json");
67
   *
68
   * The function is eager: the file's content is loaded in memory inside the parser instance
69
   * and immediately parsed. The file can be deleted after the  `parser.load` call.
70
   *
71
   * ### IMPORTANT: Document Lifetime
72
   *
73
   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
74
   * documents because it reuses the same buffers, but you *must* use the document before you
75
   * destroy the parser or call parse() again.
76
   *
77
   * Moving the parser instance is safe, but it invalidates the element instances. You may store
78
   * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like
79
   * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`.
80
   *
81
   * ### Parser Capacity
82
   *
83
   * If the parser's current capacity is less than the file length, it will allocate enough capacity
84
   * to handle it (up to max_capacity).
85
   *
86
   * ## Windows and Unicode
87
   *
88
   * Windows users who need to read files with non-ANSI characters in the
89
   * name should set their code page to UTF-8 (65001) before calling this
90
   * function. This should be the default with Windows 11 and better.
91
   * Further, they may use the AreFileApisANSI function to determine whether
92
   * the filename is interpreted using the ANSI or the system default OEM
93
   * codepage, and they may call SetFileApisToOEM accordingly.
94
   *
95
   * @param path The path to load.
96
   * @return The document, or an error:
97
   *         - IO_ERROR if there was an error opening or reading the file.
98
   *           Be mindful that on some 32-bit systems,
99
   *           the file size might be limited to 2 GB.
100
   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
101
   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
102
   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
103
   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
104
   */
105
  inline simdjson_result<element> load(const std::string &path) & noexcept;
106
  inline simdjson_result<element> load(const std::string &path) &&  = delete ;
107
108
  /**
109
   * Load a JSON document from a file into a provide document instance and return a temporary reference to it.
110
   * It is similar to the function `load` except that instead of parsing into the internal
111
   * `document` instance associated with the parser, it allows the user to provide a document
112
   * instance.
113
   *
114
   *   dom::parser parser;
115
   *   dom::document doc;
116
   *   element doc_root = parser.load_into_document(doc, "jsonexamples/twitter.json");
117
   *
118
   * The function is eager: the file's content is loaded in memory inside the parser instance
119
   * and immediately parsed. The file can be deleted after the `parser.load_into_document` call.
120
   *
121
   * ### IMPORTANT: Document Lifetime
122
   *
123
   * After the call to load_into_document, the parser is no longer needed.
124
   *
125
   * The JSON document lives in the document instance: you must keep the document
126
   * instance alive while you navigate through it (i.e., used the returned value from
127
   * load_into_document). You are encourage to reuse the document instance
128
   * many times with new data to avoid reallocations:
129
   *
130
   *   dom::document doc;
131
   *   element doc_root1 = parser.load_into_document(doc, "jsonexamples/twitter.json");
132
   *   //... doc_root1 is a pointer inside doc
133
   *   element doc_root2 = parser.load_into_document(doc, "jsonexamples/twitter.json");
134
   *   //... doc_root2 is a pointer inside doc
135
   *   // at this point doc_root1 is no longer safe
136
   *
137
   * Moving the document instance is safe, but it invalidates the element instances. After
138
   * moving a document, you can recover safe access to the document root with its `root()` method.
139
   *
140
   * @param doc The document instance where the parsed data will be stored (on success).
141
   * @param path The path to load.
142
   * @return The document, or an error:
143
   *         - IO_ERROR if there was an error opening or reading the file.
144
   *           Be mindful that on some 32-bit systems,
145
   *           the file size might be limited to 2 GB.
146
   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
147
   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
148
   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
149
   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
150
   */
151
  inline simdjson_result<element> load_into_document(document& doc, const std::string &path) & noexcept;
152
  inline simdjson_result<element> load_into_document(document& doc, const std::string &path) && =delete;
153
154
  /**
155
   * Parse a JSON document and return a temporary reference to it.
156
   *
157
   *   dom::parser parser;
158
   *   element doc_root = parser.parse(buf, len);
159
   *
160
   * The function eagerly parses the input: the input can be modified and discarded after
161
   * the `parser.parse(buf, len)` call has completed.
162
   *
163
   * ### IMPORTANT: Document Lifetime
164
   *
165
   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
166
   * documents because it reuses the same buffers, but you *must* use the document before you
167
   * destroy the parser or call parse() again.
168
   *
169
   * Moving the parser instance is safe, but it invalidates the element instances. You may store
170
   * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like
171
   * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`.
172
   *
173
   * ### REQUIRED: Buffer Padding
174
   *
175
   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
176
   * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
177
   * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
178
   * SIMDJSON_PADDING bytes to avoid runtime warnings.
179
   *
180
   * If realloc_if_needed is true (the default), it is assumed that the buffer does *not* have enough padding,
181
   * and it is copied into an enlarged temporary buffer before parsing. Thus the following is safe:
182
   *
183
   *   const char *json      = R"({"key":"value"})";
184
   *   const size_t json_len = std::strlen(json);
185
   *   simdjson::dom::parser parser;
186
   *   simdjson::dom::element element = parser.parse(json, json_len);
187
   *
188
   * If you set realloc_if_needed to false (e.g., parser.parse(json, json_len, false)),
189
   * you must provide a buffer with at least SIMDJSON_PADDING extra bytes at the end.
190
   * The benefit of setting realloc_if_needed to false is that you avoid a temporary
191
   * memory allocation and a copy.
192
   *
193
   * The padded bytes may be read. It is not important how you initialize
194
   * these bytes though we recommend a sensible default like null character values or spaces.
195
   * For example, the following low-level code is safe:
196
   *
197
   *   const char *json      = R"({"key":"value"})";
198
   *   const size_t json_len = std::strlen(json);
199
   *   std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]};
200
   *   std::memcpy(padded_json_copy.get(), json, json_len);
201
   *   std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING);
202
   *   simdjson::dom::parser parser;
203
   *   simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false);
204
   *
205
   * ### std::string references
206
   *
207
   * If you pass a mutable std::string reference (std::string&), the parser will seek to extend
208
   * its capacity to SIMDJSON_PADDING bytes beyond the end of the string.
209
   *
210
   * Whenever you pass an std::string reference, the parser will access the bytes beyond the end of
211
   * the string but before the end of the allocated memory (std::string::capacity()).
212
   * If you are using a sanitizer that checks for reading uninitialized bytes or std::string's
213
   * container-overflow checks, you may encounter sanitizer warnings.
214
   * You can safely ignore these warnings. Or you can call simdjson::pad(std::string&) to pad the
215
   * string with SIMDJSON_PADDING spaces: this function returns a simdjson::padding_string_view
216
   * which can be be passed to the parser's parse function:
217
   *
218
   *    std::string json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )";
219
   *    element doc = parser.parse(simdjson::pad(json));
220
   *
221
   * ### Parser Capacity
222
   *
223
   * If the parser's current capacity is less than len, it will allocate enough capacity
224
   * to handle it (up to max_capacity).
225
   *
226
   * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
227
   *            realloc_if_needed is true.
228
   * @param len The length of the JSON.
229
   * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
230
   * @return An element pointing at the root of the document, or an error:
231
   *         - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
232
   *           and memory allocation fails.
233
   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
234
   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
235
   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
236
   */
237
  inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;
238
  inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;
239
  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
240
  simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;
241
  simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete;
242
  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
243
  simdjson_inline simdjson_result<element> parse(const std::string &s) & noexcept;
244
  simdjson_inline simdjson_result<element> parse(const std::string &s) && =delete;
245
  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
246
  simdjson_inline simdjson_result<element> parse(const padded_string &s) & noexcept;
247
  simdjson_inline simdjson_result<element> parse(const padded_string &s) && =delete;
248
  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
249
  simdjson_inline simdjson_result<element> parse(const padded_string_view &v) & noexcept;
250
  simdjson_inline simdjson_result<element> parse(const padded_string_view &v) && =delete;
251
252
  /** @private We do not want to allow implicit conversion from C string to std::string. */
253
  simdjson_inline simdjson_result<element> parse(const char *buf) noexcept = delete;
254
255
  /**
256
   * Parse a JSON document into a provide document instance and return a temporary reference to it.
257
   * It is similar to the function `parse` except that instead of parsing into the internal
258
   * `document` instance associated with the parser, it allows the user to provide a document
259
   * instance.
260
   *
261
   *   dom::parser parser;
262
   *   dom::document doc;
263
   *   element doc_root = parser.parse_into_document(doc, buf, len);
264
   *
265
   * The function eagerly parses the input: the input can be modified and discarded after
266
   * the `parser.parse(buf, len)` call has completed.
267
   *
268
   * ### IMPORTANT: Document Lifetime
269
   *
270
   * After the call to parse_into_document, the parser is no longer needed.
271
   *
272
   * The JSON document lives in the document instance: you must keep the document
273
   * instance alive while you navigate through it (i.e., used the returned value from
274
   * parse_into_document). You are encourage to reuse the document instance
275
   * many times with new data to avoid reallocations:
276
   *
277
   *   dom::document doc;
278
   *   element doc_root1 = parser.parse_into_document(doc, buf1, len);
279
   *   //... doc_root1 is a pointer inside doc
280
   *   element doc_root2 = parser.parse_into_document(doc, buf1, len);
281
   *   //... doc_root2 is a pointer inside doc
282
   *   // at this point doc_root1 is no longer safe
283
   *
284
   * Moving the document instance is safe, but it invalidates the element instances. After
285
   * moving a document, you can recover safe access to the document root with its `root()` method.
286
   *
287
   * @param doc The document instance where the parsed data will be stored (on success).
288
   * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
289
   *            realloc_if_needed is true.
290
   * @param len The length of the JSON.
291
   * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
292
   * @return An element pointing at the root of document, or an error:
293
   *         - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
294
   *           and memory allocation fails.
295
   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
296
   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
297
   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
298
   */
299
  inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;
300
  inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;
301
  /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
302
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;
303
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) && =delete;
304
  /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
305
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) & noexcept;
306
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) && =delete;
307
  /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
308
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) & noexcept;
309
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) && =delete;
310
311
  /** @private We do not want to allow implicit conversion from C string to std::string. */
312
  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf) noexcept = delete;
313
314
  /**
315
   * Load a file containing many JSON documents.
316
   *
317
   *   dom::parser parser;
318
   *   for (const element doc : parser.load_many(path)) {
319
   *     cout << std::string(doc["title"]) << endl;
320
   *   }
321
   *
322
   * The file is loaded in memory and can be safely deleted after the `parser.load_many(path)`
323
   * function has returned. The memory is held by the `parser` instance.
324
   *
325
   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
326
   * And, possibly, no document many have been parsed when the `parser.load_many(path)` function
327
   * returned.
328
   *
329
   * If there is a UTF-8 BOM, the parser skips it.
330
   *
331
   * ### Format
332
   *
333
   * The file must contain a series of one or more JSON documents, concatenated into a single
334
   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
335
   * then starts parsing the next document at that point. (It does this with more parallelism and
336
   * lookahead than you might think, though.)
337
   *
338
   * Documents that consist of an object or array may omit the whitespace between them, concatenating
339
   * with no separator. documents that consist of a single primitive (i.e. documents that are not
340
   * arrays or objects) MUST be separated with whitespace.
341
   *
342
   * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
343
   * Setting batch_size to excessively large or excessively small values may impact negatively the
344
   * performance.
345
   *
346
   * ### Error Handling
347
   *
348
   * All errors are returned during iteration: if there is a global error such as memory allocation,
349
   * it will be yielded as the first result. Iteration always stops after the first error.
350
   *
351
   * As with all other simdjson methods, non-exception error handling is readily available through
352
   * the same interface, requiring you to check the error before using the document:
353
   *
354
   *   dom::parser parser;
355
   *   dom::document_stream docs;
356
   *   auto error = parser.load_many(path).get(docs);
357
   *   if (error) { cerr << error << endl; exit(1); }
358
   *   for (auto doc : docs) {
359
   *     std::string_view title;
360
   *     if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }
361
   *     cout << title << endl;
362
   *   }
363
   *
364
   * ### Threads
365
   *
366
   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
367
   * hood to do some lookahead.
368
   *
369
   * ### Parser Capacity
370
   *
371
   * If the parser's current capacity is less than batch_size, it will allocate enough capacity
372
   * to handle it (up to max_capacity).
373
   *
374
   * @param path File name pointing at the concatenated JSON to parse.
375
   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
376
   *                   spot is cache-related: small enough to fit in cache, yet big enough to
377
   *                   parse as many documents as possible in one tight loop.
378
   *                   Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet
379
   *                   spot in our tests.
380
   *                   If you set the batch_size to a value smaller than simdjson::dom::MINIMAL_BATCH_SIZE
381
   *                   (currently 32B), it will be replaced by simdjson::dom::MINIMAL_BATCH_SIZE.
382
   * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
383
   *         - IO_ERROR if there was an error opening or reading the file.
384
   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
385
   *         - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
386
   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
387
   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
388
   */
389
  inline simdjson_result<document_stream> load_many(const std::string &path, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
390
391
  /**
392
   * Parse a buffer containing many JSON documents.
393
   *
394
   *   dom::parser parser;
395
   *   for (element doc : parser.parse_many(buf, len)) {
396
   *     cout << std::string(doc["title"]) << endl;
397
   *   }
398
   *
399
   * No copy of the input buffer is made.
400
   *
401
   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
402
   * And, possibly, no document many have been parsed when the `parser.load_many(path)` function
403
   * returned.
404
   *
405
   * The caller is responsabile to ensure that the input string data remains unchanged and is
406
   * not deleted during the loop. In particular, the following is unsafe and will not compile:
407
   *
408
   *   auto docs = parser.parse_many("[\"temporary data\"]"_padded);
409
   *   // here the string "[\"temporary data\"]" may no longer exist in memory
410
   *   // the parser instance may not have even accessed the input yet
411
   *   for (element doc : docs) {
412
   *     cout << std::string(doc["title"]) << endl;
413
   *   }
414
   *
415
   * The following is safe:
416
   *
417
   *   auto json = "[\"temporary data\"]"_padded;
418
   *   auto docs = parser.parse_many(json);
419
   *   for (element doc : docs) {
420
   *     cout << std::string(doc["title"]) << endl;
421
   *   }
422
   *
423
   * If there is a UTF-8 BOM, the parser skips it.
424
   *
425
   * ### Format
426
   *
427
   * The buffer must contain a series of one or more JSON documents, concatenated into a single
428
   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
429
   * then starts parsing the next document at that point. (It does this with more parallelism and
430
   * lookahead than you might think, though.)
431
   *
432
   * documents that consist of an object or array may omit the whitespace between them, concatenating
433
   * with no separator. documents that consist of a single primitive (i.e. documents that are not
434
   * arrays or objects) MUST be separated with whitespace.
435
   *
436
   * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
437
   * Setting batch_size to excessively large or excessively small values may impact negatively the
438
   * performance.
439
   *
440
   * ### Error Handling
441
   *
442
   * All errors are returned during iteration: if there is a global error such as memory allocation,
443
   * it will be yielded as the first result. Iteration always stops after the first error.
444
   *
445
   * As with all other simdjson methods, non-exception error handling is readily available through
446
   * the same interface, requiring you to check the error before using the document:
447
   *
448
   *   dom::parser parser;
449
   *   dom::document_stream docs;
450
   *   auto error = parser.load_many(path).get(docs);
451
   *   if (error) { cerr << error << endl; exit(1); }
452
   *   for (auto doc : docs) {
453
   *     std::string_view title;
454
   *     if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }
455
   *     cout << title << endl;
456
   *   }
457
   *
458
   * ### REQUIRED: Buffer Padding
459
   *
460
   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
461
   * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
462
   * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
463
   * SIMDJSON_PADDING bytes to avoid runtime warnings.
464
   *
465
   * ### Threads
466
   *
467
   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
468
   * hood to do some lookahead.
469
   *
470
   * ### Parser Capacity
471
   *
472
   * If the parser's current capacity is less than batch_size, it will allocate enough capacity
473
   * to handle it (up to max_capacity).
474
   *
475
   * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
476
   * @param len The length of the concatenated JSON.
477
   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
478
   *                   spot is cache-related: small enough to fit in cache, yet big enough to
479
   *                   parse as many documents as possible in one tight loop.
480
   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
481
   * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
482
   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails
483
   *         - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
484
   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
485
   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
486
   */
487
  inline simdjson_result<document_stream> parse_many(const uint8_t *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
488
  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
489
  inline simdjson_result<document_stream> parse_many(const char *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
490
  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
491
  inline simdjson_result<document_stream> parse_many(const std::string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
492
  inline simdjson_result<document_stream> parse_many(const std::string &&s, size_t batch_size) = delete;// unsafe
493
  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
494
  inline simdjson_result<document_stream> parse_many(const padded_string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
495
  inline simdjson_result<document_stream> parse_many(const padded_string &&s, size_t batch_size) = delete;// unsafe
496
497
  /** @private We do not want to allow implicit conversion from C string to std::string. */
498
  simdjson_result<document_stream> parse_many(const char *buf, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept = delete;
499
500
  /**
501
   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
502
   * and `max_depth` depth.
503
   *
504
   * @param capacity The new capacity.
505
   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
506
   * @return The error, if there is one.
507
   */
508
  simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;
509
510
#ifndef SIMDJSON_DISABLE_DEPRECATED_API
511
  /**
512
   * @private deprecated because it returns bool instead of error_code, which is our standard for
513
   * failures. Use allocate() instead.
514
   *
515
   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
516
   * and `max_depth` depth.
517
   *
518
   * @param capacity The new capacity.
519
   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
520
   * @return true if successful, false if allocation failed.
521
   */
522
  [[deprecated("Use allocate() instead.")]]
523
  simdjson_warn_unused inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;
524
#endif // SIMDJSON_DISABLE_DEPRECATED_API
525
  /**
526
   * The largest document this parser can support without reallocating.
527
   *
528
   * @return Current capacity, in bytes.
529
   */
530
  simdjson_inline size_t capacity() const noexcept;
531
532
  /**
533
   * The largest document this parser can automatically support.
534
   *
535
   * The parser may reallocate internal buffers as needed up to this amount.
536
   *
537
   * @return Maximum capacity, in bytes.
538
   */
539
  simdjson_inline size_t max_capacity() const noexcept;
540
541
  /**
542
   * The maximum level of nested object and arrays supported by this parser.
543
   *
544
   * @return Maximum depth, in bytes.
545
   */
546
  simdjson_pure simdjson_inline size_t max_depth() const noexcept;
547
548
  /**
549
   * Set max_capacity. This is the largest document this parser can automatically support.
550
   *
551
   * The parser may reallocate internal buffers as needed up to this amount as documents are passed
552
   * to it.
553
   *
554
   * Note: To avoid limiting the memory to an absurd value, such as zero or two bytes,
555
   * iff you try to set max_capacity to a value lower than MINIMAL_DOCUMENT_CAPACITY,
556
   * then the maximal capacity is set to MINIMAL_DOCUMENT_CAPACITY.
557
   *
558
   * This call will not allocate or deallocate, even if capacity is currently above max_capacity.
559
   *
560
   * @param max_capacity The new maximum capacity, in bytes.
561
   */
562
  simdjson_inline void set_max_capacity(size_t max_capacity) noexcept;
563
564
#ifdef SIMDJSON_THREADS_ENABLED
565
  /**
566
   * The parser instance can use threads when they are available to speed up some
567
   * operations. It is enabled by default. Changing this attribute will change the
568
   * behavior of the parser for future operations. Set to true by default.
569
   */
570
  bool threaded{true};
571
#else
572
  /**
573
   * When SIMDJSON_THREADS_ENABLED is not defined, the parser instance cannot use threads.
574
   */
575
  bool threaded{false};
576
#endif
577
  /** @private Use the new DOM API instead */
578
  class Iterator;
579
  /** @private Use simdjson_error instead */
580
  using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error;
581
582
  /** @private [for benchmarking access] The implementation to use */
583
  std::unique_ptr<internal::dom_parser_implementation> implementation{};
584
585
  /** @private Use `if (parser.parse(...).error())` instead */
586
  bool valid{false};
587
  /** @private Use `parser.parse(...).error()` instead */
588
  error_code error{UNINITIALIZED};
589
590
  /** @private Use `parser.parse(...).value()` instead */
591
  document doc{};
592
593
  /** @private returns true if the document parsed was valid */
594
  [[deprecated("Use the result of parser.parse() instead")]]
595
  inline bool is_valid() const noexcept;
596
597
  /**
598
   * @private return an error code corresponding to the last parsing attempt, see
599
   * simdjson.h will return UNINITIALIZED if no parsing was attempted
600
   */
601
  [[deprecated("Use the result of parser.parse() instead")]]
602
  inline int get_error_code() const noexcept;
603
604
  /** @private return the string equivalent of "get_error_code" */
605
  [[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error")]]
606
  inline std::string get_error_message() const noexcept;
607
608
  /** @private */
609
  [[deprecated("Use cout << on the result of parser.parse() instead")]]
610
  inline bool print_json(std::ostream &os) const noexcept;
611
612
  /** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */
613
  inline bool dump_raw_tape(std::ostream &os) const noexcept;
614
615
616
private:
617
  /**
618
   * The maximum document length this parser will automatically support.
619
   *
620
   * The parser will not be automatically allocated above this amount.
621
   */
622
  size_t _max_capacity;
623
624
  /**
625
   * The loaded buffer (reused each time load() is called)
626
   */
627
  std::unique_ptr<char[]> loaded_bytes;
628
629
  /** Capacity of loaded_bytes buffer. */
630
  size_t _loaded_bytes_capacity{0};
631
632
  // all nodes are stored on the doc.tape using a 64-bit word.
633
  //
634
  // strings, double and ints are stored as
635
  //  a 64-bit word with a pointer to the actual value
636
  //
637
  //
638
  //
639
  // for objects or arrays, store [ or {  at the beginning and } and ] at the
640
  // end. For the openings ([ or {), we annotate them with a reference to the
641
  // location on the doc.tape of the end, and for then closings (} and ]), we
642
  // annotate them with a reference to the location of the opening
643
  //
644
  //
645
646
  /**
647
   * Ensure we have enough capacity to handle at least desired_capacity bytes,
648
   * and auto-allocate if not. This also allocates memory if needed in the
649
   * internal document.
650
   */
651
  inline error_code ensure_capacity(size_t desired_capacity) noexcept;
652
  /**
653
   * Ensure we have enough capacity to handle at least desired_capacity bytes,
654
   * and auto-allocate if not. This also allocates memory if needed in the
655
   * provided document.
656
   */
657
  inline error_code ensure_capacity(document& doc, size_t desired_capacity) noexcept;
658
659
  /** Read the file into loaded_bytes */
660
  inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
661
662
  friend class parser::Iterator;
663
  friend class document_stream;
664
665
666
}; // class parser
667
668
} // namespace dom
669
} // namespace simdjson
670
671
#endif // SIMDJSON_DOM_PARSER_H