/src/simdjson/include/simdjson/dom/parser.h
Line | Count | Source |
1 | | #ifndef SIMDJSON_DOM_PARSER_H |
2 | | #define SIMDJSON_DOM_PARSER_H |
3 | | |
4 | | #include "simdjson/dom/base.h" |
5 | | #include "simdjson/dom/document.h" |
6 | | |
7 | | namespace simdjson { |
8 | | |
9 | | namespace dom { |
10 | | |
11 | | /** |
12 | | * A persistent document parser. |
13 | | * |
14 | | * The parser is designed to be reused, holding the internal buffers necessary to do parsing, |
15 | | * as well as memory for a single document. The parsed document is overwritten on each parse. |
16 | | * |
17 | | * This class cannot be copied, only moved, to avoid unintended allocations. |
18 | | * |
19 | | * @note Moving a parser instance may invalidate "dom::element" instances. If you need to |
20 | | * preserve both the "dom::element" instances and the parser, consider wrapping the parser |
21 | | * instance in a std::unique_ptr instance: |
22 | | * |
23 | | * std::unique_ptr<dom::parser> parser(new dom::parser{}); |
24 | | * auto error = parser->load(f).get(root); |
25 | | * |
26 | | * You can then move std::unique_ptr safely. |
27 | | * |
28 | | * @note This is not thread safe: one parser cannot produce two documents at the same time! |
29 | | */ |
30 | | class parser { |
31 | | public: |
32 | | /** |
33 | | * Create a JSON parser. |
34 | | * |
35 | | * The new parser will have zero capacity. |
36 | | * |
37 | | * @param max_capacity The maximum document length the parser can automatically handle. The parser |
38 | | * will allocate more capacity on an as needed basis (when it sees documents too big to handle) |
39 | | * up to this amount. The parser still starts with zero capacity no matter what this number is: |
40 | | * to allocate an initial capacity, call allocate() after constructing the parser. |
41 | | * Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process). |
42 | | */ |
43 | | simdjson_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; |
44 | | /** |
45 | | * Take another parser's buffers and state. |
46 | | * |
47 | | * @param other The parser to take. Its capacity is zeroed. |
48 | | */ |
49 | | simdjson_inline parser(parser &&other) noexcept; |
50 | | parser(const parser &) = delete; ///< @private Disallow copying |
51 | | /** |
52 | | * Take another parser's buffers and state. |
53 | | * |
54 | | * @param other The parser to take. Its capacity is zeroed. |
55 | | */ |
56 | | simdjson_inline parser &operator=(parser &&other) noexcept; |
57 | | parser &operator=(const parser &) = delete; ///< @private Disallow copying |
58 | | |
59 | | /** Deallocate the JSON parser. */ |
60 | 46.0k | ~parser()=default; |
61 | | |
62 | | /** |
63 | | * Load a JSON document from a file and return a reference to it. |
64 | | * |
65 | | * dom::parser parser; |
66 | | * const element doc = parser.load("jsonexamples/twitter.json"); |
67 | | * |
68 | | * The function is eager: the file's content is loaded in memory inside the parser instance |
69 | | * and immediately parsed. The file can be deleted after the `parser.load` call. |
70 | | * |
71 | | * ### IMPORTANT: Document Lifetime |
72 | | * |
73 | | * The JSON document still lives in the parser: this is the most efficient way to parse JSON |
74 | | * documents because it reuses the same buffers, but you *must* use the document before you |
75 | | * destroy the parser or call parse() again. |
76 | | * |
77 | | * Moving the parser instance is safe, but it invalidates the element instances. You may store |
78 | | * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like |
79 | | * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`. |
80 | | * |
81 | | * ### Parser Capacity |
82 | | * |
83 | | * If the parser's current capacity is less than the file length, it will allocate enough capacity |
84 | | * to handle it (up to max_capacity). |
85 | | * |
86 | | * ## Windows and Unicode |
87 | | * |
88 | | * Windows users who need to read files with non-ANSI characters in the |
89 | | * name should set their code page to UTF-8 (65001) before calling this |
90 | | * function. This should be the default with Windows 11 and better. |
91 | | * Further, they may use the AreFileApisANSI function to determine whether |
92 | | * the filename is interpreted using the ANSI or the system default OEM |
93 | | * codepage, and they may call SetFileApisToOEM accordingly. |
94 | | * |
95 | | * @param path The path to load. |
96 | | * @return The document, or an error: |
97 | | * - IO_ERROR if there was an error opening or reading the file. |
98 | | * Be mindful that on some 32-bit systems, |
99 | | * the file size might be limited to 2 GB. |
100 | | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. |
101 | | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
102 | | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
103 | | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
104 | | */ |
105 | | inline simdjson_result<element> load(const std::string &path) & noexcept; |
106 | | inline simdjson_result<element> load(const std::string &path) && = delete ; |
107 | | |
108 | | /** |
109 | | * Load a JSON document from a file into a provide document instance and return a temporary reference to it. |
110 | | * It is similar to the function `load` except that instead of parsing into the internal |
111 | | * `document` instance associated with the parser, it allows the user to provide a document |
112 | | * instance. |
113 | | * |
114 | | * dom::parser parser; |
115 | | * dom::document doc; |
116 | | * element doc_root = parser.load_into_document(doc, "jsonexamples/twitter.json"); |
117 | | * |
118 | | * The function is eager: the file's content is loaded in memory inside the parser instance |
119 | | * and immediately parsed. The file can be deleted after the `parser.load_into_document` call. |
120 | | * |
121 | | * ### IMPORTANT: Document Lifetime |
122 | | * |
123 | | * After the call to load_into_document, the parser is no longer needed. |
124 | | * |
125 | | * The JSON document lives in the document instance: you must keep the document |
126 | | * instance alive while you navigate through it (i.e., used the returned value from |
127 | | * load_into_document). You are encourage to reuse the document instance |
128 | | * many times with new data to avoid reallocations: |
129 | | * |
130 | | * dom::document doc; |
131 | | * element doc_root1 = parser.load_into_document(doc, "jsonexamples/twitter.json"); |
132 | | * //... doc_root1 is a pointer inside doc |
133 | | * element doc_root2 = parser.load_into_document(doc, "jsonexamples/twitter.json"); |
134 | | * //... doc_root2 is a pointer inside doc |
135 | | * // at this point doc_root1 is no longer safe |
136 | | * |
137 | | * Moving the document instance is safe, but it invalidates the element instances. After |
138 | | * moving a document, you can recover safe access to the document root with its `root()` method. |
139 | | * |
140 | | * @param doc The document instance where the parsed data will be stored (on success). |
141 | | * @param path The path to load. |
142 | | * @return The document, or an error: |
143 | | * - IO_ERROR if there was an error opening or reading the file. |
144 | | * Be mindful that on some 32-bit systems, |
145 | | * the file size might be limited to 2 GB. |
146 | | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. |
147 | | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
148 | | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
149 | | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
150 | | */ |
151 | | inline simdjson_result<element> load_into_document(document& doc, const std::string &path) & noexcept; |
152 | | inline simdjson_result<element> load_into_document(document& doc, const std::string &path) && =delete; |
153 | | |
154 | | /** |
155 | | * Parse a JSON document and return a temporary reference to it. |
156 | | * |
157 | | * dom::parser parser; |
158 | | * element doc_root = parser.parse(buf, len); |
159 | | * |
160 | | * The function eagerly parses the input: the input can be modified and discarded after |
161 | | * the `parser.parse(buf, len)` call has completed. |
162 | | * |
163 | | * ### IMPORTANT: Document Lifetime |
164 | | * |
165 | | * The JSON document still lives in the parser: this is the most efficient way to parse JSON |
166 | | * documents because it reuses the same buffers, but you *must* use the document before you |
167 | | * destroy the parser or call parse() again. |
168 | | * |
169 | | * Moving the parser instance is safe, but it invalidates the element instances. You may store |
170 | | * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like |
171 | | * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`. |
172 | | * |
173 | | * ### REQUIRED: Buffer Padding |
174 | | * |
175 | | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
176 | | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
177 | | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
178 | | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
179 | | * |
180 | | * If realloc_if_needed is true (the default), it is assumed that the buffer does *not* have enough padding, |
181 | | * and it is copied into an enlarged temporary buffer before parsing. Thus the following is safe: |
182 | | * |
183 | | * const char *json = R"({"key":"value"})"; |
184 | | * const size_t json_len = std::strlen(json); |
185 | | * simdjson::dom::parser parser; |
186 | | * simdjson::dom::element element = parser.parse(json, json_len); |
187 | | * |
188 | | * If you set realloc_if_needed to false (e.g., parser.parse(json, json_len, false)), |
189 | | * you must provide a buffer with at least SIMDJSON_PADDING extra bytes at the end. |
190 | | * The benefit of setting realloc_if_needed to false is that you avoid a temporary |
191 | | * memory allocation and a copy. |
192 | | * |
193 | | * The padded bytes may be read. It is not important how you initialize |
194 | | * these bytes though we recommend a sensible default like null character values or spaces. |
195 | | * For example, the following low-level code is safe: |
196 | | * |
197 | | * const char *json = R"({"key":"value"})"; |
198 | | * const size_t json_len = std::strlen(json); |
199 | | * std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]}; |
200 | | * std::memcpy(padded_json_copy.get(), json, json_len); |
201 | | * std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING); |
202 | | * simdjson::dom::parser parser; |
203 | | * simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false); |
204 | | * |
205 | | * ### std::string references |
206 | | * |
207 | | * If you pass a mutable std::string reference (std::string&), the parser will seek to extend |
208 | | * its capacity to SIMDJSON_PADDING bytes beyond the end of the string. |
209 | | * |
210 | | * Whenever you pass an std::string reference, the parser will access the bytes beyond the end of |
211 | | * the string but before the end of the allocated memory (std::string::capacity()). |
212 | | * If you are using a sanitizer that checks for reading uninitialized bytes or std::string's |
213 | | * container-overflow checks, you may encounter sanitizer warnings. |
214 | | * You can safely ignore these warnings. Or you can call simdjson::pad(std::string&) to pad the |
215 | | * string with SIMDJSON_PADDING spaces: this function returns a simdjson::padding_string_view |
216 | | * which can be be passed to the parser's parse function: |
217 | | * |
218 | | * std::string json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"; |
219 | | * element doc = parser.parse(simdjson::pad(json)); |
220 | | * |
221 | | * ### Parser Capacity |
222 | | * |
223 | | * If the parser's current capacity is less than len, it will allocate enough capacity |
224 | | * to handle it (up to max_capacity). |
225 | | * |
226 | | * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless |
227 | | * realloc_if_needed is true. |
228 | | * @param len The length of the JSON. |
229 | | * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding. |
230 | | * @return An element pointing at the root of the document, or an error: |
231 | | * - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity, |
232 | | * and memory allocation fails. |
233 | | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
234 | | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
235 | | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
236 | | */ |
237 | | inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
238 | | inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete; |
239 | | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
240 | | simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
241 | | simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete; |
242 | | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
243 | | simdjson_inline simdjson_result<element> parse(const std::string &s) & noexcept; |
244 | | simdjson_inline simdjson_result<element> parse(const std::string &s) && =delete; |
245 | | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
246 | | simdjson_inline simdjson_result<element> parse(const padded_string &s) & noexcept; |
247 | | simdjson_inline simdjson_result<element> parse(const padded_string &s) && =delete; |
248 | | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
249 | | simdjson_inline simdjson_result<element> parse(const padded_string_view &v) & noexcept; |
250 | | simdjson_inline simdjson_result<element> parse(const padded_string_view &v) && =delete; |
251 | | |
252 | | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
253 | | simdjson_inline simdjson_result<element> parse(const char *buf) noexcept = delete; |
254 | | |
255 | | /** |
256 | | * Parse a JSON document into a provide document instance and return a temporary reference to it. |
257 | | * It is similar to the function `parse` except that instead of parsing into the internal |
258 | | * `document` instance associated with the parser, it allows the user to provide a document |
259 | | * instance. |
260 | | * |
261 | | * dom::parser parser; |
262 | | * dom::document doc; |
263 | | * element doc_root = parser.parse_into_document(doc, buf, len); |
264 | | * |
265 | | * The function eagerly parses the input: the input can be modified and discarded after |
266 | | * the `parser.parse(buf, len)` call has completed. |
267 | | * |
268 | | * ### IMPORTANT: Document Lifetime |
269 | | * |
270 | | * After the call to parse_into_document, the parser is no longer needed. |
271 | | * |
272 | | * The JSON document lives in the document instance: you must keep the document |
273 | | * instance alive while you navigate through it (i.e., used the returned value from |
274 | | * parse_into_document). You are encourage to reuse the document instance |
275 | | * many times with new data to avoid reallocations: |
276 | | * |
277 | | * dom::document doc; |
278 | | * element doc_root1 = parser.parse_into_document(doc, buf1, len); |
279 | | * //... doc_root1 is a pointer inside doc |
280 | | * element doc_root2 = parser.parse_into_document(doc, buf1, len); |
281 | | * //... doc_root2 is a pointer inside doc |
282 | | * // at this point doc_root1 is no longer safe |
283 | | * |
284 | | * Moving the document instance is safe, but it invalidates the element instances. After |
285 | | * moving a document, you can recover safe access to the document root with its `root()` method. |
286 | | * |
287 | | * @param doc The document instance where the parsed data will be stored (on success). |
288 | | * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless |
289 | | * realloc_if_needed is true. |
290 | | * @param len The length of the JSON. |
291 | | * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding. |
292 | | * @return An element pointing at the root of document, or an error: |
293 | | * - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity, |
294 | | * and memory allocation fails. |
295 | | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
296 | | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
297 | | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
298 | | */ |
299 | | inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
300 | | inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete; |
301 | | /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
302 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
303 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) && =delete; |
304 | | /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
305 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) & noexcept; |
306 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) && =delete; |
307 | | /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
308 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) & noexcept; |
309 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) && =delete; |
310 | | |
311 | | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
312 | | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf) noexcept = delete; |
313 | | |
314 | | /** |
315 | | * Load a file containing many JSON documents. |
316 | | * |
317 | | * dom::parser parser; |
318 | | * for (const element doc : parser.load_many(path)) { |
319 | | * cout << std::string(doc["title"]) << endl; |
320 | | * } |
321 | | * |
322 | | * The file is loaded in memory and can be safely deleted after the `parser.load_many(path)` |
323 | | * function has returned. The memory is held by the `parser` instance. |
324 | | * |
325 | | * The function is lazy: it may be that no more than one JSON document at a time is parsed. |
326 | | * And, possibly, no document many have been parsed when the `parser.load_many(path)` function |
327 | | * returned. |
328 | | * |
329 | | * If there is a UTF-8 BOM, the parser skips it. |
330 | | * |
331 | | * ### Format |
332 | | * |
333 | | * The file must contain a series of one or more JSON documents, concatenated into a single |
334 | | * buffer, separated by whitespace. It effectively parses until it has a fully valid document, |
335 | | * then starts parsing the next document at that point. (It does this with more parallelism and |
336 | | * lookahead than you might think, though.) |
337 | | * |
338 | | * Documents that consist of an object or array may omit the whitespace between them, concatenating |
339 | | * with no separator. documents that consist of a single primitive (i.e. documents that are not |
340 | | * arrays or objects) MUST be separated with whitespace. |
341 | | * |
342 | | * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. |
343 | | * Setting batch_size to excessively large or excessively small values may impact negatively the |
344 | | * performance. |
345 | | * |
346 | | * ### Error Handling |
347 | | * |
348 | | * All errors are returned during iteration: if there is a global error such as memory allocation, |
349 | | * it will be yielded as the first result. Iteration always stops after the first error. |
350 | | * |
351 | | * As with all other simdjson methods, non-exception error handling is readily available through |
352 | | * the same interface, requiring you to check the error before using the document: |
353 | | * |
354 | | * dom::parser parser; |
355 | | * dom::document_stream docs; |
356 | | * auto error = parser.load_many(path).get(docs); |
357 | | * if (error) { cerr << error << endl; exit(1); } |
358 | | * for (auto doc : docs) { |
359 | | * std::string_view title; |
360 | | * if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); } |
361 | | * cout << title << endl; |
362 | | * } |
363 | | * |
364 | | * ### Threads |
365 | | * |
366 | | * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the |
367 | | * hood to do some lookahead. |
368 | | * |
369 | | * ### Parser Capacity |
370 | | * |
371 | | * If the parser's current capacity is less than batch_size, it will allocate enough capacity |
372 | | * to handle it (up to max_capacity). |
373 | | * |
374 | | * @param path File name pointing at the concatenated JSON to parse. |
375 | | * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet |
376 | | * spot is cache-related: small enough to fit in cache, yet big enough to |
377 | | * parse as many documents as possible in one tight loop. |
378 | | * Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet |
379 | | * spot in our tests. |
380 | | * If you set the batch_size to a value smaller than simdjson::dom::MINIMAL_BATCH_SIZE |
381 | | * (currently 32B), it will be replaced by simdjson::dom::MINIMAL_BATCH_SIZE. |
382 | | * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: |
383 | | * - IO_ERROR if there was an error opening or reading the file. |
384 | | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. |
385 | | * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. |
386 | | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
387 | | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
388 | | */ |
389 | | inline simdjson_result<document_stream> load_many(const std::string &path, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
390 | | |
391 | | /** |
392 | | * Parse a buffer containing many JSON documents. |
393 | | * |
394 | | * dom::parser parser; |
395 | | * for (element doc : parser.parse_many(buf, len)) { |
396 | | * cout << std::string(doc["title"]) << endl; |
397 | | * } |
398 | | * |
399 | | * No copy of the input buffer is made. |
400 | | * |
401 | | * The function is lazy: it may be that no more than one JSON document at a time is parsed. |
402 | | * And, possibly, no document many have been parsed when the `parser.load_many(path)` function |
403 | | * returned. |
404 | | * |
405 | | * The caller is responsabile to ensure that the input string data remains unchanged and is |
406 | | * not deleted during the loop. In particular, the following is unsafe and will not compile: |
407 | | * |
408 | | * auto docs = parser.parse_many("[\"temporary data\"]"_padded); |
409 | | * // here the string "[\"temporary data\"]" may no longer exist in memory |
410 | | * // the parser instance may not have even accessed the input yet |
411 | | * for (element doc : docs) { |
412 | | * cout << std::string(doc["title"]) << endl; |
413 | | * } |
414 | | * |
415 | | * The following is safe: |
416 | | * |
417 | | * auto json = "[\"temporary data\"]"_padded; |
418 | | * auto docs = parser.parse_many(json); |
419 | | * for (element doc : docs) { |
420 | | * cout << std::string(doc["title"]) << endl; |
421 | | * } |
422 | | * |
423 | | * If there is a UTF-8 BOM, the parser skips it. |
424 | | * |
425 | | * ### Format |
426 | | * |
427 | | * The buffer must contain a series of one or more JSON documents, concatenated into a single |
428 | | * buffer, separated by whitespace. It effectively parses until it has a fully valid document, |
429 | | * then starts parsing the next document at that point. (It does this with more parallelism and |
430 | | * lookahead than you might think, though.) |
431 | | * |
432 | | * documents that consist of an object or array may omit the whitespace between them, concatenating |
433 | | * with no separator. documents that consist of a single primitive (i.e. documents that are not |
434 | | * arrays or objects) MUST be separated with whitespace. |
435 | | * |
436 | | * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. |
437 | | * Setting batch_size to excessively large or excessively small values may impact negatively the |
438 | | * performance. |
439 | | * |
440 | | * ### Error Handling |
441 | | * |
442 | | * All errors are returned during iteration: if there is a global error such as memory allocation, |
443 | | * it will be yielded as the first result. Iteration always stops after the first error. |
444 | | * |
445 | | * As with all other simdjson methods, non-exception error handling is readily available through |
446 | | * the same interface, requiring you to check the error before using the document: |
447 | | * |
448 | | * dom::parser parser; |
449 | | * dom::document_stream docs; |
450 | | * auto error = parser.load_many(path).get(docs); |
451 | | * if (error) { cerr << error << endl; exit(1); } |
452 | | * for (auto doc : docs) { |
453 | | * std::string_view title; |
454 | | * if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); } |
455 | | * cout << title << endl; |
456 | | * } |
457 | | * |
458 | | * ### REQUIRED: Buffer Padding |
459 | | * |
460 | | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
461 | | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
462 | | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
463 | | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
464 | | * |
465 | | * ### Threads |
466 | | * |
467 | | * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the |
468 | | * hood to do some lookahead. |
469 | | * |
470 | | * ### Parser Capacity |
471 | | * |
472 | | * If the parser's current capacity is less than batch_size, it will allocate enough capacity |
473 | | * to handle it (up to max_capacity). |
474 | | * |
475 | | * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes. |
476 | | * @param len The length of the concatenated JSON. |
477 | | * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet |
478 | | * spot is cache-related: small enough to fit in cache, yet big enough to |
479 | | * parse as many documents as possible in one tight loop. |
480 | | * Defaults to 10MB, which has been a reasonable sweet spot in our tests. |
481 | | * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: |
482 | | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails |
483 | | * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. |
484 | | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
485 | | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
486 | | */ |
487 | | inline simdjson_result<document_stream> parse_many(const uint8_t *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
488 | | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
489 | | inline simdjson_result<document_stream> parse_many(const char *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
490 | | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
491 | | inline simdjson_result<document_stream> parse_many(const std::string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
492 | | inline simdjson_result<document_stream> parse_many(const std::string &&s, size_t batch_size) = delete;// unsafe |
493 | | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
494 | | inline simdjson_result<document_stream> parse_many(const padded_string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
495 | | inline simdjson_result<document_stream> parse_many(const padded_string &&s, size_t batch_size) = delete;// unsafe |
496 | | |
497 | | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
498 | | simdjson_result<document_stream> parse_many(const char *buf, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept = delete; |
499 | | |
500 | | /** |
501 | | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
502 | | * and `max_depth` depth. |
503 | | * |
504 | | * @param capacity The new capacity. |
505 | | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
506 | | * @return The error, if there is one. |
507 | | */ |
508 | | simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept; |
509 | | |
510 | | #ifndef SIMDJSON_DISABLE_DEPRECATED_API |
511 | | /** |
512 | | * @private deprecated because it returns bool instead of error_code, which is our standard for |
513 | | * failures. Use allocate() instead. |
514 | | * |
515 | | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
516 | | * and `max_depth` depth. |
517 | | * |
518 | | * @param capacity The new capacity. |
519 | | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
520 | | * @return true if successful, false if allocation failed. |
521 | | */ |
522 | | [[deprecated("Use allocate() instead.")]] |
523 | | simdjson_warn_unused inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept; |
524 | | #endif // SIMDJSON_DISABLE_DEPRECATED_API |
525 | | /** |
526 | | * The largest document this parser can support without reallocating. |
527 | | * |
528 | | * @return Current capacity, in bytes. |
529 | | */ |
530 | | simdjson_inline size_t capacity() const noexcept; |
531 | | |
532 | | /** |
533 | | * The largest document this parser can automatically support. |
534 | | * |
535 | | * The parser may reallocate internal buffers as needed up to this amount. |
536 | | * |
537 | | * @return Maximum capacity, in bytes. |
538 | | */ |
539 | | simdjson_inline size_t max_capacity() const noexcept; |
540 | | |
541 | | /** |
542 | | * The maximum level of nested object and arrays supported by this parser. |
543 | | * |
544 | | * @return Maximum depth, in bytes. |
545 | | */ |
546 | | simdjson_pure simdjson_inline size_t max_depth() const noexcept; |
547 | | |
548 | | /** |
549 | | * Set max_capacity. This is the largest document this parser can automatically support. |
550 | | * |
551 | | * The parser may reallocate internal buffers as needed up to this amount as documents are passed |
552 | | * to it. |
553 | | * |
554 | | * Note: To avoid limiting the memory to an absurd value, such as zero or two bytes, |
555 | | * iff you try to set max_capacity to a value lower than MINIMAL_DOCUMENT_CAPACITY, |
556 | | * then the maximal capacity is set to MINIMAL_DOCUMENT_CAPACITY. |
557 | | * |
558 | | * This call will not allocate or deallocate, even if capacity is currently above max_capacity. |
559 | | * |
560 | | * @param max_capacity The new maximum capacity, in bytes. |
561 | | */ |
562 | | simdjson_inline void set_max_capacity(size_t max_capacity) noexcept; |
563 | | |
564 | | #ifdef SIMDJSON_THREADS_ENABLED |
565 | | /** |
566 | | * The parser instance can use threads when they are available to speed up some |
567 | | * operations. It is enabled by default. Changing this attribute will change the |
568 | | * behavior of the parser for future operations. Set to true by default. |
569 | | */ |
570 | | bool threaded{true}; |
571 | | #else |
572 | | /** |
573 | | * When SIMDJSON_THREADS_ENABLED is not defined, the parser instance cannot use threads. |
574 | | */ |
575 | | bool threaded{false}; |
576 | | #endif |
577 | | /** @private Use the new DOM API instead */ |
578 | | class Iterator; |
579 | | /** @private Use simdjson_error instead */ |
580 | | using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error; |
581 | | |
582 | | /** @private [for benchmarking access] The implementation to use */ |
583 | | std::unique_ptr<internal::dom_parser_implementation> implementation{}; |
584 | | |
585 | | /** @private Use `if (parser.parse(...).error())` instead */ |
586 | | bool valid{false}; |
587 | | /** @private Use `parser.parse(...).error()` instead */ |
588 | | error_code error{UNINITIALIZED}; |
589 | | |
590 | | /** @private Use `parser.parse(...).value()` instead */ |
591 | | document doc{}; |
592 | | |
593 | | /** @private returns true if the document parsed was valid */ |
594 | | [[deprecated("Use the result of parser.parse() instead")]] |
595 | | inline bool is_valid() const noexcept; |
596 | | |
597 | | /** |
598 | | * @private return an error code corresponding to the last parsing attempt, see |
599 | | * simdjson.h will return UNINITIALIZED if no parsing was attempted |
600 | | */ |
601 | | [[deprecated("Use the result of parser.parse() instead")]] |
602 | | inline int get_error_code() const noexcept; |
603 | | |
604 | | /** @private return the string equivalent of "get_error_code" */ |
605 | | [[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error")]] |
606 | | inline std::string get_error_message() const noexcept; |
607 | | |
608 | | /** @private */ |
609 | | [[deprecated("Use cout << on the result of parser.parse() instead")]] |
610 | | inline bool print_json(std::ostream &os) const noexcept; |
611 | | |
612 | | /** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */ |
613 | | inline bool dump_raw_tape(std::ostream &os) const noexcept; |
614 | | |
615 | | |
616 | | private: |
617 | | /** |
618 | | * The maximum document length this parser will automatically support. |
619 | | * |
620 | | * The parser will not be automatically allocated above this amount. |
621 | | */ |
622 | | size_t _max_capacity; |
623 | | |
624 | | /** |
625 | | * The loaded buffer (reused each time load() is called) |
626 | | */ |
627 | | std::unique_ptr<char[]> loaded_bytes; |
628 | | |
629 | | /** Capacity of loaded_bytes buffer. */ |
630 | | size_t _loaded_bytes_capacity{0}; |
631 | | |
632 | | // all nodes are stored on the doc.tape using a 64-bit word. |
633 | | // |
634 | | // strings, double and ints are stored as |
635 | | // a 64-bit word with a pointer to the actual value |
636 | | // |
637 | | // |
638 | | // |
639 | | // for objects or arrays, store [ or { at the beginning and } and ] at the |
640 | | // end. For the openings ([ or {), we annotate them with a reference to the |
641 | | // location on the doc.tape of the end, and for then closings (} and ]), we |
642 | | // annotate them with a reference to the location of the opening |
643 | | // |
644 | | // |
645 | | |
646 | | /** |
647 | | * Ensure we have enough capacity to handle at least desired_capacity bytes, |
648 | | * and auto-allocate if not. This also allocates memory if needed in the |
649 | | * internal document. |
650 | | */ |
651 | | inline error_code ensure_capacity(size_t desired_capacity) noexcept; |
652 | | /** |
653 | | * Ensure we have enough capacity to handle at least desired_capacity bytes, |
654 | | * and auto-allocate if not. This also allocates memory if needed in the |
655 | | * provided document. |
656 | | */ |
657 | | inline error_code ensure_capacity(document& doc, size_t desired_capacity) noexcept; |
658 | | |
659 | | /** Read the file into loaded_bytes */ |
660 | | inline simdjson_result<size_t> read_file(const std::string &path) noexcept; |
661 | | |
662 | | friend class parser::Iterator; |
663 | | friend class document_stream; |
664 | | |
665 | | |
666 | | }; // class parser |
667 | | |
668 | | } // namespace dom |
669 | | } // namespace simdjson |
670 | | |
671 | | #endif // SIMDJSON_DOM_PARSER_H |