/src/qpdf/libqpdf/QPDFParser.cc
Line | Count | Source |
1 | | #include <qpdf/QPDFParser.hh> |
2 | | |
3 | | #include <qpdf/QPDF.hh> |
4 | | #include <qpdf/QPDFObjGen.hh> |
5 | | #include <qpdf/QPDFObjectHandle.hh> |
6 | | #include <qpdf/QPDFObject_private.hh> |
7 | | #include <qpdf/QPDFTokenizer_private.hh> |
8 | | #include <qpdf/QTC.hh> |
9 | | #include <qpdf/QUtil.hh> |
10 | | |
11 | | #include <memory> |
12 | | |
13 | | using namespace std::literals; |
14 | | using namespace qpdf; |
15 | | |
16 | | using ObjectPtr = std::shared_ptr<QPDFObject>; |
17 | | |
18 | | static uint32_t const& max_nesting{global::Limits::parser_max_nesting()}; |
19 | | |
20 | | // The ParseGuard class allows QPDFParser to detect re-entrant parsing. It also provides |
21 | | // special access to allow the parser to create unresolved objects and dangling references. |
22 | | class QPDF::Doc::ParseGuard |
23 | | { |
24 | | public: |
25 | | ParseGuard(QPDF* qpdf) : |
26 | 19.0M | objects(qpdf ? &qpdf->m->objects : nullptr) |
27 | 19.0M | { |
28 | 19.0M | if (objects) { |
29 | 19.0M | objects->inParse(true); |
30 | 19.0M | } |
31 | 19.0M | } |
32 | | |
33 | | static std::shared_ptr<QPDFObject> |
34 | | getObject(QPDF* qpdf, int id, int gen, bool parse_pdf) |
35 | 4.66M | { |
36 | 4.66M | return qpdf->m->objects.getObjectForParser(id, gen, parse_pdf); |
37 | 4.66M | } |
38 | | |
39 | | ~ParseGuard() |
40 | 19.0M | { |
41 | 19.0M | if (objects) { |
42 | 19.0M | objects->inParse(false); |
43 | 19.0M | } |
44 | 19.0M | } |
45 | | QPDF::Doc::Objects* objects; |
46 | | }; |
47 | | |
48 | | using ParseGuard = QPDF::Doc::ParseGuard; |
49 | | using Parser = qpdf::impl::Parser; |
50 | | |
51 | | QPDFObjectHandle |
52 | | Parser::parse(InputSource& input, std::string const& object_description, QPDF* context) |
53 | 46.9k | { |
54 | 46.9k | qpdf::Tokenizer tokenizer; |
55 | 46.9k | if (auto result = Parser( |
56 | 46.9k | input, |
57 | 46.9k | make_description(input.getName(), object_description), |
58 | 46.9k | object_description, |
59 | 46.9k | tokenizer, |
60 | 46.9k | nullptr, |
61 | 46.9k | context, |
62 | 46.9k | false) |
63 | 46.9k | .parse()) { |
64 | 46.3k | return result; |
65 | 46.3k | } |
66 | 646 | return {QPDFObject::create<QPDF_Null>()}; |
67 | 46.9k | } |
68 | | |
69 | | std::pair<QPDFObjectHandle, bool> |
70 | | Parser::parse_content( |
71 | | InputSource& input, |
72 | | std::shared_ptr<QPDFObject::Description> sp_description, |
73 | | qpdf::Tokenizer& tokenizer, |
74 | | QPDF* context) |
75 | 16.9M | { |
76 | 16.9M | static const std::string content("content"); // GCC12 - make constexpr |
77 | 16.9M | auto p = Parser( |
78 | 16.9M | input, |
79 | 16.9M | std::move(sp_description), |
80 | 16.9M | content, |
81 | 16.9M | tokenizer, |
82 | 16.9M | nullptr, |
83 | 16.9M | context, |
84 | 16.9M | true, |
85 | 16.9M | 0, |
86 | 16.9M | 0, |
87 | 16.9M | context && context->doc().reconstructed_xref()); |
88 | 16.9M | if (auto result = p.parse(true)) { |
89 | 16.9M | return {result, false}; |
90 | 16.9M | } |
91 | 16.1k | return {{}, p.empty_}; |
92 | 16.9M | } |
93 | | |
94 | | QPDFObjectHandle |
95 | | Parser::parse( |
96 | | InputSource& input, |
97 | | std::string const& object_description, |
98 | | QPDFTokenizer& tokenizer, |
99 | | bool& empty, |
100 | | QPDFObjectHandle::StringDecrypter* decrypter, |
101 | | QPDF* context) |
102 | 0 | { |
103 | | // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the |
104 | | // only user of the 'empty' member. When removing this overload also remove 'empty'. |
105 | 0 | auto p = Parser( |
106 | 0 | input, |
107 | 0 | make_description(input.getName(), object_description), |
108 | 0 | object_description, |
109 | 0 | *tokenizer.m, |
110 | 0 | decrypter, |
111 | 0 | context, |
112 | 0 | false); |
113 | 0 | auto result = p.parse(); |
114 | 0 | empty = p.empty_; |
115 | 0 | if (result) { |
116 | 0 | return result; |
117 | 0 | } |
118 | 0 | return {QPDFObject::create<QPDF_Null>()}; |
119 | 0 | } |
120 | | |
121 | | QPDFObjectHandle |
122 | | Parser::parse( |
123 | | InputSource& input, |
124 | | std::string const& object_description, |
125 | | qpdf::Tokenizer& tokenizer, |
126 | | QPDFObjectHandle::StringDecrypter* decrypter, |
127 | | QPDF& context, |
128 | | bool sanity_checks) |
129 | 1.61M | { |
130 | 1.61M | return Parser( |
131 | 1.61M | input, |
132 | 1.61M | make_description(input.getName(), object_description), |
133 | 1.61M | object_description, |
134 | 1.61M | tokenizer, |
135 | 1.61M | decrypter, |
136 | 1.61M | &context, |
137 | 1.61M | true, |
138 | 1.61M | 0, |
139 | 1.61M | 0, |
140 | 1.61M | sanity_checks) |
141 | 1.61M | .parse(); |
142 | 1.61M | } |
143 | | |
144 | | QPDFObjectHandle |
145 | | Parser::parse( |
146 | | is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) |
147 | 409k | { |
148 | 409k | return Parser( |
149 | 409k | input, |
150 | 409k | std::make_shared<QPDFObject::Description>( |
151 | 409k | QPDFObject::ObjStreamDescr(stream_id, obj_id)), |
152 | 409k | "", |
153 | 409k | tokenizer, |
154 | 409k | nullptr, |
155 | 409k | &context, |
156 | 409k | true, |
157 | 409k | stream_id, |
158 | 409k | obj_id) |
159 | 409k | .parse(); |
160 | 409k | } |
161 | | |
162 | | QPDFObjectHandle |
163 | | Parser::parse(bool content_stream) |
164 | 19.0M | { |
165 | 19.0M | try { |
166 | 19.0M | return parse_first(content_stream); |
167 | 19.0M | } catch (Error&) { |
168 | 107k | return {}; |
169 | 107k | } catch (QPDFExc& e) { |
170 | 51.0k | throw e; |
171 | 51.0k | } catch (std::logic_error& e) { |
172 | 23 | throw e; |
173 | 17.3k | } catch (std::exception& e) { |
174 | 17.3k | warn("treating object as null because of error during parsing: "s + e.what()); |
175 | 17.3k | return {}; |
176 | 17.3k | } |
177 | 19.0M | } |
178 | | |
179 | | QPDFObjectHandle |
180 | | Parser::parse_first(bool content_stream) |
181 | 19.0M | { |
182 | | // This method must take care not to resolve any objects. Don't check the type of any object |
183 | | // without first ensuring that it is a direct object. Otherwise, doing so may have the side |
184 | | // effect of reading the object and changing the file pointer. If you do this, it will cause a |
185 | | // logic error to be thrown from QPDF::inParse(). |
186 | | |
187 | 19.0M | QPDF::Doc::ParseGuard pg(context_); |
188 | 19.0M | start_ = input_.tell(); |
189 | 19.0M | if (!tokenizer_.nextToken(input_, object_description_)) { |
190 | 26.9k | warn(tokenizer_.getErrorMessage()); |
191 | 26.9k | } |
192 | | |
193 | 19.0M | switch (tokenizer_.getType()) { |
194 | 14.2k | case QPDFTokenizer::tt_eof: |
195 | 14.2k | if (content_stream) { |
196 | | // In content stream mode, leave object uninitialized to indicate EOF |
197 | 7.64k | empty_ = true; |
198 | 7.64k | return {}; |
199 | 7.64k | } |
200 | 6.64k | warn("unexpected EOF"); |
201 | 6.64k | return {}; |
202 | | |
203 | 24.7k | case QPDFTokenizer::tt_bad: |
204 | 24.7k | return {}; |
205 | | |
206 | 2.31k | case QPDFTokenizer::tt_brace_open: |
207 | 4.39k | case QPDFTokenizer::tt_brace_close: |
208 | 4.39k | warn("treating unexpected brace token as null"); |
209 | 4.39k | return {}; |
210 | | |
211 | 6.94k | case QPDFTokenizer::tt_array_close: |
212 | 6.94k | warn("treating unexpected array close token as null"); |
213 | 6.94k | return {}; |
214 | | |
215 | 6.12k | case QPDFTokenizer::tt_dict_close: |
216 | 6.12k | warn("unexpected dictionary close token"); |
217 | 6.12k | return {}; |
218 | | |
219 | 141k | case QPDFTokenizer::tt_array_open: |
220 | 1.77M | case QPDFTokenizer::tt_dict_open: |
221 | 1.77M | stack_.clear(); |
222 | 1.77M | stack_.emplace_back( |
223 | 1.77M | input_, |
224 | 1.77M | (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key); |
225 | 1.77M | frame_ = &stack_.back(); |
226 | 1.77M | return parse_remainder(content_stream); |
227 | | |
228 | 6.90k | case QPDFTokenizer::tt_bool: |
229 | 6.90k | return with_description<QPDF_Bool>(tokenizer_.getValue() == "true"); |
230 | | |
231 | 2.71k | case QPDFTokenizer::tt_null: |
232 | 2.71k | return {QPDFObject::create<QPDF_Null>()}; |
233 | | |
234 | 400k | case QPDFTokenizer::tt_integer: |
235 | 400k | return with_description<QPDF_Integer>(QUtil::string_to_ll(tokenizer_.getValue().c_str())); |
236 | | |
237 | 125k | case QPDFTokenizer::tt_real: |
238 | 125k | return with_description<QPDF_Real>(tokenizer_.getValue()); |
239 | | |
240 | 133k | case QPDFTokenizer::tt_name: |
241 | 133k | return with_description<QPDF_Name>(tokenizer_.getValue()); |
242 | | |
243 | 16.5M | case QPDFTokenizer::tt_word: |
244 | 16.5M | { |
245 | 16.5M | auto const& value = tokenizer_.getValue(); |
246 | 16.5M | if (content_stream) { |
247 | 16.4M | return with_description<QPDF_Operator>(value); |
248 | 16.4M | } else if (value == "endobj") { |
249 | | // We just saw endobj without having read anything. Nothing in the PDF spec appears |
250 | | // to allow empty objects, but they have been encountered in actual PDF files and |
251 | | // Adobe Reader appears to ignore them. Treat this as a null and do not move the |
252 | | // input source's offset. |
253 | 2.85k | empty_ = true; |
254 | 2.85k | input_.seek(input_.getLastOffset(), SEEK_SET); |
255 | 2.85k | if (!content_stream) { |
256 | 2.85k | warn("empty object treated as null"); |
257 | 2.85k | } |
258 | 2.85k | return {}; |
259 | 93.8k | } else { |
260 | 93.8k | warn("unknown token while reading object; treating as string"); |
261 | 93.8k | return with_description<QPDF_String>(value); |
262 | 93.8k | } |
263 | 16.5M | } |
264 | | |
265 | 11.4k | case QPDFTokenizer::tt_string: |
266 | 11.4k | if (decrypter_) { |
267 | 1.60k | std::string s{tokenizer_.getValue()}; |
268 | 1.60k | decrypter_->decryptString(s); |
269 | 1.60k | return with_description<QPDF_String>(s); |
270 | 9.81k | } else { |
271 | 9.81k | return with_description<QPDF_String>(tokenizer_.getValue()); |
272 | 9.81k | } |
273 | | |
274 | 0 | default: |
275 | 0 | warn("treating unknown token type as null while reading object"); |
276 | 0 | return {}; |
277 | 19.0M | } |
278 | 19.0M | } |
279 | | |
280 | | QPDFObjectHandle |
281 | | Parser::parse_remainder(bool content_stream) |
282 | 1.77M | { |
283 | | // This method must take care not to resolve any objects. Don't check the type of any object |
284 | | // without first ensuring that it is a direct object. Otherwise, doing so may have the side |
285 | | // effect of reading the object and changing the file pointer. If you do this, it will cause a |
286 | | // logic error to be thrown from QPDF::inParse(). |
287 | | |
288 | 1.77M | bad_count_ = 0; |
289 | 1.77M | bool b_contents = false; |
290 | | |
291 | 119M | while (true) { |
292 | 119M | if (!tokenizer_.nextToken(input_, object_description_)) { |
293 | 450k | warn(tokenizer_.getErrorMessage()); |
294 | 450k | } |
295 | 119M | ++good_count_; // optimistically |
296 | | |
297 | 119M | if (int_count_ != 0) { |
298 | | // Special handling of indirect references. Treat integer tokens as part of an indirect |
299 | | // reference until proven otherwise. |
300 | 38.3M | if (tokenizer_.getType() == QPDFTokenizer::tt_integer) { |
301 | 30.7M | if (++int_count_ > 2) { |
302 | | // Process the oldest buffered integer. |
303 | 25.2M | add_int(int_count_); |
304 | 25.2M | } |
305 | 30.7M | last_offset_buffer_[int_count_ % 2] = input_.getLastOffset(); |
306 | 30.7M | int_buffer_[int_count_ % 2] = QUtil::string_to_ll(tokenizer_.getValue().c_str()); |
307 | 30.7M | continue; |
308 | | |
309 | 30.7M | } else if ( |
310 | 7.66M | int_count_ >= 2 && tokenizer_.getType() == QPDFTokenizer::tt_word && |
311 | 4.94M | tokenizer_.getValue() == "R") { |
312 | 4.69M | if (!context_) { |
313 | 0 | throw std::logic_error( |
314 | 0 | "Parser::parse called without context on an object with indirect " |
315 | 0 | "references"); |
316 | 0 | } |
317 | 4.69M | auto id = QIntC::to_int(int_buffer_[(int_count_ - 1) % 2]); |
318 | 4.69M | auto gen = QIntC::to_int(int_buffer_[(int_count_) % 2]); |
319 | 4.69M | if (!(id < 1 || gen < 0 || gen >= 65535)) { |
320 | 4.66M | add(ParseGuard::getObject(context_, id, gen, parse_pdf_)); |
321 | 4.66M | } else { |
322 | 32.7k | add_bad_null( |
323 | 32.7k | "treating bad indirect reference (" + std::to_string(id) + " " + |
324 | 32.7k | std::to_string(gen) + " R) as null"); |
325 | 32.7k | } |
326 | 4.69M | int_count_ = 0; |
327 | 4.69M | continue; |
328 | | |
329 | 4.69M | } else if (int_count_ > 0) { |
330 | | // Process the buffered integers before processing the current token. |
331 | 2.96M | if (int_count_ > 1) { |
332 | 793k | add_int(int_count_ - 1); |
333 | 793k | } |
334 | 2.96M | add_int(int_count_); |
335 | 2.96M | int_count_ = 0; |
336 | 2.96M | } |
337 | 38.3M | } |
338 | | |
339 | 83.9M | switch (tokenizer_.getType()) { |
340 | 78.8k | case QPDFTokenizer::tt_eof: |
341 | 78.8k | warn("parse error while reading object"); |
342 | 78.8k | if (content_stream) { |
343 | | // In content stream mode, leave object uninitialized to indicate EOF |
344 | 339 | return {}; |
345 | 339 | } |
346 | 78.4k | warn("unexpected EOF"); |
347 | 78.4k | return {}; |
348 | | |
349 | 370k | case QPDFTokenizer::tt_bad: |
350 | 370k | check_too_many_bad_tokens(); |
351 | 370k | add_null(); |
352 | 370k | continue; |
353 | | |
354 | 29.1k | case QPDFTokenizer::tt_brace_open: |
355 | 52.0k | case QPDFTokenizer::tt_brace_close: |
356 | 52.0k | add_bad_null("treating unexpected brace token as null"); |
357 | 52.0k | continue; |
358 | | |
359 | 1.32M | case QPDFTokenizer::tt_array_close: |
360 | 1.32M | if (frame_->state == st_array) { |
361 | 1.29M | auto object = frame_->null_count > 100 |
362 | 1.29M | ? QPDFObject::create<QPDF_Array>(std::move(frame_->olist), true) |
363 | 1.29M | : QPDFObject::create<QPDF_Array>(std::move(frame_->olist)); |
364 | 1.29M | set_description(object, frame_->offset - 1); |
365 | | // The `offset` points to the next of "[". Set the rewind offset to point to the |
366 | | // beginning of "[". This has been explicitly tested with whitespace surrounding the |
367 | | // array start delimiter. getLastOffset points to the array end token and therefore |
368 | | // can't be used here. |
369 | 1.29M | if (stack_.size() <= 1) { |
370 | 47.8k | return object; |
371 | 47.8k | } |
372 | 1.24M | stack_.pop_back(); |
373 | 1.24M | frame_ = &stack_.back(); |
374 | 1.24M | add(std::move(object)); |
375 | 1.24M | } else { |
376 | 27.4k | if (sanity_checks_) { |
377 | | // During sanity checks, assume nesting of containers is corrupt and object is |
378 | | // unusable. |
379 | 19.0k | warn("unexpected array close token; giving up on reading object"); |
380 | 19.0k | return {}; |
381 | 19.0k | } |
382 | 8.41k | add_bad_null("treating unexpected array close token as null"); |
383 | 8.41k | } |
384 | 1.25M | continue; |
385 | | |
386 | 2.21M | case QPDFTokenizer::tt_dict_close: |
387 | 2.21M | if (frame_->state <= st_dictionary_value) { |
388 | | // Attempt to recover more or less gracefully from invalid dictionaries. |
389 | 2.18M | auto& dict = frame_->dict; |
390 | | |
391 | 2.18M | if (frame_->state == st_dictionary_value) { |
392 | 109k | warn( |
393 | 109k | frame_->offset, |
394 | 109k | "dictionary ended prematurely; using null as value for last key"); |
395 | 109k | dict[frame_->key] = QPDFObject::create<QPDF_Null>(); |
396 | 109k | } |
397 | 2.18M | if (!frame_->olist.empty()) { |
398 | 523k | if (sanity_checks_) { |
399 | 489k | warn( |
400 | 489k | frame_->offset, |
401 | 489k | "expected dictionary keys but found non-name objects; ignoring"); |
402 | 489k | } else { |
403 | 33.8k | fix_missing_keys(); |
404 | 33.8k | } |
405 | 523k | } |
406 | | |
407 | 2.18M | if (!frame_->contents_string.empty() && dict.contains("/Type") && |
408 | 1.25k | dict["/Type"].isNameAndEquals("/Sig") && dict.contains("/ByteRange") && |
409 | 398 | dict.contains("/Contents") && dict["/Contents"].isString()) { |
410 | 340 | dict["/Contents"] = QPDFObjectHandle::newString(frame_->contents_string); |
411 | 340 | dict["/Contents"].setParsedOffset(frame_->contents_offset); |
412 | 340 | } |
413 | 2.18M | auto object = QPDFObject::create<QPDF_Dictionary>(std::move(dict)); |
414 | 2.18M | set_description(object, frame_->offset - 2); |
415 | | // The `offset` points to the next of "<<". Set the rewind offset to point to the |
416 | | // beginning of "<<". This has been explicitly tested with whitespace surrounding |
417 | | // the dictionary start delimiter. getLastOffset points to the dictionary end token |
418 | | // and therefore can't be used here. |
419 | 2.18M | if (stack_.size() <= 1) { |
420 | 1.42M | return object; |
421 | 1.42M | } |
422 | 763k | stack_.pop_back(); |
423 | 763k | frame_ = &stack_.back(); |
424 | 763k | add(std::move(object)); |
425 | 763k | } else { |
426 | 22.3k | if (sanity_checks_) { |
427 | | // During sanity checks, assume nesting of containers is corrupt and object is |
428 | | // unusable. |
429 | 15.6k | warn("unexpected dictionary close token; giving up on reading object"); |
430 | 15.6k | return {}; |
431 | 15.6k | } |
432 | 6.67k | add_bad_null("unexpected dictionary close token"); |
433 | 6.67k | } |
434 | 770k | continue; |
435 | | |
436 | 2.21M | case QPDFTokenizer::tt_array_open: |
437 | 3.36M | case QPDFTokenizer::tt_dict_open: |
438 | 3.36M | if (stack_.size() > max_nesting) { |
439 | 1.00k | limits_error( |
440 | 1.00k | "parser-max-nesting", "ignoring excessively deeply nested data structure"); |
441 | 1.00k | } |
442 | 3.36M | b_contents = false; |
443 | 3.36M | stack_.emplace_back( |
444 | 3.36M | input_, |
445 | 3.36M | (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array |
446 | 3.36M | : st_dictionary_key); |
447 | 3.36M | frame_ = &stack_.back(); |
448 | 3.36M | continue; |
449 | | |
450 | 119k | case QPDFTokenizer::tt_bool: |
451 | 119k | add_scalar<QPDF_Bool>(tokenizer_.getValue() == "true"); |
452 | 119k | continue; |
453 | | |
454 | 936k | case QPDFTokenizer::tt_null: |
455 | 936k | add_null(); |
456 | 936k | continue; |
457 | | |
458 | 8.13M | case QPDFTokenizer::tt_integer: |
459 | 8.13M | if (!content_stream) { |
460 | | // Buffer token in case it is part of an indirect reference. |
461 | 7.66M | last_offset_buffer_[1] = input_.getLastOffset(); |
462 | 7.66M | int_buffer_[1] = QUtil::string_to_ll(tokenizer_.getValue().c_str()); |
463 | 7.66M | int_count_ = 1; |
464 | 7.66M | } else { |
465 | 471k | add_scalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer_.getValue().c_str())); |
466 | 471k | } |
467 | 8.13M | continue; |
468 | | |
469 | 1.06M | case QPDFTokenizer::tt_real: |
470 | 1.06M | add_scalar<QPDF_Real>(tokenizer_.getValue()); |
471 | 1.06M | continue; |
472 | | |
473 | 61.3M | case QPDFTokenizer::tt_name: |
474 | 61.3M | if (frame_->state == st_dictionary_key) { |
475 | 9.07M | frame_->key = tokenizer_.getValue(); |
476 | 9.07M | frame_->state = st_dictionary_value; |
477 | 9.07M | b_contents = decrypter_ && frame_->key == "/Contents"; |
478 | 9.07M | continue; |
479 | 52.2M | } else { |
480 | 52.2M | add_scalar<QPDF_Name>(tokenizer_.getValue()); |
481 | 52.2M | } |
482 | 52.2M | continue; |
483 | | |
484 | 52.2M | case QPDFTokenizer::tt_word: |
485 | 3.20M | if (content_stream) { |
486 | 867k | add_scalar<QPDF_Operator>(tokenizer_.getValue()); |
487 | 867k | continue; |
488 | 867k | } |
489 | | |
490 | 2.33M | if (sanity_checks_) { |
491 | 2.24M | if (tokenizer_.getValue() == "endobj" || tokenizer_.getValue() == "endstream") { |
492 | | // During sanity checks, assume an unexpected endobj or endstream indicates that |
493 | | // we are parsing past the end of the object. |
494 | 29.5k | warn( |
495 | 29.5k | "unexpected 'endobj' or 'endstream' while reading object; giving up on " |
496 | 29.5k | "reading object"); |
497 | 29.5k | return {}; |
498 | 29.5k | } |
499 | | |
500 | 2.21M | add_bad_null("unknown token while reading object; treating as null"); |
501 | 2.21M | continue; |
502 | 2.24M | } |
503 | | |
504 | 92.2k | warn("unknown token while reading object; treating as string"); |
505 | 92.2k | check_too_many_bad_tokens(); |
506 | 92.2k | add_scalar<QPDF_String>(tokenizer_.getValue()); |
507 | | |
508 | 92.2k | continue; |
509 | | |
510 | 1.76M | case QPDFTokenizer::tt_string: |
511 | 1.76M | { |
512 | 1.76M | auto const& val = tokenizer_.getValue(); |
513 | 1.76M | if (decrypter_) { |
514 | 277k | if (b_contents) { |
515 | 5.86k | frame_->contents_string = val; |
516 | 5.86k | frame_->contents_offset = input_.getLastOffset(); |
517 | 5.86k | b_contents = false; |
518 | 5.86k | } |
519 | 277k | std::string s{val}; |
520 | 277k | decrypter_->decryptString(s); |
521 | 277k | add_scalar<QPDF_String>(s); |
522 | 1.48M | } else { |
523 | 1.48M | add_scalar<QPDF_String>(val); |
524 | 1.48M | } |
525 | 1.76M | } |
526 | 1.76M | continue; |
527 | | |
528 | 0 | default: |
529 | 0 | add_bad_null("treating unknown token type as null while reading object"); |
530 | 83.9M | } |
531 | 83.9M | } |
532 | 1.77M | } |
533 | | |
534 | | void |
535 | | Parser::add(std::shared_ptr<QPDFObject>&& obj) |
536 | 92.2M | { |
537 | 92.2M | if (frame_->state != st_dictionary_value) { |
538 | | // If state is st_dictionary_key then there is a missing key. Push onto olist for |
539 | | // processing once the tt_dict_close token has been found. |
540 | 83.7M | frame_->olist.emplace_back(std::move(obj)); |
541 | 83.7M | } else { |
542 | 8.48M | if (auto res = frame_->dict.insert_or_assign(frame_->key, std::move(obj)); !res.second) { |
543 | 89.6k | warn_duplicate_key(); |
544 | 89.6k | } |
545 | 8.48M | frame_->state = st_dictionary_key; |
546 | 8.48M | } |
547 | 92.2M | } |
548 | | |
549 | | void |
550 | | Parser::add_null() |
551 | 3.49M | { |
552 | 3.49M | const static ObjectPtr null_obj = QPDFObject::create<QPDF_Null>(); |
553 | | |
554 | 3.49M | if (frame_->state != st_dictionary_value) { |
555 | | // If state is st_dictionary_key then there is a missing key. Push onto olist for |
556 | | // processing once the tt_dict_close token has been found. |
557 | 3.13M | frame_->olist.emplace_back(null_obj); |
558 | 3.13M | } else { |
559 | 360k | if (auto res = frame_->dict.insert_or_assign(frame_->key, null_obj); !res.second) { |
560 | 16.8k | warn_duplicate_key(); |
561 | 16.8k | } |
562 | 360k | frame_->state = st_dictionary_key; |
563 | 360k | } |
564 | 3.49M | ++frame_->null_count; |
565 | 3.49M | } |
566 | | |
567 | | void |
568 | | Parser::add_bad_null(std::string const& msg) |
569 | 2.30M | { |
570 | 2.30M | warn(msg); |
571 | 2.30M | check_too_many_bad_tokens(); |
572 | 2.30M | add_null(); |
573 | 2.30M | } |
574 | | |
575 | | void |
576 | | Parser::add_int(int count) |
577 | 28.9M | { |
578 | 28.9M | auto obj = QPDFObject::create<QPDF_Integer>(int_buffer_[count % 2]); |
579 | 28.9M | obj->setDescription(context_, description_, last_offset_buffer_[count % 2]); |
580 | 28.9M | add(std::move(obj)); |
581 | 28.9M | } |
582 | | |
583 | | template <typename T, typename... Args> |
584 | | void |
585 | | Parser::add_scalar(Args&&... args) |
586 | 56.6M | { |
587 | 56.6M | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); |
588 | 56.6M | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { |
589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is |
590 | | // encountered. |
591 | 4.53k | max_bad_count_ = 1; |
592 | 4.53k | check_too_many_bad_tokens(); // always throws Error() |
593 | 4.53k | } |
594 | 56.6M | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); |
595 | 56.6M | obj->setDescription(context_, description_, input_.getLastOffset()); |
596 | 56.6M | add(std::move(obj)); |
597 | 56.6M | } void qpdf::impl::Parser::add_scalar<QPDF_Bool, bool>(bool&&) Line | Count | Source | 586 | 119k | { | 587 | 119k | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 119k | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 494 | max_bad_count_ = 1; | 592 | 494 | check_too_many_bad_tokens(); // always throws Error() | 593 | 494 | } | 594 | 119k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 119k | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 119k | add(std::move(obj)); | 597 | 119k | } |
void qpdf::impl::Parser::add_scalar<QPDF_Integer, long long>(long long&&) Line | Count | Source | 586 | 471k | { | 587 | 471k | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 471k | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 41 | max_bad_count_ = 1; | 592 | 41 | check_too_many_bad_tokens(); // always throws Error() | 593 | 41 | } | 594 | 471k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 471k | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 471k | add(std::move(obj)); | 597 | 471k | } |
void qpdf::impl::Parser::add_scalar<QPDF_Real, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 586 | 1.06M | { | 587 | 1.06M | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 1.06M | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 921 | max_bad_count_ = 1; | 592 | 921 | check_too_many_bad_tokens(); // always throws Error() | 593 | 921 | } | 594 | 1.06M | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 1.06M | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 1.06M | add(std::move(obj)); | 597 | 1.06M | } |
void qpdf::impl::Parser::add_scalar<QPDF_Name, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 586 | 52.2M | { | 587 | 52.2M | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 52.2M | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 2.11k | max_bad_count_ = 1; | 592 | 2.11k | check_too_many_bad_tokens(); // always throws Error() | 593 | 2.11k | } | 594 | 52.2M | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 52.2M | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 52.2M | add(std::move(obj)); | 597 | 52.2M | } |
void qpdf::impl::Parser::add_scalar<QPDF_Operator, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 586 | 867k | { | 587 | 867k | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 867k | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 134 | max_bad_count_ = 1; | 592 | 134 | check_too_many_bad_tokens(); // always throws Error() | 593 | 134 | } | 594 | 867k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 867k | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 867k | add(std::move(obj)); | 597 | 867k | } |
void qpdf::impl::Parser::add_scalar<QPDF_String, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 586 | 1.57M | { | 587 | 1.57M | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 1.57M | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 721 | max_bad_count_ = 1; | 592 | 721 | check_too_many_bad_tokens(); // always throws Error() | 593 | 721 | } | 594 | 1.57M | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 1.57M | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 1.57M | add(std::move(obj)); | 597 | 1.57M | } |
void qpdf::impl::Parser::add_scalar<QPDF_String, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&) Line | Count | Source | 586 | 277k | { | 587 | 277k | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); | 588 | 277k | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { | 589 | | // Stop adding scalars. We are going to abort when the close token or a bad token is | 590 | | // encountered. | 591 | 111 | max_bad_count_ = 1; | 592 | 111 | check_too_many_bad_tokens(); // always throws Error() | 593 | 111 | } | 594 | 277k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 595 | 277k | obj->setDescription(context_, description_, input_.getLastOffset()); | 596 | 277k | add(std::move(obj)); | 597 | 277k | } |
|
598 | | |
599 | | template <typename T, typename... Args> |
600 | | QPDFObjectHandle |
601 | | Parser::with_description(Args&&... args) |
602 | 17.2M | { |
603 | 17.2M | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); |
604 | 17.2M | obj->setDescription(context_, description_, start_); |
605 | 17.2M | return {obj}; |
606 | 17.2M | } QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_Bool, bool>(bool&&) Line | Count | Source | 602 | 6.90k | { | 603 | 6.90k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 6.90k | obj->setDescription(context_, description_, start_); | 605 | 6.90k | return {obj}; | 606 | 6.90k | } |
QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_Integer, long long>(long long&&) Line | Count | Source | 602 | 393k | { | 603 | 393k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 393k | obj->setDescription(context_, description_, start_); | 605 | 393k | return {obj}; | 606 | 393k | } |
QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_Real, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 602 | 125k | { | 603 | 125k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 125k | obj->setDescription(context_, description_, start_); | 605 | 125k | return {obj}; | 606 | 125k | } |
QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_Name, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 602 | 133k | { | 603 | 133k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 133k | obj->setDescription(context_, description_, start_); | 605 | 133k | return {obj}; | 606 | 133k | } |
QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_Operator, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 602 | 16.4M | { | 603 | 16.4M | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 16.4M | obj->setDescription(context_, description_, start_); | 605 | 16.4M | return {obj}; | 606 | 16.4M | } |
QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_String, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) Line | Count | Source | 602 | 99.6k | { | 603 | 99.6k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 99.6k | obj->setDescription(context_, description_, start_); | 605 | 99.6k | return {obj}; | 606 | 99.6k | } |
QPDFObjectHandle qpdf::impl::Parser::with_description<QPDF_String, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&) Line | Count | Source | 602 | 1.59k | { | 603 | 1.59k | auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); | 604 | 1.59k | obj->setDescription(context_, description_, start_); | 605 | 1.59k | return {obj}; | 606 | 1.59k | } |
|
607 | | |
608 | | void |
609 | | Parser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset) |
610 | 3.47M | { |
611 | 3.47M | if (obj) { |
612 | 3.47M | obj->setDescription(context_, description_, parsed_offset); |
613 | 3.47M | } |
614 | 3.47M | } |
615 | | |
616 | | void |
617 | | Parser::fix_missing_keys() |
618 | 33.8k | { |
619 | 33.8k | std::set<std::string> names; |
620 | 140k | for (auto& obj: frame_->olist) { |
621 | 140k | if (obj.raw_type_code() == ::ot_name) { |
622 | 6.35k | names.insert(obj.getName()); |
623 | 6.35k | } |
624 | 140k | } |
625 | 33.8k | int next_fake_key = 1; |
626 | 134k | for (auto const& item: frame_->olist) { |
627 | 135k | while (true) { |
628 | 135k | const std::string key = "/QPDFFake" + std::to_string(next_fake_key++); |
629 | 135k | const bool found_fake = !frame_->dict.contains(key) && !names.contains(key); |
630 | 135k | QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); |
631 | 135k | if (found_fake) { |
632 | 134k | warn( |
633 | 134k | frame_->offset, |
634 | 134k | "expected dictionary key but found non-name object; inserting key " + key); |
635 | 134k | frame_->dict[key] = item; |
636 | 134k | break; |
637 | 134k | } |
638 | 135k | } |
639 | 134k | } |
640 | 33.8k | } |
641 | | |
642 | | void |
643 | | Parser::check_too_many_bad_tokens() |
644 | 2.85M | { |
645 | 2.85M | auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); |
646 | 2.85M | if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { |
647 | 4.63k | if (bad_count_) { |
648 | 3.68k | limits_error( |
649 | 3.68k | "parser-max-container-size-damaged", |
650 | 3.68k | "encountered errors while parsing an array or dictionary with more than " + |
651 | 3.68k | std::to_string(limit) + " elements; giving up on reading object"); |
652 | 3.68k | } |
653 | 4.63k | limits_error( |
654 | 4.63k | "parser-max-container-size", |
655 | 4.63k | "encountered an array or dictionary with more than " + std::to_string(limit) + |
656 | 4.63k | " elements during xref recovery; giving up on reading object"); |
657 | 4.63k | } |
658 | 2.85M | if (max_bad_count_ && --max_bad_count_ == 0) { |
659 | 17.7k | limits_error( |
660 | 17.7k | "parser-max-errors", "too many errors during parsing; treating object as null"); |
661 | 17.7k | } |
662 | 2.85M | if (good_count_ > 4) { |
663 | 1.00M | good_count_ = 0; |
664 | 1.00M | bad_count_ = 1; |
665 | 1.00M | return; |
666 | 1.00M | } |
667 | 1.85M | if (++bad_count_ > 5 || |
668 | 1.76M | (frame_->state != st_array && std::cmp_less(max_bad_count_, frame_->olist.size()))) { |
669 | | // Give up after 5 errors in close proximity or if the number of missing dictionary keys |
670 | | // exceeds the remaining number of allowable total errors. |
671 | 84.9k | warn("too many errors; giving up on reading object"); |
672 | 84.9k | throw Error(); |
673 | 84.9k | } |
674 | 1.76M | good_count_ = 0; |
675 | 1.76M | } |
676 | | |
677 | | void |
678 | | Parser::limits_error(std::string const& limit, std::string const& msg) |
679 | 23.4k | { |
680 | 23.4k | Limits::error(); |
681 | 23.4k | warn("limits error("s + limit + "): " + msg); |
682 | 23.4k | throw Error(); |
683 | 23.4k | } |
684 | | |
685 | | void |
686 | | Parser::warn(QPDFExc const& e) const |
687 | 4.18M | { |
688 | | // If parsing on behalf of a QPDF object and want to give a warning, we can warn through the |
689 | | // object. If parsing for some other reason, such as an explicit creation of an object from a |
690 | | // string, then just throw the exception. |
691 | 4.18M | if (context_) { |
692 | 4.18M | context_->warn(e); |
693 | 4.18M | } else { |
694 | 646 | throw e; |
695 | 646 | } |
696 | 4.18M | } |
697 | | |
698 | | void |
699 | | Parser::warn_duplicate_key() |
700 | 106k | { |
701 | 106k | warn( |
702 | 106k | frame_->offset, |
703 | 106k | "dictionary has duplicated key " + frame_->key + |
704 | 106k | "; last occurrence overrides earlier ones"); |
705 | 106k | check_too_many_bad_tokens(); |
706 | 106k | } |
707 | | |
708 | | void |
709 | | Parser::warn(qpdf_offset_t offset, std::string const& msg) const |
710 | 4.18M | { |
711 | 4.18M | if (stream_id_) { |
712 | 267k | std::string descr = "object "s + std::to_string(obj_id_) + " 0"; |
713 | 267k | std::string name = context_->getFilename() + " object stream " + std::to_string(stream_id_); |
714 | 267k | warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); |
715 | 3.91M | } else { |
716 | 3.91M | warn(QPDFExc(qpdf_e_damaged_pdf, input_.getName(), object_description_, offset, msg)); |
717 | 3.91M | } |
718 | 4.18M | } |
719 | | |
720 | | void |
721 | | Parser::warn(std::string const& msg) const |
722 | 3.34M | { |
723 | 3.34M | warn(input_.getLastOffset(), msg); |
724 | 3.34M | } |