/src/poppler/poppler/Parser.cc
Line | Count | Source |
1 | | //======================================================================== |
2 | | // |
3 | | // Parser.cc |
4 | | // |
5 | | // Copyright 1996-2003 Glyph & Cog, LLC |
6 | | // |
7 | | //======================================================================== |
8 | | |
9 | | //======================================================================== |
10 | | // |
11 | | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | | // |
13 | | // All changes made under the Poppler project to this file are licensed |
14 | | // under GPL version 2 or later |
15 | | // |
16 | | // Copyright (C) 2006, 2009, 201, 2010, 2013, 2014, 2017-2020, 2025, 2026 Albert Astals Cid <aacid@kde.org> |
17 | | // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com> |
18 | | // Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com> |
19 | | // Copyright (C) 2012 Hib Eris <hib@hiberis.nl> |
20 | | // Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com> |
21 | | // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> |
22 | | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
23 | | // Copyright (C) 2018, 2019 Adam Reichold <adam.reichold@t-online.de> |
24 | | // Copyright (C) 2018 Marek Kasik <mkasik@redhat.com> |
25 | | // Copyright (C) 2024 Nelson Benítez León <nbenitezl@gmail.com> |
26 | | // Copyright (C) 2024-2026 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk> |
27 | | // Copyright (C) 2025 Arnav V <arnav0872@gmail.com> |
28 | | // |
29 | | // To see a description of the changes please see the Changelog file that |
30 | | // came with your tarball or type make ChangeLog if you are building from git |
31 | | // |
32 | | //======================================================================== |
33 | | |
34 | | #include <config.h> |
35 | | |
36 | | #include <climits> |
37 | | #include "Object.h" |
38 | | #include "Array.h" |
39 | | #include "Dict.h" |
40 | | #include "Decrypt.h" |
41 | | #include "Parser.h" |
42 | | #include "XRef.h" |
43 | | #include "Error.h" |
44 | | |
45 | | // Max number of nested objects. This is used to catch infinite loops |
46 | | // in the object structure. And also technically valid files with |
47 | | // lots of nested arrays that made us consume all the stack |
48 | | constexpr int recursionLimit = 500; |
49 | | |
50 | 23.9M | Parser::Parser(XRef *xrefA, std::unique_ptr<Stream> &&streamA, bool allowStreamsA) : lexer { xrefA, std::move(streamA) } |
51 | 23.9M | { |
52 | 23.9M | allowStreams = allowStreamsA; |
53 | 23.9M | buf1 = lexer.getObj(); |
54 | 23.9M | buf2 = lexer.getObj(); |
55 | 23.9M | inlineImg = 0; |
56 | 23.9M | } |
57 | | |
58 | 1.66M | Parser::Parser(XRef *xrefA, Object *objectA, bool allowStreamsA) : lexer { xrefA, objectA } |
59 | 1.66M | { |
60 | 1.66M | allowStreams = allowStreamsA; |
61 | 1.66M | buf1 = lexer.getObj(); |
62 | 1.66M | buf2 = lexer.getObj(); |
63 | 1.66M | inlineImg = 0; |
64 | 1.66M | } |
65 | | |
66 | 25.6M | Parser::~Parser() = default; |
67 | | |
68 | | Object Parser::getObj(int recursion) |
69 | 39.6M | { |
70 | 39.6M | return getObj(false, nullptr, cryptRC4, 0, 0, 0, recursion); |
71 | 39.6M | } |
72 | | |
73 | | static std::unique_ptr<GooString> decryptedString(const std::string &s, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen) |
74 | 231k | { |
75 | 231k | DecryptStream decrypt(std::make_unique<MemStream>(s.c_str(), 0, s.size(), Object::null()), fileKey, encAlgorithm, keyLength, { .num = objNum, .gen = objGen }); |
76 | 231k | if (!decrypt.rewind()) { |
77 | 0 | return {}; |
78 | 0 | } |
79 | 231k | std::unique_ptr<GooString> res = std::make_unique<GooString>(); |
80 | 231k | int c; |
81 | 19.3M | while ((c = decrypt.getChar()) != EOF) { |
82 | 19.1M | res->push_back((char)c); |
83 | 19.1M | } |
84 | 231k | return res; |
85 | 231k | } |
86 | | |
87 | | Object Parser::getObj(bool simpleOnly, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict, bool decryptString) |
88 | 907M | { |
89 | 907M | Object obj; |
90 | | |
91 | | // refill buffer after inline image data |
92 | 907M | if (inlineImg == 2) { |
93 | 453k | buf1 = lexer.getObj(); |
94 | 453k | buf2 = lexer.getObj(); |
95 | 453k | inlineImg = 0; |
96 | 453k | } |
97 | | |
98 | 907M | if (unlikely(recursion >= recursionLimit)) { |
99 | 5.92k | return Object::error(); |
100 | 5.92k | } |
101 | | |
102 | | // array |
103 | 907M | if (!simpleOnly && buf1.isCmd("[")) { |
104 | 24.8M | shift(); |
105 | 24.8M | obj = Object(std::make_unique<Array>(lexer.getXRef())); |
106 | 330M | while (!buf1.isCmd("]") && !buf1.isEOF() && recursion + 1 < recursionLimit) { |
107 | 305M | Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1); |
108 | 305M | obj.arrayAdd(std::move(obj2)); |
109 | 305M | } |
110 | 24.8M | if (recursion + 1 >= recursionLimit && strict) { |
111 | 0 | goto err; |
112 | 0 | } |
113 | 24.8M | if (buf1.isEOF()) { |
114 | 1.68M | error(errSyntaxError, getPos(), "End of file inside array"); |
115 | 1.68M | if (strict) { |
116 | 1 | goto err; |
117 | 1 | } |
118 | 1.68M | } |
119 | 24.8M | shift(); |
120 | | |
121 | | // dictionary or stream |
122 | 882M | } else if (!simpleOnly && buf1.isCmd("<<")) { |
123 | 26.6M | shift(objNum); |
124 | 26.6M | obj = Object(std::make_unique<Dict>(lexer.getXRef())); |
125 | 26.6M | bool hasContentsEntry = false; |
126 | 332M | while (!buf1.isCmd(">>") && !buf1.isEOF()) { |
127 | 306M | if (!buf1.isName()) { |
128 | 111M | error(errSyntaxError, getPos(), "Dictionary key must be a name object"); |
129 | 111M | if (strict) { |
130 | 48 | goto err; |
131 | 48 | } |
132 | 111M | shift(); |
133 | 194M | } else { |
134 | | // buf1 will go away in shift(), so keep the key |
135 | 194M | const auto key = std::move(buf1); |
136 | 194M | shift(); |
137 | 194M | if (buf1.isEOF() || buf1.isError()) { |
138 | 710k | if (strict && buf1.isError()) { |
139 | 8 | goto err; |
140 | 8 | } |
141 | 710k | break; |
142 | 710k | } |
143 | | // We don't decrypt strings that are the value of "Contents" key entries. We decrypt them if needed a few lines below. |
144 | | // The "Contents" field of Sig dictionaries is not encrypted, but we can't know the type of the dictionary here yet |
145 | | // so we don't decrypt any Contents and if later we find it's not a Sig dictionary we decrypt it |
146 | 193M | const bool isContents = !hasContentsEntry && key.isName("Contents"); |
147 | 193M | hasContentsEntry = hasContentsEntry || isContents; |
148 | 193M | Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, /*strict*/ false, /*decryptString*/ !isContents); |
149 | 193M | if (unlikely(recursion + 1 >= recursionLimit)) { |
150 | 5.92k | break; |
151 | 5.92k | } |
152 | 193M | obj.dictAdd(key.getName(), std::move(obj2)); |
153 | 193M | } |
154 | 306M | } |
155 | 26.6M | if (buf1.isEOF()) { |
156 | 1.22M | error(errSyntaxError, getPos(), "End of file inside dictionary"); |
157 | 1.22M | if (strict) { |
158 | 16 | goto err; |
159 | 16 | } |
160 | 1.22M | } |
161 | 26.6M | if (fileKey && hasContentsEntry) { |
162 | 9.57k | Dict *dict = obj.getDict(); |
163 | 9.57k | const bool isSigDict = dict->is("Sig"); |
164 | 9.57k | if (!isSigDict) { |
165 | 8.97k | const Object &contentsObj = dict->lookupNF("Contents"); |
166 | 8.97k | if (contentsObj.isString()) { |
167 | 4.79k | std::unique_ptr<GooString> s = decryptedString(contentsObj.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen); |
168 | 4.79k | dict->set("Contents", Object(std::move(s))); |
169 | 4.79k | } |
170 | 8.97k | } |
171 | 9.57k | } |
172 | | // stream objects are not allowed inside content streams or |
173 | | // object streams |
174 | 26.6M | if (buf2.isCmd("stream")) { |
175 | 8.04M | if (allowStreams) { |
176 | 7.80M | if (auto str = makeStream(std::move(obj), fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, strict)) { |
177 | 7.78M | return Object(std::move(str)); |
178 | 7.78M | } |
179 | 7.80M | } |
180 | 259k | return Object::error(); |
181 | 8.04M | } |
182 | 18.5M | shift(); |
183 | | |
184 | | // indirect reference or integer |
185 | 855M | } else if (buf1.isInt()) { |
186 | 343M | const int num = buf1.getInt(); |
187 | 343M | shift(); |
188 | 343M | if (buf1.isInt() && buf2.isCmd("R")) { |
189 | 46.3M | const int gen = buf1.getInt(); |
190 | 46.3M | shift(); |
191 | 46.3M | shift(); |
192 | | |
193 | 46.3M | if (unlikely(num <= 0 || gen < 0)) { |
194 | 887k | return Object(); |
195 | 887k | } |
196 | | |
197 | 45.4M | Ref r; |
198 | 45.4M | r.num = num; |
199 | 45.4M | r.gen = gen; |
200 | 45.4M | return Object(r); |
201 | 46.3M | } |
202 | 297M | return Object(num); |
203 | | |
204 | | // string |
205 | 512M | } else if (decryptString && buf1.isString() && fileKey) { |
206 | 226k | std::unique_ptr<GooString> s2 = decryptedString(buf1.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen); |
207 | 226k | obj = Object(std::move(s2)); |
208 | 226k | shift(); |
209 | | |
210 | | // simple object |
211 | 511M | } else { |
212 | | // avoid re-allocating memory for complex objects like strings by |
213 | | // shallow copy of <buf1> to <obj> and nulling <buf1> so that |
214 | | // subsequent buf1.free() won't free this memory |
215 | 511M | obj = std::move(buf1); |
216 | 511M | shift(); |
217 | 511M | } |
218 | | |
219 | 555M | return obj; |
220 | | |
221 | 73 | err: |
222 | 73 | return Object::error(); |
223 | 907M | } |
224 | | |
225 | | std::unique_ptr<Stream> Parser::makeStream(Object &&dict, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict) |
226 | 7.80M | { |
227 | 7.80M | BaseStream *baseStr; |
228 | 7.80M | Goffset length; |
229 | 7.80M | Goffset pos, endPos; |
230 | | |
231 | 7.80M | if (XRef *xref = lexer.getXRef()) { |
232 | 7.43M | XRefEntry *entry = xref->getEntry(objNum, false); |
233 | 7.43M | if (entry) { |
234 | 7.43M | if (!entry->getFlag(XRefEntry::Parsing) || (objNum == 0 && objGen == 0)) { |
235 | 7.42M | entry->setFlag(XRefEntry::Parsing, true); |
236 | 7.42M | } else { |
237 | 8.07k | error(errSyntaxError, getPos(), "Object '{0:d} {1:d} obj' is being already parsed", objNum, objGen); |
238 | 8.07k | return nullptr; |
239 | 8.07k | } |
240 | 7.43M | } |
241 | 7.43M | } |
242 | | |
243 | | // get stream start position |
244 | 7.79M | lexer.skipToNextLine(); |
245 | 7.79M | Stream *lexerStream; |
246 | 7.79M | if (!(lexerStream = lexer.getStream())) { |
247 | 889 | return nullptr; |
248 | 889 | } |
249 | 7.79M | pos = lexerStream->getPos(); |
250 | | |
251 | | // get length |
252 | 7.79M | Object obj = dict.dictLookup("Length", recursion); |
253 | 7.79M | if (obj.isInt()) { |
254 | 6.83M | length = obj.getInt(); |
255 | 6.83M | } else if (obj.isInt64()) { |
256 | 3.20k | length = obj.getInt64(); |
257 | 951k | } else { |
258 | 951k | error(errSyntaxError, getPos(), "Bad 'Length' attribute in stream"); |
259 | 951k | if (strict) { |
260 | 19 | return nullptr; |
261 | 19 | } |
262 | 951k | length = 0; |
263 | 951k | } |
264 | | |
265 | | // check for length in damaged file |
266 | 7.79M | if (lexer.hasXRef() && lexer.getXRef()->getStreamEnd(pos, &endPos)) { |
267 | 5.88M | length = endPos - pos; |
268 | 5.88M | } |
269 | | |
270 | | // in badly damaged PDF files, we can run off the end of the input |
271 | | // stream immediately after the "stream" token |
272 | 7.79M | if (!lexer.getStream()) { |
273 | 0 | return nullptr; |
274 | 0 | } |
275 | 7.79M | baseStr = lexer.getStream()->getBaseStream(); |
276 | | |
277 | | // skip over stream data |
278 | 7.79M | if (Lexer::LOOK_VALUE_NOT_CACHED != lexer.lookCharLastValueCached) { |
279 | | // take into account the fact that we've cached one value |
280 | 195k | pos = pos - 1; |
281 | 195k | lexer.lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED; |
282 | 195k | } |
283 | 7.79M | if (unlikely(length < 0)) { |
284 | 1.39k | return nullptr; |
285 | 1.39k | } |
286 | 7.78M | if (unlikely(pos > LLONG_MAX - length)) { |
287 | 183 | return nullptr; |
288 | 183 | } |
289 | 7.78M | lexer.setPos(pos + length); |
290 | | |
291 | | // refill token buffers and check for 'endstream' |
292 | 7.78M | shift(); // kill '>>' |
293 | 7.78M | shift("endstream", objNum); // kill 'stream' |
294 | 7.78M | if (buf1.isCmd("endstream")) { |
295 | 6.65M | shift(); |
296 | 6.65M | } else { |
297 | 1.12M | error(errSyntaxError, getPos(), "Missing 'endstream' or incorrect stream length"); |
298 | 1.12M | if (strict) { |
299 | 67 | return nullptr; |
300 | 67 | } |
301 | 1.12M | if (lexer.hasXRef() && lexer.getStream()) { |
302 | | // shift until we find the proper endstream or we change to another object or reach eof |
303 | 425k | length = lexer.getPos() - pos; |
304 | 425k | if (buf1.isCmd("endstream")) { |
305 | 0 | dict.dictSet("Length", Object(length)); |
306 | 0 | } |
307 | 704k | } else { |
308 | | // When building the xref we can't use it so use this |
309 | | // kludge for broken PDF files: just add 5k to the length, and |
310 | | // hope its enough |
311 | 704k | if (length < LLONG_MAX - pos - 5000) { |
312 | 703k | length += 5000; |
313 | 703k | } |
314 | 704k | } |
315 | 1.12M | } |
316 | | |
317 | | // make base stream |
318 | 7.78M | auto str = baseStr->makeSubStream(pos, true, length, std::move(dict)); |
319 | | |
320 | | // handle decryption |
321 | 7.78M | if (fileKey) { |
322 | 69.4k | str = std::make_unique<DecryptStream>(std::move(str), fileKey, encAlgorithm, keyLength, Ref { .num = objNum, .gen = objGen }); |
323 | 69.4k | } |
324 | | |
325 | | // get filters |
326 | 7.78M | Dict *streamDict = str->getDict(); |
327 | 7.78M | str = Stream::addFilters(std::move(str), streamDict, recursion); |
328 | | |
329 | 7.78M | if (XRef *xref = lexer.getXRef()) { |
330 | | // Don't try to reuse the entry from the block at the start |
331 | | // of the function, xref can change in the middle because of |
332 | | // reconstruction |
333 | 7.42M | XRefEntry *entry = xref->getEntry(objNum, false); |
334 | 7.42M | if (entry) { |
335 | 7.42M | entry->setFlag(XRefEntry::Parsing, false); |
336 | 7.42M | } |
337 | 7.42M | } |
338 | | |
339 | 7.78M | return str; |
340 | 7.78M | } |
341 | | |
342 | | void Parser::shift(int objNum) |
343 | 1.36G | { |
344 | 1.36G | if (inlineImg > 0) { |
345 | 520k | if (inlineImg < 2) { |
346 | 487k | ++inlineImg; |
347 | 487k | } else { |
348 | | // in a damaged content stream, if 'ID' shows up in the middle |
349 | | // of a dictionary, we need to reset |
350 | 33.4k | inlineImg = 0; |
351 | 33.4k | } |
352 | 1.36G | } else if (buf2.isCmd("ID")) { |
353 | 487k | lexer.skipChar(); // skip char after 'ID' command |
354 | 487k | inlineImg = 1; |
355 | 487k | } |
356 | 1.36G | buf1 = std::move(buf2); |
357 | 1.36G | if (inlineImg > 0) { // don't buffer inline image data |
358 | 974k | buf2.setToNull(); |
359 | 1.36G | } else { |
360 | 1.36G | buf2 = lexer.getObj(objNum); |
361 | 1.36G | } |
362 | 1.36G | } |
363 | | |
364 | | void Parser::shift(const char *cmdA, int objNum) |
365 | 7.78M | { |
366 | 7.78M | if (inlineImg > 0) { |
367 | 0 | if (inlineImg < 2) { |
368 | 0 | ++inlineImg; |
369 | 0 | } else { |
370 | | // in a damaged content stream, if 'ID' shows up in the middle |
371 | | // of a dictionary, we need to reset |
372 | 0 | inlineImg = 0; |
373 | 0 | } |
374 | 7.78M | } else if (buf2.isCmd("ID")) { |
375 | 496 | lexer.skipChar(); // skip char after 'ID' command |
376 | 496 | inlineImg = 1; |
377 | 496 | } |
378 | 7.78M | buf1 = std::move(buf2); |
379 | 7.78M | if (inlineImg > 0) { |
380 | 496 | buf2.setToNull(); |
381 | 7.78M | } else if (buf1.isCmd(cmdA)) { |
382 | 6.65M | buf2 = lexer.getObj(objNum); |
383 | 6.65M | } else { |
384 | 1.12M | buf2 = lexer.getObj(cmdA, objNum); |
385 | 1.12M | } |
386 | 7.78M | } |