Coverage Report

Created: 2026-03-31 07:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/poppler/poppler/Parser.cc
Line
Count
Source
1
//========================================================================
2
//
3
// Parser.cc
4
//
5
// Copyright 1996-2003 Glyph & Cog, LLC
6
//
7
//========================================================================
8
9
//========================================================================
10
//
11
// Modified under the Poppler project - http://poppler.freedesktop.org
12
//
13
// All changes made under the Poppler project to this file are licensed
14
// under GPL version 2 or later
15
//
16
// Copyright (C) 2006, 2009, 201, 2010, 2013, 2014, 2017-2020, 2025, 2026 Albert Astals Cid <aacid@kde.org>
17
// Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18
// Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com>
19
// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
20
// Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
21
// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
22
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
23
// Copyright (C) 2018, 2019 Adam Reichold <adam.reichold@t-online.de>
24
// Copyright (C) 2018 Marek Kasik <mkasik@redhat.com>
25
// Copyright (C) 2024 Nelson Benítez León <nbenitezl@gmail.com>
26
// Copyright (C) 2024-2026 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
27
// Copyright (C) 2025 Arnav V <arnav0872@gmail.com>
28
//
29
// To see a description of the changes please see the Changelog file that
30
// came with your tarball or type make ChangeLog if you are building from git
31
//
32
//========================================================================
33
34
#include <config.h>
35
36
#include <climits>
37
#include "Object.h"
38
#include "Array.h"
39
#include "Dict.h"
40
#include "Decrypt.h"
41
#include "Parser.h"
42
#include "XRef.h"
43
#include "Error.h"
44
45
// Max number of nested objects.  This is used to catch infinite loops
46
// in the object structure. And also technically valid files with
47
// lots of nested arrays that made us consume all the stack
48
constexpr int recursionLimit = 500;
49
50
23.9M
Parser::Parser(XRef *xrefA, std::unique_ptr<Stream> &&streamA, bool allowStreamsA) : lexer { xrefA, std::move(streamA) }
51
23.9M
{
52
23.9M
    allowStreams = allowStreamsA;
53
23.9M
    buf1 = lexer.getObj();
54
23.9M
    buf2 = lexer.getObj();
55
23.9M
    inlineImg = 0;
56
23.9M
}
57
58
1.66M
Parser::Parser(XRef *xrefA, Object *objectA, bool allowStreamsA) : lexer { xrefA, objectA }
59
1.66M
{
60
1.66M
    allowStreams = allowStreamsA;
61
1.66M
    buf1 = lexer.getObj();
62
1.66M
    buf2 = lexer.getObj();
63
1.66M
    inlineImg = 0;
64
1.66M
}
65
66
25.6M
Parser::~Parser() = default;
67
68
Object Parser::getObj(int recursion)
69
39.6M
{
70
39.6M
    return getObj(false, nullptr, cryptRC4, 0, 0, 0, recursion);
71
39.6M
}
72
73
static std::unique_ptr<GooString> decryptedString(const std::string &s, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen)
74
231k
{
75
231k
    DecryptStream decrypt(std::make_unique<MemStream>(s.c_str(), 0, s.size(), Object::null()), fileKey, encAlgorithm, keyLength, { .num = objNum, .gen = objGen });
76
231k
    if (!decrypt.rewind()) {
77
0
        return {};
78
0
    }
79
231k
    std::unique_ptr<GooString> res = std::make_unique<GooString>();
80
231k
    int c;
81
19.3M
    while ((c = decrypt.getChar()) != EOF) {
82
19.1M
        res->push_back((char)c);
83
19.1M
    }
84
231k
    return res;
85
231k
}
86
87
Object Parser::getObj(bool simpleOnly, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict, bool decryptString)
88
907M
{
89
907M
    Object obj;
90
91
    // refill buffer after inline image data
92
907M
    if (inlineImg == 2) {
93
453k
        buf1 = lexer.getObj();
94
453k
        buf2 = lexer.getObj();
95
453k
        inlineImg = 0;
96
453k
    }
97
98
907M
    if (unlikely(recursion >= recursionLimit)) {
99
5.92k
        return Object::error();
100
5.92k
    }
101
102
    // array
103
907M
    if (!simpleOnly && buf1.isCmd("[")) {
104
24.8M
        shift();
105
24.8M
        obj = Object(std::make_unique<Array>(lexer.getXRef()));
106
330M
        while (!buf1.isCmd("]") && !buf1.isEOF() && recursion + 1 < recursionLimit) {
107
305M
            Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1);
108
305M
            obj.arrayAdd(std::move(obj2));
109
305M
        }
110
24.8M
        if (recursion + 1 >= recursionLimit && strict) {
111
0
            goto err;
112
0
        }
113
24.8M
        if (buf1.isEOF()) {
114
1.68M
            error(errSyntaxError, getPos(), "End of file inside array");
115
1.68M
            if (strict) {
116
1
                goto err;
117
1
            }
118
1.68M
        }
119
24.8M
        shift();
120
121
        // dictionary or stream
122
882M
    } else if (!simpleOnly && buf1.isCmd("<<")) {
123
26.6M
        shift(objNum);
124
26.6M
        obj = Object(std::make_unique<Dict>(lexer.getXRef()));
125
26.6M
        bool hasContentsEntry = false;
126
332M
        while (!buf1.isCmd(">>") && !buf1.isEOF()) {
127
306M
            if (!buf1.isName()) {
128
111M
                error(errSyntaxError, getPos(), "Dictionary key must be a name object");
129
111M
                if (strict) {
130
48
                    goto err;
131
48
                }
132
111M
                shift();
133
194M
            } else {
134
                // buf1 will go away in shift(), so keep the key
135
194M
                const auto key = std::move(buf1);
136
194M
                shift();
137
194M
                if (buf1.isEOF() || buf1.isError()) {
138
710k
                    if (strict && buf1.isError()) {
139
8
                        goto err;
140
8
                    }
141
710k
                    break;
142
710k
                }
143
                // We don't decrypt strings that are the value of "Contents" key entries. We decrypt them if needed a few lines below.
144
                // The "Contents" field of Sig dictionaries is not encrypted, but we can't know the type of the dictionary here yet
145
                // so we don't decrypt any Contents and if later we find it's not a Sig dictionary we decrypt it
146
193M
                const bool isContents = !hasContentsEntry && key.isName("Contents");
147
193M
                hasContentsEntry = hasContentsEntry || isContents;
148
193M
                Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, /*strict*/ false, /*decryptString*/ !isContents);
149
193M
                if (unlikely(recursion + 1 >= recursionLimit)) {
150
5.92k
                    break;
151
5.92k
                }
152
193M
                obj.dictAdd(key.getName(), std::move(obj2));
153
193M
            }
154
306M
        }
155
26.6M
        if (buf1.isEOF()) {
156
1.22M
            error(errSyntaxError, getPos(), "End of file inside dictionary");
157
1.22M
            if (strict) {
158
16
                goto err;
159
16
            }
160
1.22M
        }
161
26.6M
        if (fileKey && hasContentsEntry) {
162
9.57k
            Dict *dict = obj.getDict();
163
9.57k
            const bool isSigDict = dict->is("Sig");
164
9.57k
            if (!isSigDict) {
165
8.97k
                const Object &contentsObj = dict->lookupNF("Contents");
166
8.97k
                if (contentsObj.isString()) {
167
4.79k
                    std::unique_ptr<GooString> s = decryptedString(contentsObj.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
168
4.79k
                    dict->set("Contents", Object(std::move(s)));
169
4.79k
                }
170
8.97k
            }
171
9.57k
        }
172
        // stream objects are not allowed inside content streams or
173
        // object streams
174
26.6M
        if (buf2.isCmd("stream")) {
175
8.04M
            if (allowStreams) {
176
7.80M
                if (auto str = makeStream(std::move(obj), fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, strict)) {
177
7.78M
                    return Object(std::move(str));
178
7.78M
                }
179
7.80M
            }
180
259k
            return Object::error();
181
8.04M
        }
182
18.5M
        shift();
183
184
        // indirect reference or integer
185
855M
    } else if (buf1.isInt()) {
186
343M
        const int num = buf1.getInt();
187
343M
        shift();
188
343M
        if (buf1.isInt() && buf2.isCmd("R")) {
189
46.3M
            const int gen = buf1.getInt();
190
46.3M
            shift();
191
46.3M
            shift();
192
193
46.3M
            if (unlikely(num <= 0 || gen < 0)) {
194
887k
                return Object();
195
887k
            }
196
197
45.4M
            Ref r;
198
45.4M
            r.num = num;
199
45.4M
            r.gen = gen;
200
45.4M
            return Object(r);
201
46.3M
        }
202
297M
        return Object(num);
203
204
        // string
205
512M
    } else if (decryptString && buf1.isString() && fileKey) {
206
226k
        std::unique_ptr<GooString> s2 = decryptedString(buf1.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
207
226k
        obj = Object(std::move(s2));
208
226k
        shift();
209
210
        // simple object
211
511M
    } else {
212
        // avoid re-allocating memory for complex objects like strings by
213
        // shallow copy of <buf1> to <obj> and nulling <buf1> so that
214
        // subsequent buf1.free() won't free this memory
215
511M
        obj = std::move(buf1);
216
511M
        shift();
217
511M
    }
218
219
555M
    return obj;
220
221
73
err:
222
73
    return Object::error();
223
907M
}
224
225
std::unique_ptr<Stream> Parser::makeStream(Object &&dict, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict)
226
7.80M
{
227
7.80M
    BaseStream *baseStr;
228
7.80M
    Goffset length;
229
7.80M
    Goffset pos, endPos;
230
231
7.80M
    if (XRef *xref = lexer.getXRef()) {
232
7.43M
        XRefEntry *entry = xref->getEntry(objNum, false);
233
7.43M
        if (entry) {
234
7.43M
            if (!entry->getFlag(XRefEntry::Parsing) || (objNum == 0 && objGen == 0)) {
235
7.42M
                entry->setFlag(XRefEntry::Parsing, true);
236
7.42M
            } else {
237
8.07k
                error(errSyntaxError, getPos(), "Object '{0:d} {1:d} obj' is being already parsed", objNum, objGen);
238
8.07k
                return nullptr;
239
8.07k
            }
240
7.43M
        }
241
7.43M
    }
242
243
    // get stream start position
244
7.79M
    lexer.skipToNextLine();
245
7.79M
    Stream *lexerStream;
246
7.79M
    if (!(lexerStream = lexer.getStream())) {
247
889
        return nullptr;
248
889
    }
249
7.79M
    pos = lexerStream->getPos();
250
251
    // get length
252
7.79M
    Object obj = dict.dictLookup("Length", recursion);
253
7.79M
    if (obj.isInt()) {
254
6.83M
        length = obj.getInt();
255
6.83M
    } else if (obj.isInt64()) {
256
3.20k
        length = obj.getInt64();
257
951k
    } else {
258
951k
        error(errSyntaxError, getPos(), "Bad 'Length' attribute in stream");
259
951k
        if (strict) {
260
19
            return nullptr;
261
19
        }
262
951k
        length = 0;
263
951k
    }
264
265
    // check for length in damaged file
266
7.79M
    if (lexer.hasXRef() && lexer.getXRef()->getStreamEnd(pos, &endPos)) {
267
5.88M
        length = endPos - pos;
268
5.88M
    }
269
270
    // in badly damaged PDF files, we can run off the end of the input
271
    // stream immediately after the "stream" token
272
7.79M
    if (!lexer.getStream()) {
273
0
        return nullptr;
274
0
    }
275
7.79M
    baseStr = lexer.getStream()->getBaseStream();
276
277
    // skip over stream data
278
7.79M
    if (Lexer::LOOK_VALUE_NOT_CACHED != lexer.lookCharLastValueCached) {
279
        // take into account the fact that we've cached one value
280
195k
        pos = pos - 1;
281
195k
        lexer.lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED;
282
195k
    }
283
7.79M
    if (unlikely(length < 0)) {
284
1.39k
        return nullptr;
285
1.39k
    }
286
7.78M
    if (unlikely(pos > LLONG_MAX - length)) {
287
183
        return nullptr;
288
183
    }
289
7.78M
    lexer.setPos(pos + length);
290
291
    // refill token buffers and check for 'endstream'
292
7.78M
    shift(); // kill '>>'
293
7.78M
    shift("endstream", objNum); // kill 'stream'
294
7.78M
    if (buf1.isCmd("endstream")) {
295
6.65M
        shift();
296
6.65M
    } else {
297
1.12M
        error(errSyntaxError, getPos(), "Missing 'endstream' or incorrect stream length");
298
1.12M
        if (strict) {
299
67
            return nullptr;
300
67
        }
301
1.12M
        if (lexer.hasXRef() && lexer.getStream()) {
302
            // shift until we find the proper endstream or we change to another object or reach eof
303
425k
            length = lexer.getPos() - pos;
304
425k
            if (buf1.isCmd("endstream")) {
305
0
                dict.dictSet("Length", Object(length));
306
0
            }
307
704k
        } else {
308
            // When building the xref we can't use it so use this
309
            // kludge for broken PDF files: just add 5k to the length, and
310
            // hope its enough
311
704k
            if (length < LLONG_MAX - pos - 5000) {
312
703k
                length += 5000;
313
703k
            }
314
704k
        }
315
1.12M
    }
316
317
    // make base stream
318
7.78M
    auto str = baseStr->makeSubStream(pos, true, length, std::move(dict));
319
320
    // handle decryption
321
7.78M
    if (fileKey) {
322
69.4k
        str = std::make_unique<DecryptStream>(std::move(str), fileKey, encAlgorithm, keyLength, Ref { .num = objNum, .gen = objGen });
323
69.4k
    }
324
325
    // get filters
326
7.78M
    Dict *streamDict = str->getDict();
327
7.78M
    str = Stream::addFilters(std::move(str), streamDict, recursion);
328
329
7.78M
    if (XRef *xref = lexer.getXRef()) {
330
        // Don't try to reuse the entry from the block at the start
331
        // of the function, xref can change in the middle because of
332
        // reconstruction
333
7.42M
        XRefEntry *entry = xref->getEntry(objNum, false);
334
7.42M
        if (entry) {
335
7.42M
            entry->setFlag(XRefEntry::Parsing, false);
336
7.42M
        }
337
7.42M
    }
338
339
7.78M
    return str;
340
7.78M
}
341
342
void Parser::shift(int objNum)
343
1.36G
{
344
1.36G
    if (inlineImg > 0) {
345
520k
        if (inlineImg < 2) {
346
487k
            ++inlineImg;
347
487k
        } else {
348
            // in a damaged content stream, if 'ID' shows up in the middle
349
            // of a dictionary, we need to reset
350
33.4k
            inlineImg = 0;
351
33.4k
        }
352
1.36G
    } else if (buf2.isCmd("ID")) {
353
487k
        lexer.skipChar(); // skip char after 'ID' command
354
487k
        inlineImg = 1;
355
487k
    }
356
1.36G
    buf1 = std::move(buf2);
357
1.36G
    if (inlineImg > 0) { // don't buffer inline image data
358
974k
        buf2.setToNull();
359
1.36G
    } else {
360
1.36G
        buf2 = lexer.getObj(objNum);
361
1.36G
    }
362
1.36G
}
363
364
void Parser::shift(const char *cmdA, int objNum)
365
7.78M
{
366
7.78M
    if (inlineImg > 0) {
367
0
        if (inlineImg < 2) {
368
0
            ++inlineImg;
369
0
        } else {
370
            // in a damaged content stream, if 'ID' shows up in the middle
371
            // of a dictionary, we need to reset
372
0
            inlineImg = 0;
373
0
        }
374
7.78M
    } else if (buf2.isCmd("ID")) {
375
496
        lexer.skipChar(); // skip char after 'ID' command
376
496
        inlineImg = 1;
377
496
    }
378
7.78M
    buf1 = std::move(buf2);
379
7.78M
    if (inlineImg > 0) {
380
496
        buf2.setToNull();
381
7.78M
    } else if (buf1.isCmd(cmdA)) {
382
6.65M
        buf2 = lexer.getObj(objNum);
383
6.65M
    } else {
384
1.12M
        buf2 = lexer.getObj(cmdA, objNum);
385
1.12M
    }
386
7.78M
}