Coverage Report

Created: 2025-06-22 06:28

/src/qpdf/libqpdf/QPDF_objects.cc
Line
Count
Source (jump to first uncovered line)
1
#include <qpdf/qpdf-config.h> // include first for large file support
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <array>
6
#include <atomic>
7
#include <cstring>
8
#include <limits>
9
#include <map>
10
#include <regex>
11
#include <sstream>
12
#include <vector>
13
14
#include <qpdf/BufferInputSource.hh>
15
#include <qpdf/FileInputSource.hh>
16
#include <qpdf/InputSource_private.hh>
17
#include <qpdf/OffsetInputSource.hh>
18
#include <qpdf/Pipeline.hh>
19
#include <qpdf/QPDFExc.hh>
20
#include <qpdf/QPDFLogger.hh>
21
#include <qpdf/QPDFObjectHandle_private.hh>
22
#include <qpdf/QPDFObject_private.hh>
23
#include <qpdf/QPDFParser.hh>
24
#include <qpdf/QTC.hh>
25
#include <qpdf/QUtil.hh>
26
#include <qpdf/Util.hh>
27
28
using namespace qpdf;
29
using namespace std::literals;
30
31
namespace
32
{
33
    class InvalidInputSource: public InputSource
34
    {
35
      public:
36
        ~InvalidInputSource() override = default;
37
        qpdf_offset_t
38
        findAndSkipNextEOL() override
39
0
        {
40
0
            throwException();
41
0
            return 0;
42
0
        }
43
        std::string const&
44
        getName() const override
45
0
        {
46
0
            static std::string name("closed input source");
47
0
            return name;
48
0
        }
49
        qpdf_offset_t
50
        tell() override
51
0
        {
52
0
            throwException();
53
0
            return 0;
54
0
        }
55
        void
56
        seek(qpdf_offset_t offset, int whence) override
57
0
        {
58
0
            throwException();
59
0
        }
60
        void
61
        rewind() override
62
0
        {
63
0
            throwException();
64
0
        }
65
        size_t
66
        read(char* buffer, size_t length) override
67
0
        {
68
0
            throwException();
69
0
            return 0;
70
0
        }
71
        void
72
        unreadCh(char ch) override
73
0
        {
74
0
            throwException();
75
0
        }
76
77
      private:
78
        void
79
        throwException()
80
0
        {
81
0
            throw std::logic_error(
82
0
                "QPDF operation attempted on a QPDF object with no input "
83
0
                "source. QPDF operations are invalid before processFile (or "
84
0
                "another process method) or after closeInputSource");
85
0
        }
86
    };
87
} // namespace
88
89
bool
90
QPDF::findStartxref()
91
8.51k
{
92
8.51k
    if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
93
        // Position in front of offset token
94
6.52k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
95
6.52k
        return true;
96
6.52k
    }
97
1.99k
    return false;
98
8.51k
}
99
100
void
101
QPDF::parse(char const* password)
102
24.3k
{
103
24.3k
    if (password) {
104
0
        m->encp->provided_password = password;
105
0
    }
106
107
    // Find the header anywhere in the first 1024 bytes of the file.
108
24.3k
    PatternFinder hf(*this, &QPDF::findHeader);
109
24.3k
    if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
110
20.9k
        QTC::TC("qpdf", "QPDF not a pdf file");
111
20.9k
        warn(damagedPDF("", -1, "can't find PDF header"));
112
        // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
113
20.9k
        m->pdf_version = "1.2";
114
20.9k
    }
115
116
    // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file.  We add an extra
117
    // 30 characters to leave room for the startxref stuff.
118
24.3k
    m->file->seek(0, SEEK_END);
119
24.3k
    qpdf_offset_t end_offset = m->file->tell();
120
24.3k
    m->xref_table_max_offset = end_offset;
121
    // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
122
    // scenarios at least 3 bytes are required.
123
24.3k
    if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
124
24.2k
        m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
125
24.2k
    }
126
24.3k
    qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
127
24.3k
    PatternFinder sf(*this, &QPDF::findStartxref);
128
24.3k
    qpdf_offset_t xref_offset = 0;
129
24.3k
    if (m->file->findLast("startxref", start_offset, 0, sf)) {
130
6.10k
        xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
131
6.10k
    }
132
133
24.3k
    try {
134
24.3k
        if (xref_offset == 0) {
135
18.3k
            QTC::TC("qpdf", "QPDF can't find startxref");
136
18.3k
            throw damagedPDF("", -1, "can't find startxref");
137
18.3k
        }
138
6.01k
        try {
139
6.01k
            read_xref(xref_offset);
140
6.01k
        } catch (QPDFExc&) {
141
4.63k
            throw;
142
4.63k
        } catch (std::exception& e) {
143
517
            throw damagedPDF("", -1, std::string("error reading xref: ") + e.what());
144
517
        }
145
23.4k
    } catch (QPDFExc& e) {
146
23.4k
        if (m->attempt_recovery) {
147
23.4k
            reconstruct_xref(e, xref_offset > 0);
148
23.4k
            QTC::TC("qpdf", "QPDF reconstructed xref table");
149
23.4k
        } else {
150
0
            throw;
151
0
        }
152
23.4k
    }
153
154
23.4k
    initializeEncryption();
155
9.29k
    m->parsed = true;
156
9.29k
    if (!m->xref_table.empty() && !getRoot().getKey("/Pages").isDictionary()) {
157
        // QPDFs created from JSON have an empty xref table and no root object yet.
158
2
        throw damagedPDF("", -1, "unable to find page tree");
159
2
    }
160
9.29k
}
161
162
void
163
QPDF::inParse(bool v)
164
354k
{
165
354k
    if (m->in_parse == v) {
166
        // This happens if QPDFParser::parse tries to resolve an indirect object while it is
167
        // parsing.
168
0
        throw std::logic_error(
169
0
            "QPDF: re-entrant parsing detected. This is a qpdf bug."
170
0
            " Please report at https://github.com/qpdf/qpdf/issues.");
171
0
    }
172
354k
    m->in_parse = v;
173
354k
}
174
175
void
176
QPDF::setTrailer(QPDFObjectHandle obj)
177
7.33k
{
178
7.33k
    if (m->trailer) {
179
154
        return;
180
154
    }
181
7.17k
    m->trailer = obj;
182
7.17k
}
183
184
void
185
QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
186
24.0k
{
187
24.0k
    if (m->reconstructed_xref) {
188
        // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
189
        // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
190
242
        throw e;
191
242
    }
192
193
    // If recovery generates more than 1000 warnings, the file is so severely damaged that there
194
    // probably is no point trying to continue.
195
23.7k
    const auto max_warnings = m->warnings.size() + 1000U;
196
2.11M
    auto check_warnings = [this, max_warnings]() {
197
2.11M
        if (m->warnings.size() > max_warnings) {
198
0
            throw damagedPDF("", -1, "too many errors while reconstructing cross-reference table");
199
0
        }
200
2.11M
    };
201
202
23.7k
    m->reconstructed_xref = true;
203
23.7k
    m->in_xref_reconstruction = true;
204
    // We may find more objects, which may contain dangling references.
205
23.7k
    m->fixed_dangling_refs = false;
206
207
23.7k
    warn(damagedPDF("", -1, "file is damaged"));
208
23.7k
    warn(e);
209
23.7k
    warn(damagedPDF("", -1, "Attempting to reconstruct cross-reference table"));
210
211
    // Delete all references to type 1 (uncompressed) objects
212
23.7k
    std::vector<QPDFObjGen> to_delete;
213
36.3k
    for (auto const& iter: m->xref_table) {
214
36.3k
        if (iter.second.getType() == 1) {
215
22.2k
            to_delete.emplace_back(iter.first);
216
22.2k
        }
217
36.3k
    }
218
23.7k
    for (auto const& iter: to_delete) {
219
22.2k
        m->xref_table.erase(iter);
220
22.2k
    }
221
222
23.7k
    std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
223
23.7k
    std::vector<qpdf_offset_t> trailers;
224
23.7k
    std::vector<qpdf_offset_t> startxrefs;
225
226
23.7k
    m->file->seek(0, SEEK_END);
227
23.7k
    qpdf_offset_t eof = m->file->tell();
228
23.7k
    m->file->seek(0, SEEK_SET);
229
    // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
230
23.7k
    static size_t const MAX_LEN = 10;
231
1.83M
    while (m->file->tell() < eof) {
232
1.81M
        QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
233
1.81M
        qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
234
1.81M
        if (t1.isInteger()) {
235
520k
            auto pos = m->file->tell();
236
520k
            auto t2 = readToken(*m->file, MAX_LEN);
237
520k
            if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
238
251k
                int obj = QUtil::string_to_int(t1.getValue().c_str());
239
251k
                int gen = QUtil::string_to_int(t2.getValue().c_str());
240
251k
                if (obj <= m->xref_table_max_id) {
241
250k
                    found_objects.emplace_back(obj, gen, token_start);
242
250k
                } else {
243
1.06k
                    warn(damagedPDF(
244
1.06k
                        "", -1, "ignoring object with impossibly large id " + std::to_string(obj)));
245
1.06k
                }
246
251k
            }
247
520k
            m->file->seek(pos, SEEK_SET);
248
1.29M
        } else if (!m->trailer && t1.isWord("trailer")) {
249
148k
            trailers.emplace_back(m->file->tell());
250
1.14M
        } else if (!found_startxref && t1.isWord("startxref")) {
251
2.26k
            startxrefs.emplace_back(m->file->tell());
252
2.26k
        }
253
1.81M
        check_warnings();
254
1.81M
        m->file->findAndSkipNextEOL();
255
1.81M
    }
256
257
23.7k
    if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
258
23.7k
        startxrefs.back() > std::get<2>(found_objects.back())) {
259
409
        auto xref_backup{m->xref_table};
260
409
        try {
261
409
            m->file->seek(startxrefs.back(), SEEK_SET);
262
409
            if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
263
249
                read_xref(offset);
264
265
249
                if (getRoot().getKey("/Pages").isDictionary()) {
266
3
                    QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
267
3
                    warn(damagedPDF(
268
3
                        "", -1, "startxref was more than 1024 bytes before end of file"));
269
3
                    initializeEncryption();
270
3
                    m->parsed = true;
271
3
                    m->reconstructed_xref = false;
272
3
                    return;
273
3
                }
274
249
            }
275
409
        } catch (...) {
276
            // ok, bad luck. Do recovery.
277
245
        }
278
406
        m->xref_table = std::move(xref_backup);
279
406
    }
280
281
23.7k
    auto rend = found_objects.rend();
282
274k
    for (auto it = found_objects.rbegin(); it != rend; it++) {
283
250k
        auto [obj, gen, token_start] = *it;
284
250k
        insertXrefEntry(obj, 1, token_start, gen);
285
250k
        check_warnings();
286
250k
    }
287
23.7k
    m->deleted_objects.clear();
288
289
41.0k
    for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
290
20.7k
        m->file->seek(*it, SEEK_SET);
291
20.7k
        auto t = readTrailer();
292
20.7k
        if (!t.isDictionary()) {
293
            // Oh well.  It was worth a try.
294
15.6k
        } else {
295
5.07k
            if (t.hasKey("/Root")) {
296
3.47k
                m->trailer = t;
297
3.47k
                break;
298
3.47k
            }
299
1.60k
            warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
300
1.60k
        }
301
17.2k
        check_warnings();
302
17.2k
    }
303
304
23.7k
    if (!m->trailer) {
305
19.3k
        qpdf_offset_t max_offset{0};
306
19.3k
        size_t max_size{0};
307
        // If there are any xref streams, take the last one to appear.
308
68.1k
        for (auto const& iter: m->xref_table) {
309
68.1k
            auto entry = iter.second;
310
68.1k
            if (entry.getType() != 1) {
311
281
                continue;
312
281
            }
313
67.8k
            auto oh = getObject(iter.first);
314
67.8k
            try {
315
67.8k
                if (!oh.isStreamOfType("/XRef")) {
316
59.6k
                    continue;
317
59.6k
                }
318
67.8k
            } catch (std::exception&) {
319
2.05k
                continue;
320
2.05k
            }
321
6.10k
            auto offset = entry.getOffset();
322
6.10k
            auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
323
6.10k
            if (size > max_size || (size == max_size && offset > max_offset)) {
324
6.04k
                max_offset = offset;
325
6.04k
                setTrailer(oh.getDict());
326
6.04k
            }
327
6.10k
            check_warnings();
328
6.10k
        }
329
19.3k
        if (max_offset > 0) {
330
5.88k
            try {
331
5.88k
                read_xref(max_offset, true);
332
5.88k
            } catch (std::exception&) {
333
3.33k
                warn(damagedPDF(
334
3.33k
                    "", -1, "error decoding candidate xref stream while recovering damaged file"));
335
3.33k
            }
336
5.88k
            QTC::TC("qpdf", "QPDF recover xref stream");
337
5.87k
        }
338
19.3k
    }
339
340
23.7k
    if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
341
        // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
342
19.5k
        QPDFObjectHandle root;
343
168k
        for (auto const& iter: m->obj_cache) {
344
168k
            try {
345
168k
                if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
346
5.97k
                    root = iter.second.object;
347
5.97k
                }
348
168k
            } catch (std::exception&) {
349
2.86k
                continue;
350
2.86k
            }
351
168k
        }
352
19.5k
        if (root) {
353
5.88k
            if (!m->trailer) {
354
5.00k
                warn(damagedPDF(
355
5.00k
                    "", -1, "unable to find trailer dictionary while recovering damaged file"));
356
5.00k
                m->trailer = QPDFObjectHandle::newDictionary();
357
5.00k
            }
358
5.88k
            m->trailer.replaceKey("/Root", root);
359
5.88k
        }
360
19.5k
    }
361
362
23.7k
    if (!m->trailer) {
363
        // We could check the last encountered object to see if it was an xref stream.  If so, we
364
        // could try to get the trailer from there.  This may make it possible to recover files with
365
        // bad startxref pointers even when they have object streams.
366
367
8.47k
        throw damagedPDF("", -1, "unable to find trailer dictionary while recovering damaged file");
368
8.47k
    }
369
15.2k
    if (m->xref_table.empty()) {
370
        // We cannot check for an empty xref table in parse because empty tables are valid when
371
        // creating QPDF objects from JSON.
372
351
        throw damagedPDF("", -1, "unable to find objects while recovering damaged file");
373
351
    }
374
14.9k
    check_warnings();
375
14.9k
    if (!m->parsed) {
376
14.4k
        m->parsed = true;
377
14.4k
        getAllPages();
378
14.4k
        check_warnings();
379
14.4k
        if (m->all_pages.empty()) {
380
730
            m->parsed = false;
381
730
            throw damagedPDF("", -1, "unable to find any pages while recovering damaged file");
382
730
        }
383
14.4k
    }
384
385
14.1k
    m->in_xref_reconstruction = false;
386
    // We could iterate through the objects looking for streams and try to find objects inside of
387
    // them, but it's probably not worth the trouble.  Acrobat can't recover files with any errors
388
    // in an xref stream, and this would be a real long shot anyway.  If we wanted to do anything
389
    // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
390
    // It's safe to call it more than once.
391
14.1k
}
392
393
void
394
QPDF::read_xref(qpdf_offset_t xref_offset, bool in_stream_recovery)
395
12.1k
{
396
12.1k
    std::map<int, int> free_table;
397
12.1k
    std::set<qpdf_offset_t> visited;
398
24.8k
    while (xref_offset) {
399
12.9k
        visited.insert(xref_offset);
400
12.9k
        char buf[7];
401
12.9k
        memset(buf, 0, sizeof(buf));
402
12.9k
        m->file->seek(xref_offset, SEEK_SET);
403
        // Some files miss the mark a little with startxref. We could do a better job of searching
404
        // in the neighborhood for something that looks like either an xref table or stream, but the
405
        // simple heuristic of skipping whitespace can help with the xref table case and is harmless
406
        // with the stream case.
407
12.9k
        bool done = false;
408
12.9k
        bool skipped_space = false;
409
30.1k
        while (!done) {
410
17.1k
            char ch;
411
17.1k
            if (1 == m->file->read(&ch, 1)) {
412
16.3k
                if (util::is_space(ch)) {
413
4.59k
                    skipped_space = true;
414
11.7k
                } else {
415
11.7k
                    m->file->unreadCh(ch);
416
11.7k
                    done = true;
417
11.7k
                }
418
16.3k
            } else {
419
794
                QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
420
794
                done = true;
421
794
            }
422
17.1k
        }
423
424
12.9k
        m->file->read(buf, sizeof(buf) - 1);
425
        // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
426
        // where it is terminated by arbitrary whitespace.
427
12.9k
        if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
428
1.63k
            if (skipped_space) {
429
95
                QTC::TC("qpdf", "QPDF xref skipped space");
430
95
                warn(damagedPDF("", -1, "extraneous whitespace seen before xref"));
431
95
            }
432
1.63k
            QTC::TC(
433
1.63k
                "qpdf",
434
1.63k
                "QPDF xref space",
435
1.63k
                ((buf[4] == '\n')       ? 0
436
1.63k
                     : (buf[4] == '\r') ? 1
437
1.36k
                     : (buf[4] == ' ')  ? 2
438
651
                                        : 9999));
439
1.63k
            int skip = 4;
440
            // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
441
3.43k
            while (util::is_space(buf[skip])) {
442
1.80k
                ++skip;
443
1.80k
            }
444
1.63k
            xref_offset = read_xrefTable(xref_offset + skip);
445
11.3k
        } else {
446
11.3k
            xref_offset = read_xrefStream(xref_offset, in_stream_recovery);
447
11.3k
        }
448
12.9k
        if (visited.contains(xref_offset)) {
449
232
            QTC::TC("qpdf", "QPDF xref loop");
450
232
            throw damagedPDF("", -1, "loop detected following xref tables");
451
232
        }
452
12.9k
    }
453
454
11.9k
    if (!m->trailer) {
455
0
        throw damagedPDF("", -1, "unable to find trailer while reading xref");
456
0
    }
457
11.9k
    int size = m->trailer.getKey("/Size").getIntValueAsInt();
458
11.9k
    int max_obj = 0;
459
11.9k
    if (!m->xref_table.empty()) {
460
3.16k
        max_obj = m->xref_table.rbegin()->first.getObj();
461
3.16k
    }
462
11.9k
    if (!m->deleted_objects.empty()) {
463
1.08k
        max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
464
1.08k
    }
465
11.9k
    if ((size < 1) || (size - 1 != max_obj)) {
466
2.57k
        QTC::TC("qpdf", "QPDF xref size mismatch");
467
2.57k
        warn(damagedPDF(
468
2.57k
            "",
469
2.57k
            -1,
470
2.57k
            ("reported number of objects (" + std::to_string(size) +
471
2.57k
             ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
472
2.57k
    }
473
474
    // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
475
    // never depend on its being set.
476
11.9k
    m->deleted_objects.clear();
477
478
    // Make sure we keep only the highest generation for any object.
479
11.9k
    QPDFObjGen last_og{-1, 0};
480
334k
    for (auto const& item: m->xref_table) {
481
334k
        auto id = item.first.getObj();
482
334k
        if (id == last_og.getObj() && id > 0) {
483
10.9k
            removeObject(last_og);
484
10.9k
        }
485
334k
        last_og = item.first;
486
334k
    }
487
11.9k
}
488
489
bool
490
QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
491
10.4k
{
492
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
493
    // buffer.
494
10.4k
    char const* p = line.c_str();
495
10.4k
    char const* start = line.c_str();
496
497
    // Skip zero or more spaces
498
11.7k
    while (util::is_space(*p)) {
499
1.30k
        ++p;
500
1.30k
    }
501
    // Require digit
502
10.4k
    if (!util::is_digit(*p)) {
503
336
        return false;
504
336
    }
505
    // Gather digits
506
10.1k
    std::string obj_str;
507
36.7k
    while (util::is_digit(*p)) {
508
26.6k
        obj_str.append(1, *p++);
509
26.6k
    }
510
    // Require space
511
10.1k
    if (!util::is_space(*p)) {
512
128
        return false;
513
128
    }
514
    // Skip spaces
515
28.1k
    while (util::is_space(*p)) {
516
18.1k
        ++p;
517
18.1k
    }
518
    // Require digit
519
9.98k
    if (!util::is_digit(*p)) {
520
124
        return false;
521
124
    }
522
    // Gather digits
523
9.85k
    std::string num_str;
524
33.5k
    while (util::is_digit(*p)) {
525
23.6k
        num_str.append(1, *p++);
526
23.6k
    }
527
    // Skip any space including line terminators
528
28.3k
    while (util::is_space(*p)) {
529
18.4k
        ++p;
530
18.4k
    }
531
9.85k
    bytes = toI(p - start);
532
9.85k
    obj = QUtil::string_to_int(obj_str.c_str());
533
9.85k
    num = QUtil::string_to_int(num_str.c_str());
534
9.85k
    return true;
535
9.98k
}
536
537
bool
538
QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
539
7.45k
{
540
    // Reposition after initial read attempt and reread.
541
7.45k
    m->file->seek(m->file->getLastOffset(), SEEK_SET);
542
7.45k
    auto line = m->file->readLine(30);
543
544
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
545
    // buffer.
546
7.45k
    char const* p = line.data();
547
548
    // Skip zero or more spaces. There aren't supposed to be any.
549
7.45k
    bool invalid = false;
550
9.20k
    while (util::is_space(*p)) {
551
1.75k
        ++p;
552
1.75k
        QTC::TC("qpdf", "QPDF ignore first space in xref entry");
553
1.75k
        invalid = true;
554
1.75k
    }
555
    // Require digit
556
7.45k
    if (!util::is_digit(*p)) {
557
18
        return false;
558
18
    }
559
    // Gather digits
560
7.44k
    std::string f1_str;
561
24.6k
    while (util::is_digit(*p)) {
562
17.1k
        f1_str.append(1, *p++);
563
17.1k
    }
564
    // Require space
565
7.44k
    if (!util::is_space(*p)) {
566
26
        return false;
567
26
    }
568
7.41k
    if (util::is_space(*(p + 1))) {
569
2.64k
        QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
570
2.64k
        invalid = true;
571
2.64k
    }
572
    // Skip spaces
573
18.7k
    while (util::is_space(*p)) {
574
11.2k
        ++p;
575
11.2k
    }
576
    // Require digit
577
7.41k
    if (!util::is_digit(*p)) {
578
48
        return false;
579
48
    }
580
    // Gather digits
581
7.36k
    std::string f2_str;
582
22.0k
    while (util::is_digit(*p)) {
583
14.6k
        f2_str.append(1, *p++);
584
14.6k
    }
585
    // Require space
586
7.36k
    if (!util::is_space(*p)) {
587
42
        return false;
588
42
    }
589
7.32k
    if (util::is_space(*(p + 1))) {
590
2.09k
        QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
591
2.09k
        invalid = true;
592
2.09k
    }
593
    // Skip spaces
594
17.5k
    while (util::is_space(*p)) {
595
10.1k
        ++p;
596
10.1k
    }
597
7.32k
    if ((*p == 'f') || (*p == 'n')) {
598
7.18k
        type = *p;
599
7.18k
    } else {
600
143
        return false;
601
143
    }
602
7.18k
    if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
603
7.06k
        QTC::TC("qpdf", "QPDF ignore length error xref entry");
604
7.06k
        invalid = true;
605
7.06k
    }
606
607
7.18k
    if (invalid) {
608
7.07k
        warn(damagedPDF("xref table", "accepting invalid xref table entry"));
609
7.07k
    }
610
611
7.18k
    f1 = QUtil::string_to_ll(f1_str.c_str());
612
7.18k
    f2 = QUtil::string_to_int(f2_str.c_str());
613
614
7.18k
    return true;
615
7.32k
}
616
617
// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
618
// result.
619
bool
620
QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
621
15.4k
{
622
15.4k
    std::array<char, 21> line;
623
15.4k
    if (m->file->read(line.data(), 20) != 20) {
624
        // C++20: [[unlikely]]
625
285
        return false;
626
285
    }
627
15.1k
    line[20] = '\0';
628
15.1k
    char const* p = line.data();
629
630
15.1k
    int f1_len = 0;
631
15.1k
    int f2_len = 0;
632
633
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
634
    // buffer.
635
636
    // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
637
69.3k
    while (*p == '0') {
638
54.1k
        ++f1_len;
639
54.1k
        ++p;
640
54.1k
    }
641
51.6k
    while (util::is_digit(*p) && f1_len++ < 10) {
642
36.4k
        f1 *= 10;
643
36.4k
        f1 += *p++ - '0';
644
36.4k
    }
645
    // Require space
646
15.1k
    if (!util::is_space(*p++)) {
647
        // Entry doesn't start with space or digit.
648
        // C++20: [[unlikely]]
649
72
        return false;
650
72
    }
651
    // Gather digits. NB No risk of overflow as 99'999 < max int.
652
43.7k
    while (*p == '0') {
653
28.6k
        ++f2_len;
654
28.6k
        ++p;
655
28.6k
    }
656
35.1k
    while (util::is_digit(*p) && f2_len++ < 5) {
657
20.1k
        f2 *= 10;
658
20.1k
        f2 += static_cast<int>(*p++ - '0');
659
20.1k
    }
660
15.0k
    if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
661
        // C++20: [[likely]]
662
10.8k
        type = *p;
663
        // No test for valid line[19].
664
10.8k
        if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
665
            // C++20: [[likely]]
666
7.60k
            return true;
667
7.60k
        }
668
10.8k
    }
669
7.45k
    return read_bad_xrefEntry(f1, f2, type);
670
15.0k
}
671
672
// Read a single cross-reference table section and associated trailer.
673
qpdf_offset_t
674
QPDF::read_xrefTable(qpdf_offset_t xref_offset)
675
1.63k
{
676
1.63k
    m->file->seek(xref_offset, SEEK_SET);
677
1.63k
    std::string line;
678
10.4k
    while (true) {
679
10.4k
        line.assign(50, '\0');
680
10.4k
        m->file->read(line.data(), line.size());
681
10.4k
        int obj = 0;
682
10.4k
        int num = 0;
683
10.4k
        int bytes = 0;
684
10.4k
        if (!parse_xrefFirst(line, obj, num, bytes)) {
685
588
            QTC::TC("qpdf", "QPDF invalid xref");
686
588
            throw damagedPDF("xref table", "xref syntax invalid");
687
588
        }
688
9.85k
        m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
689
24.6k
        for (qpdf_offset_t i = obj; i - num < obj; ++i) {
690
15.4k
            if (i == 0) {
691
                // This is needed by checkLinearization()
692
499
                m->first_xref_item_offset = m->file->tell();
693
499
            }
694
            // For xref_table, these will always be small enough to be ints
695
15.4k
            qpdf_offset_t f1 = 0;
696
15.4k
            int f2 = 0;
697
15.4k
            char type = '\0';
698
15.4k
            if (!read_xrefEntry(f1, f2, type)) {
699
634
                QTC::TC("qpdf", "QPDF invalid xref entry");
700
634
                throw damagedPDF(
701
634
                    "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
702
634
            }
703
14.7k
            if (type == 'f') {
704
4.99k
                insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
705
9.79k
            } else {
706
9.79k
                insertXrefEntry(toI(i), 1, f1, f2);
707
9.79k
            }
708
14.7k
        }
709
9.22k
        qpdf_offset_t pos = m->file->tell();
710
9.22k
        if (readToken(*m->file).isWord("trailer")) {
711
374
            break;
712
8.85k
        } else {
713
8.85k
            m->file->seek(pos, SEEK_SET);
714
8.85k
        }
715
9.22k
    }
716
717
    // Set offset to previous xref table if any
718
410
    QPDFObjectHandle cur_trailer = readTrailer();
719
410
    if (!cur_trailer.isDictionary()) {
720
44
        QTC::TC("qpdf", "QPDF missing trailer");
721
44
        throw damagedPDF("", "expected trailer dictionary");
722
44
    }
723
724
366
    if (!m->trailer) {
725
272
        setTrailer(cur_trailer);
726
727
272
        if (!m->trailer.hasKey("/Size")) {
728
50
            QTC::TC("qpdf", "QPDF trailer lacks size");
729
50
            throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
730
50
        }
731
222
        if (!m->trailer.getKey("/Size").isInteger()) {
732
10
            QTC::TC("qpdf", "QPDF trailer size not integer");
733
10
            throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
734
10
        }
735
222
    }
736
737
306
    if (cur_trailer.hasKey("/XRefStm")) {
738
33
        if (m->ignore_xref_streams) {
739
0
            QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
740
33
        } else {
741
33
            if (cur_trailer.getKey("/XRefStm").isInteger()) {
742
                // Read the xref stream but disregard any return value -- we'll use our trailer's
743
                // /Prev key instead of the xref stream's.
744
32
                (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
745
32
            } else {
746
1
                throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
747
1
            }
748
33
        }
749
33
    }
750
751
305
    if (cur_trailer.hasKey("/Prev")) {
752
65
        if (!cur_trailer.getKey("/Prev").isInteger()) {
753
1
            QTC::TC("qpdf", "QPDF trailer prev not integer");
754
1
            throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
755
1
        }
756
64
        QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
757
64
        return cur_trailer.getKey("/Prev").getIntValue();
758
65
    }
759
760
240
    return 0;
761
305
}
762
763
// Read a single cross-reference stream.
764
qpdf_offset_t
765
QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
766
10.9k
{
767
10.9k
    if (!m->ignore_xref_streams) {
768
10.9k
        QPDFObjGen x_og;
769
10.9k
        QPDFObjectHandle xref_obj;
770
10.9k
        try {
771
10.9k
            m->in_read_xref_stream = true;
772
10.9k
            xref_obj =
773
10.9k
                readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
774
10.9k
        } catch (QPDFExc&) {
775
            // ignore -- report error below
776
1.90k
        }
777
10.9k
        m->in_read_xref_stream = false;
778
10.9k
        if (xref_obj.isStreamOfType("/XRef")) {
779
8.32k
            QTC::TC("qpdf", "QPDF found xref stream");
780
8.32k
            return processXRefStream(xref_offset, xref_obj, in_stream_recovery);
781
8.32k
        }
782
10.9k
    }
783
784
2.57k
    QTC::TC("qpdf", "QPDF can't find xref");
785
2.57k
    throw damagedPDF("", xref_offset, "xref not found");
786
0
    return 0; // unreachable
787
10.9k
}
788
789
// Return the entry size of the xref stream and the processed W array.
790
std::pair<int, std::array<int, 3>>
791
QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
792
8.32k
{
793
8.32k
    auto W_obj = dict.getKey("/W");
794
8.32k
    if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
795
8.32k
          W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
796
252
        throw damaged("Cross-reference stream does not have a proper /W key");
797
252
    }
798
799
8.07k
    std::array<int, 3> W;
800
8.07k
    int entry_size = 0;
801
8.07k
    auto w_vector = W_obj.getArrayAsVector();
802
8.07k
    int max_bytes = sizeof(qpdf_offset_t);
803
32.1k
    for (size_t i = 0; i < 3; ++i) {
804
24.1k
        W[i] = w_vector[i].getIntValueAsInt();
805
24.1k
        if (W[i] > max_bytes) {
806
32
            throw damaged("Cross-reference stream's /W contains impossibly large values");
807
32
        }
808
24.1k
        if (W[i] < 0) {
809
51
            throw damaged("Cross-reference stream's /W contains negative values");
810
51
        }
811
24.0k
        entry_size += W[i];
812
24.0k
    }
813
7.99k
    if (entry_size == 0) {
814
3
        throw damaged("Cross-reference stream's /W indicates entry size of 0");
815
3
    }
816
7.98k
    return {entry_size, W};
817
7.99k
}
818
819
// Validate Size key and return the maximum number of entries that the xref stream can contain.
820
int
821
QPDF::processXRefSize(
822
    QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
823
7.98k
{
824
    // Number of entries is limited by the highest possible object id and stream size.
825
7.98k
    auto max_num_entries = std::numeric_limits<int>::max();
826
7.98k
    if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
827
0
        max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
828
0
    }
829
830
7.98k
    auto Size_obj = dict.getKey("/Size");
831
7.98k
    long long size;
832
7.98k
    if (!dict.getKey("/Size").getValueAsInt(size)) {
833
59
        throw damaged("Cross-reference stream does not have a proper /Size key");
834
7.92k
    } else if (size < 0) {
835
69
        throw damaged("Cross-reference stream has a negative /Size key");
836
7.85k
    } else if (size >= max_num_entries) {
837
84
        throw damaged("Cross-reference stream has an impossibly large /Size key");
838
84
    }
839
    // We are not validating that Size <= (Size key of parent xref / trailer).
840
7.77k
    return max_num_entries;
841
7.98k
}
842
843
// Return the number of entries of the xref stream and the processed Index array.
844
std::pair<int, std::vector<std::pair<int, int>>>
845
QPDF::processXRefIndex(
846
    QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
847
7.77k
{
848
7.77k
    auto size = dict.getKey("/Size").getIntValueAsInt();
849
7.77k
    auto Index_obj = dict.getKey("/Index");
850
851
7.77k
    if (Index_obj.isArray()) {
852
1.26k
        std::vector<std::pair<int, int>> indx;
853
1.26k
        int num_entries = 0;
854
1.26k
        auto index_vec = Index_obj.getArrayAsVector();
855
1.26k
        if ((index_vec.size() % 2) || index_vec.size() < 2) {
856
10
            throw damaged("Cross-reference stream's /Index has an invalid number of values");
857
10
        }
858
859
1.25k
        int i = 0;
860
1.25k
        long long first = 0;
861
11.1k
        for (auto& val: index_vec) {
862
11.1k
            if (val.isInteger()) {
863
11.1k
                if (i % 2) {
864
5.53k
                    auto count = val.getIntValue();
865
5.53k
                    if (count <= 0) {
866
42
                        throw damaged(
867
42
                            "Cross-reference stream section claims to contain " +
868
42
                            std::to_string(count) + " entries");
869
42
                    }
870
                    // We are guarding against the possibility of num_entries * entry_size
871
                    // overflowing. We are not checking that entries are in ascending order as
872
                    // required by the spec, which probably should generate a warning. We are also
873
                    // not checking that for each subsection first object number + number of entries
874
                    // <= /Size. The spec requires us to ignore object number > /Size.
875
5.49k
                    if (first > (max_num_entries - count) ||
876
5.49k
                        count > (max_num_entries - num_entries)) {
877
99
                        throw damaged(
878
99
                            "Cross-reference stream claims to contain too many entries: " +
879
99
                            std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
880
99
                            std::to_string(num_entries));
881
99
                    }
882
5.39k
                    indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
883
5.39k
                    num_entries += static_cast<int>(count);
884
5.63k
                } else {
885
5.63k
                    first = val.getIntValue();
886
5.63k
                    if (first < 0) {
887
8
                        throw damaged(
888
8
                            "Cross-reference stream's /Index contains a negative object id");
889
5.63k
                    } else if (first > max_num_entries) {
890
80
                        throw damaged(
891
80
                            "Cross-reference stream's /Index contains an impossibly "
892
80
                            "large object id");
893
80
                    }
894
5.63k
                }
895
11.1k
            } else {
896
20
                throw damaged(
897
20
                    "Cross-reference stream's /Index's item " + std::to_string(i) +
898
20
                    " is not an integer");
899
20
            }
900
10.9k
            i++;
901
10.9k
        }
902
1.00k
        QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
903
1.00k
        return {num_entries, indx};
904
6.50k
    } else if (Index_obj.isNull()) {
905
6.50k
        QTC::TC("qpdf", "QPDF xref /Index is null");
906
6.50k
        return {size, {{0, size}}};
907
6.50k
    } else {
908
6
        throw damaged("Cross-reference stream does not have a proper /Index key");
909
6
    }
910
7.77k
}
911
912
qpdf_offset_t
913
QPDF::processXRefStream(
914
    qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj, bool in_stream_recovery)
915
8.32k
{
916
8.32k
    auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
917
4.98k
        return damagedPDF("xref stream", xref_offset, msg.data());
918
4.98k
    };
919
920
8.32k
    auto dict = xref_obj.getDict();
921
922
8.32k
    auto [entry_size, W] = processXRefW(dict, damaged);
923
8.32k
    int max_num_entries = processXRefSize(dict, entry_size, damaged);
924
8.32k
    auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
925
926
8.32k
    std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
927
8.32k
    size_t actual_size = bp->getSize();
928
8.32k
    auto expected_size = toS(entry_size) * toS(num_entries);
929
930
8.32k
    if (expected_size != actual_size) {
931
4.16k
        QPDFExc x = damaged(
932
4.16k
            "Cross-reference stream data has the wrong size; expected = " +
933
4.16k
            std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
934
4.16k
        if (expected_size > actual_size) {
935
799
            throw x;
936
3.36k
        } else {
937
3.36k
            warn(x);
938
3.36k
        }
939
4.16k
    }
940
941
7.52k
    bool saw_first_compressed_object = false;
942
943
    // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
944
    // We know that entry_size * num_entries is less or equal to the size of the buffer.
945
7.52k
    auto p = bp->getBuffer();
946
7.52k
    for (auto [obj, sec_entries]: indx) {
947
        // Process a subsection.
948
503k
        for (int i = 0; i < sec_entries; ++i) {
949
            // Read this entry
950
498k
            std::array<qpdf_offset_t, 3> fields{};
951
498k
            if (W[0] == 0) {
952
38.8k
                QTC::TC("qpdf", "QPDF default for xref stream field 0");
953
38.8k
                fields[0] = 1;
954
38.8k
            }
955
1.99M
            for (size_t j = 0; j < 3; ++j) {
956
3.22M
                for (int k = 0; k < W[j]; ++k) {
957
1.73M
                    fields[j] <<= 8;
958
1.73M
                    fields[j] |= *p++;
959
1.73M
                }
960
1.49M
            }
961
962
            // Get the generation number.  The generation number is 0 unless this is an uncompressed
963
            // object record, in which case the generation number appears as the third field.
964
498k
            if (saw_first_compressed_object) {
965
430k
                if (fields[0] != 2) {
966
91.9k
                    m->uncompressed_after_compressed = true;
967
91.9k
                }
968
430k
            } else if (fields[0] == 2) {
969
2.23k
                saw_first_compressed_object = true;
970
2.23k
            }
971
498k
            if (obj == 0) {
972
                // This is needed by checkLinearization()
973
3.44k
                m->first_xref_item_offset = xref_offset;
974
494k
            } else if (fields[0] == 0) {
975
                // Ignore fields[2], which we don't care about in this case. This works around the
976
                // issue of some PDF files that put invalid values, like -1, here for deleted
977
                // objects.
978
51.0k
                insertFreeXrefEntry(QPDFObjGen(obj, 0));
979
443k
            } else {
980
443k
                auto typ = toI(fields[0]);
981
443k
                if (!in_stream_recovery || typ == 2) {
982
                    // If we are in xref stream recovery all actual uncompressed objects have
983
                    // already been inserted into the xref table. Avoid adding junk data into the
984
                    // xref table.
985
398k
                    insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
986
398k
                }
987
443k
            }
988
498k
            ++obj;
989
498k
        }
990
5.64k
    }
991
992
7.52k
    if (!m->trailer) {
993
1.01k
        setTrailer(dict);
994
1.01k
    }
995
996
7.52k
    if (dict.hasKey("/Prev")) {
997
1.03k
        if (!dict.getKey("/Prev").isInteger()) {
998
28
            throw damagedPDF(
999
28
                "xref stream", "/Prev key in xref stream dictionary is not an integer");
1000
28
        }
1001
1.00k
        QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
1002
1.00k
        return dict.getKey("/Prev").getIntValue();
1003
6.49k
    } else {
1004
6.49k
        return 0;
1005
6.49k
    }
1006
7.52k
}
1007
1008
void
1009
QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
1010
658k
{
1011
    // Populate the xref table in such a way that the first reference to an object that we see,
1012
    // which is the one in the latest xref table in which it appears, is the one that gets stored.
1013
    // This works because we are reading more recent appends before older ones.
1014
1015
    // If there is already an entry for this object and generation in the table, it means that a
1016
    // later xref table has registered this object.  Disregard this one.
1017
658k
    int new_gen = f0 == 2 ? 0 : f2;
1018
1019
658k
    if (!(f0 == 1 || f0 == 2)) {
1020
6.83k
        return;
1021
6.83k
    }
1022
1023
651k
    if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
1024
        // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
1025
        // is probably no point having another warning but we could count invalid items in order to
1026
        // decide when to give up.
1027
53.1k
        QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
1028
        // ignore impossibly large object ids or object ids > Size.
1029
53.1k
        return;
1030
53.1k
    }
1031
1032
598k
    if (m->deleted_objects.contains(obj)) {
1033
841
        QTC::TC("qpdf", "QPDF xref deleted object");
1034
841
        return;
1035
841
    }
1036
1037
597k
    if (f0 == 2 && static_cast<int>(f1) == obj) {
1038
1.63k
        warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
1039
1.63k
        return;
1040
1.63k
    }
1041
1042
596k
    auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
1043
596k
    if (!created) {
1044
142k
        QTC::TC("qpdf", "QPDF xref reused object");
1045
142k
        return;
1046
142k
    }
1047
1048
453k
    switch (f0) {
1049
149k
    case 1:
1050
        // f2 is generation
1051
149k
        QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1052
149k
        iter->second = QPDFXRefEntry(f1);
1053
149k
        break;
1054
1055
303k
    case 2:
1056
303k
        iter->second = QPDFXRefEntry(toI(f1), f2);
1057
303k
        break;
1058
1059
0
    default:
1060
0
        throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
1061
0
        break;
1062
453k
    }
1063
453k
}
1064
1065
void
1066
QPDF::insertFreeXrefEntry(QPDFObjGen og)
1067
56.0k
{
1068
56.0k
    if (!m->xref_table.contains(og)) {
1069
55.0k
        m->deleted_objects.insert(og.getObj());
1070
55.0k
    }
1071
56.0k
}
1072
1073
void
1074
QPDF::showXRefTable()
1075
0
{
1076
0
    auto& cout = *m->log->getInfo();
1077
0
    for (auto const& iter: m->xref_table) {
1078
0
        QPDFObjGen const& og = iter.first;
1079
0
        QPDFXRefEntry const& entry = iter.second;
1080
0
        cout << og.unparse('/') << ": ";
1081
0
        switch (entry.getType()) {
1082
0
        case 1:
1083
0
            cout << "uncompressed; offset = " << entry.getOffset();
1084
0
            break;
1085
1086
0
        case 2:
1087
0
            *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
1088
0
                               << ", index = " << entry.getObjStreamIndex();
1089
0
            break;
1090
1091
0
        default:
1092
0
            throw std::logic_error("unknown cross-reference table type while showing xref_table");
1093
0
            break;
1094
0
        }
1095
0
        m->log->info("\n");
1096
0
    }
1097
0
}
1098
1099
// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
1100
// return false. Otherwise return true.
1101
bool
1102
QPDF::resolveXRefTable()
1103
8.86k
{
1104
8.86k
    bool may_change = !m->reconstructed_xref;
1105
276k
    for (auto& iter: m->xref_table) {
1106
276k
        if (isUnresolved(iter.first)) {
1107
183k
            resolve(iter.first);
1108
183k
            if (may_change && m->reconstructed_xref) {
1109
21
                return false;
1110
21
            }
1111
183k
        }
1112
276k
    }
1113
8.84k
    return true;
1114
8.86k
}
1115
1116
// Ensure all objects in the pdf file, including those in indirect references, appear in the object
1117
// cache.
1118
void
1119
QPDF::fixDanglingReferences(bool force)
1120
20.4k
{
1121
20.4k
    if (m->fixed_dangling_refs) {
1122
11.6k
        return;
1123
11.6k
    }
1124
8.84k
    if (!resolveXRefTable()) {
1125
21
        QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
1126
21
        resolveXRefTable();
1127
21
    }
1128
8.84k
    m->fixed_dangling_refs = true;
1129
8.84k
}
1130
1131
size_t
1132
QPDF::getObjectCount()
1133
12.2k
{
1134
    // This method returns the next available indirect object number. makeIndirectObject uses it for
1135
    // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
1136
    // be in obj_cache.
1137
12.2k
    fixDanglingReferences();
1138
12.2k
    QPDFObjGen og;
1139
12.2k
    if (!m->obj_cache.empty()) {
1140
12.2k
        og = (*(m->obj_cache.rbegin())).first;
1141
12.2k
    }
1142
12.2k
    return toS(og.getObj());
1143
12.2k
}
1144
1145
std::vector<QPDFObjectHandle>
1146
QPDF::getAllObjects()
1147
0
{
1148
    // After fixDanglingReferences is called, all objects are in the object cache.
1149
0
    fixDanglingReferences();
1150
0
    std::vector<QPDFObjectHandle> result;
1151
0
    for (auto const& iter: m->obj_cache) {
1152
0
        result.push_back(newIndirect(iter.first, iter.second.object));
1153
0
    }
1154
0
    return result;
1155
0
}
1156
1157
void
1158
QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
1159
244k
{
1160
244k
    m->last_object_description.clear();
1161
244k
    if (!description.empty()) {
1162
20.5k
        m->last_object_description += description;
1163
20.5k
        if (og.isIndirect()) {
1164
9.54k
            m->last_object_description += ": ";
1165
9.54k
        }
1166
20.5k
    }
1167
244k
    if (og.isIndirect()) {
1168
233k
        m->last_object_description += "object " + og.unparse(' ');
1169
233k
    }
1170
244k
}
1171
1172
QPDFObjectHandle
1173
QPDF::readTrailer()
1174
21.0k
{
1175
21.0k
    qpdf_offset_t offset = m->file->tell();
1176
21.0k
    auto [object, empty] = QPDFParser::parse(
1177
21.0k
        *m->file, "trailer", m->tokenizer, nullptr, *this, m->in_xref_reconstruction);
1178
21.0k
    if (empty) {
1179
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1180
        // actual PDF files and Adobe Reader appears to ignore them.
1181
208
        warn(damagedPDF("trailer", "empty object treated as null"));
1182
20.8k
    } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
1183
218
        warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1184
218
    }
1185
    // Override last_offset so that it points to the beginning of the object we just read
1186
21.0k
    m->file->setLastOffset(offset);
1187
21.0k
    return object;
1188
21.0k
}
1189
1190
QPDFObjectHandle
1191
QPDF::readObject(std::string const& description, QPDFObjGen og)
1192
120k
{
1193
120k
    setLastObjectDescription(description, og);
1194
120k
    qpdf_offset_t offset = m->file->tell();
1195
1196
120k
    StringDecrypter decrypter{this, og};
1197
120k
    StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1198
120k
    auto [object, empty] = QPDFParser::parse(
1199
120k
        *m->file,
1200
120k
        m->last_object_description,
1201
120k
        m->tokenizer,
1202
120k
        decrypter_ptr,
1203
120k
        *this,
1204
120k
        m->in_xref_reconstruction || m->in_read_xref_stream);
1205
120k
    ;
1206
120k
    if (empty) {
1207
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1208
        // actual PDF files and Adobe Reader appears to ignore them.
1209
119
        warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
1210
119
        return object;
1211
119
    }
1212
120k
    auto token = readToken(*m->file);
1213
120k
    if (object.isDictionary() && token.isWord("stream")) {
1214
50.6k
        readStream(object, og, offset);
1215
50.6k
        token = readToken(*m->file);
1216
50.6k
    }
1217
120k
    if (!token.isWord("endobj")) {
1218
36.0k
        QTC::TC("qpdf", "QPDF err expected endobj");
1219
36.0k
        warn(damagedPDF("expected endobj"));
1220
36.0k
    }
1221
120k
    return object;
1222
120k
}
1223
1224
// After reading stream dictionary and stream keyword, read rest of stream.
1225
void
1226
QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1227
50.6k
{
1228
50.6k
    validateStreamLineEnd(object, og, offset);
1229
1230
    // Must get offset before accessing any additional objects since resolving a previously
1231
    // unresolved indirect object will change file position.
1232
50.6k
    qpdf_offset_t stream_offset = m->file->tell();
1233
50.6k
    size_t length = 0;
1234
1235
50.6k
    try {
1236
50.6k
        auto length_obj = object.getKey("/Length");
1237
1238
50.6k
        if (!length_obj.isInteger()) {
1239
21.2k
            if (length_obj.isNull()) {
1240
20.9k
                QTC::TC("qpdf", "QPDF stream without length");
1241
20.9k
                throw damagedPDF(offset, "stream dictionary lacks /Length key");
1242
20.9k
            }
1243
314
            QTC::TC("qpdf", "QPDF stream length not integer");
1244
314
            throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
1245
21.2k
        }
1246
1247
29.3k
        length = toS(length_obj.getUIntValue());
1248
        // Seek in two steps to avoid potential integer overflow
1249
29.3k
        m->file->seek(stream_offset, SEEK_SET);
1250
29.3k
        m->file->seek(toO(length), SEEK_CUR);
1251
29.3k
        if (!readToken(*m->file).isWord("endstream")) {
1252
7.84k
            QTC::TC("qpdf", "QPDF missing endstream");
1253
7.84k
            throw damagedPDF("expected endstream");
1254
7.84k
        }
1255
32.3k
    } catch (QPDFExc& e) {
1256
32.3k
        if (m->attempt_recovery) {
1257
32.3k
            warn(e);
1258
32.3k
            length = recoverStreamLength(m->file, og, stream_offset);
1259
32.3k
        } else {
1260
0
            throw;
1261
0
        }
1262
32.3k
    }
1263
45.9k
    object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
1264
45.9k
}
1265
1266
void
1267
QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1268
50.6k
{
1269
    // The PDF specification states that the word "stream" should be followed by either a carriage
1270
    // return and a newline or by a newline alone.  It specifically disallowed following it by a
1271
    // carriage return alone since, in that case, there would be no way to tell whether the NL in a
1272
    // CR NL sequence was part of the stream data.  However, some readers, including Adobe reader,
1273
    // accept a carriage return by itself when followed by a non-newline character, so that's what
1274
    // we do here. We have also seen files that have extraneous whitespace between the stream
1275
    // keyword and the newline.
1276
57.3k
    while (true) {
1277
57.3k
        char ch;
1278
57.3k
        if (m->file->read(&ch, 1) == 0) {
1279
            // A premature EOF here will result in some other problem that will get reported at
1280
            // another time.
1281
233
            return;
1282
233
        }
1283
57.1k
        if (ch == '\n') {
1284
            // ready to read stream data
1285
24.3k
            QTC::TC("qpdf", "QPDF stream with NL only");
1286
24.3k
            return;
1287
24.3k
        }
1288
32.7k
        if (ch == '\r') {
1289
            // Read another character
1290
21.0k
            if (m->file->read(&ch, 1) != 0) {
1291
21.0k
                if (ch == '\n') {
1292
                    // Ready to read stream data
1293
18.3k
                    QTC::TC("qpdf", "QPDF stream with CRNL");
1294
18.3k
                } else {
1295
                    // Treat the \r by itself as the whitespace after endstream and start reading
1296
                    // stream data in spite of not having seen a newline.
1297
2.70k
                    QTC::TC("qpdf", "QPDF stream with CR only");
1298
2.70k
                    m->file->unreadCh(ch);
1299
2.70k
                    warn(damagedPDF(
1300
2.70k
                        m->file->tell(), "stream keyword followed by carriage return only"));
1301
2.70k
                }
1302
21.0k
            }
1303
21.0k
            return;
1304
21.0k
        }
1305
11.7k
        if (!util::is_space(ch)) {
1306
4.92k
            QTC::TC("qpdf", "QPDF stream without newline");
1307
4.92k
            m->file->unreadCh(ch);
1308
4.92k
            warn(damagedPDF(
1309
4.92k
                m->file->tell(), "stream keyword not followed by proper line terminator"));
1310
4.92k
            return;
1311
4.92k
        }
1312
6.79k
        warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
1313
6.79k
    }
1314
50.6k
}
1315
1316
QPDFObjectHandle
1317
QPDF::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id)
1318
35.2k
{
1319
35.2k
    auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this);
1320
35.2k
    if (empty) {
1321
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1322
        // actual PDF files and Adobe Reader appears to ignore them.
1323
13
        warn(QPDFExc(
1324
13
            qpdf_e_damaged_pdf,
1325
13
            m->file->getName() + " object stream " + std::to_string(stream_id),
1326
13
            +"object " + std::to_string(obj_id) + " 0, offset " +
1327
13
                std::to_string(input.getLastOffset()),
1328
13
            0,
1329
13
            "empty object treated as null"));
1330
13
    }
1331
35.2k
    return object;
1332
35.2k
}
1333
1334
bool
1335
QPDF::findEndstream()
1336
34.2k
{
1337
    // Find endstream or endobj. Position the input at that token.
1338
34.2k
    auto t = readToken(*m->file, 20);
1339
34.2k
    if (t.isWord("endobj") || t.isWord("endstream")) {
1340
26.9k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
1341
26.9k
        return true;
1342
26.9k
    }
1343
7.30k
    return false;
1344
34.2k
}
1345
1346
size_t
1347
QPDF::recoverStreamLength(
1348
    std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
1349
27.8k
{
1350
    // Try to reconstruct stream length by looking for endstream or endobj
1351
27.8k
    warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
1352
1353
27.8k
    PatternFinder ef(*this, &QPDF::findEndstream);
1354
27.8k
    size_t length = 0;
1355
27.8k
    if (m->file->findFirst("end", stream_offset, 0, ef)) {
1356
26.9k
        length = toS(m->file->tell() - stream_offset);
1357
        // Reread endstream but, if it was endobj, don't skip that.
1358
26.9k
        QPDFTokenizer::Token t = readToken(*m->file);
1359
26.9k
        if (t.getValue() == "endobj") {
1360
18.5k
            m->file->seek(m->file->getLastOffset(), SEEK_SET);
1361
18.5k
        }
1362
26.9k
    }
1363
1364
27.8k
    if (length) {
1365
26.0k
        auto end = stream_offset + toO(length);
1366
26.0k
        qpdf_offset_t found_offset = 0;
1367
26.0k
        QPDFObjGen found_og;
1368
1369
        // Make sure this is inside this object
1370
632k
        for (auto const& [current_og, entry]: m->xref_table) {
1371
632k
            if (entry.getType() == 1) {
1372
597k
                qpdf_offset_t obj_offset = entry.getOffset();
1373
597k
                if (found_offset < obj_offset && obj_offset < end) {
1374
146k
                    found_offset = obj_offset;
1375
146k
                    found_og = current_og;
1376
146k
                }
1377
597k
            }
1378
632k
        }
1379
26.0k
        if (!found_offset || found_og == og) {
1380
            // If we are trying to recover an XRef stream the xref table will not contain and
1381
            // won't contain any entries, therefore we cannot check the found length. Otherwise we
1382
            // found endstream\nendobj within the space allowed for this object, so we're probably
1383
            // in good shape.
1384
24.5k
        } else {
1385
1.43k
            QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1386
1.43k
            length = 0;
1387
1.43k
        }
1388
26.0k
    }
1389
1390
27.8k
    if (length == 0) {
1391
3.25k
        warn(damagedPDF(
1392
3.25k
            *input, stream_offset, "unable to recover stream data; treating stream as empty"));
1393
24.5k
    } else {
1394
24.5k
        warn(damagedPDF(
1395
24.5k
            *input, stream_offset, "recovered stream length: " + std::to_string(length)));
1396
24.5k
    }
1397
1398
27.8k
    QTC::TC("qpdf", "QPDF recovered stream length");
1399
27.8k
    return length;
1400
27.8k
}
1401
1402
QPDFTokenizer::Token
1403
QPDF::readToken(InputSource& input, size_t max_len)
1404
3.59M
{
1405
3.59M
    return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1406
3.59M
}
1407
1408
QPDFObjectHandle
1409
QPDF::readObjectAtOffset(
1410
    bool try_recovery,
1411
    qpdf_offset_t offset,
1412
    std::string const& description,
1413
    QPDFObjGen exp_og,
1414
    QPDFObjGen& og,
1415
    bool skip_cache_if_in_xref)
1416
123k
{
1417
123k
    bool check_og = true;
1418
123k
    if (exp_og.getObj() == 0) {
1419
        // This method uses an expect object ID of 0 to indicate that we don't know or don't care
1420
        // what the actual object ID is at this offset. This is true when we read the xref stream
1421
        // and linearization hint streams. In this case, we don't verify the expect object
1422
        // ID/generation against what was read from the file. There is also no reason to attempt
1423
        // xref recovery if we get a failure in this case since the read attempt was not triggered
1424
        // by an xref lookup.
1425
10.9k
        check_og = false;
1426
10.9k
        try_recovery = false;
1427
10.9k
    }
1428
123k
    setLastObjectDescription(description, exp_og);
1429
1430
123k
    if (!m->attempt_recovery) {
1431
0
        try_recovery = false;
1432
0
    }
1433
1434
    // Special case: if offset is 0, just return null.  Some PDF writers, in particular
1435
    // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
1436
    // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
1437
    // these.
1438
123k
    if (offset == 0) {
1439
197
        QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1440
197
        warn(damagedPDF(-1, "object has offset 0"));
1441
197
        return QPDFObjectHandle::newNull();
1442
197
    }
1443
1444
123k
    m->file->seek(offset, SEEK_SET);
1445
123k
    try {
1446
123k
        QPDFTokenizer::Token tobjid = readToken(*m->file);
1447
123k
        bool objidok = tobjid.isInteger();
1448
123k
        QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1449
123k
        if (!objidok) {
1450
1.92k
            QTC::TC("qpdf", "QPDF expected n n obj");
1451
1.92k
            throw damagedPDF(offset, "expected n n obj");
1452
1.92k
        }
1453
121k
        QPDFTokenizer::Token tgen = readToken(*m->file);
1454
121k
        bool genok = tgen.isInteger();
1455
121k
        QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1456
121k
        if (!genok) {
1457
145
            throw damagedPDF(offset, "expected n n obj");
1458
145
        }
1459
121k
        QPDFTokenizer::Token tobj = readToken(*m->file);
1460
1461
121k
        bool objok = tobj.isWord("obj");
1462
121k
        QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1463
1464
121k
        if (!objok) {
1465
151
            throw damagedPDF(offset, "expected n n obj");
1466
151
        }
1467
120k
        int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1468
120k
        int generation = QUtil::string_to_int(tgen.getValue().c_str());
1469
120k
        og = QPDFObjGen(objid, generation);
1470
120k
        if (objid == 0) {
1471
20
            QTC::TC("qpdf", "QPDF object id 0");
1472
20
            throw damagedPDF(offset, "object with ID 0");
1473
20
        }
1474
120k
        if (check_og && (exp_og != og)) {
1475
63
            QTC::TC("qpdf", "QPDF err wrong objid/generation");
1476
63
            QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1477
63
            if (try_recovery) {
1478
                // Will be retried below
1479
63
                throw e;
1480
63
            } else {
1481
                // We can try reading the object anyway even if the ID doesn't match.
1482
0
                warn(e);
1483
0
            }
1484
63
        }
1485
120k
    } catch (QPDFExc& e) {
1486
2.30k
        if (try_recovery) {
1487
            // Try again after reconstructing xref table
1488
556
            reconstruct_xref(e);
1489
556
            if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {
1490
240
                qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1491
240
                QPDFObjectHandle result =
1492
240
                    readObjectAtOffset(false, new_offset, description, exp_og, og, false);
1493
240
                QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1494
240
                return result;
1495
316
            } else {
1496
316
                QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1497
316
                warn(damagedPDF(
1498
316
                    "",
1499
316
                    -1,
1500
316
                    ("object " + exp_og.unparse(' ') +
1501
316
                     " not found in file after regenerating cross reference "
1502
316
                     "table")));
1503
316
                return QPDFObjectHandle::newNull();
1504
316
            }
1505
1.75k
        } else {
1506
1.75k
            throw;
1507
1.75k
        }
1508
2.30k
    }
1509
1510
120k
    QPDFObjectHandle oh = readObject(description, og);
1511
1512
120k
    if (isUnresolved(og)) {
1513
        // Store the object in the cache here so it gets cached whether we first know the offset or
1514
        // whether we first know the object ID and generation (in which we case we would get here
1515
        // through resolve).
1516
1517
        // Determine the end offset of this object before and after white space.  We use these
1518
        // numbers to validate linearization hint tables.  Offsets and lengths of objects may imply
1519
        // the end of an object to be anywhere between these values.
1520
107k
        qpdf_offset_t end_before_space = m->file->tell();
1521
1522
        // skip over spaces
1523
228k
        while (true) {
1524
228k
            char ch;
1525
228k
            if (m->file->read(&ch, 1)) {
1526
223k
                if (!isspace(static_cast<unsigned char>(ch))) {
1527
102k
                    m->file->seek(-1, SEEK_CUR);
1528
102k
                    break;
1529
102k
                }
1530
223k
            } else {
1531
4.66k
                throw damagedPDF(m->file->tell(), "EOF after endobj");
1532
4.66k
            }
1533
228k
        }
1534
102k
        qpdf_offset_t end_after_space = m->file->tell();
1535
102k
        if (skip_cache_if_in_xref && m->xref_table.contains(og)) {
1536
            // Ordinarily, an object gets read here when resolved through xref table or stream. In
1537
            // the special case of the xref stream and linearization hint tables, the offset comes
1538
            // from another source. For the specific case of xref streams, the xref stream is read
1539
            // and loaded into the object cache very early in parsing. Ordinarily, when a file is
1540
            // updated by appending, items inserted into the xref table in later updates take
1541
            // precedence over earlier items. In the special case of reusing the object number
1542
            // previously used as the xref stream, we have the following order of events:
1543
            //
1544
            // * reused object gets loaded into the xref table
1545
            // * old object is read here while reading xref streams
1546
            // * original xref entry is ignored (since already in xref table)
1547
            //
1548
            // It is the second step that causes a problem. Even though the xref table is correct in
1549
            // this case, the old object is already in the cache and so effectively prevails over
1550
            // the reused object. To work around this issue, we have a special case for the xref
1551
            // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
1552
            // don't cache what we read here.
1553
            //
1554
            // It is likely that the same bug may exist for linearization hint tables, but the
1555
            // existing code uses end_before_space and end_after_space from the cache, so fixing
1556
            // that would require more significant rework. The chances of a linearization hint
1557
            // stream being reused seems smaller because the xref stream is probably the highest
1558
            // object in the file and the linearization hint stream would be some random place in
1559
            // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
1560
            // could use !check_og in place of skip_cache_if_in_xref.
1561
16
            QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1562
102k
        } else {
1563
102k
            updateCache(og, oh.getObj(), end_before_space, end_after_space);
1564
102k
        }
1565
102k
    }
1566
1567
116k
    return oh;
1568
120k
}
1569
1570
std::shared_ptr<QPDFObject> const&
1571
QPDF::resolve(QPDFObjGen og)
1572
367k
{
1573
367k
    if (!isUnresolved(og)) {
1574
0
        return m->obj_cache[og].object;
1575
0
    }
1576
1577
367k
    if (m->resolving.contains(og)) {
1578
        // This can happen if an object references itself directly or indirectly in some key that
1579
        // has to be resolved during object parsing, such as stream length.
1580
148
        QTC::TC("qpdf", "QPDF recursion loop in resolve");
1581
148
        warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
1582
148
        updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1583
148
        return m->obj_cache[og].object;
1584
148
    }
1585
366k
    ResolveRecorder rr(this, og);
1586
1587
366k
    if (m->xref_table.contains(og)) {
1588
300k
        QPDFXRefEntry const& entry = m->xref_table[og];
1589
300k
        try {
1590
300k
            switch (entry.getType()) {
1591
112k
            case 1:
1592
112k
                {
1593
112k
                    qpdf_offset_t offset = entry.getOffset();
1594
                    // Object stored in cache by readObjectAtOffset
1595
112k
                    QPDFObjGen a_og;
1596
112k
                    QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
1597
112k
                }
1598
112k
                break;
1599
1600
188k
            case 2:
1601
188k
                resolveObjectsInStream(entry.getObjStreamNumber());
1602
188k
                break;
1603
1604
9
            default:
1605
9
                throw damagedPDF(
1606
9
                    "", -1, ("object " + og.unparse('/') + " has unexpected xref entry type"));
1607
300k
            }
1608
300k
        } catch (QPDFExc& e) {
1609
31.7k
            warn(e);
1610
31.7k
        } catch (std::exception& e) {
1611
662
            warn(damagedPDF(
1612
662
                "", -1, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
1613
662
        }
1614
300k
    }
1615
1616
358k
    if (isUnresolved(og)) {
1617
        // PDF spec says unknown objects resolve to the null object.
1618
257k
        QTC::TC("qpdf", "QPDF resolve failure to null");
1619
257k
        updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1620
257k
    }
1621
1622
358k
    auto& result(m->obj_cache[og].object);
1623
358k
    result->setDefaultDescription(this, og);
1624
358k
    return result;
1625
366k
}
1626
1627
void
1628
QPDF::resolveObjectsInStream(int obj_stream_number)
1629
188k
{
1630
188k
    auto damaged =
1631
188k
        [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc {
1632
17.2k
        return {
1633
17.2k
            qpdf_e_damaged_pdf,
1634
17.2k
            m->file->getName() + " object stream " + std::to_string(obj_stream_number),
1635
17.2k
            +"object " + std::to_string(id) + " 0",
1636
17.2k
            offset,
1637
17.2k
            msg,
1638
17.2k
            true};
1639
17.2k
    };
1640
1641
188k
    if (m->resolved_object_streams.contains(obj_stream_number)) {
1642
167k
        return;
1643
167k
    }
1644
20.7k
    m->resolved_object_streams.insert(obj_stream_number);
1645
    // Force resolution of object stream
1646
20.7k
    auto obj_stream = getObject(obj_stream_number, 0).as_stream();
1647
20.7k
    if (!obj_stream) {
1648
18.2k
        throw damagedPDF(
1649
18.2k
            "object " + std::to_string(obj_stream_number) + " 0",
1650
18.2k
            "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
1651
18.2k
    }
1652
1653
    // For linearization data in the object, use the data from the object stream for the objects in
1654
    // the stream.
1655
2.49k
    QPDFObjGen stream_og(obj_stream_number, 0);
1656
2.49k
    qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
1657
2.49k
    qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
1658
1659
2.49k
    QPDFObjectHandle dict = obj_stream.getDict();
1660
2.49k
    if (!dict.isDictionaryOfType("/ObjStm")) {
1661
745
        QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
1662
745
        warn(damagedPDF(
1663
745
            "object " + std::to_string(obj_stream_number) + " 0",
1664
745
            "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1665
745
    }
1666
1667
2.49k
    unsigned int n{0};
1668
2.49k
    int first{0};
1669
2.49k
    if (!(dict.getKey("/N").getValueAsUInt(n) && dict.getKey("/First").getValueAsInt(first))) {
1670
202
        throw damagedPDF(
1671
202
            "object " + std::to_string(obj_stream_number) + " 0",
1672
202
            "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1673
202
    }
1674
1675
    // id, offset, size
1676
2.29k
    std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets;
1677
1678
2.29k
    auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
1679
1680
2.29k
    BufferInputSource input("", bp.get());
1681
1682
2.29k
    const auto b_size = bp->getSize();
1683
2.29k
    const auto end_offset = static_cast<qpdf_offset_t>(b_size);
1684
2.29k
    auto b_start = bp->getBuffer();
1685
1686
2.29k
    if (first >= end_offset) {
1687
35
        throw damagedPDF(
1688
35
            "object " + std::to_string(obj_stream_number) + " 0",
1689
35
            "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry");
1690
35
    }
1691
1692
2.25k
    int id = 0;
1693
2.25k
    long long last_offset = -1;
1694
2.25k
    bool is_first = true;
1695
66.0k
    for (unsigned int i = 0; i < n; ++i) {
1696
64.0k
        auto tnum = readToken(input);
1697
64.0k
        auto id_offset = input.getLastOffset();
1698
64.0k
        auto toffset = readToken(input);
1699
64.0k
        if (!(tnum.isInteger() && toffset.isInteger())) {
1700
223
            throw damaged(0, input.getLastOffset(), "expected integer in object stream header");
1701
223
        }
1702
1703
63.7k
        int num = QUtil::string_to_int(tnum.getValue().c_str());
1704
63.7k
        long long offset = QUtil::string_to_int(toffset.getValue().c_str());
1705
1706
63.7k
        if (num == obj_stream_number) {
1707
493
            QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1708
493
            warn(damaged(num, id_offset, "object stream claims to contain itself"));
1709
493
            continue;
1710
493
        }
1711
1712
63.2k
        if (num < 1) {
1713
823
            QTC::TC("qpdf", "QPDF object stream contains id < 1");
1714
823
            warn(damaged(num, id_offset, "object id is invalid"s));
1715
823
            continue;
1716
823
        }
1717
1718
62.4k
        if (offset <= last_offset) {
1719
9.56k
            QTC::TC("qpdf", "QPDF object stream offsets not increasing");
1720
9.56k
            warn(damaged(
1721
9.56k
                num,
1722
9.56k
                input.getLastOffset(),
1723
9.56k
                "offset " + std::to_string(offset) +
1724
9.56k
                    " is invalid (must be larger than previous offset " +
1725
9.56k
                    std::to_string(last_offset) + ")"));
1726
9.56k
            continue;
1727
9.56k
        }
1728
1729
52.9k
        if (num > m->xref_table_max_id) {
1730
2.48k
            continue;
1731
2.48k
        }
1732
1733
50.4k
        if (first + offset >= end_offset) {
1734
6.14k
            warn(damaged(
1735
6.14k
                num, input.getLastOffset(), "offset " + std::to_string(offset) + " is too large"));
1736
6.14k
            continue;
1737
6.14k
        }
1738
1739
44.2k
        if (is_first) {
1740
1.04k
            is_first = false;
1741
43.2k
        } else {
1742
43.2k
            offsets.emplace_back(
1743
43.2k
                id, last_offset + first, static_cast<size_t>(offset - last_offset));
1744
43.2k
        }
1745
1746
44.2k
        last_offset = offset;
1747
44.2k
        id = num;
1748
44.2k
    }
1749
1750
2.03k
    if (!is_first) {
1751
        // We found at least one valid entry.
1752
857
        offsets.emplace_back(
1753
857
            id, last_offset + first, b_size - static_cast<size_t>(last_offset + first));
1754
857
    }
1755
1756
    // To avoid having to read the object stream multiple times, store all objects that would be
1757
    // found here in the cache.  Remember that some objects stored here might have been overridden
1758
    // by new objects appended to the file, so it is necessary to recheck the xref table and only
1759
    // cache what would actually be resolved here.
1760
37.3k
    for (auto const& [obj_id, obj_offset, obj_size]: offsets) {
1761
37.3k
        QPDFObjGen og(obj_id, 0);
1762
37.3k
        auto entry = m->xref_table.find(og);
1763
37.3k
        if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1764
37.3k
            entry->second.getObjStreamNumber() == obj_stream_number) {
1765
35.3k
            Buffer obj_buffer{b_start + obj_offset, obj_size};
1766
35.3k
            is::OffsetBuffer in("", &obj_buffer, obj_offset);
1767
35.3k
            auto oh = readObjectInStream(in, obj_stream_number, obj_id);
1768
35.3k
            updateCache(og, oh.getObj(), end_before_space, end_after_space);
1769
35.3k
        } else {
1770
2.00k
            QTC::TC("qpdf", "QPDF not caching overridden objstm object");
1771
2.00k
        }
1772
37.3k
    }
1773
2.03k
}
1774
1775
QPDFObjectHandle
1776
QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
1777
3.27k
{
1778
3.27k
    obj->setDefaultDescription(this, og);
1779
3.27k
    return {obj};
1780
3.27k
}
1781
1782
void
1783
QPDF::updateCache(
1784
    QPDFObjGen og,
1785
    std::shared_ptr<QPDFObject> const& object,
1786
    qpdf_offset_t end_before_space,
1787
    qpdf_offset_t end_after_space,
1788
    bool destroy)
1789
395k
{
1790
395k
    object->setObjGen(this, og);
1791
395k
    if (isCached(og)) {
1792
211k
        auto& cache = m->obj_cache[og];
1793
211k
        object->move_to(cache.object, destroy);
1794
211k
        cache.end_before_space = end_before_space;
1795
211k
        cache.end_after_space = end_after_space;
1796
211k
    } else {
1797
183k
        m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
1798
183k
    }
1799
395k
}
1800
1801
bool
1802
QPDF::isCached(QPDFObjGen og)
1803
1.50M
{
1804
1.50M
    return m->obj_cache.contains(og);
1805
1.50M
}
1806
1807
bool
1808
QPDF::isUnresolved(QPDFObjGen og)
1809
1.11M
{
1810
1.11M
    return !isCached(og) || m->obj_cache[og].object->isUnresolved();
1811
1.11M
}
1812
1813
QPDFObjGen
1814
QPDF::nextObjGen()
1815
3.28k
{
1816
3.28k
    int max_objid = toI(getObjectCount());
1817
3.28k
    if (max_objid == std::numeric_limits<int>::max()) {
1818
4
        throw std::range_error("max object id is too high to create new objects");
1819
4
    }
1820
3.27k
    return {max_objid + 1, 0};
1821
3.28k
}
1822
1823
QPDFObjectHandle
1824
QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
1825
3.28k
{
1826
3.28k
    QPDFObjGen next{nextObjGen()};
1827
3.28k
    m->obj_cache[next] = ObjCache(obj, -1, -1);
1828
3.28k
    return newIndirect(next, m->obj_cache[next].object);
1829
3.28k
}
1830
1831
QPDFObjectHandle
1832
QPDF::makeIndirectObject(QPDFObjectHandle oh)
1833
3.28k
{
1834
3.28k
    if (!oh) {
1835
0
        throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
1836
0
    }
1837
3.28k
    return makeIndirectFromQPDFObject(oh.getObj());
1838
3.28k
}
1839
1840
std::shared_ptr<QPDFObject>
1841
QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
1842
366k
{
1843
    // This method is called by the parser and therefore must not resolve any objects.
1844
366k
    auto og = QPDFObjGen(id, gen);
1845
366k
    if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
1846
143k
        return iter->second.object;
1847
143k
    }
1848
223k
    if (m->xref_table.contains(og) || !m->parsed) {
1849
178k
        return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
1850
178k
            .first->second.object;
1851
178k
    }
1852
45.0k
    if (parse_pdf) {
1853
45.0k
        return QPDFObject::create<QPDF_Null>();
1854
45.0k
    }
1855
0
    return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
1856
45.0k
}
1857
1858
std::shared_ptr<QPDFObject>
1859
QPDF::getObjectForJSON(int id, int gen)
1860
0
{
1861
0
    auto og = QPDFObjGen(id, gen);
1862
0
    auto [it, inserted] = m->obj_cache.try_emplace(og);
1863
0
    auto& obj = it->second.object;
1864
0
    if (inserted) {
1865
0
        obj = (m->parsed && !m->xref_table.contains(og))
1866
0
            ? QPDFObject::create<QPDF_Null>(this, og)
1867
0
            : QPDFObject::create<QPDF_Unresolved>(this, og);
1868
0
    }
1869
0
    return obj;
1870
0
}
1871
1872
QPDFObjectHandle
1873
QPDF::getObject(QPDFObjGen og)
1874
113k
{
1875
113k
    if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
1876
60.5k
        return {it->second.object};
1877
60.5k
    } else if (m->parsed && !m->xref_table.contains(og)) {
1878
9.21k
        return QPDFObject::create<QPDF_Null>();
1879
43.9k
    } else {
1880
43.9k
        auto result =
1881
43.9k
            m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
1882
43.9k
        return {result.first->second.object};
1883
43.9k
    }
1884
113k
}
1885
1886
void
1887
QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
1888
0
{
1889
0
    replaceObject(QPDFObjGen(objid, generation), oh);
1890
0
}
1891
1892
void
1893
QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
1894
0
{
1895
0
    if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
1896
0
        QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
1897
0
        throw std::logic_error("QPDF::replaceObject called with indirect object handle");
1898
0
    }
1899
0
    updateCache(og, oh.getObj(), -1, -1, false);
1900
0
}
1901
1902
void
1903
QPDF::removeObject(QPDFObjGen og)
1904
11.5k
{
1905
11.5k
    m->xref_table.erase(og);
1906
11.5k
    if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
1907
        // Take care of any object handles that may be floating around.
1908
1.03k
        cached->second.object->assign_null();
1909
1.03k
        cached->second.object->setObjGen(nullptr, QPDFObjGen());
1910
1.03k
        m->obj_cache.erase(cached);
1911
1.03k
    }
1912
11.5k
}
1913
1914
void
1915
QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
1916
0
{
1917
0
    QTC::TC("qpdf", "QPDF replaceReserved");
1918
0
    auto tc = reserved.getTypeCode();
1919
0
    if (!(tc == ::ot_reserved || tc == ::ot_null)) {
1920
0
        throw std::logic_error("replaceReserved called with non-reserved object");
1921
0
    }
1922
0
    replaceObject(reserved.getObjGen(), replacement);
1923
0
}
1924
1925
void
1926
QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
1927
0
{
1928
0
    swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
1929
0
}
1930
1931
void
1932
QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
1933
0
{
1934
    // Force objects to be read from the input source if needed, then swap them in the cache.
1935
0
    resolve(og1);
1936
0
    resolve(og2);
1937
0
    m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
1938
0
}
1939
1940
size_t
1941
QPDF::tableSize()
1942
8.23k
{
1943
    // If obj_cache is dense, accommodate all object in tables,else accommodate only original
1944
    // objects.
1945
8.23k
    auto max_xref = !m->xref_table.empty() ? m->xref_table.crbegin()->first.getObj() : 0;
1946
8.23k
    auto max_obj = !m->obj_cache.empty() ? m->obj_cache.crbegin()->first.getObj() : 0;
1947
8.23k
    auto max_id = std::numeric_limits<int>::max() - 1;
1948
8.23k
    if (max_obj >= max_id || max_xref >= max_id) {
1949
        // Temporary fix. Long-term solution is
1950
        // - QPDFObjGen to enforce objgens are valid and sensible
1951
        // - xref table and obj cache to protect against insertion of impossibly large obj ids
1952
2
        stopOnError("Impossibly large object id encountered.");
1953
2
    }
1954
8.23k
    if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
1955
6.21k
        return toS(++max_obj);
1956
6.21k
    }
1957
2.02k
    return toS(++max_xref);
1958
8.23k
}
1959
1960
std::vector<QPDFObjGen>
1961
QPDF::getCompressibleObjVector()
1962
0
{
1963
0
    return getCompressibleObjGens<QPDFObjGen>();
1964
0
}
1965
1966
std::vector<bool>
1967
QPDF::getCompressibleObjSet()
1968
773
{
1969
773
    return getCompressibleObjGens<bool>();
1970
773
}
1971
1972
template <typename T>
1973
std::vector<T>
1974
QPDF::getCompressibleObjGens()
1975
773
{
1976
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
1977
    // by traversing the document from the root, including a traversal of the pages tree.  This
1978
    // makes that objects that are on the same page are more likely to be in the same object stream,
1979
    // which is slightly more efficient, particularly with linearized files.  This is better than
1980
    // iterating through the xref table since it avoids preserving orphaned items.
1981
1982
    // Exclude encryption dictionary, if any
1983
773
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
1984
773
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
1985
1986
773
    const size_t max_obj = getObjectCount();
1987
773
    std::vector<bool> visited(max_obj, false);
1988
773
    std::vector<QPDFObjectHandle> queue;
1989
773
    queue.reserve(512);
1990
773
    queue.push_back(m->trailer);
1991
773
    std::vector<T> result;
1992
773
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
1993
0
        result.reserve(m->obj_cache.size());
1994
773
    } else if constexpr (std::is_same_v<T, bool>) {
1995
773
        result.resize(max_obj + 1U, false);
1996
    } else {
1997
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
1998
    }
1999
350k
    while (!queue.empty()) {
2000
349k
        auto obj = queue.back();
2001
349k
        queue.pop_back();
2002
349k
        if (obj.getObjectID() > 0) {
2003
63.6k
            QPDFObjGen og = obj.getObjGen();
2004
63.6k
            const size_t id = toS(og.getObj() - 1);
2005
63.6k
            if (id >= max_obj) {
2006
0
                throw std::logic_error(
2007
0
                    "unexpected object id encountered in getCompressibleObjGens");
2008
0
            }
2009
63.6k
            if (visited[id]) {
2010
21.6k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2011
21.6k
                continue;
2012
21.6k
            }
2013
2014
            // Check whether this is the current object. If not, remove it (which changes it into a
2015
            // direct null and therefore stops us from revisiting it) and move on to the next object
2016
            // in the queue.
2017
42.0k
            auto upper = m->obj_cache.upper_bound(og);
2018
42.0k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2019
647
                removeObject(og);
2020
647
                continue;
2021
647
            }
2022
2023
41.3k
            visited[id] = true;
2024
2025
41.3k
            if (og == encryption_dict_og) {
2026
41
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2027
41.3k
            } else if (!(obj.isStream() ||
2028
41.3k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2029
37.9k
                          obj.hasKey("/Contents")))) {
2030
37.9k
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2031
0
                    result.push_back(og);
2032
37.9k
                } else if constexpr (std::is_same_v<T, bool>) {
2033
37.9k
                    result[id + 1U] = true;
2034
37.9k
                }
2035
37.9k
            }
2036
41.3k
        }
2037
327k
        if (obj.isStream()) {
2038
3.38k
            auto dict = obj.getDict().as_dictionary();
2039
3.38k
            auto end = dict.crend();
2040
20.5k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2041
17.2k
                std::string const& key = iter->first;
2042
17.2k
                QPDFObjectHandle const& value = iter->second;
2043
17.2k
                if (!value.null()) {
2044
15.6k
                    if (key == "/Length") {
2045
                        // omit stream lengths
2046
3.04k
                        if (value.isIndirect()) {
2047
117
                            QTC::TC("qpdf", "QPDF exclude indirect length");
2048
117
                        }
2049
12.6k
                    } else {
2050
12.6k
                        queue.emplace_back(value);
2051
12.6k
                    }
2052
15.6k
                }
2053
17.2k
            }
2054
324k
        } else if (obj.isDictionary()) {
2055
25.6k
            auto dict = obj.as_dictionary();
2056
25.6k
            auto end = dict.crend();
2057
130k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2058
104k
                if (!iter->second.null()) {
2059
91.0k
                    queue.emplace_back(iter->second);
2060
91.0k
                }
2061
104k
            }
2062
298k
        } else if (auto items = obj.as_array()) {
2063
298k
            queue.insert(queue.end(), items.crbegin(), items.crend());
2064
298k
        }
2065
327k
    }
2066
2067
773
    return result;
2068
773
}
Unexecuted instantiation: std::__1::vector<QPDFObjGen, std::__1::allocator<QPDFObjGen> > QPDF::getCompressibleObjGens<QPDFObjGen>()
std::__1::vector<bool, std::__1::allocator<bool> > QPDF::getCompressibleObjGens<bool>()
Line
Count
Source
1975
773
{
1976
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
1977
    // by traversing the document from the root, including a traversal of the pages tree.  This
1978
    // makes that objects that are on the same page are more likely to be in the same object stream,
1979
    // which is slightly more efficient, particularly with linearized files.  This is better than
1980
    // iterating through the xref table since it avoids preserving orphaned items.
1981
1982
    // Exclude encryption dictionary, if any
1983
773
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
1984
773
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
1985
1986
773
    const size_t max_obj = getObjectCount();
1987
773
    std::vector<bool> visited(max_obj, false);
1988
773
    std::vector<QPDFObjectHandle> queue;
1989
773
    queue.reserve(512);
1990
773
    queue.push_back(m->trailer);
1991
773
    std::vector<T> result;
1992
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
1993
        result.reserve(m->obj_cache.size());
1994
773
    } else if constexpr (std::is_same_v<T, bool>) {
1995
773
        result.resize(max_obj + 1U, false);
1996
    } else {
1997
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
1998
    }
1999
350k
    while (!queue.empty()) {
2000
349k
        auto obj = queue.back();
2001
349k
        queue.pop_back();
2002
349k
        if (obj.getObjectID() > 0) {
2003
63.6k
            QPDFObjGen og = obj.getObjGen();
2004
63.6k
            const size_t id = toS(og.getObj() - 1);
2005
63.6k
            if (id >= max_obj) {
2006
0
                throw std::logic_error(
2007
0
                    "unexpected object id encountered in getCompressibleObjGens");
2008
0
            }
2009
63.6k
            if (visited[id]) {
2010
21.6k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2011
21.6k
                continue;
2012
21.6k
            }
2013
2014
            // Check whether this is the current object. If not, remove it (which changes it into a
2015
            // direct null and therefore stops us from revisiting it) and move on to the next object
2016
            // in the queue.
2017
42.0k
            auto upper = m->obj_cache.upper_bound(og);
2018
42.0k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2019
647
                removeObject(og);
2020
647
                continue;
2021
647
            }
2022
2023
41.3k
            visited[id] = true;
2024
2025
41.3k
            if (og == encryption_dict_og) {
2026
41
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2027
41.3k
            } else if (!(obj.isStream() ||
2028
41.3k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2029
37.9k
                          obj.hasKey("/Contents")))) {
2030
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2031
                    result.push_back(og);
2032
37.9k
                } else if constexpr (std::is_same_v<T, bool>) {
2033
37.9k
                    result[id + 1U] = true;
2034
37.9k
                }
2035
37.9k
            }
2036
41.3k
        }
2037
327k
        if (obj.isStream()) {
2038
3.38k
            auto dict = obj.getDict().as_dictionary();
2039
3.38k
            auto end = dict.crend();
2040
20.5k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2041
17.2k
                std::string const& key = iter->first;
2042
17.2k
                QPDFObjectHandle const& value = iter->second;
2043
17.2k
                if (!value.null()) {
2044
15.6k
                    if (key == "/Length") {
2045
                        // omit stream lengths
2046
3.04k
                        if (value.isIndirect()) {
2047
117
                            QTC::TC("qpdf", "QPDF exclude indirect length");
2048
117
                        }
2049
12.6k
                    } else {
2050
12.6k
                        queue.emplace_back(value);
2051
12.6k
                    }
2052
15.6k
                }
2053
17.2k
            }
2054
324k
        } else if (obj.isDictionary()) {
2055
25.6k
            auto dict = obj.as_dictionary();
2056
25.6k
            auto end = dict.crend();
2057
130k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2058
104k
                if (!iter->second.null()) {
2059
91.0k
                    queue.emplace_back(iter->second);
2060
91.0k
                }
2061
104k
            }
2062
298k
        } else if (auto items = obj.as_array()) {
2063
298k
            queue.insert(queue.end(), items.crbegin(), items.crend());
2064
298k
        }
2065
327k
    }
2066
2067
773
    return result;
2068
773
}