Coverage Report

Created: 2025-07-14 06:16

/src/qpdf/libqpdf/QPDF_objects.cc
Line
Count
Source (jump to first uncovered line)
1
#include <qpdf/qpdf-config.h> // include first for large file support
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <array>
6
#include <atomic>
7
#include <cstring>
8
#include <limits>
9
#include <map>
10
#include <regex>
11
#include <sstream>
12
#include <vector>
13
14
#include <qpdf/BufferInputSource.hh>
15
#include <qpdf/FileInputSource.hh>
16
#include <qpdf/InputSource_private.hh>
17
#include <qpdf/OffsetInputSource.hh>
18
#include <qpdf/Pipeline.hh>
19
#include <qpdf/QPDFExc.hh>
20
#include <qpdf/QPDFLogger.hh>
21
#include <qpdf/QPDFObjectHandle_private.hh>
22
#include <qpdf/QPDFObject_private.hh>
23
#include <qpdf/QPDFParser.hh>
24
#include <qpdf/QTC.hh>
25
#include <qpdf/QUtil.hh>
26
#include <qpdf/Util.hh>
27
28
using namespace qpdf;
29
using namespace std::literals;
30
31
namespace
32
{
33
    class InvalidInputSource: public InputSource
34
    {
35
      public:
36
        ~InvalidInputSource() override = default;
37
        qpdf_offset_t
38
        findAndSkipNextEOL() override
39
0
        {
40
0
            throwException();
41
0
            return 0;
42
0
        }
43
        std::string const&
44
        getName() const override
45
0
        {
46
0
            static std::string name("closed input source");
47
0
            return name;
48
0
        }
49
        qpdf_offset_t
50
        tell() override
51
0
        {
52
0
            throwException();
53
0
            return 0;
54
0
        }
55
        void
56
        seek(qpdf_offset_t offset, int whence) override
57
0
        {
58
0
            throwException();
59
0
        }
60
        void
61
        rewind() override
62
0
        {
63
0
            throwException();
64
0
        }
65
        size_t
66
        read(char* buffer, size_t length) override
67
0
        {
68
0
            throwException();
69
0
            return 0;
70
0
        }
71
        void
72
        unreadCh(char ch) override
73
0
        {
74
0
            throwException();
75
0
        }
76
77
      private:
78
        void
79
        throwException()
80
0
        {
81
0
            throw std::logic_error(
82
0
                "QPDF operation attempted on a QPDF object with no input "
83
0
                "source. QPDF operations are invalid before processFile (or "
84
0
                "another process method) or after closeInputSource");
85
0
        }
86
    };
87
} // namespace
88
89
bool
90
QPDF::findStartxref()
91
7.64k
{
92
7.64k
    if (readToken(*m->file).isWord("startxref") && readToken(*m->file).isInteger()) {
93
        // Position in front of offset token
94
5.88k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
95
5.88k
        return true;
96
5.88k
    }
97
1.75k
    return false;
98
7.64k
}
99
100
void
101
QPDF::parse(char const* password)
102
20.3k
{
103
20.3k
    if (password) {
104
0
        m->encp->provided_password = password;
105
0
    }
106
107
    // Find the header anywhere in the first 1024 bytes of the file.
108
20.3k
    PatternFinder hf(*this, &QPDF::findHeader);
109
20.3k
    if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
110
17.5k
        QTC::TC("qpdf", "QPDF not a pdf file");
111
17.5k
        warn(damagedPDF("", -1, "can't find PDF header"));
112
        // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
113
17.5k
        m->pdf_version = "1.2";
114
17.5k
    }
115
116
    // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file.  We add an extra
117
    // 30 characters to leave room for the startxref stuff.
118
20.3k
    m->file->seek(0, SEEK_END);
119
20.3k
    qpdf_offset_t end_offset = m->file->tell();
120
20.3k
    m->xref_table_max_offset = end_offset;
121
    // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
122
    // scenarios at least 3 bytes are required.
123
20.3k
    if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
124
20.3k
        m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
125
20.3k
    }
126
20.3k
    qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
127
20.3k
    PatternFinder sf(*this, &QPDF::findStartxref);
128
20.3k
    qpdf_offset_t xref_offset = 0;
129
20.3k
    if (m->file->findLast("startxref", start_offset, 0, sf)) {
130
5.53k
        xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
131
5.53k
    }
132
133
20.3k
    try {
134
20.3k
        if (xref_offset == 0) {
135
14.9k
            QTC::TC("qpdf", "QPDF can't find startxref");
136
14.9k
            throw damagedPDF("", -1, "can't find startxref");
137
14.9k
        }
138
5.45k
        try {
139
5.45k
            read_xref(xref_offset);
140
5.45k
        } catch (QPDFExc&) {
141
3.81k
            throw;
142
3.81k
        } catch (std::exception& e) {
143
447
            throw damagedPDF("", -1, std::string("error reading xref: ") + e.what());
144
447
        }
145
19.1k
    } catch (QPDFExc& e) {
146
19.1k
        if (m->attempt_recovery) {
147
19.1k
            reconstruct_xref(e, xref_offset > 0);
148
19.1k
            QTC::TC("qpdf", "QPDF reconstructed xref table");
149
19.1k
        } else {
150
0
            throw;
151
0
        }
152
19.1k
    }
153
154
19.1k
    initializeEncryption();
155
8.78k
    m->parsed = true;
156
8.78k
    if (!m->xref_table.empty() && !getRoot().getKey("/Pages").isDictionary()) {
157
        // QPDFs created from JSON have an empty xref table and no root object yet.
158
5
        throw damagedPDF("", -1, "unable to find page tree");
159
5
    }
160
8.78k
}
161
162
void
163
QPDF::inParse(bool v)
164
372k
{
165
372k
    if (m->in_parse == v) {
166
        // This happens if QPDFParser::parse tries to resolve an indirect object while it is
167
        // parsing.
168
0
        throw std::logic_error(
169
0
            "QPDF: re-entrant parsing detected. This is a qpdf bug."
170
0
            " Please report at https://github.com/qpdf/qpdf/issues.");
171
0
    }
172
372k
    m->in_parse = v;
173
372k
}
174
175
void
176
QPDF::setTrailer(QPDFObjectHandle obj)
177
6.36k
{
178
6.36k
    if (m->trailer) {
179
216
        return;
180
216
    }
181
6.14k
    m->trailer = obj;
182
6.14k
}
183
184
void
185
QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
186
23.0k
{
187
23.0k
    if (m->reconstructed_xref) {
188
        // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
189
        // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
190
3.73k
        throw e;
191
3.73k
    }
192
193
    // If recovery generates more than 1000 warnings, the file is so severely damaged that there
194
    // probably is no point trying to continue.
195
19.3k
    const auto max_warnings = m->warnings.size() + 1000U;
196
1.94M
    auto check_warnings = [this, max_warnings]() {
197
1.94M
        if (m->warnings.size() > max_warnings) {
198
0
            throw damagedPDF("", -1, "too many errors while reconstructing cross-reference table");
199
0
        }
200
1.94M
    };
201
202
19.3k
    m->reconstructed_xref = true;
203
    // We may find more objects, which may contain dangling references.
204
19.3k
    m->fixed_dangling_refs = false;
205
206
19.3k
    warn(damagedPDF("", -1, "file is damaged"));
207
19.3k
    warn(e);
208
19.3k
    warn(damagedPDF("", -1, "Attempting to reconstruct cross-reference table"));
209
210
    // Delete all references to type 1 (uncompressed) objects
211
19.3k
    std::vector<QPDFObjGen> to_delete;
212
53.4k
    for (auto const& iter: m->xref_table) {
213
53.4k
        if (iter.second.getType() == 1) {
214
34.0k
            to_delete.emplace_back(iter.first);
215
34.0k
        }
216
53.4k
    }
217
34.0k
    for (auto const& iter: to_delete) {
218
34.0k
        m->xref_table.erase(iter);
219
34.0k
    }
220
221
19.3k
    std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
222
19.3k
    std::vector<qpdf_offset_t> trailers;
223
19.3k
    std::vector<qpdf_offset_t> startxrefs;
224
225
19.3k
    m->file->seek(0, SEEK_END);
226
19.3k
    qpdf_offset_t eof = m->file->tell();
227
19.3k
    m->file->seek(0, SEEK_SET);
228
    // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
229
19.3k
    static size_t const MAX_LEN = 10;
230
1.74M
    while (m->file->tell() < eof) {
231
1.72M
        QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
232
1.72M
        qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
233
1.72M
        if (t1.isInteger()) {
234
299k
            auto pos = m->file->tell();
235
299k
            auto t2 = readToken(*m->file, MAX_LEN);
236
299k
            if (t2.isInteger() && readToken(*m->file, MAX_LEN).isWord("obj")) {
237
180k
                int obj = QUtil::string_to_int(t1.getValue().c_str());
238
180k
                int gen = QUtil::string_to_int(t2.getValue().c_str());
239
180k
                if (obj <= m->xref_table_max_id) {
240
179k
                    found_objects.emplace_back(obj, gen, token_start);
241
179k
                } else {
242
882
                    warn(damagedPDF(
243
882
                        "", -1, "ignoring object with impossibly large id " + std::to_string(obj)));
244
882
                }
245
180k
            }
246
299k
            m->file->seek(pos, SEEK_SET);
247
1.42M
        } else if (!m->trailer && t1.isWord("trailer")) {
248
34.8k
            trailers.emplace_back(m->file->tell());
249
1.38M
        } else if (!found_startxref && t1.isWord("startxref")) {
250
18.9k
            startxrefs.emplace_back(m->file->tell());
251
18.9k
        }
252
1.72M
        check_warnings();
253
1.72M
        m->file->findAndSkipNextEOL();
254
1.72M
    }
255
256
19.3k
    if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
257
19.3k
        startxrefs.back() > std::get<2>(found_objects.back())) {
258
397
        auto xref_backup{m->xref_table};
259
397
        try {
260
397
            m->file->seek(startxrefs.back(), SEEK_SET);
261
397
            if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
262
230
                read_xref(offset);
263
264
230
                if (getRoot().getKey("/Pages").isDictionary()) {
265
22
                    QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
266
22
                    warn(damagedPDF(
267
22
                        "", -1, "startxref was more than 1024 bytes before end of file"));
268
22
                    initializeEncryption();
269
22
                    m->parsed = true;
270
22
                    m->reconstructed_xref = false;
271
22
                    return;
272
22
                }
273
230
            }
274
397
        } catch (...) {
275
            // ok, bad luck. Do recovery.
276
209
        }
277
375
        m->xref_table = std::move(xref_backup);
278
375
    }
279
280
19.3k
    auto rend = found_objects.rend();
281
198k
    for (auto it = found_objects.rbegin(); it != rend; it++) {
282
179k
        auto [obj, gen, token_start] = *it;
283
179k
        insertXrefEntry(obj, 1, token_start, gen);
284
179k
        check_warnings();
285
179k
    }
286
19.3k
    m->deleted_objects.clear();
287
288
33.3k
    for (auto it = trailers.rbegin(); it != trailers.rend(); it++) {
289
16.8k
        m->file->seek(*it, SEEK_SET);
290
16.8k
        auto t = readTrailer();
291
16.8k
        if (!t.isDictionary()) {
292
            // Oh well.  It was worth a try.
293
12.5k
        } else {
294
4.22k
            if (t.hasKey("/Root")) {
295
2.80k
                m->trailer = t;
296
2.80k
                break;
297
2.80k
            }
298
1.42k
            warn(damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
299
1.42k
        }
300
14.0k
        check_warnings();
301
14.0k
    }
302
303
19.3k
    if (!m->trailer) {
304
15.7k
        qpdf_offset_t max_offset{0};
305
15.7k
        size_t max_size{0};
306
        // If there are any xref streams, take the last one to appear.
307
72.7k
        for (auto const& iter: m->xref_table) {
308
72.7k
            auto entry = iter.second;
309
72.7k
            if (entry.getType() != 1) {
310
106
                continue;
311
106
            }
312
72.6k
            auto oh = getObject(iter.first);
313
72.6k
            try {
314
72.6k
                if (!oh.isStreamOfType("/XRef")) {
315
65.8k
                    continue;
316
65.8k
                }
317
72.6k
            } catch (std::exception&) {
318
2.04k
                continue;
319
2.04k
            }
320
4.73k
            auto offset = entry.getOffset();
321
4.73k
            auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
322
4.73k
            if (size > max_size || (size == max_size && offset > max_offset)) {
323
4.70k
                max_offset = offset;
324
4.70k
                setTrailer(oh.getDict());
325
4.70k
            }
326
4.73k
            check_warnings();
327
4.73k
        }
328
15.7k
        if (max_offset > 0) {
329
4.47k
            try {
330
4.47k
                read_xref(max_offset, true);
331
4.47k
            } catch (std::exception&) {
332
2.58k
                warn(damagedPDF(
333
2.58k
                    "", -1, "error decoding candidate xref stream while recovering damaged file"));
334
2.58k
            }
335
4.47k
            QTC::TC("qpdf", "QPDF recover xref stream");
336
4.44k
        }
337
15.7k
    }
338
339
19.2k
    if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
340
        // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
341
15.9k
        QPDFObjectHandle root;
342
182k
        for (auto const& iter: m->obj_cache) {
343
182k
            try {
344
182k
                if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
345
5.75k
                    root = iter.second.object;
346
5.75k
                }
347
182k
            } catch (std::exception&) {
348
3.22k
                continue;
349
3.22k
            }
350
182k
        }
351
15.9k
        if (root) {
352
5.53k
            if (!m->trailer) {
353
4.59k
                warn(damagedPDF(
354
4.59k
                    "", -1, "unable to find trailer dictionary while recovering damaged file"));
355
4.59k
                m->trailer = QPDFObjectHandle::newDictionary();
356
4.59k
            }
357
5.53k
            m->trailer.replaceKey("/Root", root);
358
5.53k
        }
359
15.9k
    }
360
361
19.2k
    if (!m->trailer) {
362
        // We could check the last encountered object to see if it was an xref stream.  If so, we
363
        // could try to get the trailer from there.  This may make it possible to recover files with
364
        // bad startxref pointers even when they have object streams.
365
366
6.65k
        throw damagedPDF("", -1, "unable to find trailer dictionary while recovering damaged file");
367
6.65k
    }
368
12.6k
    if (m->xref_table.empty()) {
369
        // We cannot check for an empty xref table in parse because empty tables are valid when
370
        // creating QPDF objects from JSON.
371
198
        throw damagedPDF("", -1, "unable to find objects while recovering damaged file");
372
198
    }
373
12.4k
    check_warnings();
374
12.4k
    if (!m->parsed) {
375
12.1k
        m->parsed = true;
376
12.1k
        getAllPages();
377
12.1k
        check_warnings();
378
12.1k
        if (m->all_pages.empty()) {
379
495
            m->parsed = false;
380
495
            throw damagedPDF("", -1, "unable to find any pages while recovering damaged file");
381
495
        }
382
12.1k
    }
383
384
    // We could iterate through the objects looking for streams and try to find objects inside of
385
    // them, but it's probably not worth the trouble.  Acrobat can't recover files with any errors
386
    // in an xref stream, and this would be a real long shot anyway.  If we wanted to do anything
387
    // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
388
    // It's safe to call it more than once.
389
12.4k
}
390
391
void
392
QPDF::read_xref(qpdf_offset_t xref_offset, bool in_stream_recovery)
393
10.1k
{
394
10.1k
    std::map<int, int> free_table;
395
10.1k
    std::set<qpdf_offset_t> visited;
396
20.8k
    while (xref_offset) {
397
10.8k
        visited.insert(xref_offset);
398
10.8k
        char buf[7];
399
10.8k
        memset(buf, 0, sizeof(buf));
400
10.8k
        m->file->seek(xref_offset, SEEK_SET);
401
        // Some files miss the mark a little with startxref. We could do a better job of searching
402
        // in the neighborhood for something that looks like either an xref table or stream, but the
403
        // simple heuristic of skipping whitespace can help with the xref table case and is harmless
404
        // with the stream case.
405
10.8k
        bool done = false;
406
10.8k
        bool skipped_space = false;
407
26.4k
        while (!done) {
408
15.5k
            char ch;
409
15.5k
            if (1 == m->file->read(&ch, 1)) {
410
14.9k
                if (util::is_space(ch)) {
411
4.93k
                    skipped_space = true;
412
10.0k
                } else {
413
10.0k
                    m->file->unreadCh(ch);
414
10.0k
                    done = true;
415
10.0k
                }
416
14.9k
            } else {
417
591
                QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
418
591
                done = true;
419
591
            }
420
15.5k
        }
421
422
10.8k
        m->file->read(buf, sizeof(buf) - 1);
423
        // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
424
        // where it is terminated by arbitrary whitespace.
425
10.8k
        if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
426
2.28k
            if (skipped_space) {
427
160
                QTC::TC("qpdf", "QPDF xref skipped space");
428
160
                warn(damagedPDF("", -1, "extraneous whitespace seen before xref"));
429
160
            }
430
2.28k
            QTC::TC(
431
2.28k
                "qpdf",
432
2.28k
                "QPDF xref space",
433
2.28k
                ((buf[4] == '\n')       ? 0
434
2.28k
                     : (buf[4] == '\r') ? 1
435
1.61k
                     : (buf[4] == ' ')  ? 2
436
386
                                        : 9999));
437
2.28k
            int skip = 4;
438
            // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
439
4.87k
            while (util::is_space(buf[skip])) {
440
2.58k
                ++skip;
441
2.58k
            }
442
2.28k
            xref_offset = read_xrefTable(xref_offset + skip);
443
8.60k
        } else {
444
8.60k
            xref_offset = read_xrefStream(xref_offset, in_stream_recovery);
445
8.60k
        }
446
10.8k
        if (visited.contains(xref_offset)) {
447
209
            QTC::TC("qpdf", "QPDF xref loop");
448
209
            throw damagedPDF("", -1, "loop detected following xref tables");
449
209
        }
450
10.8k
    }
451
452
9.93k
    if (!m->trailer) {
453
0
        throw damagedPDF("", -1, "unable to find trailer while reading xref");
454
0
    }
455
9.93k
    int size = m->trailer.getKey("/Size").getIntValueAsInt();
456
9.93k
    int max_obj = 0;
457
9.93k
    if (!m->xref_table.empty()) {
458
2.23k
        max_obj = m->xref_table.rbegin()->first.getObj();
459
2.23k
    }
460
9.93k
    if (!m->deleted_objects.empty()) {
461
941
        max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
462
941
    }
463
9.93k
    if ((size < 1) || (size - 1 != max_obj)) {
464
2.21k
        QTC::TC("qpdf", "QPDF xref size mismatch");
465
2.21k
        warn(damagedPDF(
466
2.21k
            "",
467
2.21k
            -1,
468
2.21k
            ("reported number of objects (" + std::to_string(size) +
469
2.21k
             ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
470
2.21k
    }
471
472
    // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
473
    // never depend on its being set.
474
9.93k
    m->deleted_objects.clear();
475
476
    // Make sure we keep only the highest generation for any object.
477
9.93k
    QPDFObjGen last_og{-1, 0};
478
337k
    for (auto const& item: m->xref_table) {
479
337k
        auto id = item.first.getObj();
480
337k
        if (id == last_og.getObj() && id > 0) {
481
547
            removeObject(last_og);
482
547
        }
483
337k
        last_og = item.first;
484
337k
    }
485
9.93k
}
486
487
bool
488
QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
489
7.05k
{
490
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
491
    // buffer.
492
7.05k
    char const* p = line.c_str();
493
7.05k
    char const* start = line.c_str();
494
495
    // Skip zero or more spaces
496
8.16k
    while (util::is_space(*p)) {
497
1.11k
        ++p;
498
1.11k
    }
499
    // Require digit
500
7.05k
    if (!util::is_digit(*p)) {
501
257
        return false;
502
257
    }
503
    // Gather digits
504
6.79k
    std::string obj_str;
505
25.9k
    while (util::is_digit(*p)) {
506
19.1k
        obj_str.append(1, *p++);
507
19.1k
    }
508
    // Require space
509
6.79k
    if (!util::is_space(*p)) {
510
123
        return false;
511
123
    }
512
    // Skip spaces
513
18.4k
    while (util::is_space(*p)) {
514
11.7k
        ++p;
515
11.7k
    }
516
    // Require digit
517
6.67k
    if (!util::is_digit(*p)) {
518
109
        return false;
519
109
    }
520
    // Gather digits
521
6.56k
    std::string num_str;
522
20.4k
    while (util::is_digit(*p)) {
523
13.8k
        num_str.append(1, *p++);
524
13.8k
    }
525
    // Skip any space including line terminators
526
17.3k
    while (util::is_space(*p)) {
527
10.7k
        ++p;
528
10.7k
    }
529
6.56k
    bytes = toI(p - start);
530
6.56k
    obj = QUtil::string_to_int(obj_str.c_str());
531
6.56k
    num = QUtil::string_to_int(num_str.c_str());
532
6.56k
    return true;
533
6.67k
}
534
535
bool
536
QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
537
5.80k
{
538
    // Reposition after initial read attempt and reread.
539
5.80k
    m->file->seek(m->file->getLastOffset(), SEEK_SET);
540
5.80k
    auto line = m->file->readLine(30);
541
542
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
543
    // buffer.
544
5.80k
    char const* p = line.data();
545
546
    // Skip zero or more spaces. There aren't supposed to be any.
547
5.80k
    bool invalid = false;
548
13.5k
    while (util::is_space(*p)) {
549
7.74k
        ++p;
550
7.74k
        QTC::TC("qpdf", "QPDF ignore first space in xref entry");
551
7.74k
        invalid = true;
552
7.74k
    }
553
    // Require digit
554
5.80k
    if (!util::is_digit(*p)) {
555
23
        return false;
556
23
    }
557
    // Gather digits
558
5.78k
    std::string f1_str;
559
23.7k
    while (util::is_digit(*p)) {
560
17.9k
        f1_str.append(1, *p++);
561
17.9k
    }
562
    // Require space
563
5.78k
    if (!util::is_space(*p)) {
564
26
        return false;
565
26
    }
566
5.75k
    if (util::is_space(*(p + 1))) {
567
1.28k
        QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
568
1.28k
        invalid = true;
569
1.28k
    }
570
    // Skip spaces
571
15.9k
    while (util::is_space(*p)) {
572
10.1k
        ++p;
573
10.1k
    }
574
    // Require digit
575
5.75k
    if (!util::is_digit(*p)) {
576
45
        return false;
577
45
    }
578
    // Gather digits
579
5.71k
    std::string f2_str;
580
18.5k
    while (util::is_digit(*p)) {
581
12.8k
        f2_str.append(1, *p++);
582
12.8k
    }
583
    // Require space
584
5.71k
    if (!util::is_space(*p)) {
585
68
        return false;
586
68
    }
587
5.64k
    if (util::is_space(*(p + 1))) {
588
2.13k
        QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
589
2.13k
        invalid = true;
590
2.13k
    }
591
    // Skip spaces
592
15.2k
    while (util::is_space(*p)) {
593
9.65k
        ++p;
594
9.65k
    }
595
5.64k
    if ((*p == 'f') || (*p == 'n')) {
596
5.55k
        type = *p;
597
5.55k
    } else {
598
92
        return false;
599
92
    }
600
5.55k
    if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
601
5.40k
        QTC::TC("qpdf", "QPDF ignore length error xref entry");
602
5.40k
        invalid = true;
603
5.40k
    }
604
605
5.55k
    if (invalid) {
606
5.40k
        warn(damagedPDF("xref table", "accepting invalid xref table entry"));
607
5.40k
    }
608
609
5.55k
    f1 = QUtil::string_to_ll(f1_str.c_str());
610
5.55k
    f2 = QUtil::string_to_int(f2_str.c_str());
611
612
5.55k
    return true;
613
5.64k
}
614
615
// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
616
// result.
617
bool
618
QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
619
21.0k
{
620
21.0k
    std::array<char, 21> line;
621
21.0k
    if (m->file->read(line.data(), 20) != 20) {
622
        // C++20: [[unlikely]]
623
214
        return false;
624
214
    }
625
20.8k
    line[20] = '\0';
626
20.8k
    char const* p = line.data();
627
628
20.8k
    int f1_len = 0;
629
20.8k
    int f2_len = 0;
630
631
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
632
    // buffer.
633
634
    // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
635
112k
    while (*p == '0') {
636
91.5k
        ++f1_len;
637
91.5k
        ++p;
638
91.5k
    }
639
90.4k
    while (util::is_digit(*p) && f1_len++ < 10) {
640
69.5k
        f1 *= 10;
641
69.5k
        f1 += *p++ - '0';
642
69.5k
    }
643
    // Require space
644
20.8k
    if (!util::is_space(*p++)) {
645
        // Entry doesn't start with space or digit.
646
        // C++20: [[unlikely]]
647
111
        return false;
648
111
    }
649
    // Gather digits. NB No risk of overflow as 99'999 < max int.
650
86.8k
    while (*p == '0') {
651
66.0k
        ++f2_len;
652
66.0k
        ++p;
653
66.0k
    }
654
38.9k
    while (util::is_digit(*p) && f2_len++ < 5) {
655
18.2k
        f2 *= 10;
656
18.2k
        f2 += static_cast<int>(*p++ - '0');
657
18.2k
    }
658
20.7k
    if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
659
        // C++20: [[likely]]
660
17.6k
        type = *p;
661
        // No test for valid line[19].
662
17.6k
        if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
663
            // C++20: [[likely]]
664
14.9k
            return true;
665
14.9k
        }
666
17.6k
    }
667
5.80k
    return read_bad_xrefEntry(f1, f2, type);
668
20.7k
}
669
670
// Read a single cross-reference table section and associated trailer.
671
qpdf_offset_t
672
QPDF::read_xrefTable(qpdf_offset_t xref_offset)
673
2.28k
{
674
2.28k
    m->file->seek(xref_offset, SEEK_SET);
675
2.28k
    std::string line;
676
7.07k
    while (true) {
677
7.05k
        line.assign(50, '\0');
678
7.05k
        m->file->read(line.data(), line.size());
679
7.05k
        int obj = 0;
680
7.05k
        int num = 0;
681
7.05k
        int bytes = 0;
682
7.05k
        if (!parse_xrefFirst(line, obj, num, bytes)) {
683
489
            QTC::TC("qpdf", "QPDF invalid xref");
684
489
            throw damagedPDF("xref table", "xref syntax invalid");
685
489
        }
686
6.56k
        m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
687
27.0k
        for (qpdf_offset_t i = obj; i - num < obj; ++i) {
688
21.0k
            if (i == 0) {
689
                // This is needed by checkLinearization()
690
382
                m->first_xref_item_offset = m->file->tell();
691
382
            }
692
            // For xref_table, these will always be small enough to be ints
693
21.0k
            qpdf_offset_t f1 = 0;
694
21.0k
            int f2 = 0;
695
21.0k
            char type = '\0';
696
21.0k
            if (!read_xrefEntry(f1, f2, type)) {
697
579
                QTC::TC("qpdf", "QPDF invalid xref entry");
698
579
                throw damagedPDF(
699
579
                    "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
700
579
            }
701
20.4k
            if (type == 'f') {
702
3.97k
                insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
703
16.5k
            } else {
704
16.5k
                insertXrefEntry(toI(i), 1, f1, f2);
705
16.5k
            }
706
20.4k
        }
707
5.98k
        qpdf_offset_t pos = m->file->tell();
708
5.98k
        if (readToken(*m->file).isWord("trailer")) {
709
1.19k
            break;
710
4.79k
        } else {
711
4.79k
            m->file->seek(pos, SEEK_SET);
712
4.79k
        }
713
5.98k
    }
714
715
    // Set offset to previous xref table if any
716
1.21k
    QPDFObjectHandle cur_trailer = readTrailer();
717
1.21k
    if (!cur_trailer.isDictionary()) {
718
88
        QTC::TC("qpdf", "QPDF missing trailer");
719
88
        throw damagedPDF("", "expected trailer dictionary");
720
88
    }
721
722
1.12k
    if (!m->trailer) {
723
1.03k
        setTrailer(cur_trailer);
724
725
1.03k
        if (!m->trailer.hasKey("/Size")) {
726
88
            QTC::TC("qpdf", "QPDF trailer lacks size");
727
88
            throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
728
88
        }
729
944
        if (!m->trailer.getKey("/Size").isInteger()) {
730
4
            QTC::TC("qpdf", "QPDF trailer size not integer");
731
4
            throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
732
4
        }
733
944
    }
734
735
1.03k
    if (cur_trailer.hasKey("/XRefStm")) {
736
30
        if (m->ignore_xref_streams) {
737
0
            QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
738
30
        } else {
739
30
            if (cur_trailer.getKey("/XRefStm").isInteger()) {
740
                // Read the xref stream but disregard any return value -- we'll use our trailer's
741
                // /Prev key instead of the xref stream's.
742
29
                (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
743
29
            } else {
744
1
                throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
745
1
            }
746
30
        }
747
30
    }
748
749
1.03k
    if (cur_trailer.hasKey("/Prev")) {
750
156
        if (!cur_trailer.getKey("/Prev").isInteger()) {
751
2
            QTC::TC("qpdf", "QPDF trailer prev not integer");
752
2
            throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
753
2
        }
754
154
        QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
755
154
        return cur_trailer.getKey("/Prev").getIntValue();
756
156
    }
757
758
879
    return 0;
759
1.03k
}
760
761
// Read a single cross-reference stream.
762
qpdf_offset_t
763
QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
764
8.33k
{
765
8.33k
    if (!m->ignore_xref_streams) {
766
8.33k
        QPDFObjectHandle xref_obj;
767
8.33k
        try {
768
8.33k
            m->in_read_xref_stream = true;
769
8.33k
            xref_obj = readObjectAtOffset(xref_offset, "xref stream", true);
770
8.33k
        } catch (QPDFExc&) {
771
            // ignore -- report error below
772
1.54k
        }
773
8.33k
        m->in_read_xref_stream = false;
774
8.28k
        if (xref_obj.isStreamOfType("/XRef")) {
775
6.11k
            QTC::TC("qpdf", "QPDF found xref stream");
776
6.11k
            return processXRefStream(xref_offset, xref_obj, in_stream_recovery);
777
6.11k
        }
778
8.28k
    }
779
780
2.17k
    QTC::TC("qpdf", "QPDF can't find xref");
781
2.17k
    throw damagedPDF("", xref_offset, "xref not found");
782
0
    return 0; // unreachable
783
8.33k
}
784
785
// Return the entry size of the xref stream and the processed W array.
786
std::pair<int, std::array<int, 3>>
787
QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
788
6.11k
{
789
6.11k
    auto W_obj = dict.getKey("/W");
790
6.11k
    if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
791
6.11k
          W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
792
195
        throw damaged("Cross-reference stream does not have a proper /W key");
793
195
    }
794
795
5.92k
    std::array<int, 3> W;
796
5.92k
    int entry_size = 0;
797
5.92k
    auto w_vector = W_obj.getArrayAsVector();
798
5.92k
    int max_bytes = sizeof(qpdf_offset_t);
799
23.5k
    for (size_t i = 0; i < 3; ++i) {
800
17.6k
        W[i] = w_vector[i].getIntValueAsInt();
801
17.6k
        if (W[i] > max_bytes) {
802
20
            throw damaged("Cross-reference stream's /W contains impossibly large values");
803
20
        }
804
17.6k
        if (W[i] < 0) {
805
55
            throw damaged("Cross-reference stream's /W contains negative values");
806
55
        }
807
17.5k
        entry_size += W[i];
808
17.5k
    }
809
5.84k
    if (entry_size == 0) {
810
5
        throw damaged("Cross-reference stream's /W indicates entry size of 0");
811
5
    }
812
5.84k
    return {entry_size, W};
813
5.84k
}
814
815
// Validate Size key and return the maximum number of entries that the xref stream can contain.
816
int
817
QPDF::processXRefSize(
818
    QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
819
5.84k
{
820
    // Number of entries is limited by the highest possible object id and stream size.
821
5.84k
    auto max_num_entries = std::numeric_limits<int>::max();
822
5.84k
    if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
823
0
        max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
824
0
    }
825
826
5.84k
    auto Size_obj = dict.getKey("/Size");
827
5.84k
    long long size;
828
5.84k
    if (!dict.getKey("/Size").getValueAsInt(size)) {
829
65
        throw damaged("Cross-reference stream does not have a proper /Size key");
830
5.77k
    } else if (size < 0) {
831
67
        throw damaged("Cross-reference stream has a negative /Size key");
832
5.71k
    } else if (size >= max_num_entries) {
833
67
        throw damaged("Cross-reference stream has an impossibly large /Size key");
834
67
    }
835
    // We are not validating that Size <= (Size key of parent xref / trailer).
836
5.64k
    return max_num_entries;
837
5.84k
}
838
839
// Return the number of entries of the xref stream and the processed Index array.
840
std::pair<int, std::vector<std::pair<int, int>>>
841
QPDF::processXRefIndex(
842
    QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
843
5.64k
{
844
5.64k
    auto size = dict.getKey("/Size").getIntValueAsInt();
845
5.64k
    auto Index_obj = dict.getKey("/Index");
846
847
5.64k
    if (Index_obj.isArray()) {
848
902
        std::vector<std::pair<int, int>> indx;
849
902
        int num_entries = 0;
850
902
        auto index_vec = Index_obj.getArrayAsVector();
851
902
        if ((index_vec.size() % 2) || index_vec.size() < 2) {
852
12
            throw damaged("Cross-reference stream's /Index has an invalid number of values");
853
12
        }
854
855
890
        int i = 0;
856
890
        long long first = 0;
857
5.15k
        for (auto& val: index_vec) {
858
5.15k
            if (val.isInteger()) {
859
5.13k
                if (i % 2) {
860
2.50k
                    auto count = val.getIntValue();
861
2.50k
                    if (count <= 0) {
862
53
                        throw damaged(
863
53
                            "Cross-reference stream section claims to contain " +
864
53
                            std::to_string(count) + " entries");
865
53
                    }
866
                    // We are guarding against the possibility of num_entries * entry_size
867
                    // overflowing. We are not checking that entries are in ascending order as
868
                    // required by the spec, which probably should generate a warning. We are also
869
                    // not checking that for each subsection first object number + number of entries
870
                    // <= /Size. The spec requires us to ignore object number > /Size.
871
2.45k
                    if (first > (max_num_entries - count) ||
872
2.45k
                        count > (max_num_entries - num_entries)) {
873
64
                        throw damaged(
874
64
                            "Cross-reference stream claims to contain too many entries: " +
875
64
                            std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
876
64
                            std::to_string(num_entries));
877
64
                    }
878
2.39k
                    indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
879
2.39k
                    num_entries += static_cast<int>(count);
880
2.62k
                } else {
881
2.62k
                    first = val.getIntValue();
882
2.62k
                    if (first < 0) {
883
29
                        throw damaged(
884
29
                            "Cross-reference stream's /Index contains a negative object id");
885
2.59k
                    } else if (first > max_num_entries) {
886
74
                        throw damaged(
887
74
                            "Cross-reference stream's /Index contains an impossibly "
888
74
                            "large object id");
889
74
                    }
890
2.62k
                }
891
5.13k
            } else {
892
23
                throw damaged(
893
23
                    "Cross-reference stream's /Index's item " + std::to_string(i) +
894
23
                    " is not an integer");
895
23
            }
896
4.91k
            i++;
897
4.91k
        }
898
647
        QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
899
647
        return {num_entries, indx};
900
4.74k
    } else if (Index_obj.isNull()) {
901
4.73k
        QTC::TC("qpdf", "QPDF xref /Index is null");
902
4.73k
        return {size, {{0, size}}};
903
4.73k
    } else {
904
3
        throw damaged("Cross-reference stream does not have a proper /Index key");
905
3
    }
906
5.64k
}
907
908
qpdf_offset_t
909
QPDF::processXRefStream(
910
    qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj, bool in_stream_recovery)
911
6.11k
{
912
6.11k
    auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
913
4.08k
        return damagedPDF("xref stream", xref_offset, msg.data());
914
4.08k
    };
915
916
6.11k
    auto dict = xref_obj.getDict();
917
918
6.11k
    auto [entry_size, W] = processXRefW(dict, damaged);
919
6.11k
    int max_num_entries = processXRefSize(dict, entry_size, damaged);
920
6.11k
    auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
921
922
6.11k
    std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
923
6.11k
    size_t actual_size = bp->getSize();
924
6.11k
    auto expected_size = toS(entry_size) * toS(num_entries);
925
926
6.11k
    if (expected_size != actual_size) {
927
3.35k
        QPDFExc x = damaged(
928
3.35k
            "Cross-reference stream data has the wrong size; expected = " +
929
3.35k
            std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
930
3.35k
        if (expected_size > actual_size) {
931
545
            throw x;
932
2.81k
        } else {
933
2.81k
            warn(x);
934
2.81k
        }
935
3.35k
    }
936
937
5.57k
    bool saw_first_compressed_object = false;
938
939
    // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
940
    // We know that entry_size * num_entries is less or equal to the size of the buffer.
941
5.57k
    auto p = bp->getBuffer();
942
5.57k
    for (auto [obj, sec_entries]: indx) {
943
        // Process a subsection.
944
1.02M
        for (int i = 0; i < sec_entries; ++i) {
945
            // Read this entry
946
1.01M
            std::array<qpdf_offset_t, 3> fields{};
947
1.01M
            if (W[0] == 0) {
948
114k
                QTC::TC("qpdf", "QPDF default for xref stream field 0");
949
114k
                fields[0] = 1;
950
114k
            }
951
4.07M
            for (size_t j = 0; j < 3; ++j) {
952
7.14M
                for (int k = 0; k < W[j]; ++k) {
953
4.09M
                    fields[j] <<= 8;
954
4.09M
                    fields[j] |= *p++;
955
4.09M
                }
956
3.05M
            }
957
958
            // Get the generation number.  The generation number is 0 unless this is an uncompressed
959
            // object record, in which case the generation number appears as the third field.
960
1.01M
            if (saw_first_compressed_object) {
961
846k
                if (fields[0] != 2) {
962
424k
                    m->uncompressed_after_compressed = true;
963
424k
                }
964
846k
            } else if (fields[0] == 2) {
965
2.17k
                saw_first_compressed_object = true;
966
2.17k
            }
967
1.01M
            if (obj == 0) {
968
                // This is needed by checkLinearization()
969
3.01k
                m->first_xref_item_offset = xref_offset;
970
1.01M
            } else if (fields[0] == 0) {
971
                // Ignore fields[2], which we don't care about in this case. This works around the
972
                // issue of some PDF files that put invalid values, like -1, here for deleted
973
                // objects.
974
191k
                insertFreeXrefEntry(QPDFObjGen(obj, 0));
975
823k
            } else {
976
823k
                auto typ = toI(fields[0]);
977
823k
                if (!in_stream_recovery || typ == 2) {
978
                    // If we are in xref stream recovery all actual uncompressed objects have
979
                    // already been inserted into the xref table. Avoid adding junk data into the
980
                    // xref table.
981
644k
                    insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
982
644k
                }
983
823k
            }
984
1.01M
            ++obj;
985
1.01M
        }
986
3.90k
    }
987
988
5.57k
    if (!m->trailer) {
989
630
        setTrailer(dict);
990
630
    }
991
992
5.57k
    if (dict.hasKey("/Prev")) {
993
894
        if (!dict.getKey("/Prev").isInteger()) {
994
85
            throw damagedPDF(
995
85
                "xref stream", "/Prev key in xref stream dictionary is not an integer");
996
85
        }
997
809
        QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
998
809
        return dict.getKey("/Prev").getIntValue();
999
4.68k
    } else {
1000
4.68k
        return 0;
1001
4.68k
    }
1002
5.57k
}
1003
1004
void
1005
QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
1006
840k
{
1007
    // Populate the xref table in such a way that the first reference to an object that we see,
1008
    // which is the one in the latest xref table in which it appears, is the one that gets stored.
1009
    // This works because we are reading more recent appends before older ones.
1010
1011
    // If there is already an entry for this object and generation in the table, it means that a
1012
    // later xref table has registered this object.  Disregard this one.
1013
840k
    int new_gen = f0 == 2 ? 0 : f2;
1014
1015
840k
    if (!(f0 == 1 || f0 == 2)) {
1016
28.2k
        return;
1017
28.2k
    }
1018
1019
811k
    if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
1020
        // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
1021
        // is probably no point having another warning but we could count invalid items in order to
1022
        // decide when to give up.
1023
237k
        QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
1024
        // ignore impossibly large object ids or object ids > Size.
1025
237k
        return;
1026
237k
    }
1027
1028
574k
    if (m->deleted_objects.contains(obj)) {
1029
1.12k
        QTC::TC("qpdf", "QPDF xref deleted object");
1030
1.12k
        return;
1031
1.12k
    }
1032
1033
573k
    if (f0 == 2 && static_cast<int>(f1) == obj) {
1034
1.01k
        warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
1035
1.01k
        return;
1036
1.01k
    }
1037
1038
572k
    auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
1039
572k
    if (!created) {
1040
68.6k
        QTC::TC("qpdf", "QPDF xref reused object");
1041
68.6k
        return;
1042
68.6k
    }
1043
1044
503k
    switch (f0) {
1045
169k
    case 1:
1046
        // f2 is generation
1047
169k
        QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1048
169k
        iter->second = QPDFXRefEntry(f1);
1049
169k
        break;
1050
1051
333k
    case 2:
1052
333k
        iter->second = QPDFXRefEntry(toI(f1), f2);
1053
333k
        break;
1054
1055
0
    default:
1056
0
        throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
1057
0
        break;
1058
503k
    }
1059
503k
}
1060
1061
void
1062
QPDF::insertFreeXrefEntry(QPDFObjGen og)
1063
195k
{
1064
195k
    if (!m->xref_table.contains(og)) {
1065
194k
        m->deleted_objects.insert(og.getObj());
1066
194k
    }
1067
195k
}
1068
1069
void
1070
QPDF::showXRefTable()
1071
0
{
1072
0
    auto& cout = *m->log->getInfo();
1073
0
    for (auto const& iter: m->xref_table) {
1074
0
        QPDFObjGen const& og = iter.first;
1075
0
        QPDFXRefEntry const& entry = iter.second;
1076
0
        cout << og.unparse('/') << ": ";
1077
0
        switch (entry.getType()) {
1078
0
        case 1:
1079
0
            cout << "uncompressed; offset = " << entry.getOffset();
1080
0
            break;
1081
1082
0
        case 2:
1083
0
            *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
1084
0
                               << ", index = " << entry.getObjStreamIndex();
1085
0
            break;
1086
1087
0
        default:
1088
0
            throw std::logic_error("unknown cross-reference table type while showing xref_table");
1089
0
            break;
1090
0
        }
1091
0
        m->log->info("\n");
1092
0
    }
1093
0
}
1094
1095
// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
1096
// return false. Otherwise return true.
1097
bool
1098
QPDF::resolveXRefTable()
1099
8.46k
{
1100
8.46k
    bool may_change = !m->reconstructed_xref;
1101
286k
    for (auto& iter: m->xref_table) {
1102
286k
        if (isUnresolved(iter.first)) {
1103
181k
            resolve(iter.first);
1104
181k
            if (may_change && m->reconstructed_xref) {
1105
41
                return false;
1106
41
            }
1107
181k
        }
1108
286k
    }
1109
8.42k
    return true;
1110
8.46k
}
1111
1112
// Ensure all objects in the pdf file, including those in indirect references, appear in the object
1113
// cache.
1114
void
1115
QPDF::fixDanglingReferences(bool force)
1116
17.9k
{
1117
17.9k
    if (m->fixed_dangling_refs) {
1118
9.53k
        return;
1119
9.53k
    }
1120
8.42k
    if (!resolveXRefTable()) {
1121
41
        QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
1122
41
        resolveXRefTable();
1123
41
    }
1124
8.42k
    m->fixed_dangling_refs = true;
1125
8.42k
}
1126
1127
size_t
1128
QPDF::getObjectCount()
1129
9.93k
{
1130
    // This method returns the next available indirect object number. makeIndirectObject uses it for
1131
    // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
1132
    // be in obj_cache.
1133
9.93k
    fixDanglingReferences();
1134
9.93k
    QPDFObjGen og;
1135
9.93k
    if (!m->obj_cache.empty()) {
1136
9.34k
        og = (*(m->obj_cache.rbegin())).first;
1137
9.34k
    }
1138
9.93k
    return toS(og.getObj());
1139
9.93k
}
1140
1141
std::vector<QPDFObjectHandle>
1142
QPDF::getAllObjects()
1143
0
{
1144
    // After fixDanglingReferences is called, all objects are in the object cache.
1145
0
    fixDanglingReferences();
1146
0
    std::vector<QPDFObjectHandle> result;
1147
0
    for (auto const& iter: m->obj_cache) {
1148
0
        result.push_back(newIndirect(iter.first, iter.second.object));
1149
0
    }
1150
0
    return result;
1151
0
}
1152
1153
void
1154
QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen og)
1155
248k
{
1156
248k
    m->last_object_description.clear();
1157
248k
    if (!description.empty()) {
1158
7.74k
        m->last_object_description += description;
1159
7.74k
        if (og.isIndirect()) {
1160
7.74k
            m->last_object_description += ": ";
1161
7.74k
        }
1162
7.74k
    }
1163
248k
    if (og.isIndirect()) {
1164
248k
        m->last_object_description += "object " + og.unparse(' ');
1165
248k
    }
1166
248k
}
1167
1168
QPDFObjectHandle
1169
QPDF::readTrailer()
1170
18.0k
{
1171
18.0k
    qpdf_offset_t offset = m->file->tell();
1172
18.0k
    auto [object, empty] =
1173
18.0k
        QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, *this, m->reconstructed_xref);
1174
18.0k
    if (empty) {
1175
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1176
        // actual PDF files and Adobe Reader appears to ignore them.
1177
67
        warn(damagedPDF("trailer", "empty object treated as null"));
1178
17.9k
    } else if (object.isDictionary() && readToken(*m->file).isWord("stream")) {
1179
221
        warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1180
221
    }
1181
    // Override last_offset so that it points to the beginning of the object we just read
1182
18.0k
    m->file->setLastOffset(offset);
1183
18.0k
    return object;
1184
18.0k
}
1185
1186
QPDFObjectHandle
1187
QPDF::readObject(std::string const& description, QPDFObjGen og)
1188
124k
{
1189
124k
    setLastObjectDescription(description, og);
1190
124k
    qpdf_offset_t offset = m->file->tell();
1191
1192
124k
    StringDecrypter decrypter{this, og};
1193
124k
    StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1194
124k
    auto [object, empty] = QPDFParser::parse(
1195
124k
        *m->file,
1196
124k
        m->last_object_description,
1197
124k
        m->tokenizer,
1198
124k
        decrypter_ptr,
1199
124k
        *this,
1200
124k
        m->reconstructed_xref || m->in_read_xref_stream);
1201
124k
    ;
1202
124k
    if (empty) {
1203
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1204
        // actual PDF files and Adobe Reader appears to ignore them.
1205
168
        warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
1206
168
        return object;
1207
168
    }
1208
124k
    auto token = readToken(*m->file);
1209
124k
    if (object.isDictionary() && token.isWord("stream")) {
1210
51.3k
        readStream(object, og, offset);
1211
51.3k
        token = readToken(*m->file);
1212
51.3k
    }
1213
124k
    if (!token.isWord("endobj")) {
1214
36.3k
        QTC::TC("qpdf", "QPDF err expected endobj");
1215
36.3k
        warn(damagedPDF("expected endobj"));
1216
36.3k
    }
1217
124k
    return object;
1218
124k
}
1219
1220
// After reading stream dictionary and stream keyword, read rest of stream.
1221
void
1222
QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1223
51.3k
{
1224
51.3k
    validateStreamLineEnd(object, og, offset);
1225
1226
    // Must get offset before accessing any additional objects since resolving a previously
1227
    // unresolved indirect object will change file position.
1228
51.3k
    qpdf_offset_t stream_offset = m->file->tell();
1229
51.3k
    size_t length = 0;
1230
1231
51.3k
    try {
1232
51.3k
        auto length_obj = object.getKey("/Length");
1233
1234
51.3k
        if (!length_obj.isInteger()) {
1235
16.8k
            if (length_obj.isNull()) {
1236
16.6k
                QTC::TC("qpdf", "QPDF stream without length");
1237
16.6k
                throw damagedPDF(offset, "stream dictionary lacks /Length key");
1238
16.6k
            }
1239
237
            QTC::TC("qpdf", "QPDF stream length not integer");
1240
237
            throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
1241
16.8k
        }
1242
1243
34.5k
        length = toS(length_obj.getUIntValue());
1244
        // Seek in two steps to avoid potential integer overflow
1245
34.5k
        m->file->seek(stream_offset, SEEK_SET);
1246
34.5k
        m->file->seek(toO(length), SEEK_CUR);
1247
34.5k
        if (!readToken(*m->file).isWord("endstream")) {
1248
9.22k
            QTC::TC("qpdf", "QPDF missing endstream");
1249
9.22k
            throw damagedPDF("expected endstream");
1250
9.22k
        }
1251
34.5k
    } catch (QPDFExc& e) {
1252
26.9k
        if (m->attempt_recovery) {
1253
26.9k
            warn(e);
1254
26.9k
            length = recoverStreamLength(m->file, og, stream_offset);
1255
26.9k
        } else {
1256
0
            throw;
1257
0
        }
1258
26.9k
    }
1259
49.1k
    object = QPDFObjectHandle(qpdf::Stream(*this, og, object, stream_offset, length));
1260
49.1k
}
1261
1262
void
1263
QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1264
51.3k
{
1265
    // The PDF specification states that the word "stream" should be followed by either a carriage
1266
    // return and a newline or by a newline alone.  It specifically disallowed following it by a
1267
    // carriage return alone since, in that case, there would be no way to tell whether the NL in a
1268
    // CR NL sequence was part of the stream data.  However, some readers, including Adobe reader,
1269
    // accept a carriage return by itself when followed by a non-newline character, so that's what
1270
    // we do here. We have also seen files that have extraneous whitespace between the stream
1271
    // keyword and the newline.
1272
58.3k
    while (true) {
1273
58.3k
        char ch;
1274
58.3k
        if (m->file->read(&ch, 1) == 0) {
1275
            // A premature EOF here will result in some other problem that will get reported at
1276
            // another time.
1277
183
            return;
1278
183
        }
1279
58.1k
        if (ch == '\n') {
1280
            // ready to read stream data
1281
19.8k
            QTC::TC("qpdf", "QPDF stream with NL only");
1282
19.8k
            return;
1283
19.8k
        }
1284
38.3k
        if (ch == '\r') {
1285
            // Read another character
1286
28.4k
            if (m->file->read(&ch, 1) != 0) {
1287
28.4k
                if (ch == '\n') {
1288
                    // Ready to read stream data
1289
27.7k
                    QTC::TC("qpdf", "QPDF stream with CRNL");
1290
27.7k
                } else {
1291
                    // Treat the \r by itself as the whitespace after endstream and start reading
1292
                    // stream data in spite of not having seen a newline.
1293
718
                    QTC::TC("qpdf", "QPDF stream with CR only");
1294
718
                    m->file->unreadCh(ch);
1295
718
                    warn(damagedPDF(
1296
718
                        m->file->tell(), "stream keyword followed by carriage return only"));
1297
718
                }
1298
28.4k
            }
1299
28.4k
            return;
1300
28.4k
        }
1301
9.87k
        if (!util::is_space(ch)) {
1302
2.84k
            QTC::TC("qpdf", "QPDF stream without newline");
1303
2.84k
            m->file->unreadCh(ch);
1304
2.84k
            warn(damagedPDF(
1305
2.84k
                m->file->tell(), "stream keyword not followed by proper line terminator"));
1306
2.84k
            return;
1307
2.84k
        }
1308
7.03k
        warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
1309
7.03k
    }
1310
51.3k
}
1311
1312
QPDFObjectHandle
1313
QPDF::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id)
1314
43.2k
{
1315
43.2k
    auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this);
1316
43.2k
    if (empty) {
1317
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1318
        // actual PDF files and Adobe Reader appears to ignore them.
1319
14
        warn(QPDFExc(
1320
14
            qpdf_e_damaged_pdf,
1321
14
            m->file->getName() + " object stream " + std::to_string(stream_id),
1322
14
            +"object " + std::to_string(obj_id) + " 0, offset " +
1323
14
                std::to_string(input.getLastOffset()),
1324
14
            0,
1325
14
            "empty object treated as null"));
1326
14
    }
1327
43.2k
    return object;
1328
43.2k
}
1329
1330
bool
1331
QPDF::findEndstream()
1332
30.3k
{
1333
    // Find endstream or endobj. Position the input at that token.
1334
30.3k
    auto t = readToken(*m->file, 20);
1335
30.3k
    if (t.isWord("endobj") || t.isWord("endstream")) {
1336
24.0k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
1337
24.0k
        return true;
1338
24.0k
    }
1339
6.25k
    return false;
1340
30.3k
}
1341
1342
size_t
1343
QPDF::recoverStreamLength(
1344
    std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
1345
24.8k
{
1346
    // Try to reconstruct stream length by looking for endstream or endobj
1347
24.8k
    warn(damagedPDF(*input, stream_offset, "attempting to recover stream length"));
1348
1349
24.8k
    PatternFinder ef(*this, &QPDF::findEndstream);
1350
24.8k
    size_t length = 0;
1351
24.8k
    if (m->file->findFirst("end", stream_offset, 0, ef)) {
1352
24.0k
        length = toS(m->file->tell() - stream_offset);
1353
        // Reread endstream but, if it was endobj, don't skip that.
1354
24.0k
        QPDFTokenizer::Token t = readToken(*m->file);
1355
24.0k
        if (t.getValue() == "endobj") {
1356
13.8k
            m->file->seek(m->file->getLastOffset(), SEEK_SET);
1357
13.8k
        }
1358
24.0k
    }
1359
1360
24.8k
    if (length) {
1361
23.6k
        auto end = stream_offset + toO(length);
1362
23.6k
        qpdf_offset_t found_offset = 0;
1363
23.6k
        QPDFObjGen found_og;
1364
1365
        // Make sure this is inside this object
1366
638k
        for (auto const& [current_og, entry]: m->xref_table) {
1367
638k
            if (entry.getType() == 1) {
1368
586k
                qpdf_offset_t obj_offset = entry.getOffset();
1369
586k
                if (found_offset < obj_offset && obj_offset < end) {
1370
157k
                    found_offset = obj_offset;
1371
157k
                    found_og = current_og;
1372
157k
                }
1373
586k
            }
1374
638k
        }
1375
23.6k
        if (!found_offset || found_og == og) {
1376
            // If we are trying to recover an XRef stream the xref table will not contain and
1377
            // won't contain any entries, therefore we cannot check the found length. Otherwise we
1378
            // found endstream\nendobj within the space allowed for this object, so we're probably
1379
            // in good shape.
1380
22.3k
        } else {
1381
1.20k
            QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1382
1.20k
            length = 0;
1383
1.20k
        }
1384
23.6k
    }
1385
1386
24.8k
    if (length == 0) {
1387
2.48k
        warn(damagedPDF(
1388
2.48k
            *input, stream_offset, "unable to recover stream data; treating stream as empty"));
1389
22.4k
    } else {
1390
22.4k
        warn(damagedPDF(
1391
22.4k
            *input, stream_offset, "recovered stream length: " + std::to_string(length)));
1392
22.4k
    }
1393
1394
24.8k
    QTC::TC("qpdf", "QPDF recovered stream length");
1395
24.8k
    return length;
1396
24.8k
}
1397
1398
QPDFTokenizer::Token
1399
QPDF::readToken(InputSource& input, size_t max_len)
1400
3.10M
{
1401
3.10M
    return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1402
3.10M
}
1403
1404
QPDFObjGen
1405
QPDF::read_object_start(qpdf_offset_t offset)
1406
130k
{
1407
130k
    m->file->seek(offset, SEEK_SET);
1408
130k
    QPDFTokenizer::Token tobjid = readToken(*m->file);
1409
130k
    bool objidok = tobjid.isInteger();
1410
130k
    QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1411
130k
    if (!objidok) {
1412
3.94k
        QTC::TC("qpdf", "QPDF expected n n obj");
1413
3.94k
        throw damagedPDF(offset, "expected n n obj");
1414
3.94k
    }
1415
126k
    QPDFTokenizer::Token tgen = readToken(*m->file);
1416
126k
    bool genok = tgen.isInteger();
1417
126k
    QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1418
126k
    if (!genok) {
1419
557
        throw damagedPDF(offset, "expected n n obj");
1420
557
    }
1421
126k
    QPDFTokenizer::Token tobj = readToken(*m->file);
1422
1423
126k
    bool objok = tobj.isWord("obj");
1424
126k
    QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1425
1426
126k
    if (!objok) {
1427
503
        throw damagedPDF(offset, "expected n n obj");
1428
503
    }
1429
125k
    int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1430
125k
    int generation = QUtil::string_to_int(tgen.getValue().c_str());
1431
125k
    if (objid == 0) {
1432
121
        QTC::TC("qpdf", "QPDF object id 0");
1433
121
        throw damagedPDF(offset, "object with ID 0");
1434
121
    }
1435
125k
    return {objid, generation};
1436
125k
}
1437
1438
void
1439
QPDF::readObjectAtOffset(
1440
    bool try_recovery, qpdf_offset_t offset, std::string const& description, QPDFObjGen exp_og)
1441
122k
{
1442
122k
    QPDFObjGen og;
1443
122k
    setLastObjectDescription(description, exp_og);
1444
1445
122k
    if (!m->attempt_recovery) {
1446
0
        try_recovery = false;
1447
0
    }
1448
1449
    // Special case: if offset is 0, just return null.  Some PDF writers, in particular
1450
    // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
1451
    // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
1452
    // these.
1453
122k
    if (offset == 0) {
1454
354
        QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1455
354
        warn(damagedPDF(-1, "object has offset 0"));
1456
354
        return;
1457
354
    }
1458
1459
122k
    try {
1460
122k
        og = read_object_start(offset);
1461
122k
        if (exp_og != og) {
1462
171
            QTC::TC("qpdf", "QPDF err wrong objid/generation");
1463
171
            QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1464
171
            if (try_recovery) {
1465
                // Will be retried below
1466
171
                throw e;
1467
171
            } else {
1468
                // We can try reading the object anyway even if the ID doesn't match.
1469
0
                warn(e);
1470
0
            }
1471
171
        }
1472
122k
    } catch (QPDFExc& e) {
1473
3.89k
        if (!try_recovery) {
1474
0
            throw;
1475
0
        }
1476
        // Try again after reconstructing xref table
1477
3.89k
        reconstruct_xref(e);
1478
3.89k
        if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {
1479
71
            qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1480
71
            readObjectAtOffset(false, new_offset, description, exp_og);
1481
71
            QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1482
71
            return;
1483
71
        }
1484
3.82k
        QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1485
3.82k
        warn(damagedPDF(
1486
3.82k
            "",
1487
3.82k
            -1,
1488
3.82k
            ("object " + exp_og.unparse(' ') +
1489
3.82k
             " not found in file after regenerating cross reference table")));
1490
3.82k
        return;
1491
3.89k
    }
1492
1493
117k
    QPDFObjectHandle oh = readObject(description, og);
1494
1495
    // Determine the end offset of this object before and after white space.  We use these
1496
    // numbers to validate linearization hint tables.  Offsets and lengths of objects may imply
1497
    // the end of an object to be anywhere between these values.
1498
117k
    qpdf_offset_t end_before_space = m->file->tell();
1499
1500
    // skip over spaces
1501
261k
    while (true) {
1502
256k
        char ch;
1503
256k
        if (!m->file->read(&ch, 1)) {
1504
3.88k
            throw damagedPDF(m->file->tell(), "EOF after endobj");
1505
3.88k
        }
1506
252k
        if (!isspace(static_cast<unsigned char>(ch))) {
1507
108k
            m->file->seek(-1, SEEK_CUR);
1508
108k
            break;
1509
108k
        }
1510
252k
    }
1511
114k
    updateCache(og, oh.getObj(), end_before_space, m->file->tell());
1512
114k
}
1513
1514
QPDFObjectHandle
1515
QPDF::readObjectAtOffset(
1516
    qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref)
1517
8.33k
{
1518
8.33k
    auto og = read_object_start(offset);
1519
8.33k
    auto oh = readObject(description, og);
1520
1521
8.33k
    if (!isUnresolved(og)) {
1522
4.63k
        return oh;
1523
4.63k
    }
1524
1525
3.70k
    if (skip_cache_if_in_xref && m->xref_table.contains(og)) {
1526
        // In the special case of the xref stream and linearization hint tables, the offset comes
1527
        // from another source. For the specific case of xref streams, the xref stream is read and
1528
        // loaded into the object cache very early in parsing. Ordinarily, when a file is updated by
1529
        // appending, items inserted into the xref table in later updates take precedence over
1530
        // earlier items. In the special case of reusing the object number previously used as the
1531
        // xref stream, we have the following order of events:
1532
        //
1533
        // * reused object gets loaded into the xref table
1534
        // * old object is read here while reading xref streams
1535
        // * original xref entry is ignored (since already in xref table)
1536
        //
1537
        // It is the second step that causes a problem. Even though the xref table is correct in
1538
        // this case, the old object is already in the cache and so effectively prevails over the
1539
        // reused object. To work around this issue, we have a special case for the xref stream (via
1540
        // the skip_cache_if_in_xref): if the object is already in the xref stream, don't cache what
1541
        // we read here.
1542
        //
1543
        // It is likely that the same bug may exist for linearization hint tables, but the existing
1544
        // code uses end_before_space and end_after_space from the cache, so fixing that would
1545
        // require more significant rework. The chances of a linearization hint stream being reused
1546
        // seems smaller because the xref stream is probably the highest object in the file and the
1547
        // linearization hint stream would be some random place in the middle, so I'm leaving that
1548
        // bug unfixed for now. If the bug were to be fixed, we could use !check_og in place of
1549
        // skip_cache_if_in_xref.
1550
13
        QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1551
13
        return oh;
1552
13
    }
1553
1554
    // Determine the end offset of this object before and after white space.  We use these
1555
    // numbers to validate linearization hint tables.  Offsets and lengths of objects may imply
1556
    // the end of an object to be anywhere between these values.
1557
3.68k
    qpdf_offset_t end_before_space = m->file->tell();
1558
1559
    // skip over spaces
1560
6.19k
    while (true) {
1561
4.73k
        char ch;
1562
4.73k
        if (!m->file->read(&ch, 1)) {
1563
126
            throw damagedPDF(m->file->tell(), "EOF after endobj");
1564
126
        }
1565
4.60k
        if (!isspace(static_cast<unsigned char>(ch))) {
1566
2.09k
            m->file->seek(-1, SEEK_CUR);
1567
2.09k
            break;
1568
2.09k
        }
1569
4.60k
    }
1570
3.56k
    updateCache(og, oh.getObj(), end_before_space, m->file->tell());
1571
1572
3.56k
    return oh;
1573
3.68k
}
1574
1575
std::shared_ptr<QPDFObject> const&
1576
QPDF::resolve(QPDFObjGen og)
1577
385k
{
1578
385k
    if (!isUnresolved(og)) {
1579
0
        return m->obj_cache[og].object;
1580
0
    }
1581
1582
385k
    if (m->resolving.contains(og)) {
1583
        // This can happen if an object references itself directly or indirectly in some key that
1584
        // has to be resolved during object parsing, such as stream length.
1585
260
        QTC::TC("qpdf", "QPDF recursion loop in resolve");
1586
260
        warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
1587
260
        updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1588
260
        return m->obj_cache[og].object;
1589
260
    }
1590
385k
    ResolveRecorder rr(this, og);
1591
1592
385k
    if (m->xref_table.contains(og)) {
1593
315k
        QPDFXRefEntry const& entry = m->xref_table[og];
1594
315k
        try {
1595
315k
            switch (entry.getType()) {
1596
122k
            case 1:
1597
                // Object stored in cache by readObjectAtOffset
1598
122k
                readObjectAtOffset(true, entry.getOffset(), "", og);
1599
122k
                break;
1600
1601
192k
            case 2:
1602
192k
                resolveObjectsInStream(entry.getObjStreamNumber());
1603
192k
                break;
1604
1605
9
            default:
1606
9
                throw damagedPDF(
1607
9
                    "", -1, ("object " + og.unparse('/') + " has unexpected xref entry type"));
1608
315k
            }
1609
315k
        } catch (QPDFExc& e) {
1610
39.8k
            warn(e);
1611
39.8k
        } catch (std::exception& e) {
1612
1.06k
            warn(damagedPDF(
1613
1.06k
                "", -1, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
1614
1.06k
        }
1615
315k
    }
1616
1617
375k
    if (isUnresolved(og)) {
1618
        // PDF spec says unknown objects resolve to the null object.
1619
265k
        QTC::TC("qpdf", "QPDF resolve failure to null");
1620
265k
        updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1621
265k
    }
1622
1623
375k
    auto& result(m->obj_cache[og].object);
1624
375k
    result->setDefaultDescription(this, og);
1625
375k
    return result;
1626
385k
}
1627
1628
void
1629
QPDF::resolveObjectsInStream(int obj_stream_number)
1630
192k
{
1631
192k
    auto damaged =
1632
192k
        [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc {
1633
18.8k
        return {
1634
18.8k
            qpdf_e_damaged_pdf,
1635
18.8k
            m->file->getName() + " object stream " + std::to_string(obj_stream_number),
1636
18.8k
            +"object " + std::to_string(id) + " 0",
1637
18.8k
            offset,
1638
18.8k
            msg,
1639
18.8k
            true};
1640
18.8k
    };
1641
1642
192k
    if (m->resolved_object_streams.contains(obj_stream_number)) {
1643
164k
        return;
1644
164k
    }
1645
28.4k
    m->resolved_object_streams.insert(obj_stream_number);
1646
    // Force resolution of object stream
1647
28.4k
    auto obj_stream = getObject(obj_stream_number, 0).as_stream();
1648
28.4k
    if (!obj_stream) {
1649
23.2k
        throw damagedPDF(
1650
23.2k
            "object " + std::to_string(obj_stream_number) + " 0",
1651
23.2k
            "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
1652
23.2k
    }
1653
1654
    // For linearization data in the object, use the data from the object stream for the objects in
1655
    // the stream.
1656
5.19k
    QPDFObjGen stream_og(obj_stream_number, 0);
1657
5.19k
    qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
1658
5.19k
    qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
1659
1660
5.19k
    QPDFObjectHandle dict = obj_stream.getDict();
1661
5.19k
    if (!dict.isDictionaryOfType("/ObjStm")) {
1662
556
        QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
1663
556
        warn(damagedPDF(
1664
556
            "object " + std::to_string(obj_stream_number) + " 0",
1665
556
            "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1666
556
    }
1667
1668
5.19k
    unsigned int n{0};
1669
5.19k
    int first{0};
1670
5.19k
    if (!(dict.getKey("/N").getValueAsUInt(n) && dict.getKey("/First").getValueAsInt(first))) {
1671
202
        throw damagedPDF(
1672
202
            "object " + std::to_string(obj_stream_number) + " 0",
1673
202
            "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1674
202
    }
1675
1676
    // id, offset, size
1677
4.99k
    std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets;
1678
1679
4.99k
    auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
1680
1681
4.99k
    BufferInputSource input("", bp.get());
1682
1683
4.99k
    const auto b_size = bp->getSize();
1684
4.99k
    const auto end_offset = static_cast<qpdf_offset_t>(b_size);
1685
4.99k
    auto b_start = bp->getBuffer();
1686
1687
4.99k
    if (first >= end_offset) {
1688
48
        throw damagedPDF(
1689
48
            "object " + std::to_string(obj_stream_number) + " 0",
1690
48
            "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry");
1691
48
    }
1692
1693
4.94k
    int id = 0;
1694
4.94k
    long long last_offset = -1;
1695
4.94k
    bool is_first = true;
1696
80.3k
    for (unsigned int i = 0; i < n; ++i) {
1697
75.5k
        auto tnum = readToken(input);
1698
75.5k
        auto id_offset = input.getLastOffset();
1699
75.5k
        auto toffset = readToken(input);
1700
75.5k
        if (!(tnum.isInteger() && toffset.isInteger())) {
1701
115
            throw damaged(0, input.getLastOffset(), "expected integer in object stream header");
1702
115
        }
1703
1704
75.4k
        int num = QUtil::string_to_int(tnum.getValue().c_str());
1705
75.4k
        long long offset = QUtil::string_to_int(toffset.getValue().c_str());
1706
1707
75.4k
        if (num == obj_stream_number) {
1708
358
            QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1709
358
            warn(damaged(num, id_offset, "object stream claims to contain itself"));
1710
358
            continue;
1711
358
        }
1712
1713
75.0k
        if (num < 1) {
1714
316
            QTC::TC("qpdf", "QPDF object stream contains id < 1");
1715
316
            warn(damaged(num, id_offset, "object id is invalid"s));
1716
316
            continue;
1717
316
        }
1718
1719
74.7k
        if (offset <= last_offset) {
1720
9.36k
            QTC::TC("qpdf", "QPDF object stream offsets not increasing");
1721
9.36k
            warn(damaged(
1722
9.36k
                num,
1723
9.36k
                input.getLastOffset(),
1724
9.36k
                "offset " + std::to_string(offset) +
1725
9.36k
                    " is invalid (must be larger than previous offset " +
1726
9.36k
                    std::to_string(last_offset) + ")"));
1727
9.36k
            continue;
1728
9.36k
        }
1729
1730
65.3k
        if (num > m->xref_table_max_id) {
1731
2.02k
            continue;
1732
2.02k
        }
1733
1734
63.3k
        if (first + offset >= end_offset) {
1735
8.66k
            warn(damaged(
1736
8.66k
                num, input.getLastOffset(), "offset " + std::to_string(offset) + " is too large"));
1737
8.66k
            continue;
1738
8.66k
        }
1739
1740
54.6k
        if (is_first) {
1741
1.21k
            is_first = false;
1742
53.4k
        } else {
1743
53.4k
            offsets.emplace_back(
1744
53.4k
                id, last_offset + first, static_cast<size_t>(offset - last_offset));
1745
53.4k
        }
1746
1747
54.6k
        last_offset = offset;
1748
54.6k
        id = num;
1749
54.6k
    }
1750
1751
4.82k
    if (!is_first) {
1752
        // We found at least one valid entry.
1753
1.04k
        offsets.emplace_back(
1754
1.04k
            id, last_offset + first, b_size - static_cast<size_t>(last_offset + first));
1755
1.04k
    }
1756
1757
    // To avoid having to read the object stream multiple times, store all objects that would be
1758
    // found here in the cache.  Remember that some objects stored here might have been overridden
1759
    // by new objects appended to the file, so it is necessary to recheck the xref table and only
1760
    // cache what would actually be resolved here.
1761
46.1k
    for (auto const& [obj_id, obj_offset, obj_size]: offsets) {
1762
46.1k
        QPDFObjGen og(obj_id, 0);
1763
46.1k
        auto entry = m->xref_table.find(og);
1764
46.1k
        if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1765
46.1k
            entry->second.getObjStreamNumber() == obj_stream_number) {
1766
43.3k
            Buffer obj_buffer{b_start + obj_offset, obj_size};
1767
43.3k
            is::OffsetBuffer in("", &obj_buffer, obj_offset);
1768
43.3k
            auto oh = readObjectInStream(in, obj_stream_number, obj_id);
1769
43.3k
            updateCache(og, oh.getObj(), end_before_space, end_after_space);
1770
43.3k
        } else {
1771
2.81k
            QTC::TC("qpdf", "QPDF not caching overridden objstm object");
1772
2.81k
        }
1773
46.1k
    }
1774
4.82k
}
1775
1776
QPDFObjectHandle
1777
QPDF::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
1778
1.10k
{
1779
1.10k
    obj->setDefaultDescription(this, og);
1780
1.10k
    return {obj};
1781
1.10k
}
1782
1783
void
1784
QPDF::updateCache(
1785
    QPDFObjGen og,
1786
    std::shared_ptr<QPDFObject> const& object,
1787
    qpdf_offset_t end_before_space,
1788
    qpdf_offset_t end_after_space,
1789
    bool destroy)
1790
420k
{
1791
420k
    object->setObjGen(this, og);
1792
420k
    if (isCached(og)) {
1793
239k
        auto& cache = m->obj_cache[og];
1794
239k
        object->move_to(cache.object, destroy);
1795
239k
        cache.end_before_space = end_before_space;
1796
239k
        cache.end_after_space = end_after_space;
1797
239k
    } else {
1798
180k
        m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
1799
180k
    }
1800
420k
}
1801
1802
bool
1803
QPDF::isCached(QPDFObjGen og)
1804
1.47M
{
1805
1.47M
    return m->obj_cache.contains(og);
1806
1.47M
}
1807
1808
bool
1809
QPDF::isUnresolved(QPDFObjGen og)
1810
1.05M
{
1811
1.05M
    return !isCached(og) || m->obj_cache[og].object->isUnresolved();
1812
1.05M
}
1813
1814
QPDFObjGen
1815
QPDF::nextObjGen()
1816
1.11k
{
1817
1.11k
    int max_objid = toI(getObjectCount());
1818
1.11k
    if (max_objid == std::numeric_limits<int>::max()) {
1819
2
        throw std::range_error("max object id is too high to create new objects");
1820
2
    }
1821
1.11k
    return {max_objid + 1, 0};
1822
1.11k
}
1823
1824
QPDFObjectHandle
1825
QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
1826
1.11k
{
1827
1.11k
    QPDFObjGen next{nextObjGen()};
1828
1.11k
    m->obj_cache[next] = ObjCache(obj, -1, -1);
1829
1.11k
    return newIndirect(next, m->obj_cache[next].object);
1830
1.11k
}
1831
1832
QPDFObjectHandle
1833
QPDF::makeIndirectObject(QPDFObjectHandle oh)
1834
1.11k
{
1835
1.11k
    if (!oh) {
1836
0
        throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
1837
0
    }
1838
1.11k
    return makeIndirectFromQPDFObject(oh.getObj());
1839
1.11k
}
1840
1841
std::shared_ptr<QPDFObject>
1842
QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
1843
408k
{
1844
    // This method is called by the parser and therefore must not resolve any objects.
1845
408k
    auto og = QPDFObjGen(id, gen);
1846
408k
    if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
1847
165k
        return iter->second.object;
1848
165k
    }
1849
242k
    if (m->xref_table.contains(og) || (!m->parsed && og.getObj() < m->xref_table_max_id)) {
1850
190k
        return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(this, og)})
1851
190k
            .first->second.object;
1852
190k
    }
1853
52.7k
    if (parse_pdf) {
1854
52.7k
        return QPDFObject::create<QPDF_Null>();
1855
52.7k
    }
1856
0
    return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(this, og)}).first->second.object;
1857
52.7k
}
1858
1859
std::shared_ptr<QPDFObject>
1860
QPDF::getObjectForJSON(int id, int gen)
1861
0
{
1862
0
    auto og = QPDFObjGen(id, gen);
1863
0
    auto [it, inserted] = m->obj_cache.try_emplace(og);
1864
0
    auto& obj = it->second.object;
1865
0
    if (inserted) {
1866
0
        obj = (m->parsed && !m->xref_table.contains(og))
1867
0
            ? QPDFObject::create<QPDF_Null>(this, og)
1868
0
            : QPDFObject::create<QPDF_Unresolved>(this, og);
1869
0
    }
1870
0
    return obj;
1871
0
}
1872
1873
QPDFObjectHandle
1874
QPDF::getObject(QPDFObjGen og)
1875
128k
{
1876
128k
    if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
1877
61.0k
        return {it->second.object};
1878
67.7k
    } else if (m->parsed && !m->xref_table.contains(og)) {
1879
7.97k
        return QPDFObject::create<QPDF_Null>();
1880
59.8k
    } else {
1881
59.8k
        auto result =
1882
59.8k
            m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
1883
59.8k
        return {result.first->second.object};
1884
59.8k
    }
1885
128k
}
1886
1887
void
1888
QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
1889
0
{
1890
0
    replaceObject(QPDFObjGen(objid, generation), oh);
1891
0
}
1892
1893
void
1894
QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
1895
0
{
1896
0
    if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
1897
0
        QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
1898
0
        throw std::logic_error("QPDF::replaceObject called with indirect object handle");
1899
0
    }
1900
0
    updateCache(og, oh.getObj(), -1, -1, false);
1901
0
}
1902
1903
void
1904
QPDF::removeObject(QPDFObjGen og)
1905
1.58k
{
1906
1.58k
    m->xref_table.erase(og);
1907
1.58k
    if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
1908
        // Take care of any object handles that may be floating around.
1909
1.47k
        cached->second.object->assign_null();
1910
1.47k
        cached->second.object->setObjGen(nullptr, QPDFObjGen());
1911
1.47k
        m->obj_cache.erase(cached);
1912
1.47k
    }
1913
1.58k
}
1914
1915
void
1916
QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
1917
0
{
1918
0
    QTC::TC("qpdf", "QPDF replaceReserved");
1919
0
    auto tc = reserved.getTypeCode();
1920
0
    if (!(tc == ::ot_reserved || tc == ::ot_null)) {
1921
0
        throw std::logic_error("replaceReserved called with non-reserved object");
1922
0
    }
1923
0
    replaceObject(reserved.getObjGen(), replacement);
1924
0
}
1925
1926
void
1927
QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
1928
0
{
1929
0
    swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
1930
0
}
1931
1932
void
1933
QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
1934
0
{
1935
    // Force objects to be read from the input source if needed, then swap them in the cache.
1936
0
    resolve(og1);
1937
0
    resolve(og2);
1938
0
    m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
1939
0
}
1940
1941
size_t
1942
QPDF::tableSize()
1943
8.07k
{
1944
    // If obj_cache is dense, accommodate all object in tables,else accommodate only original
1945
    // objects.
1946
8.07k
    auto max_xref = !m->xref_table.empty() ? m->xref_table.crbegin()->first.getObj() : 0;
1947
8.07k
    auto max_obj = !m->obj_cache.empty() ? m->obj_cache.crbegin()->first.getObj() : 0;
1948
8.07k
    auto max_id = std::numeric_limits<int>::max() - 1;
1949
8.07k
    if (max_obj >= max_id || max_xref >= max_id) {
1950
        // Temporary fix. Long-term solution is
1951
        // - QPDFObjGen to enforce objgens are valid and sensible
1952
        // - xref table and obj cache to protect against insertion of impossibly large obj ids
1953
1
        stopOnError("Impossibly large object id encountered.");
1954
1
    }
1955
8.07k
    if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
1956
6.10k
        return toS(++max_obj);
1957
6.10k
    }
1958
1.97k
    return toS(++max_xref);
1959
8.07k
}
1960
1961
std::vector<QPDFObjGen>
1962
QPDF::getCompressibleObjVector()
1963
0
{
1964
0
    return getCompressibleObjGens<QPDFObjGen>();
1965
0
}
1966
1967
std::vector<bool>
1968
QPDF::getCompressibleObjSet()
1969
767
{
1970
767
    return getCompressibleObjGens<bool>();
1971
767
}
1972
1973
template <typename T>
1974
std::vector<T>
1975
QPDF::getCompressibleObjGens()
1976
767
{
1977
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
1978
    // by traversing the document from the root, including a traversal of the pages tree.  This
1979
    // makes that objects that are on the same page are more likely to be in the same object stream,
1980
    // which is slightly more efficient, particularly with linearized files.  This is better than
1981
    // iterating through the xref table since it avoids preserving orphaned items.
1982
1983
    // Exclude encryption dictionary, if any
1984
767
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
1985
767
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
1986
1987
767
    const size_t max_obj = getObjectCount();
1988
767
    std::vector<bool> visited(max_obj, false);
1989
767
    std::vector<QPDFObjectHandle> queue;
1990
767
    queue.reserve(512);
1991
767
    queue.push_back(m->trailer);
1992
767
    std::vector<T> result;
1993
767
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
1994
0
        result.reserve(m->obj_cache.size());
1995
767
    } else if constexpr (std::is_same_v<T, bool>) {
1996
767
        result.resize(max_obj + 1U, false);
1997
    } else {
1998
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
1999
    }
2000
2.81M
    while (!queue.empty()) {
2001
2.80M
        auto obj = queue.back();
2002
2.80M
        queue.pop_back();
2003
2.80M
        if (obj.getObjectID() > 0) {
2004
73.3k
            QPDFObjGen og = obj.getObjGen();
2005
73.3k
            const size_t id = toS(og.getObj() - 1);
2006
73.3k
            if (id >= max_obj) {
2007
0
                throw std::logic_error(
2008
0
                    "unexpected object id encountered in getCompressibleObjGens");
2009
0
            }
2010
73.3k
            if (visited[id]) {
2011
26.3k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2012
26.3k
                continue;
2013
26.3k
            }
2014
2015
            // Check whether this is the current object. If not, remove it (which changes it into a
2016
            // direct null and therefore stops us from revisiting it) and move on to the next object
2017
            // in the queue.
2018
46.9k
            auto upper = m->obj_cache.upper_bound(og);
2019
46.9k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2020
1.04k
                removeObject(og);
2021
1.04k
                continue;
2022
1.04k
            }
2023
2024
45.9k
            visited[id] = true;
2025
2026
45.9k
            if (og == encryption_dict_og) {
2027
4
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2028
45.9k
            } else if (!(obj.isStream() ||
2029
45.9k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2030
38.4k
                          obj.hasKey("/Contents")))) {
2031
38.4k
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2032
0
                    result.push_back(og);
2033
38.4k
                } else if constexpr (std::is_same_v<T, bool>) {
2034
38.4k
                    result[id + 1U] = true;
2035
38.4k
                }
2036
38.4k
            }
2037
45.9k
        }
2038
2.78M
        if (obj.isStream()) {
2039
7.48k
            auto dict = obj.getDict().as_dictionary();
2040
7.48k
            auto end = dict.crend();
2041
37.3k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2042
29.9k
                std::string const& key = iter->first;
2043
29.9k
                QPDFObjectHandle const& value = iter->second;
2044
29.9k
                if (!value.null()) {
2045
28.1k
                    if (key == "/Length") {
2046
                        // omit stream lengths
2047
7.24k
                        if (value.isIndirect()) {
2048
126
                            QTC::TC("qpdf", "QPDF exclude indirect length");
2049
126
                        }
2050
20.8k
                    } else {
2051
20.8k
                        queue.emplace_back(value);
2052
20.8k
                    }
2053
28.1k
                }
2054
29.9k
            }
2055
2.77M
        } else if (obj.isDictionary()) {
2056
31.2k
            auto dict = obj.as_dictionary();
2057
31.2k
            auto end = dict.crend();
2058
170k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2059
139k
                if (!iter->second.null()) {
2060
117k
                    queue.emplace_back(iter->second);
2061
117k
                }
2062
139k
            }
2063
2.74M
        } else if (auto items = obj.as_array()) {
2064
2.74M
            queue.insert(queue.end(), items.crbegin(), items.crend());
2065
2.74M
        }
2066
2.78M
    }
2067
2068
767
    return result;
2069
767
}
Unexecuted instantiation: std::__1::vector<QPDFObjGen, std::__1::allocator<QPDFObjGen> > QPDF::getCompressibleObjGens<QPDFObjGen>()
std::__1::vector<bool, std::__1::allocator<bool> > QPDF::getCompressibleObjGens<bool>()
Line
Count
Source
1976
767
{
1977
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
1978
    // by traversing the document from the root, including a traversal of the pages tree.  This
1979
    // makes that objects that are on the same page are more likely to be in the same object stream,
1980
    // which is slightly more efficient, particularly with linearized files.  This is better than
1981
    // iterating through the xref table since it avoids preserving orphaned items.
1982
1983
    // Exclude encryption dictionary, if any
1984
767
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
1985
767
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
1986
1987
767
    const size_t max_obj = getObjectCount();
1988
767
    std::vector<bool> visited(max_obj, false);
1989
767
    std::vector<QPDFObjectHandle> queue;
1990
767
    queue.reserve(512);
1991
767
    queue.push_back(m->trailer);
1992
767
    std::vector<T> result;
1993
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
1994
        result.reserve(m->obj_cache.size());
1995
767
    } else if constexpr (std::is_same_v<T, bool>) {
1996
767
        result.resize(max_obj + 1U, false);
1997
    } else {
1998
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
1999
    }
2000
2.81M
    while (!queue.empty()) {
2001
2.80M
        auto obj = queue.back();
2002
2.80M
        queue.pop_back();
2003
2.80M
        if (obj.getObjectID() > 0) {
2004
73.3k
            QPDFObjGen og = obj.getObjGen();
2005
73.3k
            const size_t id = toS(og.getObj() - 1);
2006
73.3k
            if (id >= max_obj) {
2007
0
                throw std::logic_error(
2008
0
                    "unexpected object id encountered in getCompressibleObjGens");
2009
0
            }
2010
73.3k
            if (visited[id]) {
2011
26.3k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2012
26.3k
                continue;
2013
26.3k
            }
2014
2015
            // Check whether this is the current object. If not, remove it (which changes it into a
2016
            // direct null and therefore stops us from revisiting it) and move on to the next object
2017
            // in the queue.
2018
46.9k
            auto upper = m->obj_cache.upper_bound(og);
2019
46.9k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2020
1.04k
                removeObject(og);
2021
1.04k
                continue;
2022
1.04k
            }
2023
2024
45.9k
            visited[id] = true;
2025
2026
45.9k
            if (og == encryption_dict_og) {
2027
4
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2028
45.9k
            } else if (!(obj.isStream() ||
2029
45.9k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2030
38.4k
                          obj.hasKey("/Contents")))) {
2031
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2032
                    result.push_back(og);
2033
38.4k
                } else if constexpr (std::is_same_v<T, bool>) {
2034
38.4k
                    result[id + 1U] = true;
2035
38.4k
                }
2036
38.4k
            }
2037
45.9k
        }
2038
2.78M
        if (obj.isStream()) {
2039
7.48k
            auto dict = obj.getDict().as_dictionary();
2040
7.48k
            auto end = dict.crend();
2041
37.3k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2042
29.9k
                std::string const& key = iter->first;
2043
29.9k
                QPDFObjectHandle const& value = iter->second;
2044
29.9k
                if (!value.null()) {
2045
28.1k
                    if (key == "/Length") {
2046
                        // omit stream lengths
2047
7.24k
                        if (value.isIndirect()) {
2048
126
                            QTC::TC("qpdf", "QPDF exclude indirect length");
2049
126
                        }
2050
20.8k
                    } else {
2051
20.8k
                        queue.emplace_back(value);
2052
20.8k
                    }
2053
28.1k
                }
2054
29.9k
            }
2055
2.77M
        } else if (obj.isDictionary()) {
2056
31.2k
            auto dict = obj.as_dictionary();
2057
31.2k
            auto end = dict.crend();
2058
170k
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2059
139k
                if (!iter->second.null()) {
2060
117k
                    queue.emplace_back(iter->second);
2061
117k
                }
2062
139k
            }
2063
2.74M
        } else if (auto items = obj.as_array()) {
2064
2.74M
            queue.insert(queue.end(), items.crbegin(), items.crend());
2065
2.74M
        }
2066
2.78M
    }
2067
2068
767
    return result;
2069
767
}