Coverage Report

Created: 2025-10-10 06:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/QPDF_objects.cc
Line
Count
Source
1
#include <qpdf/qpdf-config.h> // include first for large file support
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <qpdf/InputSource_private.hh>
6
#include <qpdf/Pipeline.hh>
7
#include <qpdf/QPDFExc.hh>
8
#include <qpdf/QPDFLogger.hh>
9
#include <qpdf/QPDFObjectHandle_private.hh>
10
#include <qpdf/QPDFObject_private.hh>
11
#include <qpdf/QPDFParser.hh>
12
#include <qpdf/QTC.hh>
13
#include <qpdf/QUtil.hh>
14
#include <qpdf/Util.hh>
15
16
#include <array>
17
#include <atomic>
18
#include <cstring>
19
#include <limits>
20
#include <map>
21
#include <vector>
22
23
using namespace qpdf;
24
using namespace std::literals;
25
26
using Objects = QPDF::Doc::Objects;
27
28
namespace
29
{
30
    class InvalidInputSource: public InputSource
31
    {
32
      public:
33
        ~InvalidInputSource() override = default;
34
        qpdf_offset_t
35
        findAndSkipNextEOL() override
36
0
        {
37
0
            throwException();
38
0
            return 0;
39
0
        }
40
        std::string const&
41
        getName() const override
42
0
        {
43
0
            static std::string name("closed input source");
44
0
            return name;
45
0
        }
46
        qpdf_offset_t
47
        tell() override
48
0
        {
49
0
            throwException();
50
0
            return 0;
51
0
        }
52
        void
53
        seek(qpdf_offset_t offset, int whence) override
54
0
        {
55
0
            throwException();
56
0
        }
57
        void
58
        rewind() override
59
0
        {
60
0
            throwException();
61
0
        }
62
        size_t
63
        read(char* buffer, size_t length) override
64
0
        {
65
0
            throwException();
66
0
            return 0;
67
0
        }
68
        void
69
        unreadCh(char ch) override
70
0
        {
71
0
            throwException();
72
0
        }
73
74
      private:
75
        void
76
        throwException()
77
0
        {
78
0
            throw std::logic_error(
79
0
                "QPDF operation attempted on a QPDF object with no input "
80
0
                "source. QPDF operations are invalid before processFile (or "
81
0
                "another process method) or after closeInputSource");
82
0
        }
83
    };
84
} // namespace
85
86
class QPDF::ResolveRecorder final
87
{
88
  public:
89
    ResolveRecorder(QPDF& qpdf, QPDFObjGen const& og) :
90
0
        qpdf(qpdf),
91
0
        iter(qpdf.m->resolving.insert(og).first)
92
0
    {
93
0
    }
94
    ~ResolveRecorder()
95
0
    {
96
0
        qpdf.m->resolving.erase(iter);
97
0
    }
98
99
  private:
100
    QPDF& qpdf;
101
    std::set<QPDFObjGen>::const_iterator iter;
102
};
103
104
bool
105
QPDF::findStartxref()
106
7.90k
{
107
7.90k
    if (m->objects.readToken(*m->file).isWord("startxref") &&
108
7.90k
        m->objects.readToken(*m->file).isInteger()) {
109
        // Position in front of offset token
110
7.90k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
111
7.90k
        return true;
112
7.90k
    }
113
0
    return false;
114
7.90k
}
115
116
void
117
Objects::parse(char const* password)
118
7.90k
{
119
7.90k
    if (password) {
120
0
        m->encp->provided_password = password;
121
0
    }
122
123
    // Find the header anywhere in the first 1024 bytes of the file.
124
7.90k
    PatternFinder hf(qpdf, &QPDF::findHeader);
125
7.90k
    if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
126
0
        qpdf.warn(qpdf.damagedPDF("", -1, "can't find PDF header"));
127
        // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
128
0
        m->pdf_version = "1.2";
129
0
    }
130
131
    // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file.  We add an extra
132
    // 30 characters to leave room for the startxref stuff.
133
7.90k
    m->file->seek(0, SEEK_END);
134
7.90k
    qpdf_offset_t end_offset = m->file->tell();
135
7.90k
    m->xref_table_max_offset = end_offset;
136
    // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
137
    // scenarios at least 3 bytes are required.
138
7.90k
    if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
139
7.90k
        m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
140
7.90k
    }
141
7.90k
    qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
142
7.90k
    PatternFinder sf(qpdf, &QPDF::findStartxref);
143
7.90k
    qpdf_offset_t xref_offset = 0;
144
7.90k
    if (m->file->findLast("startxref", start_offset, 0, sf)) {
145
7.90k
        xref_offset = QUtil::string_to_ll(readToken(*m->file).getValue().c_str());
146
7.90k
    }
147
148
7.90k
    try {
149
7.90k
        if (xref_offset == 0) {
150
0
            throw qpdf.damagedPDF("", -1, "can't find startxref");
151
0
        }
152
7.90k
        try {
153
7.90k
            read_xref(xref_offset);
154
7.90k
        } catch (QPDFExc&) {
155
0
            throw;
156
0
        } catch (std::exception& e) {
157
0
            throw qpdf.damagedPDF("", -1, std::string("error reading xref: ") + e.what());
158
0
        }
159
7.90k
    } catch (QPDFExc& e) {
160
0
        if (m->attempt_recovery) {
161
0
            reconstruct_xref(e, xref_offset > 0);
162
0
        } else {
163
0
            throw;
164
0
        }
165
0
    }
166
167
7.90k
    qpdf.initializeEncryption();
168
7.90k
    m->parsed = true;
169
7.90k
    if (!m->xref_table.empty() && !qpdf.getRoot().getKey("/Pages").isDictionary()) {
170
        // QPDFs created from JSON have an empty xref table and no root object yet.
171
0
        throw qpdf.damagedPDF("", -1, "unable to find page tree");
172
0
    }
173
7.90k
}
174
175
void
176
Objects::inParse(bool v)
177
15.8k
{
178
15.8k
    if (m->in_parse == v) {
179
        // This happens if QPDFParser::parse tries to resolve an indirect object while it is
180
        // parsing.
181
0
        throw std::logic_error(
182
0
            "QPDF: re-entrant parsing detected. This is a qpdf bug."
183
0
            " Please report at https://github.com/qpdf/qpdf/issues.");
184
0
    }
185
15.8k
    m->in_parse = v;
186
15.8k
}
187
188
void
189
Objects::setTrailer(QPDFObjectHandle obj)
190
7.90k
{
191
7.90k
    if (m->trailer) {
192
0
        return;
193
0
    }
194
7.90k
    m->trailer = obj;
195
7.90k
}
196
197
void
198
Objects::reconstruct_xref(QPDFExc& e, bool found_startxref)
199
0
{
200
0
    if (m->reconstructed_xref) {
201
        // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
202
        // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
203
0
        throw e;
204
0
    }
205
206
    // If recovery generates more than 1000 warnings, the file is so severely damaged that there
207
    // probably is no point trying to continue.
208
0
    const auto max_warnings = m->warnings.size() + 1000U;
209
0
    auto check_warnings = [this, max_warnings]() {
210
0
        if (m->warnings.size() > max_warnings) {
211
0
            throw qpdf.damagedPDF(
212
0
                "", -1, "too many errors while reconstructing cross-reference table");
213
0
        }
214
0
    };
215
216
0
    m->reconstructed_xref = true;
217
    // We may find more objects, which may contain dangling references.
218
0
    m->fixed_dangling_refs = false;
219
220
0
    qpdf.warn(qpdf.damagedPDF("", -1, "file is damaged"));
221
0
    qpdf.warn(e);
222
0
    qpdf.warn(qpdf.damagedPDF("", -1, "Attempting to reconstruct cross-reference table"));
223
224
    // Delete all references to type 1 (uncompressed) objects
225
0
    std::vector<QPDFObjGen> to_delete;
226
0
    for (auto const& iter: m->xref_table) {
227
0
        if (iter.second.getType() == 1) {
228
0
            to_delete.emplace_back(iter.first);
229
0
        }
230
0
    }
231
0
    for (auto const& iter: to_delete) {
232
0
        m->xref_table.erase(iter);
233
0
    }
234
235
0
    std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
236
0
    std::vector<qpdf_offset_t> trailers;
237
0
    std::vector<qpdf_offset_t> startxrefs;
238
239
0
    m->file->seek(0, SEEK_END);
240
0
    qpdf_offset_t eof = m->file->tell();
241
0
    m->file->seek(0, SEEK_SET);
242
    // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
243
0
    static size_t const MAX_LEN = 10;
244
0
    while (m->file->tell() < eof) {
245
0
        QPDFTokenizer::Token t1 = m->objects.readToken(*m->file, MAX_LEN);
246
0
        qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
247
0
        if (t1.isInteger()) {
248
0
            auto pos = m->file->tell();
249
0
            auto t2 = m->objects.readToken(*m->file, MAX_LEN);
250
0
            if (t2.isInteger() && m->objects.readToken(*m->file, MAX_LEN).isWord("obj")) {
251
0
                int obj = QUtil::string_to_int(t1.getValue().c_str());
252
0
                int gen = QUtil::string_to_int(t2.getValue().c_str());
253
0
                if (obj <= m->xref_table_max_id) {
254
0
                    found_objects.emplace_back(obj, gen, token_start);
255
0
                } else {
256
0
                    qpdf.warn(qpdf.damagedPDF(
257
0
                        "", -1, "ignoring object with impossibly large id " + std::to_string(obj)));
258
0
                }
259
0
            }
260
0
            m->file->seek(pos, SEEK_SET);
261
0
        } else if (!m->trailer && t1.isWord("trailer")) {
262
0
            trailers.emplace_back(m->file->tell());
263
0
        } else if (!found_startxref && t1.isWord("startxref")) {
264
0
            startxrefs.emplace_back(m->file->tell());
265
0
        }
266
0
        check_warnings();
267
0
        m->file->findAndSkipNextEOL();
268
0
    }
269
270
0
    if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
271
0
        startxrefs.back() > std::get<2>(found_objects.back())) {
272
0
        auto xref_backup{m->xref_table};
273
0
        try {
274
0
            m->file->seek(startxrefs.back(), SEEK_SET);
275
0
            if (auto offset =
276
0
                    QUtil::string_to_ll(m->objects.readToken(*m->file).getValue().data())) {
277
0
                m->objects.read_xref(offset);
278
279
0
                if (qpdf.getRoot().getKey("/Pages").isDictionary()) {
280
0
                    QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
281
0
                    qpdf.warn(qpdf.damagedPDF(
282
0
                        "", -1, "startxref was more than 1024 bytes before end of file"));
283
0
                    qpdf.initializeEncryption();
284
0
                    m->parsed = true;
285
0
                    m->reconstructed_xref = false;
286
0
                    return;
287
0
                }
288
0
            }
289
0
        } catch (...) {
290
            // ok, bad luck. Do recovery.
291
0
        }
292
0
        m->xref_table = std::move(xref_backup);
293
0
    }
294
295
0
    auto rend = found_objects.rend();
296
0
    for (auto it = found_objects.rbegin(); it != rend; it++) {
297
0
        auto [obj, gen, token_start] = *it;
298
0
        insertXrefEntry(obj, 1, token_start, gen);
299
0
        check_warnings();
300
0
    }
301
0
    m->deleted_objects.clear();
302
303
    // Search at most the last 100 trailer candidates. If none of them are valid, odds are this file
304
    // is deliberately broken.
305
0
    int end_index = trailers.size() > 100 ? static_cast<int>(trailers.size()) - 100 : 0;
306
0
    for (auto it = trailers.rbegin(); it != std::prev(trailers.rend(), end_index); it++) {
307
0
        m->file->seek(*it, SEEK_SET);
308
0
        auto t = readTrailer();
309
0
        if (!t.isDictionary()) {
310
            // Oh well.  It was worth a try.
311
0
        } else {
312
0
            if (t.hasKey("/Root")) {
313
0
                m->trailer = t;
314
0
                break;
315
0
            }
316
0
            qpdf.warn(qpdf.damagedPDF("trailer", *it, "recovered trailer has no /Root entry"));
317
0
        }
318
0
        check_warnings();
319
0
    }
320
321
0
    if (!m->trailer) {
322
0
        qpdf_offset_t max_offset{0};
323
0
        size_t max_size{0};
324
        // If there are any xref streams, take the last one to appear.
325
0
        for (auto const& iter: m->xref_table) {
326
0
            auto entry = iter.second;
327
0
            if (entry.getType() != 1) {
328
0
                continue;
329
0
            }
330
0
            auto oh = qpdf.getObject(iter.first);
331
0
            try {
332
0
                if (!oh.isStreamOfType("/XRef")) {
333
0
                    continue;
334
0
                }
335
0
            } catch (std::exception&) {
336
0
                continue;
337
0
            }
338
0
            auto offset = entry.getOffset();
339
0
            auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
340
0
            if (size > max_size || (size == max_size && offset > max_offset)) {
341
0
                max_offset = offset;
342
0
                setTrailer(oh.getDict());
343
0
            }
344
0
            check_warnings();
345
0
        }
346
0
        if (max_offset > 0) {
347
0
            try {
348
0
                read_xref(max_offset, true);
349
0
            } catch (std::exception&) {
350
0
                qpdf.warn(qpdf.damagedPDF(
351
0
                    "", -1, "error decoding candidate xref stream while recovering damaged file"));
352
0
            }
353
0
            QTC::TC("qpdf", "QPDF recover xref stream");
354
0
        }
355
0
    }
356
357
0
    if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
358
        // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
359
0
        QPDFObjectHandle root;
360
0
        for (auto const& iter: m->obj_cache) {
361
0
            try {
362
0
                if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
363
0
                    root = iter.second.object;
364
0
                }
365
0
            } catch (std::exception&) {
366
0
                continue;
367
0
            }
368
0
        }
369
0
        if (root) {
370
0
            if (!m->trailer) {
371
0
                qpdf.warn(qpdf.damagedPDF(
372
0
                    "", -1, "unable to find trailer dictionary while recovering damaged file"));
373
0
                m->trailer = QPDFObjectHandle::newDictionary();
374
0
            }
375
0
            m->trailer.replaceKey("/Root", root);
376
0
        }
377
0
    }
378
379
0
    if (!m->trailer) {
380
        // We could check the last encountered object to see if it was an xref stream.  If so, we
381
        // could try to get the trailer from there.  This may make it possible to recover files with
382
        // bad startxref pointers even when they have object streams.
383
384
0
        throw qpdf.damagedPDF(
385
0
            "", -1, "unable to find trailer dictionary while recovering damaged file");
386
0
    }
387
0
    if (m->xref_table.empty()) {
388
        // We cannot check for an empty xref table in parse because empty tables are valid when
389
        // creating QPDF objects from JSON.
390
0
        throw qpdf.damagedPDF("", -1, "unable to find objects while recovering damaged file");
391
0
    }
392
0
    check_warnings();
393
0
    if (!m->parsed) {
394
0
        m->parsed = true;
395
0
        qpdf.getAllPages();
396
0
        check_warnings();
397
0
        if (m->all_pages.empty()) {
398
0
            m->parsed = false;
399
0
            throw qpdf.damagedPDF("", -1, "unable to find any pages while recovering damaged file");
400
0
        }
401
0
    }
402
403
    // We could iterate through the objects looking for streams and try to find objects inside of
404
    // them, but it's probably not worth the trouble.  Acrobat can't recover files with any errors
405
    // in an xref stream, and this would be a real long shot anyway.  If we wanted to do anything
406
    // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
407
    // It's safe to call it more than once.
408
0
}
409
410
void
411
Objects::read_xref(qpdf_offset_t xref_offset, bool in_stream_recovery)
412
7.90k
{
413
7.90k
    std::map<int, int> free_table;
414
7.90k
    std::set<qpdf_offset_t> visited;
415
15.8k
    while (xref_offset) {
416
7.90k
        visited.insert(xref_offset);
417
7.90k
        char buf[7];
418
7.90k
        memset(buf, 0, sizeof(buf));
419
7.90k
        m->file->seek(xref_offset, SEEK_SET);
420
        // Some files miss the mark a little with startxref. We could do a better job of searching
421
        // in the neighborhood for something that looks like either an xref table or stream, but the
422
        // simple heuristic of skipping whitespace can help with the xref table case and is harmless
423
        // with the stream case.
424
7.90k
        bool done = false;
425
7.90k
        bool skipped_space = false;
426
15.8k
        while (!done) {
427
7.90k
            char ch;
428
7.90k
            if (1 == m->file->read(&ch, 1)) {
429
7.90k
                if (util::is_space(ch)) {
430
0
                    skipped_space = true;
431
7.90k
                } else {
432
7.90k
                    m->file->unreadCh(ch);
433
7.90k
                    done = true;
434
7.90k
                }
435
7.90k
            } else {
436
0
                QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
437
0
                done = true;
438
0
            }
439
7.90k
        }
440
441
7.90k
        m->file->read(buf, sizeof(buf) - 1);
442
        // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
443
        // where it is terminated by arbitrary whitespace.
444
7.90k
        if ((strncmp(buf, "xref", 4) == 0) && util::is_space(buf[4])) {
445
7.90k
            if (skipped_space) {
446
0
                qpdf.warn(qpdf.damagedPDF("", -1, "extraneous whitespace seen before xref"));
447
0
            }
448
7.90k
            QTC::TC(
449
7.90k
                "qpdf",
450
7.90k
                "QPDF xref space",
451
7.90k
                ((buf[4] == '\n')       ? 0
452
7.90k
                     : (buf[4] == '\r') ? 1
453
0
                     : (buf[4] == ' ')  ? 2
454
0
                                        : 9999));
455
7.90k
            int skip = 4;
456
            // buf is null-terminated, and util::is_space('\0') is false, so this won't overrun.
457
15.8k
            while (util::is_space(buf[skip])) {
458
7.90k
                ++skip;
459
7.90k
            }
460
7.90k
            xref_offset = read_xrefTable(xref_offset + skip);
461
7.90k
        } else {
462
0
            xref_offset = read_xrefStream(xref_offset, in_stream_recovery);
463
0
        }
464
7.90k
        if (visited.contains(xref_offset)) {
465
0
            throw qpdf.damagedPDF("", -1, "loop detected following xref tables");
466
0
        }
467
7.90k
    }
468
469
7.90k
    if (!m->trailer) {
470
0
        throw qpdf.damagedPDF("", -1, "unable to find trailer while reading xref");
471
0
    }
472
7.90k
    int size = m->trailer.getKey("/Size").getIntValueAsInt();
473
7.90k
    int max_obj = 0;
474
7.90k
    if (!m->xref_table.empty()) {
475
0
        max_obj = m->xref_table.rbegin()->first.getObj();
476
0
    }
477
7.90k
    if (!m->deleted_objects.empty()) {
478
7.90k
        max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
479
7.90k
    }
480
7.90k
    if ((size < 1) || (size - 1 != max_obj)) {
481
0
        qpdf.warn(qpdf.damagedPDF(
482
0
            "",
483
0
            -1,
484
0
            ("reported number of objects (" + std::to_string(size) +
485
0
             ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
486
0
    }
487
488
    // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
489
    // never depend on its being set.
490
7.90k
    m->deleted_objects.clear();
491
492
    // Make sure we keep only the highest generation for any object.
493
7.90k
    QPDFObjGen last_og{-1, 0};
494
7.90k
    for (auto const& item: m->xref_table) {
495
0
        auto id = item.first.getObj();
496
0
        if (id == last_og.getObj() && id > 0) {
497
0
            qpdf.removeObject(last_og);
498
0
        }
499
0
        last_og = item.first;
500
0
    }
501
7.90k
}
502
503
bool
504
Objects::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
505
7.90k
{
506
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
507
    // buffer.
508
7.90k
    char const* p = line.c_str();
509
7.90k
    char const* start = line.c_str();
510
511
    // Skip zero or more spaces
512
7.90k
    while (util::is_space(*p)) {
513
0
        ++p;
514
0
    }
515
    // Require digit
516
7.90k
    if (!util::is_digit(*p)) {
517
0
        return false;
518
0
    }
519
    // Gather digits
520
7.90k
    std::string obj_str;
521
15.8k
    while (util::is_digit(*p)) {
522
7.90k
        obj_str.append(1, *p++);
523
7.90k
    }
524
    // Require space
525
7.90k
    if (!util::is_space(*p)) {
526
0
        return false;
527
0
    }
528
    // Skip spaces
529
15.8k
    while (util::is_space(*p)) {
530
7.90k
        ++p;
531
7.90k
    }
532
    // Require digit
533
7.90k
    if (!util::is_digit(*p)) {
534
0
        return false;
535
0
    }
536
    // Gather digits
537
7.90k
    std::string num_str;
538
15.8k
    while (util::is_digit(*p)) {
539
7.90k
        num_str.append(1, *p++);
540
7.90k
    }
541
    // Skip any space including line terminators
542
15.8k
    while (util::is_space(*p)) {
543
7.90k
        ++p;
544
7.90k
    }
545
7.90k
    bytes = toI(p - start);
546
7.90k
    obj = QUtil::string_to_int(obj_str.c_str());
547
7.90k
    num = QUtil::string_to_int(num_str.c_str());
548
7.90k
    return true;
549
7.90k
}
550
551
bool
552
Objects::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
553
0
{
554
    // Reposition after initial read attempt and reread.
555
0
    m->file->seek(m->file->getLastOffset(), SEEK_SET);
556
0
    auto line = m->file->readLine(30);
557
558
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
559
    // buffer.
560
0
    char const* p = line.data();
561
562
    // Skip zero or more spaces. There aren't supposed to be any.
563
0
    bool invalid = false;
564
0
    while (util::is_space(*p)) {
565
0
        ++p;
566
0
        invalid = true;
567
0
    }
568
    // Require digit
569
0
    if (!util::is_digit(*p)) {
570
0
        return false;
571
0
    }
572
    // Gather digits
573
0
    std::string f1_str;
574
0
    while (util::is_digit(*p)) {
575
0
        f1_str.append(1, *p++);
576
0
    }
577
    // Require space
578
0
    if (!util::is_space(*p)) {
579
0
        return false;
580
0
    }
581
0
    if (util::is_space(*(p + 1))) {
582
0
        invalid = true;
583
0
    }
584
    // Skip spaces
585
0
    while (util::is_space(*p)) {
586
0
        ++p;
587
0
    }
588
    // Require digit
589
0
    if (!util::is_digit(*p)) {
590
0
        return false;
591
0
    }
592
    // Gather digits
593
0
    std::string f2_str;
594
0
    while (util::is_digit(*p)) {
595
0
        f2_str.append(1, *p++);
596
0
    }
597
    // Require space
598
0
    if (!util::is_space(*p)) {
599
0
        return false;
600
0
    }
601
0
    if (util::is_space(*(p + 1))) {
602
0
        invalid = true;
603
0
    }
604
    // Skip spaces
605
0
    while (util::is_space(*p)) {
606
0
        ++p;
607
0
    }
608
0
    if ((*p == 'f') || (*p == 'n')) {
609
0
        type = *p;
610
0
    } else {
611
0
        return false;
612
0
    }
613
0
    if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
614
0
        invalid = true;
615
0
    }
616
617
0
    if (invalid) {
618
0
        qpdf.warn(qpdf.damagedPDF("xref table", "accepting invalid xref table entry"));
619
0
    }
620
621
0
    f1 = QUtil::string_to_ll(f1_str.c_str());
622
0
    f2 = QUtil::string_to_int(f2_str.c_str());
623
624
0
    return true;
625
0
}
626
627
// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
628
// result.
629
bool
630
Objects::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
631
7.90k
{
632
7.90k
    std::array<char, 21> line;
633
7.90k
    if (m->file->read(line.data(), 20) != 20) {
634
        // C++20: [[unlikely]]
635
0
        return false;
636
0
    }
637
7.90k
    line[20] = '\0';
638
7.90k
    char const* p = line.data();
639
640
7.90k
    int f1_len = 0;
641
7.90k
    int f2_len = 0;
642
643
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
644
    // buffer.
645
646
    // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
647
86.9k
    while (*p == '0') {
648
79.0k
        ++f1_len;
649
79.0k
        ++p;
650
79.0k
    }
651
7.90k
    while (util::is_digit(*p) && f1_len++ < 10) {
652
0
        f1 *= 10;
653
0
        f1 += *p++ - '0';
654
0
    }
655
    // Require space
656
7.90k
    if (!util::is_space(*p++)) {
657
        // Entry doesn't start with space or digit.
658
        // C++20: [[unlikely]]
659
0
        return false;
660
0
    }
661
    // Gather digits. NB No risk of overflow as 99'999 < max int.
662
7.90k
    while (*p == '0') {
663
0
        ++f2_len;
664
0
        ++p;
665
0
    }
666
47.4k
    while (util::is_digit(*p) && f2_len++ < 5) {
667
39.5k
        f2 *= 10;
668
39.5k
        f2 += static_cast<int>(*p++ - '0');
669
39.5k
    }
670
7.90k
    if (util::is_space(*p++) && (*p == 'f' || *p == 'n')) {
671
        // C++20: [[likely]]
672
7.90k
        type = *p;
673
        // No test for valid line[19].
674
7.90k
        if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
675
            // C++20: [[likely]]
676
7.90k
            return true;
677
7.90k
        }
678
7.90k
    }
679
0
    return read_bad_xrefEntry(f1, f2, type);
680
7.90k
}
681
682
// Read a single cross-reference table section and associated trailer.
683
qpdf_offset_t
684
Objects::read_xrefTable(qpdf_offset_t xref_offset)
685
7.90k
{
686
7.90k
    m->file->seek(xref_offset, SEEK_SET);
687
7.90k
    std::string line;
688
7.90k
    while (true) {
689
7.90k
        line.assign(50, '\0');
690
7.90k
        m->file->read(line.data(), line.size());
691
7.90k
        int obj = 0;
692
7.90k
        int num = 0;
693
7.90k
        int bytes = 0;
694
7.90k
        if (!parse_xrefFirst(line, obj, num, bytes)) {
695
0
            throw qpdf.damagedPDF("xref table", "xref syntax invalid");
696
0
        }
697
7.90k
        m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
698
15.8k
        for (qpdf_offset_t i = obj; i - num < obj; ++i) {
699
7.90k
            if (i == 0) {
700
                // This is needed by checkLinearization()
701
7.90k
                m->first_xref_item_offset = m->file->tell();
702
7.90k
            }
703
            // For xref_table, these will always be small enough to be ints
704
7.90k
            qpdf_offset_t f1 = 0;
705
7.90k
            int f2 = 0;
706
7.90k
            char type = '\0';
707
7.90k
            if (!read_xrefEntry(f1, f2, type)) {
708
0
                throw qpdf.damagedPDF(
709
0
                    "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
710
0
            }
711
7.90k
            if (type == 'f') {
712
7.90k
                insertFreeXrefEntry(QPDFObjGen(toI(i), f2));
713
7.90k
            } else {
714
0
                insertXrefEntry(toI(i), 1, f1, f2);
715
0
            }
716
7.90k
        }
717
7.90k
        qpdf_offset_t pos = m->file->tell();
718
7.90k
        if (readToken(*m->file).isWord("trailer")) {
719
7.90k
            break;
720
7.90k
        } else {
721
0
            m->file->seek(pos, SEEK_SET);
722
0
        }
723
7.90k
    }
724
725
    // Set offset to previous xref table if any
726
7.90k
    QPDFObjectHandle cur_trailer = m->objects.readTrailer();
727
7.90k
    if (!cur_trailer.isDictionary()) {
728
0
        throw qpdf.damagedPDF("", "expected trailer dictionary");
729
0
    }
730
731
7.90k
    if (!m->trailer) {
732
7.90k
        setTrailer(cur_trailer);
733
734
7.90k
        if (!m->trailer.hasKey("/Size")) {
735
0
            throw qpdf.damagedPDF("trailer", "trailer dictionary lacks /Size key");
736
0
        }
737
7.90k
        if (!m->trailer.getKey("/Size").isInteger()) {
738
0
            throw qpdf.damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
739
0
        }
740
7.90k
    }
741
742
7.90k
    if (cur_trailer.hasKey("/XRefStm")) {
743
0
        if (m->ignore_xref_streams) {
744
0
            QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
745
0
        } else {
746
0
            if (cur_trailer.getKey("/XRefStm").isInteger()) {
747
                // Read the xref stream but disregard any return value -- we'll use our trailer's
748
                // /Prev key instead of the xref stream's.
749
0
                (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
750
0
            } else {
751
0
                throw qpdf.damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
752
0
            }
753
0
        }
754
0
    }
755
756
7.90k
    if (cur_trailer.hasKey("/Prev")) {
757
0
        if (!cur_trailer.getKey("/Prev").isInteger()) {
758
0
            throw qpdf.damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
759
0
        }
760
0
        return cur_trailer.getKey("/Prev").getIntValue();
761
0
    }
762
763
7.90k
    return 0;
764
7.90k
}
765
766
// Read a single cross-reference stream.
767
qpdf_offset_t
768
Objects::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
769
0
{
770
0
    if (!m->ignore_xref_streams) {
771
0
        QPDFObjectHandle xref_obj;
772
0
        try {
773
0
            m->in_read_xref_stream = true;
774
0
            xref_obj = readObjectAtOffset(xref_offset, "xref stream", true);
775
0
        } catch (QPDFExc&) {
776
            // ignore -- report error below
777
0
        }
778
0
        m->in_read_xref_stream = false;
779
0
        if (xref_obj.isStreamOfType("/XRef")) {
780
0
            return processXRefStream(xref_offset, xref_obj, in_stream_recovery);
781
0
        }
782
0
    }
783
784
0
    throw qpdf.damagedPDF("", xref_offset, "xref not found");
785
0
    return 0; // unreachable
786
0
}
787
788
// Return the entry size of the xref stream and the processed W array.
789
std::pair<int, std::array<int, 3>>
790
Objects::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
791
0
{
792
0
    auto W_obj = dict.getKey("/W");
793
0
    if (!(W_obj.size() >= 3 && W_obj.getArrayItem(0).isInteger() &&
794
0
          W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
795
0
        throw damaged("Cross-reference stream does not have a proper /W key");
796
0
    }
797
798
0
    std::array<int, 3> W;
799
0
    int entry_size = 0;
800
0
    auto w_vector = W_obj.getArrayAsVector();
801
0
    int max_bytes = sizeof(qpdf_offset_t);
802
0
    for (size_t i = 0; i < 3; ++i) {
803
0
        W[i] = w_vector[i].getIntValueAsInt();
804
0
        if (W[i] > max_bytes) {
805
0
            throw damaged("Cross-reference stream's /W contains impossibly large values");
806
0
        }
807
0
        if (W[i] < 0) {
808
0
            throw damaged("Cross-reference stream's /W contains negative values");
809
0
        }
810
0
        entry_size += W[i];
811
0
    }
812
0
    if (entry_size == 0) {
813
0
        throw damaged("Cross-reference stream's /W indicates entry size of 0");
814
0
    }
815
0
    return {entry_size, W};
816
0
}
817
818
// Validate Size key and return the maximum number of entries that the xref stream can contain.
819
int
820
Objects::processXRefSize(
821
    QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
822
0
{
823
    // Number of entries is limited by the highest possible object id and stream size.
824
0
    auto max_num_entries = std::numeric_limits<int>::max();
825
0
    if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
826
0
        max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
827
0
    }
828
829
0
    auto Size_obj = dict.getKey("/Size");
830
0
    long long size;
831
0
    if (!dict.getKey("/Size").getValueAsInt(size)) {
832
0
        throw damaged("Cross-reference stream does not have a proper /Size key");
833
0
    } else if (size < 0) {
834
0
        throw damaged("Cross-reference stream has a negative /Size key");
835
0
    } else if (size >= max_num_entries) {
836
0
        throw damaged("Cross-reference stream has an impossibly large /Size key");
837
0
    }
838
    // We are not validating that Size <= (Size key of parent xref / trailer).
839
0
    return max_num_entries;
840
0
}
841
842
// Return the number of entries of the xref stream and the processed Index array.
843
std::pair<int, std::vector<std::pair<int, int>>>
844
Objects::processXRefIndex(
845
    QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
846
0
{
847
0
    auto size = dict.getKey("/Size").getIntValueAsInt();
848
0
    auto Index_obj = dict.getKey("/Index");
849
850
0
    if (Index_obj.isArray()) {
851
0
        std::vector<std::pair<int, int>> indx;
852
0
        int num_entries = 0;
853
0
        auto index_vec = Index_obj.getArrayAsVector();
854
0
        if ((index_vec.size() % 2) || index_vec.size() < 2) {
855
0
            throw damaged("Cross-reference stream's /Index has an invalid number of values");
856
0
        }
857
858
0
        int i = 0;
859
0
        long long first = 0;
860
0
        for (auto& val: index_vec) {
861
0
            if (val.isInteger()) {
862
0
                if (i % 2) {
863
0
                    auto count = val.getIntValue();
864
0
                    if (count <= 0) {
865
0
                        throw damaged(
866
0
                            "Cross-reference stream section claims to contain " +
867
0
                            std::to_string(count) + " entries");
868
0
                    }
869
                    // We are guarding against the possibility of num_entries * entry_size
870
                    // overflowing. We are not checking that entries are in ascending order as
871
                    // required by the spec, which probably should generate a warning. We are also
872
                    // not checking that for each subsection first object number + number of entries
873
                    // <= /Size. The spec requires us to ignore object number > /Size.
874
0
                    if (first > (max_num_entries - count) ||
875
0
                        count > (max_num_entries - num_entries)) {
876
0
                        throw damaged(
877
0
                            "Cross-reference stream claims to contain too many entries: " +
878
0
                            std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
879
0
                            std::to_string(num_entries));
880
0
                    }
881
0
                    indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
882
0
                    num_entries += static_cast<int>(count);
883
0
                } else {
884
0
                    first = val.getIntValue();
885
0
                    if (first < 0) {
886
0
                        throw damaged(
887
0
                            "Cross-reference stream's /Index contains a negative object id");
888
0
                    } else if (first > max_num_entries) {
889
0
                        throw damaged(
890
0
                            "Cross-reference stream's /Index contains an impossibly "
891
0
                            "large object id");
892
0
                    }
893
0
                }
894
0
            } else {
895
0
                throw damaged(
896
0
                    "Cross-reference stream's /Index's item " + std::to_string(i) +
897
0
                    " is not an integer");
898
0
            }
899
0
            i++;
900
0
        }
901
0
        QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
902
0
        return {num_entries, indx};
903
0
    } else if (Index_obj.null()) {
904
0
        return {size, {{0, size}}};
905
0
    } else {
906
0
        throw damaged("Cross-reference stream does not have a proper /Index key");
907
0
    }
908
0
}
909
910
qpdf_offset_t
911
Objects::processXRefStream(
912
    qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj, bool in_stream_recovery)
913
0
{
914
0
    auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
915
0
        return qpdf.damagedPDF("xref stream", xref_offset, msg.data());
916
0
    };
917
918
0
    auto dict = xref_obj.getDict();
919
920
0
    auto [entry_size, W] = processXRefW(dict, damaged);
921
0
    int max_num_entries = processXRefSize(dict, entry_size, damaged);
922
0
    auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
923
924
0
    std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
925
0
    size_t actual_size = bp->getSize();
926
0
    auto expected_size = toS(entry_size) * toS(num_entries);
927
928
0
    if (expected_size != actual_size) {
929
0
        QPDFExc x = damaged(
930
0
            "Cross-reference stream data has the wrong size; expected = " +
931
0
            std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
932
0
        if (expected_size > actual_size) {
933
0
            throw x;
934
0
        } else {
935
0
            qpdf.warn(x);
936
0
        }
937
0
    }
938
939
0
    bool saw_first_compressed_object = false;
940
941
    // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
942
    // We know that entry_size * num_entries is less or equal to the size of the buffer.
943
0
    auto p = bp->getBuffer();
944
0
    for (auto [obj, sec_entries]: indx) {
945
        // Process a subsection.
946
0
        for (int i = 0; i < sec_entries; ++i) {
947
            // Read this entry
948
0
            std::array<qpdf_offset_t, 3> fields{};
949
0
            if (W[0] == 0) {
950
0
                fields[0] = 1;
951
0
            }
952
0
            for (size_t j = 0; j < 3; ++j) {
953
0
                for (int k = 0; k < W[j]; ++k) {
954
0
                    fields[j] <<= 8;
955
0
                    fields[j] |= *p++;
956
0
                }
957
0
            }
958
959
            // Get the generation number.  The generation number is 0 unless this is an uncompressed
960
            // object record, in which case the generation number appears as the third field.
961
0
            if (saw_first_compressed_object) {
962
0
                if (fields[0] != 2) {
963
0
                    m->uncompressed_after_compressed = true;
964
0
                }
965
0
            } else if (fields[0] == 2) {
966
0
                saw_first_compressed_object = true;
967
0
            }
968
0
            if (obj == 0) {
969
                // This is needed by checkLinearization()
970
0
                m->first_xref_item_offset = xref_offset;
971
0
            } else if (fields[0] == 0) {
972
                // Ignore fields[2], which we don't care about in this case. This works around the
973
                // issue of some PDF files that put invalid values, like -1, here for deleted
974
                // objects.
975
0
                insertFreeXrefEntry(QPDFObjGen(obj, 0));
976
0
            } else {
977
0
                auto typ = toI(fields[0]);
978
0
                if (!in_stream_recovery || typ == 2) {
979
                    // If we are in xref stream recovery all actual uncompressed objects have
980
                    // already been inserted into the xref table. Avoid adding junk data into the
981
                    // xref table.
982
0
                    insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
983
0
                }
984
0
            }
985
0
            ++obj;
986
0
        }
987
0
    }
988
989
0
    if (!m->trailer) {
990
0
        setTrailer(dict);
991
0
    }
992
993
0
    if (dict.hasKey("/Prev")) {
994
0
        if (!dict.getKey("/Prev").isInteger()) {
995
0
            throw qpdf.damagedPDF(
996
0
                "xref stream", "/Prev key in xref stream dictionary is not an integer");
997
0
        }
998
0
        return dict.getKey("/Prev").getIntValue();
999
0
    } else {
1000
0
        return 0;
1001
0
    }
1002
0
}
1003
1004
void
1005
Objects::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
1006
0
{
1007
    // Populate the xref table in such a way that the first reference to an object that we see,
1008
    // which is the one in the latest xref table in which it appears, is the one that gets stored.
1009
    // This works because we are reading more recent appends before older ones.
1010
1011
    // If there is already an entry for this object and generation in the table, it means that a
1012
    // later xref table has registered this object.  Disregard this one.
1013
0
    int new_gen = f0 == 2 ? 0 : f2;
1014
1015
0
    if (!(f0 == 1 || f0 == 2)) {
1016
0
        return;
1017
0
    }
1018
1019
0
    if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && new_gen < 65535)) {
1020
        // We are ignoring invalid objgens. Most will arrive here from xref reconstruction. There
1021
        // is probably no point having another warning but we could count invalid items in order to
1022
        // decide when to give up.
1023
        // ignore impossibly large object ids or object ids > Size.
1024
0
        return;
1025
0
    }
1026
1027
0
    if (m->deleted_objects.contains(obj)) {
1028
0
        return;
1029
0
    }
1030
1031
0
    if (f0 == 2) {
1032
0
        if (f1 == obj) {
1033
0
            qpdf.warn(qpdf.damagedPDF(
1034
0
                "xref stream", "self-referential object stream " + std::to_string(obj)));
1035
0
            return;
1036
0
        }
1037
0
        if (f1 > m->xref_table_max_id) {
1038
            // ignore impossibly large object stream ids
1039
0
            qpdf.warn(qpdf.damagedPDF(
1040
0
                "xref stream",
1041
0
                "object stream id " + std::to_string(f1) + " for object " + std::to_string(obj) +
1042
0
                    " is impossibly large"));
1043
0
            return;
1044
0
        }
1045
0
    }
1046
1047
0
    auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
1048
0
    if (!created) {
1049
0
        return;
1050
0
    }
1051
1052
0
    switch (f0) {
1053
0
    case 1:
1054
        // f2 is generation
1055
0
        QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1056
0
        iter->second = QPDFXRefEntry(f1);
1057
0
        break;
1058
1059
0
    case 2:
1060
0
        iter->second = QPDFXRefEntry(toI(f1), f2);
1061
0
        break;
1062
1063
0
    default:
1064
0
        throw qpdf.damagedPDF(
1065
0
            "xref stream", "unknown xref stream entry type " + std::to_string(f0));
1066
0
        break;
1067
0
    }
1068
0
}
1069
1070
void
1071
Objects::insertFreeXrefEntry(QPDFObjGen og)
1072
7.90k
{
1073
7.90k
    if (!m->xref_table.contains(og)) {
1074
7.90k
        m->deleted_objects.insert(og.getObj());
1075
7.90k
    }
1076
7.90k
}
1077
1078
void
1079
QPDF::showXRefTable()
1080
0
{
1081
0
    auto& cout = *m->log->getInfo();
1082
0
    for (auto const& iter: m->xref_table) {
1083
0
        QPDFObjGen const& og = iter.first;
1084
0
        QPDFXRefEntry const& entry = iter.second;
1085
0
        cout << og.unparse('/') << ": ";
1086
0
        switch (entry.getType()) {
1087
0
        case 1:
1088
0
            cout << "uncompressed; offset = " << entry.getOffset();
1089
0
            break;
1090
1091
0
        case 2:
1092
0
            *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
1093
0
                               << ", index = " << entry.getObjStreamIndex();
1094
0
            break;
1095
1096
0
        default:
1097
0
            throw std::logic_error("unknown cross-reference table type while showing xref_table");
1098
0
            break;
1099
0
        }
1100
0
        m->log->info("\n");
1101
0
    }
1102
0
}
1103
1104
// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
1105
// return false. Otherwise return true.
1106
bool
1107
Objects::resolveXRefTable()
1108
0
{
1109
0
    bool may_change = !m->reconstructed_xref;
1110
0
    for (auto& iter: m->xref_table) {
1111
0
        if (isUnresolved(iter.first)) {
1112
0
            resolve(iter.first);
1113
0
            if (may_change && m->reconstructed_xref) {
1114
0
                return false;
1115
0
            }
1116
0
        }
1117
0
    }
1118
0
    return true;
1119
0
}
1120
1121
// Ensure all objects in the pdf file, including those in indirect references, appear in the object
1122
// cache.
1123
void
1124
QPDF::fixDanglingReferences(bool force)
1125
0
{
1126
0
    if (m->fixed_dangling_refs) {
1127
0
        return;
1128
0
    }
1129
0
    if (!m->objects.resolveXRefTable()) {
1130
0
        m->objects.resolveXRefTable();
1131
0
    }
1132
0
    m->fixed_dangling_refs = true;
1133
0
}
1134
1135
size_t
1136
QPDF::getObjectCount()
1137
0
{
1138
    // This method returns the next available indirect object number. makeIndirectObject uses it for
1139
    // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
1140
    // be in obj_cache.
1141
0
    fixDanglingReferences();
1142
0
    QPDFObjGen og;
1143
0
    if (!m->obj_cache.empty()) {
1144
0
        og = (*(m->obj_cache.rbegin())).first;
1145
0
    }
1146
0
    return toS(og.getObj());
1147
0
}
1148
1149
std::vector<QPDFObjectHandle>
1150
QPDF::getAllObjects()
1151
0
{
1152
    // After fixDanglingReferences is called, all objects are in the object cache.
1153
0
    fixDanglingReferences();
1154
0
    std::vector<QPDFObjectHandle> result;
1155
0
    for (auto const& iter: m->obj_cache) {
1156
0
        result.emplace_back(m->objects.newIndirect(iter.first, iter.second.object));
1157
0
    }
1158
0
    return result;
1159
0
}
1160
1161
void
1162
Objects::setLastObjectDescription(std::string const& description, QPDFObjGen og)
1163
0
{
1164
0
    m->last_object_description.clear();
1165
0
    if (!description.empty()) {
1166
0
        m->last_object_description += description;
1167
0
        if (og.isIndirect()) {
1168
0
            m->last_object_description += ": ";
1169
0
        }
1170
0
    }
1171
0
    if (og.isIndirect()) {
1172
0
        m->last_object_description += "object " + og.unparse(' ');
1173
0
    }
1174
0
}
1175
1176
QPDFObjectHandle
1177
Objects::readTrailer()
1178
7.90k
{
1179
7.90k
    qpdf_offset_t offset = m->file->tell();
1180
7.90k
    auto [object, empty] =
1181
7.90k
        QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref);
1182
7.90k
    if (empty) {
1183
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1184
        // actual PDF files and Adobe Reader appears to ignore them.
1185
0
        qpdf.warn(qpdf.damagedPDF("trailer", "empty object treated as null"));
1186
7.90k
    } else if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) {
1187
0
        qpdf.warn(qpdf.damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1188
0
    }
1189
    // Override last_offset so that it points to the beginning of the object we just read
1190
7.90k
    m->file->setLastOffset(offset);
1191
7.90k
    return object;
1192
7.90k
}
1193
1194
QPDFObjectHandle
1195
Objects::readObject(std::string const& description, QPDFObjGen og)
1196
0
{
1197
0
    setLastObjectDescription(description, og);
1198
0
    qpdf_offset_t offset = m->file->tell();
1199
1200
0
    StringDecrypter decrypter{&qpdf, og};
1201
0
    StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1202
0
    auto [object, empty] = QPDFParser::parse(
1203
0
        *m->file,
1204
0
        m->last_object_description,
1205
0
        m->tokenizer,
1206
0
        decrypter_ptr,
1207
0
        qpdf,
1208
0
        m->reconstructed_xref || m->in_read_xref_stream);
1209
0
    ;
1210
0
    if (empty) {
1211
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1212
        // actual PDF files and Adobe Reader appears to ignore them.
1213
0
        qpdf.warn(
1214
0
            qpdf.damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null"));
1215
0
        return object;
1216
0
    }
1217
0
    auto token = readToken(*m->file);
1218
0
    if (object.isDictionary() && token.isWord("stream")) {
1219
0
        readStream(object, og, offset);
1220
0
        token = readToken(*m->file);
1221
0
    }
1222
0
    if (!token.isWord("endobj")) {
1223
0
        qpdf.warn(qpdf.damagedPDF("expected endobj"));
1224
0
    }
1225
0
    return object;
1226
0
}
1227
1228
// After reading stream dictionary and stream keyword, read rest of stream.
1229
void
1230
Objects::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1231
0
{
1232
0
    validateStreamLineEnd(object, og, offset);
1233
1234
    // Must get offset before accessing any additional objects since resolving a previously
1235
    // unresolved indirect object will change file position.
1236
0
    qpdf_offset_t stream_offset = m->file->tell();
1237
0
    size_t length = 0;
1238
1239
0
    try {
1240
0
        auto length_obj = object.getKey("/Length");
1241
1242
0
        if (!length_obj.isInteger()) {
1243
0
            if (length_obj.null()) {
1244
0
                throw qpdf.damagedPDF(offset, "stream dictionary lacks /Length key");
1245
0
            }
1246
0
            throw qpdf.damagedPDF(offset, "/Length key in stream dictionary is not an integer");
1247
0
        }
1248
1249
0
        length = toS(length_obj.getUIntValue());
1250
        // Seek in two steps to avoid potential integer overflow
1251
0
        m->file->seek(stream_offset, SEEK_SET);
1252
0
        m->file->seek(toO(length), SEEK_CUR);
1253
0
        if (!readToken(*m->file).isWord("endstream")) {
1254
0
            throw qpdf.damagedPDF("expected endstream");
1255
0
        }
1256
0
    } catch (QPDFExc& e) {
1257
0
        if (m->attempt_recovery) {
1258
0
            qpdf.warn(e);
1259
0
            length = recoverStreamLength(m->file, og, stream_offset);
1260
0
        } else {
1261
0
            throw;
1262
0
        }
1263
0
    }
1264
0
    object = QPDFObjectHandle(qpdf::Stream(qpdf, og, object, stream_offset, length));
1265
0
}
1266
1267
void
1268
Objects::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1269
0
{
1270
    // The PDF specification states that the word "stream" should be followed by either a carriage
1271
    // return and a newline or by a newline alone.  It specifically disallowed following it by a
1272
    // carriage return alone since, in that case, there would be no way to tell whether the NL in a
1273
    // CR NL sequence was part of the stream data.  However, some readers, including Adobe reader,
1274
    // accept a carriage return by itself when followed by a non-newline character, so that's what
1275
    // we do here. We have also seen files that have extraneous whitespace between the stream
1276
    // keyword and the newline.
1277
0
    while (true) {
1278
0
        char ch;
1279
0
        if (m->file->read(&ch, 1) == 0) {
1280
            // A premature EOF here will result in some other problem that will get reported at
1281
            // another time.
1282
0
            return;
1283
0
        }
1284
0
        if (ch == '\n') {
1285
            // ready to read stream data
1286
0
            return;
1287
0
        }
1288
0
        if (ch == '\r') {
1289
            // Read another character
1290
0
            if (m->file->read(&ch, 1) != 0) {
1291
0
                if (ch == '\n') {
1292
                    // Ready to read stream data
1293
0
                    QTC::TC("qpdf", "QPDF stream with CRNL");
1294
0
                } else {
1295
                    // Treat the \r by itself as the whitespace after endstream and start reading
1296
                    // stream data in spite of not having seen a newline.
1297
0
                    m->file->unreadCh(ch);
1298
0
                    qpdf.warn(qpdf.damagedPDF(
1299
0
                        m->file->tell(), "stream keyword followed by carriage return only"));
1300
0
                }
1301
0
            }
1302
0
            return;
1303
0
        }
1304
0
        if (!util::is_space(ch)) {
1305
0
            m->file->unreadCh(ch);
1306
0
            qpdf.warn(qpdf.damagedPDF(
1307
0
                m->file->tell(), "stream keyword not followed by proper line terminator"));
1308
0
            return;
1309
0
        }
1310
0
        qpdf.warn(
1311
0
            qpdf.damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
1312
0
    }
1313
0
}
1314
1315
QPDFObjectHandle
1316
Objects::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id)
1317
0
{
1318
0
    auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, qpdf);
1319
0
    if (empty) {
1320
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1321
        // actual PDF files and Adobe Reader appears to ignore them.
1322
0
        qpdf.warn(QPDFExc(
1323
0
            qpdf_e_damaged_pdf,
1324
0
            m->file->getName() + " object stream " + std::to_string(stream_id),
1325
0
            +"object " + std::to_string(obj_id) + " 0, offset " +
1326
0
                std::to_string(input.getLastOffset()),
1327
0
            0,
1328
0
            "empty object treated as null"));
1329
0
    }
1330
0
    return object;
1331
0
}
1332
1333
bool
1334
QPDF::findEndstream()
1335
0
{
1336
    // Find endstream or endobj. Position the input at that token.
1337
0
    auto t = m->objects.readToken(*m->file, 20);
1338
0
    if (t.isWord("endobj") || t.isWord("endstream")) {
1339
0
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
1340
0
        return true;
1341
0
    }
1342
0
    return false;
1343
0
}
1344
1345
size_t
1346
Objects::recoverStreamLength(
1347
    std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset)
1348
0
{
1349
    // Try to reconstruct stream length by looking for endstream or endobj
1350
0
    qpdf.warn(qpdf.damagedPDF(*input, stream_offset, "attempting to recover stream length"));
1351
1352
0
    PatternFinder ef(qpdf, &QPDF::findEndstream);
1353
0
    size_t length = 0;
1354
0
    if (m->file->findFirst("end", stream_offset, 0, ef)) {
1355
0
        length = toS(m->file->tell() - stream_offset);
1356
        // Reread endstream but, if it was endobj, don't skip that.
1357
0
        QPDFTokenizer::Token t = readToken(*m->file);
1358
0
        if (t.getValue() == "endobj") {
1359
0
            m->file->seek(m->file->getLastOffset(), SEEK_SET);
1360
0
        }
1361
0
    }
1362
1363
0
    if (length) {
1364
0
        auto end = stream_offset + toO(length);
1365
0
        qpdf_offset_t found_offset = 0;
1366
0
        QPDFObjGen found_og;
1367
1368
        // Make sure this is inside this object
1369
0
        for (auto const& [current_og, entry]: m->xref_table) {
1370
0
            if (entry.getType() == 1) {
1371
0
                qpdf_offset_t obj_offset = entry.getOffset();
1372
0
                if (found_offset < obj_offset && obj_offset < end) {
1373
0
                    found_offset = obj_offset;
1374
0
                    found_og = current_og;
1375
0
                }
1376
0
            }
1377
0
        }
1378
0
        if (!found_offset || found_og == og) {
1379
            // If we are trying to recover an XRef stream the xref table will not contain and
1380
            // won't contain any entries, therefore we cannot check the found length. Otherwise we
1381
            // found endstream\nendobj within the space allowed for this object, so we're probably
1382
            // in good shape.
1383
0
        } else {
1384
0
            length = 0;
1385
0
        }
1386
0
    }
1387
1388
0
    if (length == 0) {
1389
0
        qpdf.warn(qpdf.damagedPDF(
1390
0
            *input, stream_offset, "unable to recover stream data; treating stream as empty"));
1391
0
    } else {
1392
0
        qpdf.warn(qpdf.damagedPDF(
1393
0
            *input, stream_offset, "recovered stream length: " + std::to_string(length)));
1394
0
    }
1395
1396
0
    return length;
1397
0
}
1398
1399
QPDFTokenizer::Token
1400
Objects::readToken(InputSource& input, size_t max_len)
1401
39.5k
{
1402
39.5k
    return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1403
39.5k
}
1404
1405
QPDFObjGen
1406
Objects::read_object_start(qpdf_offset_t offset)
1407
0
{
1408
0
    m->file->seek(offset, SEEK_SET);
1409
0
    QPDFTokenizer::Token tobjid = readToken(*m->file);
1410
0
    bool objidok = tobjid.isInteger();
1411
0
    if (!objidok) {
1412
0
        throw qpdf.damagedPDF(offset, "expected n n obj");
1413
0
    }
1414
0
    QPDFTokenizer::Token tgen = readToken(*m->file);
1415
0
    bool genok = tgen.isInteger();
1416
0
    if (!genok) {
1417
0
        throw qpdf.damagedPDF(offset, "expected n n obj");
1418
0
    }
1419
0
    QPDFTokenizer::Token tobj = readToken(*m->file);
1420
1421
0
    bool objok = tobj.isWord("obj");
1422
1423
0
    if (!objok) {
1424
0
        throw qpdf.damagedPDF(offset, "expected n n obj");
1425
0
    }
1426
0
    int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1427
0
    int generation = QUtil::string_to_int(tgen.getValue().c_str());
1428
0
    if (objid == 0) {
1429
0
        throw qpdf.damagedPDF(offset, "object with ID 0");
1430
0
    }
1431
0
    return {objid, generation};
1432
0
}
1433
1434
void
1435
Objects::readObjectAtOffset(
1436
    bool try_recovery, qpdf_offset_t offset, std::string const& description, QPDFObjGen exp_og)
1437
0
{
1438
0
    QPDFObjGen og;
1439
0
    setLastObjectDescription(description, exp_og);
1440
1441
0
    if (!m->attempt_recovery) {
1442
0
        try_recovery = false;
1443
0
    }
1444
1445
    // Special case: if offset is 0, just return null.  Some PDF writers, in particular
1446
    // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
1447
    // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
1448
    // these.
1449
0
    if (offset == 0) {
1450
0
        qpdf.warn(qpdf.damagedPDF(-1, "object has offset 0"));
1451
0
        return;
1452
0
    }
1453
1454
0
    try {
1455
0
        og = read_object_start(offset);
1456
0
        if (exp_og != og) {
1457
0
            QPDFExc e = qpdf.damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1458
0
            if (try_recovery) {
1459
                // Will be retried below
1460
0
                throw e;
1461
0
            } else {
1462
                // We can try reading the object anyway even if the ID doesn't match.
1463
0
                qpdf.warn(e);
1464
0
            }
1465
0
        }
1466
0
    } catch (QPDFExc& e) {
1467
0
        if (!try_recovery) {
1468
0
            throw;
1469
0
        }
1470
        // Try again after reconstructing xref table
1471
0
        reconstruct_xref(e);
1472
0
        if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {
1473
0
            qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1474
0
            readObjectAtOffset(false, new_offset, description, exp_og);
1475
0
            return;
1476
0
        }
1477
0
        qpdf.warn(qpdf.damagedPDF(
1478
0
            "",
1479
0
            -1,
1480
0
            ("object " + exp_og.unparse(' ') +
1481
0
             " not found in file after regenerating cross reference table")));
1482
0
        return;
1483
0
    }
1484
1485
0
    QPDFObjectHandle oh = readObject(description, og);
1486
1487
    // Determine the end offset of this object before and after white space.  We use these
1488
    // numbers to validate linearization hint tables.  Offsets and lengths of objects may imply
1489
    // the end of an object to be anywhere between these values.
1490
0
    qpdf_offset_t end_before_space = m->file->tell();
1491
1492
    // skip over spaces
1493
0
    while (true) {
1494
0
        char ch;
1495
0
        if (!m->file->read(&ch, 1)) {
1496
0
            throw qpdf.damagedPDF(m->file->tell(), "EOF after endobj");
1497
0
        }
1498
0
        if (!isspace(static_cast<unsigned char>(ch))) {
1499
0
            m->file->seek(-1, SEEK_CUR);
1500
0
            break;
1501
0
        }
1502
0
    }
1503
0
    m->objects.updateCache(og, oh.getObj(), end_before_space, m->file->tell());
1504
0
}
1505
1506
QPDFObjectHandle
1507
Objects::readObjectAtOffset(
1508
    qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref)
1509
0
{
1510
0
    auto og = read_object_start(offset);
1511
0
    auto oh = readObject(description, og);
1512
1513
0
    if (!m->objects.isUnresolved(og)) {
1514
0
        return oh;
1515
0
    }
1516
1517
0
    if (skip_cache_if_in_xref && m->xref_table.contains(og)) {
1518
        // In the special case of the xref stream and linearization hint tables, the offset comes
1519
        // from another source. For the specific case of xref streams, the xref stream is read and
1520
        // loaded into the object cache very early in parsing. Ordinarily, when a file is updated by
1521
        // appending, items inserted into the xref table in later updates take precedence over
1522
        // earlier items. In the special case of reusing the object number previously used as the
1523
        // xref stream, we have the following order of events:
1524
        //
1525
        // * reused object gets loaded into the xref table
1526
        // * old object is read here while reading xref streams
1527
        // * original xref entry is ignored (since already in xref table)
1528
        //
1529
        // It is the second step that causes a problem. Even though the xref table is correct in
1530
        // this case, the old object is already in the cache and so effectively prevails over the
1531
        // reused object. To work around this issue, we have a special case for the xref stream (via
1532
        // the skip_cache_if_in_xref): if the object is already in the xref stream, don't cache what
1533
        // we read here.
1534
        //
1535
        // It is likely that the same bug may exist for linearization hint tables, but the existing
1536
        // code uses end_before_space and end_after_space from the cache, so fixing that would
1537
        // require more significant rework. The chances of a linearization hint stream being reused
1538
        // seems smaller because the xref stream is probably the highest object in the file and the
1539
        // linearization hint stream would be some random place in the middle, so I'm leaving that
1540
        // bug unfixed for now. If the bug were to be fixed, we could use !check_og in place of
1541
        // skip_cache_if_in_xref.
1542
0
        QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1543
0
        return oh;
1544
0
    }
1545
1546
    // Determine the end offset of this object before and after white space.  We use these
1547
    // numbers to validate linearization hint tables.  Offsets and lengths of objects may imply
1548
    // the end of an object to be anywhere between these values.
1549
0
    qpdf_offset_t end_before_space = m->file->tell();
1550
1551
    // skip over spaces
1552
0
    while (true) {
1553
0
        char ch;
1554
0
        if (!m->file->read(&ch, 1)) {
1555
0
            throw qpdf.damagedPDF(m->file->tell(), "EOF after endobj");
1556
0
        }
1557
0
        if (!isspace(static_cast<unsigned char>(ch))) {
1558
0
            m->file->seek(-1, SEEK_CUR);
1559
0
            break;
1560
0
        }
1561
0
    }
1562
0
    m->objects.updateCache(og, oh.getObj(), end_before_space, m->file->tell());
1563
1564
0
    return oh;
1565
0
}
1566
1567
std::shared_ptr<QPDFObject> const&
1568
Objects::resolve(QPDFObjGen og)
1569
0
{
1570
0
    if (!isUnresolved(og)) {
1571
0
        return m->obj_cache[og].object;
1572
0
    }
1573
1574
0
    if (m->resolving.contains(og)) {
1575
        // This can happen if an object references itself directly or indirectly in some key that
1576
        // has to be resolved during object parsing, such as stream length.
1577
0
        qpdf.warn(qpdf.damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
1578
0
        updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1579
0
        return m->obj_cache[og].object;
1580
0
    }
1581
0
    ResolveRecorder rr(qpdf, og);
1582
1583
0
    if (m->xref_table.contains(og)) {
1584
0
        QPDFXRefEntry const& entry = m->xref_table[og];
1585
0
        try {
1586
0
            switch (entry.getType()) {
1587
0
            case 1:
1588
                // Object stored in cache by readObjectAtOffset
1589
0
                readObjectAtOffset(true, entry.getOffset(), "", og);
1590
0
                break;
1591
1592
0
            case 2:
1593
0
                resolveObjectsInStream(entry.getObjStreamNumber());
1594
0
                break;
1595
1596
0
            default:
1597
0
                throw qpdf.damagedPDF(
1598
0
                    "", -1, ("object " + og.unparse('/') + " has unexpected xref entry type"));
1599
0
            }
1600
0
        } catch (QPDFExc& e) {
1601
0
            qpdf.warn(e);
1602
0
        } catch (std::exception& e) {
1603
0
            qpdf.warn(qpdf.damagedPDF(
1604
0
                "", -1, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
1605
0
        }
1606
0
    }
1607
1608
0
    if (isUnresolved(og)) {
1609
        // PDF spec says unknown objects resolve to the null object.
1610
0
        updateCache(og, QPDFObject::create<QPDF_Null>(), -1, -1);
1611
0
    }
1612
1613
0
    auto& result(m->obj_cache[og].object);
1614
0
    result->setDefaultDescription(&qpdf, og);
1615
0
    return result;
1616
0
}
1617
1618
void
1619
Objects::resolveObjectsInStream(int obj_stream_number)
1620
0
{
1621
0
    auto damaged =
1622
0
        [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc {
1623
0
        return {
1624
0
            qpdf_e_damaged_pdf,
1625
0
            m->file->getName() + " object stream " + std::to_string(obj_stream_number),
1626
0
            +"object " + std::to_string(id) + " 0",
1627
0
            offset,
1628
0
            msg,
1629
0
            true};
1630
0
    };
1631
1632
0
    if (m->resolved_object_streams.contains(obj_stream_number)) {
1633
0
        return;
1634
0
    }
1635
0
    m->resolved_object_streams.insert(obj_stream_number);
1636
    // Force resolution of object stream
1637
0
    Stream obj_stream = qpdf.getObject(obj_stream_number, 0);
1638
0
    if (!obj_stream) {
1639
0
        throw qpdf.damagedPDF(
1640
0
            "object " + std::to_string(obj_stream_number) + " 0",
1641
0
            "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
1642
0
    }
1643
1644
    // For linearization data in the object, use the data from the object stream for the objects in
1645
    // the stream.
1646
0
    QPDFObjGen stream_og(obj_stream_number, 0);
1647
0
    qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
1648
0
    qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
1649
1650
0
    QPDFObjectHandle dict = obj_stream.getDict();
1651
0
    if (!dict.isDictionaryOfType("/ObjStm")) {
1652
0
        qpdf.warn(qpdf.damagedPDF(
1653
0
            "object " + std::to_string(obj_stream_number) + " 0",
1654
0
            "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1655
0
    }
1656
1657
0
    unsigned int n{0};
1658
0
    int first{0};
1659
0
    if (!(dict.getKey("/N").getValueAsUInt(n) && dict.getKey("/First").getValueAsInt(first))) {
1660
0
        throw qpdf.damagedPDF(
1661
0
            "object " + std::to_string(obj_stream_number) + " 0",
1662
0
            "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1663
0
    }
1664
1665
    // id, offset, size
1666
0
    std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets;
1667
1668
0
    auto stream_data = obj_stream.getStreamData(qpdf_dl_specialized);
1669
1670
0
    is::OffsetBuffer input("", stream_data);
1671
1672
0
    const auto b_size = stream_data.size();
1673
0
    const auto end_offset = static_cast<qpdf_offset_t>(b_size);
1674
0
    auto b_start = stream_data.data();
1675
1676
0
    if (first >= end_offset) {
1677
0
        throw qpdf.damagedPDF(
1678
0
            "object " + std::to_string(obj_stream_number) + " 0",
1679
0
            "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry");
1680
0
    }
1681
1682
0
    int id = 0;
1683
0
    long long last_offset = -1;
1684
0
    bool is_first = true;
1685
0
    for (unsigned int i = 0; i < n; ++i) {
1686
0
        auto tnum = readToken(input);
1687
0
        auto id_offset = input.getLastOffset();
1688
0
        auto toffset = readToken(input);
1689
0
        if (!(tnum.isInteger() && toffset.isInteger())) {
1690
0
            throw damaged(0, input.getLastOffset(), "expected integer in object stream header");
1691
0
        }
1692
1693
0
        int num = QUtil::string_to_int(tnum.getValue().c_str());
1694
0
        long long offset = QUtil::string_to_int(toffset.getValue().c_str());
1695
1696
0
        if (num == obj_stream_number) {
1697
0
            qpdf.warn(damaged(num, id_offset, "object stream claims to contain itself"));
1698
0
            continue;
1699
0
        }
1700
1701
0
        if (num < 1) {
1702
0
            qpdf.warn(damaged(num, id_offset, "object id is invalid"s));
1703
0
            continue;
1704
0
        }
1705
1706
0
        if (offset <= last_offset) {
1707
0
            qpdf.warn(damaged(
1708
0
                num,
1709
0
                input.getLastOffset(),
1710
0
                "offset " + std::to_string(offset) +
1711
0
                    " is invalid (must be larger than previous offset " +
1712
0
                    std::to_string(last_offset) + ")"));
1713
0
            continue;
1714
0
        }
1715
1716
0
        if (num > m->xref_table_max_id) {
1717
0
            continue;
1718
0
        }
1719
1720
0
        if (first + offset >= end_offset) {
1721
0
            qpdf.warn(damaged(
1722
0
                num, input.getLastOffset(), "offset " + std::to_string(offset) + " is too large"));
1723
0
            continue;
1724
0
        }
1725
1726
0
        if (is_first) {
1727
0
            is_first = false;
1728
0
        } else {
1729
0
            offsets.emplace_back(
1730
0
                id, last_offset + first, static_cast<size_t>(offset - last_offset));
1731
0
        }
1732
1733
0
        last_offset = offset;
1734
0
        id = num;
1735
0
    }
1736
1737
0
    if (!is_first) {
1738
        // We found at least one valid entry.
1739
0
        offsets.emplace_back(
1740
0
            id, last_offset + first, b_size - static_cast<size_t>(last_offset + first));
1741
0
    }
1742
1743
    // To avoid having to read the object stream multiple times, store all objects that would be
1744
    // found here in the cache.  Remember that some objects stored here might have been overridden
1745
    // by new objects appended to the file, so it is necessary to recheck the xref table and only
1746
    // cache what would actually be resolved here.
1747
0
    for (auto const& [obj_id, obj_offset, obj_size]: offsets) {
1748
0
        QPDFObjGen og(obj_id, 0);
1749
0
        auto entry = m->xref_table.find(og);
1750
0
        if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1751
0
            entry->second.getObjStreamNumber() == obj_stream_number) {
1752
0
            is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset);
1753
0
            auto oh = readObjectInStream(in, obj_stream_number, obj_id);
1754
0
            updateCache(og, oh.getObj(), end_before_space, end_after_space);
1755
0
        } else {
1756
0
            QTC::TC("qpdf", "QPDF not caching overridden objstm object");
1757
0
        }
1758
0
    }
1759
0
}
1760
1761
QPDFObjectHandle
1762
Objects::newIndirect(QPDFObjGen og, std::shared_ptr<QPDFObject> const& obj)
1763
0
{
1764
0
    obj->setDefaultDescription(&qpdf, og);
1765
0
    return {obj};
1766
0
}
1767
1768
void
1769
Objects::updateCache(
1770
    QPDFObjGen og,
1771
    std::shared_ptr<QPDFObject> const& object,
1772
    qpdf_offset_t end_before_space,
1773
    qpdf_offset_t end_after_space,
1774
    bool destroy)
1775
20.6k
{
1776
20.6k
    object->setObjGen(&qpdf, og);
1777
20.6k
    if (isCached(og)) {
1778
20.6k
        auto& cache = m->obj_cache[og];
1779
20.6k
        object->move_to(cache.object, destroy);
1780
20.6k
        cache.end_before_space = end_before_space;
1781
20.6k
        cache.end_after_space = end_after_space;
1782
20.6k
    } else {
1783
0
        m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
1784
0
    }
1785
20.6k
}
1786
1787
bool
1788
Objects::isCached(QPDFObjGen og)
1789
20.6k
{
1790
20.6k
    return m->obj_cache.contains(og);
1791
20.6k
}
1792
1793
bool
1794
Objects::isUnresolved(QPDFObjGen og)
1795
0
{
1796
0
    return !isCached(og) || m->obj_cache[og].object->isUnresolved();
1797
0
}
1798
1799
QPDFObjGen
1800
Objects::nextObjGen()
1801
0
{
1802
0
    int max_objid = toI(qpdf.getObjectCount());
1803
0
    if (max_objid == std::numeric_limits<int>::max()) {
1804
0
        throw std::range_error("max object id is too high to create new objects");
1805
0
    }
1806
0
    return {max_objid + 1, 0};
1807
0
}
1808
1809
QPDFObjectHandle
1810
Objects::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
1811
0
{
1812
0
    QPDFObjGen next{nextObjGen()};
1813
0
    m->obj_cache[next] = ObjCache(obj, -1, -1);
1814
0
    return newIndirect(next, m->obj_cache[next].object);
1815
0
}
1816
1817
QPDFObjectHandle
1818
QPDF::makeIndirectObject(QPDFObjectHandle oh)
1819
0
{
1820
0
    if (!oh) {
1821
0
        throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
1822
0
    }
1823
0
    return m->objects.makeIndirectFromQPDFObject(oh.getObj());
1824
0
}
1825
1826
std::shared_ptr<QPDFObject>
1827
Objects::getObjectForParser(int id, int gen, bool parse_pdf)
1828
0
{
1829
    // This method is called by the parser and therefore must not resolve any objects.
1830
0
    auto og = QPDFObjGen(id, gen);
1831
0
    if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
1832
0
        return iter->second.object;
1833
0
    }
1834
0
    if (m->xref_table.contains(og) || (!m->parsed && og.getObj() < m->xref_table_max_id)) {
1835
0
        return m->obj_cache.insert({og, QPDFObject::create<QPDF_Unresolved>(&qpdf, og)})
1836
0
            .first->second.object;
1837
0
    }
1838
0
    if (parse_pdf) {
1839
0
        return QPDFObject::create<QPDF_Null>();
1840
0
    }
1841
0
    return m->obj_cache.insert({og, QPDFObject::create<QPDF_Null>(&qpdf, og)}).first->second.object;
1842
0
}
1843
1844
std::shared_ptr<QPDFObject>
1845
Objects::getObjectForJSON(int id, int gen)
1846
650k
{
1847
650k
    auto og = QPDFObjGen(id, gen);
1848
650k
    auto [it, inserted] = m->obj_cache.try_emplace(og);
1849
650k
    auto& obj = it->second.object;
1850
650k
    if (inserted) {
1851
14.2k
        obj = (m->parsed && !m->xref_table.contains(og))
1852
14.2k
            ? QPDFObject::create<QPDF_Null>(&qpdf, og)
1853
14.2k
            : QPDFObject::create<QPDF_Unresolved>(&qpdf, og);
1854
14.2k
    }
1855
650k
    return obj;
1856
650k
}
1857
1858
QPDFObjectHandle
1859
QPDF::getObject(QPDFObjGen og)
1860
20.6k
{
1861
20.6k
    if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
1862
20.6k
        return {it->second.object};
1863
20.6k
    } else if (m->parsed && !m->xref_table.contains(og)) {
1864
0
        return QPDFObject::create<QPDF_Null>();
1865
0
    } else {
1866
0
        auto result =
1867
0
            m->obj_cache.try_emplace(og, QPDFObject::create<QPDF_Unresolved>(this, og), -1, -1);
1868
0
        return {result.first->second.object};
1869
0
    }
1870
20.6k
}
1871
1872
void
1873
QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
1874
0
{
1875
0
    replaceObject(QPDFObjGen(objid, generation), oh);
1876
0
}
1877
1878
void
1879
QPDF::replaceObject(QPDFObjGen og, QPDFObjectHandle oh)
1880
20.6k
{
1881
20.6k
    if (!oh || (oh.isIndirect() && !(oh.isStream() && oh.getObjGen() == og))) {
1882
0
        throw std::logic_error("QPDF::replaceObject called with indirect object handle");
1883
0
    }
1884
20.6k
    m->objects.updateCache(og, oh.getObj(), -1, -1, false);
1885
20.6k
}
1886
1887
void
1888
QPDF::removeObject(QPDFObjGen og)
1889
0
{
1890
0
    m->xref_table.erase(og);
1891
0
    if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
1892
        // Take care of any object handles that may be floating around.
1893
0
        cached->second.object->assign_null();
1894
0
        cached->second.object->setObjGen(nullptr, QPDFObjGen());
1895
0
        m->obj_cache.erase(cached);
1896
0
    }
1897
0
}
1898
1899
void
1900
QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
1901
0
{
1902
0
    QTC::TC("qpdf", "QPDF replaceReserved");
1903
0
    auto tc = reserved.getTypeCode();
1904
0
    if (!(tc == ::ot_reserved || tc == ::ot_null)) {
1905
0
        throw std::logic_error("replaceReserved called with non-reserved object");
1906
0
    }
1907
0
    replaceObject(reserved.getObjGen(), replacement);
1908
0
}
1909
1910
void
1911
QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
1912
0
{
1913
0
    swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
1914
0
}
1915
1916
void
1917
QPDF::swapObjects(QPDFObjGen og1, QPDFObjGen og2)
1918
0
{
1919
    // Force objects to be read from the input source if needed, then swap them in the cache.
1920
0
    m->objects.resolve(og1);
1921
0
    m->objects.resolve(og2);
1922
0
    m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
1923
0
}
1924
1925
size_t
1926
Objects::tableSize()
1927
0
{
1928
    // If obj_cache is dense, accommodate all object in tables,else accommodate only original
1929
    // objects.
1930
0
    auto max_xref = !m->xref_table.empty() ? m->xref_table.crbegin()->first.getObj() : 0;
1931
0
    auto max_obj = !m->obj_cache.empty() ? m->obj_cache.crbegin()->first.getObj() : 0;
1932
0
    auto max_id = std::numeric_limits<int>::max() - 1;
1933
0
    if (max_obj >= max_id || max_xref >= max_id) {
1934
        // Temporary fix. Long-term solution is
1935
        // - QPDFObjGen to enforce objgens are valid and sensible
1936
        // - xref table and obj cache to protect against insertion of impossibly large obj ids
1937
0
        qpdf.stopOnError("Impossibly large object id encountered.");
1938
0
    }
1939
0
    if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
1940
0
        return toS(++max_obj);
1941
0
    }
1942
0
    return toS(++max_xref);
1943
0
}
1944
1945
std::vector<QPDFObjGen>
1946
Objects::getCompressibleObjVector()
1947
0
{
1948
0
    return getCompressibleObjGens<QPDFObjGen>();
1949
0
}
1950
1951
std::vector<bool>
1952
Objects::getCompressibleObjSet()
1953
0
{
1954
0
    return getCompressibleObjGens<bool>();
1955
0
}
1956
1957
template <typename T>
1958
std::vector<T>
1959
Objects::getCompressibleObjGens()
1960
0
{
1961
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
1962
    // by traversing the document from the root, including a traversal of the pages tree.  This
1963
    // makes that objects that are on the same page are more likely to be in the same object stream,
1964
    // which is slightly more efficient, particularly with linearized files.  This is better than
1965
    // iterating through the xref table since it avoids preserving orphaned items.
1966
1967
    // Exclude encryption dictionary, if any
1968
0
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
1969
0
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
1970
1971
0
    const size_t max_obj = qpdf.getObjectCount();
1972
0
    std::vector<bool> visited(max_obj, false);
1973
0
    std::vector<QPDFObjectHandle> queue;
1974
0
    queue.reserve(512);
1975
0
    queue.emplace_back(m->trailer);
1976
0
    std::vector<T> result;
1977
0
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
1978
0
        result.reserve(m->obj_cache.size());
1979
0
    } else if constexpr (std::is_same_v<T, bool>) {
1980
0
        result.resize(max_obj + 1U, false);
1981
    } else {
1982
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
1983
    }
1984
0
    while (!queue.empty()) {
1985
0
        auto obj = queue.back();
1986
0
        queue.pop_back();
1987
0
        if (obj.getObjectID() > 0) {
1988
0
            QPDFObjGen og = obj.getObjGen();
1989
0
            const size_t id = toS(og.getObj() - 1);
1990
0
            if (id >= max_obj) {
1991
0
                throw std::logic_error(
1992
0
                    "unexpected object id encountered in getCompressibleObjGens");
1993
0
            }
1994
0
            if (visited[id]) {
1995
0
                continue;
1996
0
            }
1997
1998
            // Check whether this is the current object. If not, remove it (which changes it into a
1999
            // direct null and therefore stops us from revisiting it) and move on to the next object
2000
            // in the queue.
2001
0
            auto upper = m->obj_cache.upper_bound(og);
2002
0
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2003
0
                qpdf.removeObject(og);
2004
0
                continue;
2005
0
            }
2006
2007
0
            visited[id] = true;
2008
2009
0
            if (og == encryption_dict_og) {
2010
0
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2011
0
            } else if (!(obj.isStream() ||
2012
0
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2013
0
                          obj.hasKey("/Contents")))) {
2014
0
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2015
0
                    result.push_back(og);
2016
0
                } else if constexpr (std::is_same_v<T, bool>) {
2017
0
                    result[id + 1U] = true;
2018
0
                }
2019
0
            }
2020
0
        }
2021
0
        if (obj.isStream()) {
2022
0
            auto dict = obj.getDict().as_dictionary();
2023
0
            auto end = dict.crend();
2024
0
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2025
0
                std::string const& key = iter->first;
2026
0
                QPDFObjectHandle const& value = iter->second;
2027
0
                if (!value.null()) {
2028
0
                    if (key == "/Length") {
2029
                        // omit stream lengths
2030
0
                        if (value.isIndirect()) {
2031
0
                            QTC::TC("qpdf", "QPDF exclude indirect length");
2032
0
                        }
2033
0
                    } else {
2034
0
                        queue.emplace_back(value);
2035
0
                    }
2036
0
                }
2037
0
            }
2038
0
        } else if (obj.isDictionary()) {
2039
0
            auto dict = obj.as_dictionary();
2040
0
            auto end = dict.crend();
2041
0
            for (auto iter = dict.crbegin(); iter != end; ++iter) {
2042
0
                if (!iter->second.null()) {
2043
0
                    queue.emplace_back(iter->second);
2044
0
                }
2045
0
            }
2046
0
        } else if (auto items = obj.as_array()) {
2047
0
            queue.insert(queue.end(), items.crbegin(), items.crend());
2048
0
        }
2049
0
    }
2050
2051
0
    return result;
2052
0
}
Unexecuted instantiation: std::__1::vector<QPDFObjGen, std::__1::allocator<QPDFObjGen> > QPDF::Doc::Objects::getCompressibleObjGens<QPDFObjGen>()
Unexecuted instantiation: std::__1::vector<bool, std::__1::allocator<bool> > QPDF::Doc::Objects::getCompressibleObjGens<bool>()