Coverage Report

Created: 2024-09-08 06:06

/src/qpdf/libqpdf/QPDF.cc
Line
Count
Source (jump to first uncovered line)
1
#include <qpdf/qpdf-config.h> // include first for large file support
2
3
#include <qpdf/QPDF.hh>
4
5
#include <array>
6
#include <atomic>
7
#include <cstring>
8
#include <limits>
9
#include <map>
10
#include <regex>
11
#include <sstream>
12
#include <vector>
13
14
#include <qpdf/BufferInputSource.hh>
15
#include <qpdf/FileInputSource.hh>
16
#include <qpdf/OffsetInputSource.hh>
17
#include <qpdf/Pipeline.hh>
18
#include <qpdf/QPDFExc.hh>
19
#include <qpdf/QPDFLogger.hh>
20
#include <qpdf/QPDFObject_private.hh>
21
#include <qpdf/QPDFParser.hh>
22
#include <qpdf/QPDF_Array.hh>
23
#include <qpdf/QPDF_Dictionary.hh>
24
#include <qpdf/QPDF_Null.hh>
25
#include <qpdf/QPDF_Reserved.hh>
26
#include <qpdf/QPDF_Stream.hh>
27
#include <qpdf/QPDF_Unresolved.hh>
28
#include <qpdf/QTC.hh>
29
#include <qpdf/QUtil.hh>
30
31
// This must be a fixed value. This API returns a const reference to it, and the C API relies on its
32
// being static as well.
33
std::string const QPDF::qpdf_version(QPDF_VERSION);
34
35
static char const* EMPTY_PDF = (
36
    // force line break
37
    "%PDF-1.3\n"
38
    "1 0 obj\n"
39
    "<< /Type /Catalog /Pages 2 0 R >>\n"
40
    "endobj\n"
41
    "2 0 obj\n"
42
    "<< /Type /Pages /Kids [] /Count 0 >>\n"
43
    "endobj\n"
44
    "xref\n"
45
    "0 3\n"
46
    "0000000000 65535 f \n"
47
    "0000000009 00000 n \n"
48
    "0000000058 00000 n \n"
49
    "trailer << /Size 3 /Root 1 0 R >>\n"
50
    "startxref\n"
51
    "110\n"
52
    "%%EOF\n");
53
54
namespace
55
{
56
    class InvalidInputSource: public InputSource
57
    {
58
      public:
59
        ~InvalidInputSource() override = default;
60
        qpdf_offset_t
61
        findAndSkipNextEOL() override
62
0
        {
63
0
            throwException();
64
0
            return 0;
65
0
        }
66
        std::string const&
67
        getName() const override
68
0
        {
69
0
            static std::string name("closed input source");
70
0
            return name;
71
0
        }
72
        qpdf_offset_t
73
        tell() override
74
0
        {
75
0
            throwException();
76
0
            return 0;
77
0
        }
78
        void
79
        seek(qpdf_offset_t offset, int whence) override
80
0
        {
81
0
            throwException();
82
0
        }
83
        void
84
        rewind() override
85
0
        {
86
0
            throwException();
87
0
        }
88
        size_t
89
        read(char* buffer, size_t length) override
90
0
        {
91
0
            throwException();
92
0
            return 0;
93
0
        }
94
        void
95
        unreadCh(char ch) override
96
0
        {
97
0
            throwException();
98
0
        }
99
100
      private:
101
        void
102
        throwException()
103
0
        {
104
0
            throw std::logic_error("QPDF operation attempted on a QPDF object with no input "
105
0
                                   "source. QPDF operations are invalid before processFile (or "
106
0
                                   "another process method) or after closeInputSource");
107
0
        }
108
    };
109
} // namespace
110
111
QPDF::ForeignStreamData::ForeignStreamData(
112
    std::shared_ptr<EncryptionParameters> encp,
113
    std::shared_ptr<InputSource> file,
114
    QPDFObjGen const& foreign_og,
115
    qpdf_offset_t offset,
116
    size_t length,
117
    QPDFObjectHandle local_dict) :
118
    encp(encp),
119
    file(file),
120
    foreign_og(foreign_og),
121
    offset(offset),
122
    length(length),
123
    local_dict(local_dict)
124
0
{
125
0
}
126
127
QPDF::CopiedStreamDataProvider::CopiedStreamDataProvider(QPDF& destination_qpdf) :
128
    QPDFObjectHandle::StreamDataProvider(true),
129
    destination_qpdf(destination_qpdf)
130
0
{
131
0
}
132
133
bool
134
QPDF::CopiedStreamDataProvider::provideStreamData(
135
    QPDFObjGen const& og, Pipeline* pipeline, bool suppress_warnings, bool will_retry)
136
0
{
137
0
    std::shared_ptr<ForeignStreamData> foreign_data = foreign_stream_data[og];
138
0
    bool result = false;
139
0
    if (foreign_data.get()) {
140
0
        result = destination_qpdf.pipeForeignStreamData(
141
0
            foreign_data, pipeline, suppress_warnings, will_retry);
142
0
        QTC::TC("qpdf", "QPDF copy foreign with data", result ? 0 : 1);
143
0
    } else {
144
0
        auto foreign_stream = foreign_streams[og];
145
0
        result = foreign_stream.pipeStreamData(
146
0
            pipeline, nullptr, 0, qpdf_dl_none, suppress_warnings, will_retry);
147
0
        QTC::TC("qpdf", "QPDF copy foreign with foreign_stream", result ? 0 : 1);
148
0
    }
149
0
    return result;
150
0
}
151
152
void
153
QPDF::CopiedStreamDataProvider::registerForeignStream(
154
    QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream)
155
0
{
156
0
    this->foreign_streams[local_og] = foreign_stream;
157
0
}
158
159
void
160
QPDF::CopiedStreamDataProvider::registerForeignStream(
161
    QPDFObjGen const& local_og, std::shared_ptr<ForeignStreamData> foreign_stream)
162
0
{
163
0
    this->foreign_stream_data[local_og] = foreign_stream;
164
0
}
165
166
QPDF::StringDecrypter::StringDecrypter(QPDF* qpdf, QPDFObjGen const& og) :
167
    qpdf(qpdf),
168
    og(og)
169
1.97M
{
170
1.97M
}
171
172
void
173
QPDF::StringDecrypter::decryptString(std::string& val)
174
32.3k
{
175
32.3k
    qpdf->decryptString(val, og);
176
32.3k
}
177
178
std::string const&
179
QPDF::QPDFVersion()
180
0
{
181
    // The C API relies on this being a static value.
182
0
    return QPDF::qpdf_version;
183
0
}
184
185
QPDF::EncryptionParameters::EncryptionParameters() :
186
    encrypted(false),
187
    encryption_initialized(false),
188
    encryption_V(0),
189
    encryption_R(0),
190
    encrypt_metadata(true),
191
    cf_stream(e_none),
192
    cf_string(e_none),
193
    cf_file(e_none),
194
    user_password_matched(false),
195
    owner_password_matched(false)
196
102k
{
197
102k
}
198
199
QPDF::Members::Members() :
200
    log(QPDFLogger::defaultLogger()),
201
    file(new InvalidInputSource()),
202
    encp(new EncryptionParameters)
203
102k
{
204
102k
}
205
206
QPDF::QPDF() :
207
    m(new Members())
208
102k
{
209
102k
    m->tokenizer.allowEOF();
210
    // Generate a unique ID. It just has to be unique among all QPDF objects allocated throughout
211
    // the lifetime of this running application.
212
102k
    static std::atomic<unsigned long long> unique_id{0};
213
102k
    m->unique_id = unique_id.fetch_add(1ULL);
214
102k
}
215
216
QPDF::~QPDF()
217
102k
{
218
    // If two objects are mutually referential (through each object having an array or dictionary
219
    // that contains an indirect reference to the other), the circular references in the
220
    // std::shared_ptr objects will prevent the objects from being deleted. Walk through all objects
221
    // in the object cache, which is those objects that we read from the file, and break all
222
    // resolved indirect references by replacing them with an internal object type representing that
223
    // they have been destroyed. Note that we can't break references like this at any time when the
224
    // QPDF object is active. The call to reset also causes all direct QPDFObjectHandle objects that
225
    // are reachable from this object to release their association with this QPDF. Direct objects
226
    // are not destroyed since they can be moved to other QPDF objects safely.
227
228
    // At this point, obviously no one is still using the QPDF object, but we'll explicitly clear
229
    // the xref table anyway just to prevent any possibility of resolve() succeeding.
230
102k
    m->xref_table.clear();
231
2.77M
    for (auto const& iter: m->obj_cache) {
232
2.77M
        iter.second.object->disconnect();
233
2.77M
        if (iter.second.object->getTypeCode() != ::ot_null) {
234
2.36M
            iter.second.object->destroy();
235
2.36M
        }
236
2.77M
    }
237
102k
}
238
239
std::shared_ptr<QPDF>
240
QPDF::create()
241
89.1k
{
242
89.1k
    return std::make_shared<QPDF>();
243
89.1k
}
244
245
void
246
QPDF::processFile(char const* filename, char const* password)
247
0
{
248
0
    auto* fi = new FileInputSource(filename);
249
0
    processInputSource(std::shared_ptr<InputSource>(fi), password);
250
0
}
251
252
void
253
QPDF::processFile(char const* description, FILE* filep, bool close_file, char const* password)
254
0
{
255
0
    auto* fi = new FileInputSource(description, filep, close_file);
256
0
    processInputSource(std::shared_ptr<InputSource>(fi), password);
257
0
}
258
259
void
260
QPDF::processMemoryFile(
261
    char const* description, char const* buf, size_t length, char const* password)
262
13.2k
{
263
13.2k
    processInputSource(
264
13.2k
        std::shared_ptr<InputSource>(
265
            // line-break
266
13.2k
            new BufferInputSource(
267
13.2k
                description, new Buffer(QUtil::unsigned_char_pointer(buf), length), true)),
268
13.2k
        password);
269
13.2k
}
270
271
void
272
QPDF::processInputSource(std::shared_ptr<InputSource> source, char const* password)
273
102k
{
274
102k
    m->file = source;
275
102k
    parse(password);
276
102k
}
277
278
void
279
QPDF::closeInputSource()
280
0
{
281
0
    m->file = std::shared_ptr<InputSource>(new InvalidInputSource());
282
0
}
283
284
void
285
QPDF::setPasswordIsHexKey(bool val)
286
0
{
287
0
    m->provided_password_is_hex_key = val;
288
0
}
289
290
void
291
QPDF::emptyPDF()
292
0
{
293
0
    processMemoryFile("empty PDF", EMPTY_PDF, strlen(EMPTY_PDF));
294
0
}
295
296
void
297
QPDF::registerStreamFilter(
298
    std::string const& filter_name, std::function<std::shared_ptr<QPDFStreamFilter>()> factory)
299
0
{
300
0
    QPDF_Stream::registerStreamFilter(filter_name, factory);
301
0
}
302
303
void
304
QPDF::setIgnoreXRefStreams(bool val)
305
0
{
306
0
    m->ignore_xref_streams = val;
307
0
}
308
309
std::shared_ptr<QPDFLogger>
310
QPDF::getLogger()
311
0
{
312
0
    return m->log;
313
0
}
314
315
void
316
QPDF::setLogger(std::shared_ptr<QPDFLogger> l)
317
0
{
318
0
    m->log = l;
319
0
}
320
321
void
322
QPDF::setOutputStreams(std::ostream* out, std::ostream* err)
323
0
{
324
0
    setLogger(QPDFLogger::create());
325
0
    m->log->setOutputStreams(out, err);
326
0
}
327
328
void
329
QPDF::setSuppressWarnings(bool val)
330
0
{
331
0
    m->suppress_warnings = val;
332
0
}
333
334
void
335
QPDF::setMaxWarnings(size_t val)
336
102k
{
337
102k
    m->max_warnings = val;
338
102k
}
339
340
void
341
QPDF::setAttemptRecovery(bool val)
342
0
{
343
0
    m->attempt_recovery = val;
344
0
}
345
346
void
347
QPDF::setImmediateCopyFrom(bool val)
348
0
{
349
0
    m->immediate_copy_from = val;
350
0
}
351
352
std::vector<QPDFExc>
353
QPDF::getWarnings()
354
0
{
355
0
    std::vector<QPDFExc> result = m->warnings;
356
0
    m->warnings.clear();
357
0
    return result;
358
0
}
359
360
bool
361
QPDF::anyWarnings() const
362
0
{
363
0
    return !m->warnings.empty();
364
0
}
365
366
size_t
367
QPDF::numWarnings() const
368
0
{
369
0
    return m->warnings.size();
370
0
}
371
372
bool
373
QPDF::validatePDFVersion(char const*& p, std::string& version)
374
85.2k
{
375
85.2k
    bool valid = QUtil::is_digit(*p);
376
85.2k
    if (valid) {
377
174k
        while (QUtil::is_digit(*p)) {
378
101k
            version.append(1, *p++);
379
101k
        }
380
72.9k
        if ((*p == '.') && QUtil::is_digit(*(p + 1))) {
381
63.8k
            version.append(1, *p++);
382
167k
            while (QUtil::is_digit(*p)) {
383
103k
                version.append(1, *p++);
384
103k
            }
385
63.8k
        } else {
386
9.09k
            valid = false;
387
9.09k
        }
388
72.9k
    }
389
85.2k
    return valid;
390
85.2k
}
391
392
bool
393
QPDF::findHeader()
394
84.9k
{
395
84.9k
    qpdf_offset_t global_offset = m->file->tell();
396
84.9k
    std::string line = m->file->readLine(1024);
397
84.9k
    char const* p = line.c_str();
398
84.9k
    if (strncmp(p, "%PDF-", 5) != 0) {
399
0
        throw std::logic_error("findHeader is not looking at %PDF-");
400
0
    }
401
84.9k
    p += 5;
402
84.9k
    std::string version;
403
    // Note: The string returned by line.c_str() is always null-terminated. The code below never
404
    // overruns the buffer because a null character always short-circuits further advancement.
405
84.9k
    bool valid = validatePDFVersion(p, version);
406
84.9k
    if (valid) {
407
63.6k
        m->pdf_version = version;
408
63.6k
        if (global_offset != 0) {
409
            // Empirical evidence strongly suggests that when there is leading material prior to the
410
            // PDF header, all explicit offsets in the file are such that 0 points to the beginning
411
            // of the header.
412
15.5k
            QTC::TC("qpdf", "QPDF global offset");
413
15.5k
            m->file = std::shared_ptr<InputSource>(new OffsetInputSource(m->file, global_offset));
414
15.5k
        }
415
63.6k
    }
416
84.9k
    return valid;
417
84.9k
}
418
419
bool
420
QPDF::findStartxref()
421
71.9k
{
422
71.9k
    if (readToken(m->file).isWord("startxref") && readToken(m->file).isInteger()) {
423
        // Position in front of offset token
424
60.6k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
425
60.6k
        return true;
426
60.6k
    }
427
11.3k
    return false;
428
71.9k
}
429
430
void
431
QPDF::parse(char const* password)
432
102k
{
433
102k
    if (password) {
434
0
        m->encp->provided_password = password;
435
0
    }
436
437
    // Find the header anywhere in the first 1024 bytes of the file.
438
102k
    PatternFinder hf(*this, &QPDF::findHeader);
439
102k
    if (!m->file->findFirst("%PDF-", 0, 1024, hf)) {
440
38.7k
        QTC::TC("qpdf", "QPDF not a pdf file");
441
38.7k
        warn(damagedPDF("", 0, "can't find PDF header"));
442
        // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode
443
38.7k
        m->pdf_version = "1.2";
444
38.7k
    }
445
446
    // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file.  We add an extra
447
    // 30 characters to leave room for the startxref stuff.
448
102k
    m->file->seek(0, SEEK_END);
449
102k
    qpdf_offset_t end_offset = m->file->tell();
450
102k
    m->xref_table_max_offset = end_offset;
451
    // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
452
    // scenarios at least 3 bytes are required.
453
102k
    if (m->xref_table_max_id > m->xref_table_max_offset / 3) {
454
102k
        m->xref_table_max_id = static_cast<int>(m->xref_table_max_offset / 3);
455
102k
    }
456
102k
    qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
457
102k
    PatternFinder sf(*this, &QPDF::findStartxref);
458
102k
    qpdf_offset_t xref_offset = 0;
459
102k
    if (m->file->findLast("startxref", start_offset, 0, sf)) {
460
55.4k
        xref_offset = QUtil::string_to_ll(readToken(m->file).getValue().c_str());
461
55.4k
    }
462
463
102k
    try {
464
102k
        if (xref_offset == 0) {
465
47.5k
            QTC::TC("qpdf", "QPDF can't find startxref");
466
47.5k
            throw damagedPDF("", 0, "can't find startxref");
467
47.5k
        }
468
54.8k
        try {
469
54.8k
            read_xref(xref_offset);
470
54.8k
        } catch (QPDFExc&) {
471
38.2k
            throw;
472
38.2k
        } catch (std::exception& e) {
473
1.14k
            throw damagedPDF("", 0, std::string("error reading xref: ") + e.what());
474
1.14k
        }
475
86.9k
    } catch (QPDFExc& e) {
476
86.9k
        if (m->attempt_recovery) {
477
86.9k
            reconstruct_xref(e);
478
86.9k
            QTC::TC("qpdf", "QPDF reconstructed xref table");
479
86.9k
        } else {
480
0
            throw;
481
0
        }
482
86.9k
    }
483
484
86.9k
    initializeEncryption();
485
56.7k
    m->parsed = true;
486
56.7k
    if (m->xref_table.size() > 0 && !getRoot().getKey("/Pages").isDictionary()) {
487
        // QPDFs created from JSON have an empty xref table and no root object yet.
488
135
        throw damagedPDF("", 0, "unable to find page tree");
489
135
    }
490
56.7k
}
491
492
void
493
QPDF::inParse(bool v)
494
10.2M
{
495
10.2M
    if (m->in_parse == v) {
496
        // This happens if QPDFParser::parse tries to resolve an indirect object while it is
497
        // parsing.
498
0
        throw std::logic_error("QPDF: re-entrant parsing detected. This is a qpdf bug."
499
0
                               " Please report at https://github.com/qpdf/qpdf/issues.");
500
0
    }
501
10.2M
    m->in_parse = v;
502
10.2M
}
503
504
void
505
QPDF::warn(QPDFExc const& e)
506
5.58M
{
507
5.58M
    if (m->max_warnings > 0 && m->warnings.size() >= m->max_warnings) {
508
45.2k
        stopOnError("Too many warnings - file is too badly damaged");
509
45.2k
    }
510
5.58M
    m->warnings.push_back(e);
511
5.58M
    if (!m->suppress_warnings) {
512
5.53M
        *m->log->getWarn() << "WARNING: " << m->warnings.back().what() << "\n";
513
5.53M
    }
514
5.58M
}
515
516
void
517
QPDF::warn(
518
    qpdf_error_code_e error_code,
519
    std::string const& object,
520
    qpdf_offset_t offset,
521
    std::string const& message)
522
139k
{
523
139k
    warn(QPDFExc(error_code, getFilename(), object, offset, message));
524
139k
}
525
526
void
527
QPDF::setTrailer(QPDFObjectHandle obj)
528
74.9k
{
529
74.9k
    if (m->trailer.isInitialized()) {
530
1.53k
        return;
531
1.53k
    }
532
73.4k
    m->trailer = obj;
533
73.4k
}
534
535
void
536
QPDF::reconstruct_xref(QPDFExc& e)
537
102k
{
538
102k
    if (m->reconstructed_xref) {
539
        // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
540
        // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
541
14.7k
        throw e;
542
14.7k
    }
543
544
    // If recovery generates more than 1000 warnings, the file is so severely damaged that there
545
    // probably is no point trying to continue.
546
87.7k
    const auto max_warnings = m->warnings.size() + 1000U;
547
47.6M
    auto check_warnings = [this, max_warnings]() {
548
47.6M
        if (m->warnings.size() > max_warnings) {
549
0
            throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
550
0
        }
551
47.6M
    };
552
553
87.7k
    m->reconstructed_xref = true;
554
    // We may find more objects, which may contain dangling references.
555
87.7k
    m->fixed_dangling_refs = false;
556
557
87.7k
    warn(damagedPDF("", 0, "file is damaged"));
558
87.7k
    warn(e);
559
87.7k
    warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
560
561
    // Delete all references to type 1 (uncompressed) objects
562
87.7k
    std::set<QPDFObjGen> to_delete;
563
259k
    for (auto const& iter: m->xref_table) {
564
259k
        if (iter.second.getType() == 1) {
565
118k
            to_delete.insert(iter.first);
566
118k
        }
567
259k
    }
568
118k
    for (auto const& iter: to_delete) {
569
118k
        m->xref_table.erase(iter);
570
118k
    }
571
572
87.7k
    m->file->seek(0, SEEK_END);
573
87.7k
    qpdf_offset_t eof = m->file->tell();
574
87.7k
    m->file->seek(0, SEEK_SET);
575
    // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
576
87.7k
    static size_t const MAX_LEN = 10;
577
47.6M
    while (m->file->tell() < eof) {
578
47.5M
        QPDFTokenizer::Token t1 = readToken(m->file, MAX_LEN);
579
47.5M
        qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
580
47.5M
        if (t1.isInteger()) {
581
9.32M
            auto pos = m->file->tell();
582
9.32M
            QPDFTokenizer::Token t2 = readToken(m->file, MAX_LEN);
583
9.32M
            if ((t2.isInteger()) && (readToken(m->file, MAX_LEN).isWord("obj"))) {
584
3.24M
                int obj = QUtil::string_to_int(t1.getValue().c_str());
585
3.24M
                int gen = QUtil::string_to_int(t2.getValue().c_str());
586
3.24M
                if (obj <= m->xref_table_max_id) {
587
3.24M
                    insertReconstructedXrefEntry(obj, token_start, gen);
588
3.24M
                } else {
589
3.79k
                    warn(damagedPDF(
590
3.79k
                        "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
591
3.79k
                }
592
3.24M
            }
593
9.32M
            m->file->seek(pos, SEEK_SET);
594
38.2M
        } else if (!m->trailer.isInitialized() && t1.isWord("trailer")) {
595
109k
            auto pos = m->file->tell();
596
109k
            QPDFObjectHandle t = readTrailer();
597
109k
            if (!t.isDictionary()) {
598
                // Oh well.  It was worth a try.
599
66.6k
            } else {
600
42.7k
                setTrailer(t);
601
42.7k
            }
602
109k
            m->file->seek(pos, SEEK_SET);
603
109k
        }
604
47.5M
        check_warnings();
605
47.5M
        m->file->findAndSkipNextEOL();
606
47.5M
    }
607
87.7k
    m->deleted_objects.clear();
608
609
87.7k
    if (!m->trailer.isInitialized()) {
610
42.7k
        qpdf_offset_t max_offset{0};
611
        // If there are any xref streams, take the last one to appear.
612
356k
        for (auto const& iter: m->xref_table) {
613
356k
            auto entry = iter.second;
614
356k
            if (entry.getType() != 1) {
615
5.34k
                continue;
616
5.34k
            }
617
350k
            auto oh = getObjectByObjGen(iter.first);
618
350k
            try {
619
350k
                if (!oh.isStreamOfType("/XRef")) {
620
318k
                    continue;
621
318k
                }
622
350k
            } catch (std::exception&) {
623
14.6k
                continue;
624
14.6k
            }
625
17.3k
            auto offset = entry.getOffset();
626
17.3k
            if (offset > max_offset) {
627
16.1k
                max_offset = offset;
628
16.1k
                setTrailer(oh.getDict());
629
16.1k
            }
630
17.3k
            check_warnings();
631
17.3k
        }
632
42.7k
        if (max_offset > 0) {
633
14.5k
            try {
634
14.5k
                read_xref(max_offset);
635
14.5k
            } catch (std::exception&) {
636
7.33k
                throw damagedPDF(
637
7.33k
                    "", 0, "error decoding candidate xref stream while recovering damaged file");
638
7.33k
            }
639
7.24k
            QTC::TC("qpdf", "QPDF recover xref stream");
640
7.24k
        }
641
42.7k
    }
642
643
80.4k
    if (!m->trailer.isInitialized()) {
644
        // We could check the last encountered object to see if it was an xref stream.  If so, we
645
        // could try to get the trailer from there.  This may make it possible to recover files with
646
        // bad startxref pointers even when they have object streams.
647
648
28.1k
        throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
649
28.1k
    }
650
52.2k
    if (m->xref_table.empty()) {
651
        // We cannot check for an empty xref table in parse because empty tables are valid when
652
        // creating QPDF objects from JSON.
653
952
        throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
654
952
    }
655
51.3k
    check_warnings();
656
51.3k
    if (!m->parsed) {
657
49.7k
        m->parsed = true;
658
49.7k
        getAllPages();
659
49.7k
        check_warnings();
660
49.7k
        if (m->all_pages.empty()) {
661
829
            m->parsed = false;
662
829
            throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
663
829
        }
664
49.7k
    }
665
    // We could iterate through the objects looking for streams and try to find objects inside of
666
    // them, but it's probably not worth the trouble.  Acrobat can't recover files with any errors
667
    // in an xref stream, and this would be a real long shot anyway.  If we wanted to do anything
668
    // that involved looking at stream contents, we'd also have to call initializeEncryption() here.
669
    // It's safe to call it more than once.
670
51.3k
}
671
672
void
673
QPDF::read_xref(qpdf_offset_t xref_offset)
674
69.3k
{
675
69.3k
    std::map<int, int> free_table;
676
69.3k
    std::set<qpdf_offset_t> visited;
677
141k
    while (xref_offset) {
678
72.4k
        visited.insert(xref_offset);
679
72.4k
        char buf[7];
680
72.4k
        memset(buf, 0, sizeof(buf));
681
72.4k
        m->file->seek(xref_offset, SEEK_SET);
682
        // Some files miss the mark a little with startxref. We could do a better job of searching
683
        // in the neighborhood for something that looks like either an xref table or stream, but the
684
        // simple heuristic of skipping whitespace can help with the xref table case and is harmless
685
        // with the stream case.
686
72.4k
        bool done = false;
687
72.4k
        bool skipped_space = false;
688
2.02M
        while (!done) {
689
1.95M
            char ch;
690
1.95M
            if (1 == m->file->read(&ch, 1)) {
691
1.94M
                if (QUtil::is_space(ch)) {
692
1.88M
                    skipped_space = true;
693
1.88M
                } else {
694
59.7k
                    m->file->unreadCh(ch);
695
59.7k
                    done = true;
696
59.7k
                }
697
1.94M
            } else {
698
11.8k
                QTC::TC("qpdf", "QPDF eof skipping spaces before xref", skipped_space ? 0 : 1);
699
11.8k
                done = true;
700
11.8k
            }
701
1.95M
        }
702
703
72.4k
        m->file->read(buf, sizeof(buf) - 1);
704
        // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
705
        // where it is terminated by arbitrary whitespace.
706
72.4k
        if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
707
21.1k
            if (skipped_space) {
708
1.20k
                QTC::TC("qpdf", "QPDF xref skipped space");
709
1.20k
                warn(damagedPDF("", 0, "extraneous whitespace seen before xref"));
710
1.20k
            }
711
21.1k
            QTC::TC(
712
21.1k
                "qpdf",
713
21.1k
                "QPDF xref space",
714
21.1k
                ((buf[4] == '\n')       ? 0
715
21.1k
                     : (buf[4] == '\r') ? 1
716
5.65k
                     : (buf[4] == ' ')  ? 2
717
1.34k
                                        : 9999));
718
21.1k
            int skip = 4;
719
            // buf is null-terminated, and QUtil::is_space('\0') is false, so this won't overrun.
720
43.7k
            while (QUtil::is_space(buf[skip])) {
721
22.6k
                ++skip;
722
22.6k
            }
723
21.1k
            xref_offset = read_xrefTable(xref_offset + skip);
724
51.2k
        } else {
725
51.2k
            xref_offset = read_xrefStream(xref_offset);
726
51.2k
        }
727
72.4k
        if (visited.count(xref_offset) != 0) {
728
212
            QTC::TC("qpdf", "QPDF xref loop");
729
212
            throw damagedPDF("", 0, "loop detected following xref tables");
730
212
        }
731
72.4k
    }
732
733
69.1k
    if (!m->trailer.isInitialized()) {
734
0
        throw damagedPDF("", 0, "unable to find trailer while reading xref");
735
0
    }
736
69.1k
    int size = m->trailer.getKey("/Size").getIntValueAsInt();
737
69.1k
    int max_obj = 0;
738
69.1k
    if (!m->xref_table.empty()) {
739
8.97k
        max_obj = m->xref_table.rbegin()->first.getObj();
740
8.97k
    }
741
69.1k
    if (!m->deleted_objects.empty()) {
742
18.0k
        max_obj = std::max(max_obj, *(m->deleted_objects.rbegin()));
743
18.0k
    }
744
69.1k
    if ((size < 1) || (size - 1 != max_obj)) {
745
4.36k
        QTC::TC("qpdf", "QPDF xref size mismatch");
746
4.36k
        warn(damagedPDF(
747
4.36k
            "",
748
4.36k
            0,
749
4.36k
            ("reported number of objects (" + std::to_string(size) +
750
4.36k
             ") is not one plus the highest object number (" + std::to_string(max_obj) + ")")));
751
4.36k
    }
752
753
    // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we
754
    // never depend on its being set.
755
69.1k
    m->deleted_objects.clear();
756
757
    // Make sure we keep only the highest generation for any object.
758
69.1k
    QPDFObjGen last_og{-1, 0};
759
958k
    for (auto const& item: m->xref_table) {
760
958k
        auto id = item.first.getObj();
761
958k
        if (id == last_og.getObj() && id > 0) {
762
31.7k
            removeObject(last_og);
763
31.7k
        }
764
958k
        last_og = item.first;
765
958k
    }
766
69.1k
}
767
768
bool
769
QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes)
770
34.6k
{
771
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
772
    // buffer.
773
34.6k
    char const* p = line.c_str();
774
34.6k
    char const* start = line.c_str();
775
776
    // Skip zero or more spaces
777
51.7k
    while (QUtil::is_space(*p)) {
778
17.0k
        ++p;
779
17.0k
    }
780
    // Require digit
781
34.6k
    if (!QUtil::is_digit(*p)) {
782
1.13k
        return false;
783
1.13k
    }
784
    // Gather digits
785
33.4k
    std::string obj_str;
786
115k
    while (QUtil::is_digit(*p)) {
787
81.7k
        obj_str.append(1, *p++);
788
81.7k
    }
789
    // Require space
790
33.4k
    if (!QUtil::is_space(*p)) {
791
221
        return false;
792
221
    }
793
    // Skip spaces
794
90.9k
    while (QUtil::is_space(*p)) {
795
57.6k
        ++p;
796
57.6k
    }
797
    // Require digit
798
33.2k
    if (!QUtil::is_digit(*p)) {
799
542
        return false;
800
542
    }
801
    // Gather digits
802
32.7k
    std::string num_str;
803
102k
    while (QUtil::is_digit(*p)) {
804
69.3k
        num_str.append(1, *p++);
805
69.3k
    }
806
    // Skip any space including line terminators
807
100k
    while (QUtil::is_space(*p)) {
808
68.2k
        ++p;
809
68.2k
    }
810
32.7k
    bytes = toI(p - start);
811
32.7k
    obj = QUtil::string_to_int(obj_str.c_str());
812
32.7k
    num = QUtil::string_to_int(num_str.c_str());
813
32.7k
    return true;
814
33.2k
}
815
816
bool
817
QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
818
24.6k
{
819
    // Reposition after initial read attempt and reread.
820
24.6k
    m->file->seek(m->file->getLastOffset(), SEEK_SET);
821
24.6k
    auto line = m->file->readLine(30);
822
823
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
824
    // buffer.
825
24.6k
    char const* p = line.data();
826
827
    // Skip zero or more spaces. There aren't supposed to be any.
828
24.6k
    bool invalid = false;
829
35.7k
    while (QUtil::is_space(*p)) {
830
11.1k
        ++p;
831
11.1k
        QTC::TC("qpdf", "QPDF ignore first space in xref entry");
832
11.1k
        invalid = true;
833
11.1k
    }
834
    // Require digit
835
24.6k
    if (!QUtil::is_digit(*p)) {
836
269
        return false;
837
269
    }
838
    // Gather digits
839
24.3k
    std::string f1_str;
840
116k
    while (QUtil::is_digit(*p)) {
841
91.6k
        f1_str.append(1, *p++);
842
91.6k
    }
843
    // Require space
844
24.3k
    if (!QUtil::is_space(*p)) {
845
205
        return false;
846
205
    }
847
24.1k
    if (QUtil::is_space(*(p + 1))) {
848
7.02k
        QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
849
7.02k
        invalid = true;
850
7.02k
    }
851
    // Skip spaces
852
65.7k
    while (QUtil::is_space(*p)) {
853
41.6k
        ++p;
854
41.6k
    }
855
    // Require digit
856
24.1k
    if (!QUtil::is_digit(*p)) {
857
414
        return false;
858
414
    }
859
    // Gather digits
860
23.7k
    std::string f2_str;
861
75.2k
    while (QUtil::is_digit(*p)) {
862
51.5k
        f2_str.append(1, *p++);
863
51.5k
    }
864
    // Require space
865
23.7k
    if (!QUtil::is_space(*p)) {
866
372
        return false;
867
372
    }
868
23.3k
    if (QUtil::is_space(*(p + 1))) {
869
9.62k
        QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
870
9.62k
        invalid = true;
871
9.62k
    }
872
    // Skip spaces
873
76.0k
    while (QUtil::is_space(*p)) {
874
52.6k
        ++p;
875
52.6k
    }
876
23.3k
    if ((*p == 'f') || (*p == 'n')) {
877
22.7k
        type = *p;
878
22.7k
    } else {
879
554
        return false;
880
554
    }
881
22.7k
    if ((f1_str.length() != 10) || (f2_str.length() != 5)) {
882
19.1k
        QTC::TC("qpdf", "QPDF ignore length error xref entry");
883
19.1k
        invalid = true;
884
19.1k
    }
885
886
22.7k
    if (invalid) {
887
20.0k
        warn(damagedPDF("xref table", "accepting invalid xref table entry"));
888
20.0k
    }
889
890
22.7k
    f1 = QUtil::string_to_ll(f1_str.c_str());
891
22.7k
    f2 = QUtil::string_to_int(f2_str.c_str());
892
893
22.7k
    return true;
894
23.3k
}
895
896
// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
897
// result.
898
bool
899
QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
900
162k
{
901
162k
    std::array<char, 21> line;
902
162k
    if (m->file->read(line.data(), 20) != 20) {
903
        // C++20: [[unlikely]]
904
525
        return false;
905
525
    }
906
161k
    line[20] = '\0';
907
161k
    char const* p = line.data();
908
909
161k
    int f1_len = 0;
910
161k
    int f2_len = 0;
911
912
    // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
913
    // buffer.
914
915
    // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
916
1.03M
    while (*p == '0') {
917
868k
        ++f1_len;
918
868k
        ++p;
919
868k
    }
920
737k
    while (QUtil::is_digit(*p) && f1_len++ < 10) {
921
576k
        f1 *= 10;
922
576k
        f1 += *p++ - '0';
923
576k
    }
924
    // Require space
925
161k
    if (!QUtil::is_space(*p++)) {
926
        // Entry doesn't start with space or digit.
927
        // C++20: [[unlikely]]
928
815
        return false;
929
815
    }
930
    // Gather digits. NB No risk of overflow as 99'999 < max int.
931
737k
    while (*p == '0') {
932
576k
        ++f2_len;
933
576k
        ++p;
934
576k
    }
935
302k
    while (QUtil::is_digit(*p) && f2_len++ < 5) {
936
141k
        f2 *= 10;
937
141k
        f2 += static_cast<int>(*p++ - '0');
938
141k
    }
939
160k
    if (QUtil::is_space(*p++) && (*p == 'f' || *p == 'n')) {
940
        // C++20: [[likely]]
941
145k
        type = *p;
942
        // No test for valid line[19].
943
145k
        if (*(++p) && *(++p) && (*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
944
            // C++20: [[likely]]
945
136k
            return true;
946
136k
        }
947
145k
    }
948
24.6k
    return read_bad_xrefEntry(f1, f2, type);
949
160k
}
950
951
// Read a single cross-reference table section and associated trailer.
952
qpdf_offset_t
953
QPDF::read_xrefTable(qpdf_offset_t xref_offset)
954
21.1k
{
955
21.1k
    std::vector<QPDFObjGen> deleted_items;
956
957
21.1k
    m->file->seek(xref_offset, SEEK_SET);
958
21.1k
    std::string line;
959
34.7k
    while (true) {
960
34.6k
        line.assign(50, '\0');
961
34.6k
        m->file->read(line.data(), line.size());
962
34.6k
        int obj = 0;
963
34.6k
        int num = 0;
964
34.6k
        int bytes = 0;
965
34.6k
        if (!parse_xrefFirst(line, obj, num, bytes)) {
966
1.89k
            QTC::TC("qpdf", "QPDF invalid xref");
967
1.89k
            throw damagedPDF("xref table", "xref syntax invalid");
968
1.89k
        }
969
32.7k
        m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
970
191k
        for (qpdf_offset_t i = obj; i - num < obj; ++i) {
971
162k
            if (i == 0) {
972
                // This is needed by checkLinearization()
973
14.8k
                m->first_xref_item_offset = m->file->tell();
974
14.8k
            }
975
            // For xref_table, these will always be small enough to be ints
976
162k
            qpdf_offset_t f1 = 0;
977
162k
            int f2 = 0;
978
162k
            char type = '\0';
979
162k
            if (!read_xrefEntry(f1, f2, type)) {
980
3.15k
                QTC::TC("qpdf", "QPDF invalid xref entry");
981
3.15k
                throw damagedPDF(
982
3.15k
                    "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
983
3.15k
            }
984
158k
            if (type == 'f') {
985
                // Save deleted items until after we've checked the XRefStm, if any.
986
35.5k
                deleted_items.emplace_back(toI(i), f2);
987
123k
            } else {
988
123k
                insertXrefEntry(toI(i), 1, f1, f2);
989
123k
            }
990
158k
        }
991
29.5k
        qpdf_offset_t pos = m->file->tell();
992
29.5k
        if (readToken(m->file).isWord("trailer")) {
993
15.9k
            break;
994
15.9k
        } else {
995
13.6k
            m->file->seek(pos, SEEK_SET);
996
13.6k
        }
997
29.5k
    }
998
999
    // Set offset to previous xref table if any
1000
16.0k
    QPDFObjectHandle cur_trailer = readTrailer();
1001
16.0k
    if (!cur_trailer.isDictionary()) {
1002
141
        QTC::TC("qpdf", "QPDF missing trailer");
1003
141
        throw damagedPDF("", "expected trailer dictionary");
1004
141
    }
1005
1006
15.9k
    if (!m->trailer.isInitialized()) {
1007
15.2k
        setTrailer(cur_trailer);
1008
1009
15.2k
        if (!m->trailer.hasKey("/Size")) {
1010
38
            QTC::TC("qpdf", "QPDF trailer lacks size");
1011
38
            throw damagedPDF("trailer", "trailer dictionary lacks /Size key");
1012
38
        }
1013
15.2k
        if (!m->trailer.getKey("/Size").isInteger()) {
1014
28
            QTC::TC("qpdf", "QPDF trailer size not integer");
1015
28
            throw damagedPDF("trailer", "/Size key in trailer dictionary is not an integer");
1016
28
        }
1017
15.2k
    }
1018
1019
15.8k
    if (cur_trailer.hasKey("/XRefStm")) {
1020
245
        if (m->ignore_xref_streams) {
1021
0
            QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
1022
245
        } else {
1023
245
            if (cur_trailer.getKey("/XRefStm").isInteger()) {
1024
                // Read the xref stream but disregard any return value -- we'll use our trailer's
1025
                // /Prev key instead of the xref stream's.
1026
241
                (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue());
1027
241
            } else {
1028
4
                throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm");
1029
4
            }
1030
245
        }
1031
245
    }
1032
1033
    // Handle any deleted items now that we've read the /XRefStm.
1034
24.3k
    for (auto const& og: deleted_items) {
1035
24.3k
        insertFreeXrefEntry(og);
1036
24.3k
    }
1037
1038
15.8k
    if (cur_trailer.hasKey("/Prev")) {
1039
1.48k
        if (!cur_trailer.getKey("/Prev").isInteger()) {
1040
21
            QTC::TC("qpdf", "QPDF trailer prev not integer");
1041
21
            throw damagedPDF("trailer", "/Prev key in trailer dictionary is not an integer");
1042
21
        }
1043
1.46k
        QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
1044
1.46k
        xref_offset = cur_trailer.getKey("/Prev").getIntValue();
1045
14.3k
    } else {
1046
14.3k
        xref_offset = 0;
1047
14.3k
    }
1048
1049
15.8k
    return xref_offset;
1050
15.8k
}
1051
1052
// Read a single cross-reference stream.
1053
qpdf_offset_t
1054
QPDF::read_xrefStream(qpdf_offset_t xref_offset)
1055
50.7k
{
1056
50.7k
    if (!m->ignore_xref_streams) {
1057
50.7k
        QPDFObjGen x_og;
1058
50.7k
        QPDFObjectHandle xref_obj;
1059
50.7k
        try {
1060
50.7k
            xref_obj =
1061
50.7k
                readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
1062
50.7k
        } catch (QPDFExc&) {
1063
            // ignore -- report error below
1064
28.6k
        }
1065
50.7k
        if (xref_obj.isStreamOfType("/XRef")) {
1066
20.3k
            QTC::TC("qpdf", "QPDF found xref stream");
1067
20.3k
            return processXRefStream(xref_offset, xref_obj);
1068
20.3k
        }
1069
50.5k
    }
1070
1071
30.2k
    QTC::TC("qpdf", "QPDF can't find xref");
1072
30.2k
    throw damagedPDF("", xref_offset, "xref not found");
1073
0
    return 0; // unreachable
1074
50.7k
}
1075
1076
// Return the entry size of the xref stream and the processed W array.
1077
std::pair<int, std::array<int, 3>>
1078
QPDF::processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged)
1079
20.3k
{
1080
20.3k
    auto W_obj = dict.getKey("/W");
1081
20.3k
    if (!(W_obj.isArray() && (W_obj.getArrayNItems() >= 3) && W_obj.getArrayItem(0).isInteger() &&
1082
20.3k
          W_obj.getArrayItem(1).isInteger() && W_obj.getArrayItem(2).isInteger())) {
1083
489
        throw damaged("Cross-reference stream does not have a proper /W key");
1084
489
    }
1085
1086
19.8k
    std::array<int, 3> W;
1087
19.8k
    int entry_size = 0;
1088
19.8k
    auto w_vector = W_obj.getArrayAsVector();
1089
19.8k
    int max_bytes = sizeof(qpdf_offset_t);
1090
78.9k
    for (size_t i = 0; i < 3; ++i) {
1091
59.3k
        W[i] = w_vector[i].getIntValueAsInt();
1092
59.3k
        if (W[i] > max_bytes) {
1093
172
            throw damaged("Cross-reference stream's /W contains impossibly large values");
1094
172
        }
1095
59.1k
        if (W[i] < 0) {
1096
101
            throw damaged("Cross-reference stream's /W contains negative values");
1097
101
        }
1098
59.0k
        entry_size += W[i];
1099
59.0k
    }
1100
19.5k
    if (entry_size == 0) {
1101
43
        throw damaged("Cross-reference stream's /W indicates entry size of 0");
1102
43
    }
1103
19.5k
    return {entry_size, W};
1104
19.5k
}
1105
1106
// Validate Size key and return the maximum number of entries that the xref stream can contain.
1107
int
1108
QPDF::processXRefSize(
1109
    QPDFObjectHandle& dict, int entry_size, std::function<QPDFExc(std::string_view)> damaged)
1110
19.5k
{
1111
    // Number of entries is limited by the highest possible object id and stream size.
1112
19.5k
    auto max_num_entries = std::numeric_limits<int>::max();
1113
19.5k
    if (max_num_entries > (std::numeric_limits<qpdf_offset_t>::max() / entry_size)) {
1114
0
        max_num_entries = toI(std::numeric_limits<qpdf_offset_t>::max() / entry_size);
1115
0
    }
1116
1117
19.5k
    auto Size_obj = dict.getKey("/Size");
1118
19.5k
    long long size;
1119
19.5k
    if (!dict.getKey("/Size").getValueAsInt(size)) {
1120
313
        throw damaged("Cross-reference stream does not have a proper /Size key");
1121
19.2k
    } else if (size < 0) {
1122
49
        throw damaged("Cross-reference stream has a negative /Size key");
1123
19.1k
    } else if (size >= max_num_entries) {
1124
50
        throw damaged("Cross-reference stream has an impossibly large /Size key");
1125
50
    }
1126
    // We are not validating that Size <= (Size key of parent xref / trailer).
1127
19.1k
    return max_num_entries;
1128
19.5k
}
1129
1130
// Return the number of entries of the xref stream and the processed Index array.
1131
std::pair<int, std::vector<std::pair<int, int>>>
1132
QPDF::processXRefIndex(
1133
    QPDFObjectHandle& dict, int max_num_entries, std::function<QPDFExc(std::string_view)> damaged)
1134
19.1k
{
1135
19.1k
    auto size = dict.getKey("/Size").getIntValueAsInt();
1136
19.1k
    auto Index_obj = dict.getKey("/Index");
1137
1138
19.1k
    if (Index_obj.isArray()) {
1139
6.74k
        std::vector<std::pair<int, int>> indx;
1140
6.74k
        int num_entries = 0;
1141
6.74k
        auto index_vec = Index_obj.getArrayAsVector();
1142
6.74k
        if ((index_vec.size() % 2) || index_vec.size() < 2) {
1143
140
            throw damaged("Cross-reference stream's /Index has an invalid number of values");
1144
140
        }
1145
1146
6.60k
        int i = 0;
1147
6.60k
        long long first = 0;
1148
23.5k
        for (auto& val: index_vec) {
1149
23.5k
            if (val.isInteger()) {
1150
23.3k
                if (i % 2) {
1151
11.5k
                    auto count = val.getIntValue();
1152
11.5k
                    if (count <= 0) {
1153
79
                        throw damaged(
1154
79
                            "Cross-reference stream section claims to contain " +
1155
79
                            std::to_string(count) + " entries");
1156
79
                    }
1157
                    // We are guarding against the possibility of num_entries * entry_size
1158
                    // overflowing. We are not checking that entries are in ascending order as
1159
                    // required by the spec, which probably should generate a warning. We are also
1160
                    // not checking that for each subsection first object number + number of entries
1161
                    // <= /Size. The spec requires us to ignore object number > /Size.
1162
11.4k
                    if (first > (max_num_entries - count) ||
1163
11.4k
                        count > (max_num_entries - num_entries)) {
1164
54
                        throw damaged(
1165
54
                            "Cross-reference stream claims to contain too many entries: " +
1166
54
                            std::to_string(first) + " " + std::to_string(max_num_entries) + " " +
1167
54
                            std::to_string(num_entries));
1168
54
                    }
1169
11.4k
                    indx.emplace_back(static_cast<int>(first), static_cast<int>(count));
1170
11.4k
                    num_entries += static_cast<int>(count);
1171
11.8k
                } else {
1172
11.8k
                    first = val.getIntValue();
1173
11.8k
                    if (first < 0) {
1174
57
                        throw damaged(
1175
57
                            "Cross-reference stream's /Index contains a negative object id");
1176
11.7k
                    } else if (first > max_num_entries) {
1177
81
                        throw damaged("Cross-reference stream's /Index contains an impossibly "
1178
81
                                      "large object id");
1179
81
                    }
1180
11.8k
                }
1181
23.3k
            } else {
1182
177
                throw damaged(
1183
177
                    "Cross-reference stream's /Index's item " + std::to_string(i) +
1184
177
                    " is not an integer");
1185
177
            }
1186
23.1k
            i++;
1187
23.1k
        }
1188
6.15k
        QTC::TC("qpdf", "QPDF xref /Index is array", index_vec.size() == 2 ? 0 : 1);
1189
6.15k
        return {num_entries, indx};
1190
12.3k
    } else if (Index_obj.isNull()) {
1191
12.3k
        QTC::TC("qpdf", "QPDF xref /Index is null");
1192
12.3k
        return {size, {{0, size}}};
1193
12.3k
    } else {
1194
44
        throw damaged("Cross-reference stream does not have a proper /Index key");
1195
44
    }
1196
19.1k
}
1197
1198
qpdf_offset_t
1199
QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
1200
20.3k
{
1201
20.3k
    auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
1202
9.14k
        return damagedPDF("xref stream", xref_offset, msg.data());
1203
9.14k
    };
1204
1205
20.3k
    auto dict = xref_obj.getDict();
1206
1207
20.3k
    auto [entry_size, W] = processXRefW(dict, damaged);
1208
20.3k
    int max_num_entries = processXRefSize(dict, entry_size, damaged);
1209
20.3k
    auto [num_entries, indx] = processXRefIndex(dict, max_num_entries, damaged);
1210
1211
20.3k
    std::shared_ptr<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
1212
20.3k
    size_t actual_size = bp->getSize();
1213
20.3k
    auto expected_size = toS(entry_size) * toS(num_entries);
1214
1215
20.3k
    if (expected_size != actual_size) {
1216
7.29k
        QPDFExc x = damaged(
1217
7.29k
            "Cross-reference stream data has the wrong size; expected = " +
1218
7.29k
            std::to_string(expected_size) + "; actual = " + std::to_string(actual_size));
1219
7.29k
        if (expected_size > actual_size) {
1220
1.00k
            throw x;
1221
6.29k
        } else {
1222
6.29k
            warn(x);
1223
6.29k
        }
1224
7.29k
    }
1225
1226
19.3k
    bool saw_first_compressed_object = false;
1227
1228
    // Actual size vs. expected size check above ensures that we will not overflow any buffers here.
1229
    // We know that entry_size * num_entries is less or equal to the size of the buffer.
1230
19.3k
    auto p = bp->getBuffer();
1231
19.3k
    for (auto [obj, sec_entries]: indx) {
1232
        // Process a subsection.
1233
1.16M
        for (int i = 0; i < sec_entries; ++i) {
1234
            // Read this entry
1235
1.15M
            std::array<qpdf_offset_t, 3> fields{};
1236
1.15M
            if (W[0] == 0) {
1237
125k
                QTC::TC("qpdf", "QPDF default for xref stream field 0");
1238
125k
                fields[0] = 1;
1239
125k
            }
1240
4.61M
            for (size_t j = 0; j < 3; ++j) {
1241
7.97M
                for (int k = 0; k < W[j]; ++k) {
1242
4.51M
                    fields[j] <<= 8;
1243
4.51M
                    fields[j] |= *p++;
1244
4.51M
                }
1245
3.46M
            }
1246
1247
            // Get the generation number.  The generation number is 0 unless this is an uncompressed
1248
            // object record, in which case the generation number appears as the third field.
1249
1.15M
            if (saw_first_compressed_object) {
1250
916k
                if (fields[0] != 2) {
1251
138k
                    m->uncompressed_after_compressed = true;
1252
138k
                }
1253
916k
            } else if (fields[0] == 2) {
1254
9.12k
                saw_first_compressed_object = true;
1255
9.12k
            }
1256
1.15M
            if (obj == 0) {
1257
                // This is needed by checkLinearization()
1258
9.97k
                m->first_xref_item_offset = xref_offset;
1259
1.14M
            } else if (fields[0] == 0) {
1260
                // Ignore fields[2], which we don't care about in this case. This works around the
1261
                // issue of some PDF files that put invalid values, like -1, here for deleted
1262
                // objects.
1263
120k
                insertFreeXrefEntry(QPDFObjGen(obj, 0));
1264
1.02M
            } else {
1265
1.02M
                insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
1266
1.02M
            }
1267
1.15M
            ++obj;
1268
1.15M
        }
1269
15.9k
    }
1270
1271
19.3k
    if (!m->trailer.isInitialized()) {
1272
1.55k
        setTrailer(dict);
1273
1.55k
    }
1274
1275
19.3k
    if (dict.hasKey("/Prev")) {
1276
1.88k
        if (!dict.getKey("/Prev").isInteger()) {
1277
32
            throw damagedPDF(
1278
32
                "xref stream", "/Prev key in xref stream dictionary is not an integer");
1279
32
        }
1280
1.84k
        QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
1281
1.84k
        return dict.getKey("/Prev").getIntValue();
1282
17.4k
    } else {
1283
17.4k
        return 0;
1284
17.4k
    }
1285
19.3k
}
1286
1287
void
1288
QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2)
1289
1.14M
{
1290
    // Populate the xref table in such a way that the first reference to an object that we see,
1291
    // which is the one in the latest xref table in which it appears, is the one that gets stored.
1292
    // This works because we are reading more recent appends before older ones.
1293
1294
    // If there is already an entry for this object and generation in the table, it means that a
1295
    // later xref table has registered this object.  Disregard this one.
1296
1297
1.14M
    if (obj > m->xref_table_max_id) {
1298
        // ignore impossibly large object ids or object ids > Size.
1299
4.61k
        return;
1300
4.61k
    }
1301
1302
1.14M
    if (m->deleted_objects.count(obj)) {
1303
359
        QTC::TC("qpdf", "QPDF xref deleted object");
1304
359
        return;
1305
359
    }
1306
1307
1.14M
    if (f0 == 2 && static_cast<int>(f1) == obj) {
1308
1.21k
        warn(damagedPDF("xref stream", "self-referential object stream " + std::to_string(obj)));
1309
1.21k
        return;
1310
1.21k
    }
1311
1312
1.13M
    auto [iter, created] = m->xref_table.try_emplace(QPDFObjGen(obj, (f0 == 2 ? 0 : f2)));
1313
1.13M
    if (!created) {
1314
137k
        QTC::TC("qpdf", "QPDF xref reused object");
1315
137k
        return;
1316
137k
    }
1317
1318
1.00M
    switch (f0) {
1319
285k
    case 1:
1320
        // f2 is generation
1321
285k
        QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1322
285k
        iter->second = QPDFXRefEntry(f1);
1323
285k
        break;
1324
1325
714k
    case 2:
1326
714k
        iter->second = QPDFXRefEntry(toI(f1), f2);
1327
714k
        break;
1328
1329
2.37k
    default:
1330
2.37k
        throw damagedPDF("xref stream", "unknown xref stream entry type " + std::to_string(f0));
1331
0
        break;
1332
1.00M
    }
1333
1.00M
}
1334
1335
void
1336
QPDF::insertFreeXrefEntry(QPDFObjGen og)
1337
145k
{
1338
145k
    if (!m->xref_table.count(og)) {
1339
132k
        m->deleted_objects.insert(og.getObj());
1340
132k
    }
1341
145k
}
1342
1343
// Replace uncompressed object. This is used in xref recovery mode, which reads the file from
1344
// beginning to end.
1345
void
1346
QPDF::insertReconstructedXrefEntry(int obj, qpdf_offset_t f1, int f2)
1347
3.24M
{
1348
3.24M
    if (!(obj > 0 && obj <= m->xref_table_max_id && 0 <= f2 && f2 < 65535)) {
1349
10.3k
        QTC::TC("qpdf", "QPDF xref overwrite invalid objgen");
1350
10.3k
        return;
1351
10.3k
    }
1352
1353
3.23M
    QPDFObjGen og(obj, f2);
1354
3.23M
    if (!m->deleted_objects.count(obj)) {
1355
        // deleted_objects stores the uncompressed objects removed from the xref table at the start
1356
        // of recovery.
1357
3.23M
        QTC::TC("qpdf", "QPDF xref overwrite object");
1358
3.23M
        m->xref_table[QPDFObjGen(obj, f2)] = QPDFXRefEntry(f1);
1359
3.23M
    }
1360
3.23M
}
1361
1362
void
1363
QPDF::showXRefTable()
1364
0
{
1365
0
    auto& cout = *m->log->getInfo();
1366
0
    for (auto const& iter: m->xref_table) {
1367
0
        QPDFObjGen const& og = iter.first;
1368
0
        QPDFXRefEntry const& entry = iter.second;
1369
0
        cout << og.unparse('/') << ": ";
1370
0
        switch (entry.getType()) {
1371
0
        case 1:
1372
0
            cout << "uncompressed; offset = " << entry.getOffset();
1373
0
            break;
1374
1375
0
        case 2:
1376
0
            *m->log->getInfo() << "compressed; stream = " << entry.getObjStreamNumber()
1377
0
                               << ", index = " << entry.getObjStreamIndex();
1378
0
            break;
1379
1380
0
        default:
1381
0
            throw std::logic_error("unknown cross-reference table type while"
1382
0
                                   " showing xref_table");
1383
0
            break;
1384
0
        }
1385
0
        m->log->info("\n");
1386
0
    }
1387
0
}
1388
1389
// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and
1390
// return false. Otherwise return true.
1391
bool
1392
QPDF::resolveXRefTable()
1393
39.3k
{
1394
39.3k
    bool may_change = !m->reconstructed_xref;
1395
2.06M
    for (auto& iter: m->xref_table) {
1396
2.06M
        if (isUnresolved(iter.first)) {
1397
1.09M
            resolve(iter.first);
1398
1.09M
            if (may_change && m->reconstructed_xref) {
1399
339
                return false;
1400
339
            }
1401
1.09M
        }
1402
2.06M
    }
1403
38.9k
    return true;
1404
39.3k
}
1405
1406
// Ensure all objects in the pdf file, including those in indirect references, appear in the object
1407
// cache.
1408
void
1409
QPDF::fixDanglingReferences(bool force)
1410
132k
{
1411
132k
    if (m->fixed_dangling_refs) {
1412
93.8k
        return;
1413
93.8k
    }
1414
38.9k
    if (!resolveXRefTable()) {
1415
339
        QTC::TC("qpdf", "QPDF fix dangling triggered xref reconstruction");
1416
339
        resolveXRefTable();
1417
339
    }
1418
38.9k
    m->fixed_dangling_refs = true;
1419
38.9k
}
1420
1421
size_t
1422
QPDF::getObjectCount()
1423
100k
{
1424
    // This method returns the next available indirect object number. makeIndirectObject uses it for
1425
    // this purpose. After fixDanglingReferences is called, all objects in the xref table will also
1426
    // be in obj_cache.
1427
100k
    fixDanglingReferences();
1428
100k
    QPDFObjGen og;
1429
100k
    if (!m->obj_cache.empty()) {
1430
99.4k
        og = (*(m->obj_cache.rbegin())).first;
1431
99.4k
    }
1432
100k
    return toS(og.getObj());
1433
100k
}
1434
1435
std::vector<QPDFObjectHandle>
1436
QPDF::getAllObjects()
1437
0
{
1438
    // After fixDanglingReferences is called, all objects are in the object cache.
1439
0
    fixDanglingReferences();
1440
0
    std::vector<QPDFObjectHandle> result;
1441
0
    for (auto const& iter: m->obj_cache) {
1442
0
        result.push_back(newIndirect(iter.first, iter.second.object));
1443
0
    }
1444
0
    return result;
1445
0
}
1446
1447
void
1448
QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen const& og)
1449
4.00M
{
1450
4.00M
    m->last_object_description.clear();
1451
4.00M
    if (!description.empty()) {
1452
80.5k
        m->last_object_description += description;
1453
80.5k
        if (og.isIndirect()) {
1454
29.1k
            m->last_object_description += ": ";
1455
29.1k
        }
1456
80.5k
    }
1457
4.00M
    if (og.isIndirect()) {
1458
3.95M
        m->last_object_description += "object " + og.unparse(' ');
1459
3.95M
    }
1460
4.00M
}
1461
1462
QPDFObjectHandle
1463
QPDF::readTrailer()
1464
125k
{
1465
125k
    qpdf_offset_t offset = m->file->tell();
1466
125k
    bool empty = false;
1467
125k
    auto object =
1468
125k
        QPDFParser(m->file, "trailer", m->tokenizer, nullptr, this, true).parse(empty, false);
1469
125k
    if (empty) {
1470
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1471
        // actual PDF files and Adobe Reader appears to ignore them.
1472
2.06k
        warn(damagedPDF("trailer", "empty object treated as null"));
1473
123k
    } else if (object.isDictionary() && readToken(m->file).isWord("stream")) {
1474
335
        warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1475
335
    }
1476
    // Override last_offset so that it points to the beginning of the object we just read
1477
125k
    m->file->setLastOffset(offset);
1478
125k
    return object;
1479
125k
}
1480
1481
QPDFObjectHandle
1482
QPDF::readObject(std::string const& description, QPDFObjGen og)
1483
1.97M
{
1484
1.97M
    setLastObjectDescription(description, og);
1485
1.97M
    qpdf_offset_t offset = m->file->tell();
1486
1.97M
    bool empty = false;
1487
1488
1.97M
    StringDecrypter decrypter{this, og};
1489
1.97M
    StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1490
1.97M
    auto object =
1491
1.97M
        QPDFParser(m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this, true)
1492
1.97M
            .parse(empty, false);
1493
1.97M
    if (empty) {
1494
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1495
        // actual PDF files and Adobe Reader appears to ignore them.
1496
811
        warn(damagedPDF(m->file, m->file->getLastOffset(), "empty object treated as null"));
1497
811
        return object;
1498
811
    }
1499
1.97M
    auto token = readToken(m->file);
1500
1.97M
    if (object.isDictionary() && token.isWord("stream")) {
1501
619k
        readStream(object, og, offset);
1502
619k
        token = readToken(m->file);
1503
619k
    }
1504
1.97M
    if (!token.isWord("endobj")) {
1505
189k
        QTC::TC("qpdf", "QPDF err expected endobj");
1506
189k
        warn(damagedPDF("expected endobj"));
1507
189k
    }
1508
1.97M
    return object;
1509
1.97M
}
1510
1511
// After reading stream dictionary and stream keyword, read rest of stream.
1512
void
1513
QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1514
619k
{
1515
619k
    validateStreamLineEnd(object, og, offset);
1516
1517
    // Must get offset before accessing any additional objects since resolving a previously
1518
    // unresolved indirect object will change file position.
1519
619k
    qpdf_offset_t stream_offset = m->file->tell();
1520
619k
    size_t length = 0;
1521
1522
619k
    try {
1523
619k
        auto length_obj = object.getKey("/Length");
1524
1525
619k
        if (!length_obj.isInteger()) {
1526
52.8k
            if (length_obj.isNull()) {
1527
45.1k
                QTC::TC("qpdf", "QPDF stream without length");
1528
45.1k
                throw damagedPDF(offset, "stream dictionary lacks /Length key");
1529
45.1k
            }
1530
7.75k
            QTC::TC("qpdf", "QPDF stream length not integer");
1531
7.75k
            throw damagedPDF(offset, "/Length key in stream dictionary is not an integer");
1532
52.8k
        }
1533
1534
566k
        length = toS(length_obj.getUIntValue());
1535
        // Seek in two steps to avoid potential integer overflow
1536
566k
        m->file->seek(stream_offset, SEEK_SET);
1537
566k
        m->file->seek(toO(length), SEEK_CUR);
1538
566k
        if (!readToken(m->file).isWord("endstream")) {
1539
85.3k
            QTC::TC("qpdf", "QPDF missing endstream");
1540
85.3k
            throw damagedPDF("expected endstream");
1541
85.3k
        }
1542
566k
    } catch (QPDFExc& e) {
1543
140k
        if (m->attempt_recovery) {
1544
140k
            warn(e);
1545
140k
            length = recoverStreamLength(m->file, og, stream_offset);
1546
140k
        } else {
1547
0
            throw;
1548
0
        }
1549
140k
    }
1550
614k
    object = newIndirect(og, QPDF_Stream::create(this, og, object, stream_offset, length));
1551
614k
}
1552
1553
void
1554
QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset)
1555
619k
{
1556
    // The PDF specification states that the word "stream" should be followed by either a carriage
1557
    // return and a newline or by a newline alone.  It specifically disallowed following it by a
1558
    // carriage return alone since, in that case, there would be no way to tell whether the NL in a
1559
    // CR NL sequence was part of the stream data.  However, some readers, including Adobe reader,
1560
    // accept a carriage return by itself when followed by a non-newline character, so that's what
1561
    // we do here. We have also seen files that have extraneous whitespace between the stream
1562
    // keyword and the newline.
1563
640k
    while (true) {
1564
640k
        char ch;
1565
640k
        if (m->file->read(&ch, 1) == 0) {
1566
            // A premature EOF here will result in some other problem that will get reported at
1567
            // another time.
1568
95
            return;
1569
95
        }
1570
640k
        if (ch == '\n') {
1571
            // ready to read stream data
1572
241k
            QTC::TC("qpdf", "QPDF stream with NL only");
1573
241k
            return;
1574
241k
        }
1575
399k
        if (ch == '\r') {
1576
            // Read another character
1577
371k
            if (m->file->read(&ch, 1) != 0) {
1578
371k
                if (ch == '\n') {
1579
                    // Ready to read stream data
1580
368k
                    QTC::TC("qpdf", "QPDF stream with CRNL");
1581
368k
                } else {
1582
                    // Treat the \r by itself as the whitespace after endstream and start reading
1583
                    // stream data in spite of not having seen a newline.
1584
2.13k
                    QTC::TC("qpdf", "QPDF stream with CR only");
1585
2.13k
                    m->file->unreadCh(ch);
1586
2.13k
                    warn(damagedPDF(
1587
2.13k
                        m->file->tell(), "stream keyword followed by carriage return only"));
1588
2.13k
                }
1589
371k
            }
1590
371k
            return;
1591
371k
        }
1592
28.2k
        if (!QUtil::is_space(ch)) {
1593
6.94k
            QTC::TC("qpdf", "QPDF stream without newline");
1594
6.94k
            m->file->unreadCh(ch);
1595
6.94k
            warn(damagedPDF(
1596
6.94k
                m->file->tell(), "stream keyword not followed by proper line terminator"));
1597
6.94k
            return;
1598
6.94k
        }
1599
21.3k
        warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace"));
1600
21.3k
    }
1601
619k
}
1602
1603
QPDFObjectHandle
1604
QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
1605
317k
{
1606
317k
    m->last_object_description.erase(7); // last_object_description starts with "object "
1607
317k
    m->last_object_description += std::to_string(obj);
1608
317k
    m->last_object_description += " 0";
1609
1610
317k
    bool empty = false;
1611
317k
    auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this, true)
1612
317k
                      .parse(empty, false);
1613
317k
    if (empty) {
1614
        // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1615
        // actual PDF files and Adobe Reader appears to ignore them.
1616
8
        warn(damagedPDF(input, input->getLastOffset(), "empty object treated as null"));
1617
8
    }
1618
317k
    return object;
1619
317k
}
1620
1621
bool
1622
QPDF::findEndstream()
1623
181k
{
1624
    // Find endstream or endobj. Position the input at that token.
1625
181k
    auto t = readToken(m->file, 20);
1626
181k
    if (t.isWord("endobj") || t.isWord("endstream")) {
1627
128k
        m->file->seek(m->file->getLastOffset(), SEEK_SET);
1628
128k
        return true;
1629
128k
    }
1630
52.7k
    return false;
1631
181k
}
1632
1633
size_t
1634
QPDF::recoverStreamLength(
1635
    std::shared_ptr<InputSource> input, QPDFObjGen const& og, qpdf_offset_t stream_offset)
1636
135k
{
1637
    // Try to reconstruct stream length by looking for endstream or endobj
1638
135k
    warn(damagedPDF(input, stream_offset, "attempting to recover stream length"));
1639
1640
135k
    PatternFinder ef(*this, &QPDF::findEndstream);
1641
135k
    size_t length = 0;
1642
135k
    if (m->file->findFirst("end", stream_offset, 0, ef)) {
1643
128k
        length = toS(m->file->tell() - stream_offset);
1644
        // Reread endstream but, if it was endobj, don't skip that.
1645
128k
        QPDFTokenizer::Token t = readToken(m->file);
1646
128k
        if (t.getValue() == "endobj") {
1647
42.4k
            m->file->seek(m->file->getLastOffset(), SEEK_SET);
1648
42.4k
        }
1649
128k
    }
1650
1651
135k
    if (length) {
1652
127k
        qpdf_offset_t this_obj_offset = 0;
1653
127k
        QPDFObjGen this_og;
1654
1655
        // Make sure this is inside this object
1656
9.34M
        for (auto const& iter: m->xref_table) {
1657
9.34M
            QPDFXRefEntry const& entry = iter.second;
1658
9.34M
            if (entry.getType() == 1) {
1659
8.90M
                qpdf_offset_t obj_offset = entry.getOffset();
1660
8.90M
                if ((obj_offset > stream_offset) &&
1661
8.90M
                    ((this_obj_offset == 0) || (this_obj_offset > obj_offset))) {
1662
229k
                    this_obj_offset = obj_offset;
1663
229k
                    this_og = iter.first;
1664
229k
                }
1665
8.90M
            }
1666
9.34M
        }
1667
127k
        if (this_obj_offset && (this_og == og)) {
1668
            // Well, we found endstream\nendobj within the space allowed for this object, so we're
1669
            // probably in good shape.
1670
127k
        } else {
1671
127k
            QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1672
127k
        }
1673
127k
    }
1674
1675
135k
    if (length == 0) {
1676
7.15k
        warn(damagedPDF(
1677
7.15k
            input, stream_offset, "unable to recover stream data; treating stream as empty"));
1678
128k
    } else {
1679
128k
        warn(
1680
128k
            damagedPDF(input, stream_offset, "recovered stream length: " + std::to_string(length)));
1681
128k
    }
1682
1683
135k
    QTC::TC("qpdf", "QPDF recovered stream length");
1684
135k
    return length;
1685
135k
}
1686
1687
QPDFTokenizer::Token
1688
QPDF::readToken(std::shared_ptr<InputSource> input, size_t max_len)
1689
75.3M
{
1690
75.3M
    return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1691
75.3M
}
1692
1693
QPDFObjectHandle
1694
QPDF::readObjectAtOffset(
1695
    bool try_recovery,
1696
    qpdf_offset_t offset,
1697
    std::string const& description,
1698
    QPDFObjGen exp_og,
1699
    QPDFObjGen& og,
1700
    bool skip_cache_if_in_xref)
1701
2.02M
{
1702
2.02M
    bool check_og = true;
1703
2.02M
    if (exp_og.getObj() == 0) {
1704
        // This method uses an expect object ID of 0 to indicate that we don't know or don't care
1705
        // what the actual object ID is at this offset. This is true when we read the xref stream
1706
        // and linearization hint streams. In this case, we don't verify the expect object
1707
        // ID/generation against what was read from the file. There is also no reason to attempt
1708
        // xref recovery if we get a failure in this case since the read attempt was not triggered
1709
        // by an xref lookup.
1710
50.7k
        check_og = false;
1711
50.7k
        try_recovery = false;
1712
50.7k
    }
1713
2.02M
    setLastObjectDescription(description, exp_og);
1714
1715
2.02M
    if (!m->attempt_recovery) {
1716
0
        try_recovery = false;
1717
0
    }
1718
1719
    // Special case: if offset is 0, just return null.  Some PDF writers, in particular
1720
    // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as
1721
    // "0000000000 00000 n", which is not correct, but it won't hurt anything for us to ignore
1722
    // these.
1723
2.02M
    if (offset == 0) {
1724
1.64k
        QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1725
1.64k
        warn(damagedPDF(0, "object has offset 0"));
1726
1.64k
        return QPDFObjectHandle::newNull();
1727
1.64k
    }
1728
1729
2.02M
    m->file->seek(offset, SEEK_SET);
1730
2.02M
    try {
1731
2.02M
        QPDFTokenizer::Token tobjid = readToken(m->file);
1732
2.02M
        bool objidok = tobjid.isInteger();
1733
2.02M
        QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1734
2.02M
        if (!objidok) {
1735
37.0k
            QTC::TC("qpdf", "QPDF expected n n obj");
1736
37.0k
            throw damagedPDF(offset, "expected n n obj");
1737
37.0k
        }
1738
1.98M
        QPDFTokenizer::Token tgen = readToken(m->file);
1739
1.98M
        bool genok = tgen.isInteger();
1740
1.98M
        QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1741
1.98M
        if (!genok) {
1742
2.64k
            throw damagedPDF(offset, "expected n n obj");
1743
2.64k
        }
1744
1.98M
        QPDFTokenizer::Token tobj = readToken(m->file);
1745
1746
1.98M
        bool objok = tobj.isWord("obj");
1747
1.98M
        QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1748
1749
1.98M
        if (!objok) {
1750
2.90k
            throw damagedPDF(offset, "expected n n obj");
1751
2.90k
        }
1752
1.97M
        int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1753
1.97M
        int generation = QUtil::string_to_int(tgen.getValue().c_str());
1754
1.97M
        og = QPDFObjGen(objid, generation);
1755
1.97M
        if (objid == 0) {
1756
168
            QTC::TC("qpdf", "QPDF object id 0");
1757
168
            throw damagedPDF(offset, "object with ID 0");
1758
168
        }
1759
1.97M
        if (check_og && (exp_og != og)) {
1760
636
            QTC::TC("qpdf", "QPDF err wrong objid/generation");
1761
636
            QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1762
636
            if (try_recovery) {
1763
                // Will be retried below
1764
636
                throw e;
1765
636
            } else {
1766
                // We can try reading the object anyway even if the ID doesn't match.
1767
0
                warn(e);
1768
0
            }
1769
636
        }
1770
1.97M
    } catch (QPDFExc& e) {
1771
43.4k
        if (try_recovery) {
1772
            // Try again after reconstructing xref table
1773
15.6k
            reconstruct_xref(e);
1774
15.6k
            if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
1775
400
                qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1776
400
                QPDFObjectHandle result =
1777
400
                    readObjectAtOffset(false, new_offset, description, exp_og, og, false);
1778
400
                QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1779
400
                return result;
1780
15.2k
            } else {
1781
15.2k
                QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1782
15.2k
                warn(damagedPDF(
1783
15.2k
                    "",
1784
15.2k
                    0,
1785
15.2k
                    ("object " + exp_og.unparse(' ') +
1786
15.2k
                     " not found in file after regenerating cross reference "
1787
15.2k
                     "table")));
1788
15.2k
                return QPDFObjectHandle::newNull();
1789
15.2k
            }
1790
27.8k
        } else {
1791
27.8k
            throw;
1792
27.8k
        }
1793
43.4k
    }
1794
1795
1.97M
    QPDFObjectHandle oh = readObject(description, og);
1796
1797
1.97M
    if (isUnresolved(og)) {
1798
        // Store the object in the cache here so it gets cached whether we first know the offset or
1799
        // whether we first know the object ID and generation (in which we case we would get here
1800
        // through resolve).
1801
1802
        // Determine the end offset of this object before and after white space.  We use these
1803
        // numbers to validate linearization hint tables.  Offsets and lengths of objects may imply
1804
        // the end of an object to be anywhere between these values.
1805
1.93M
        qpdf_offset_t end_before_space = m->file->tell();
1806
1807
        // skip over spaces
1808
5.28M
        while (true) {
1809
5.28M
            char ch;
1810
5.28M
            if (m->file->read(&ch, 1)) {
1811
5.24M
                if (!isspace(static_cast<unsigned char>(ch))) {
1812
1.90M
                    m->file->seek(-1, SEEK_CUR);
1813
1.90M
                    break;
1814
1.90M
                }
1815
5.24M
            } else {
1816
34.3k
                throw damagedPDF(m->file->tell(), "EOF after endobj");
1817
34.3k
            }
1818
5.28M
        }
1819
1.90M
        qpdf_offset_t end_after_space = m->file->tell();
1820
1.90M
        if (skip_cache_if_in_xref && m->xref_table.count(og)) {
1821
            // Ordinarily, an object gets read here when resolved through xref table or stream. In
1822
            // the special case of the xref stream and linearization hint tables, the offset comes
1823
            // from another source. For the specific case of xref streams, the xref stream is read
1824
            // and loaded into the object cache very early in parsing. Ordinarily, when a file is
1825
            // updated by appending, items inserted into the xref table in later updates take
1826
            // precedence over earlier items. In the special case of reusing the object number
1827
            // previously used as the xref stream, we have the following order of events:
1828
            //
1829
            // * reused object gets loaded into the xref table
1830
            // * old object is read here while reading xref streams
1831
            // * original xref entry is ignored (since already in xref table)
1832
            //
1833
            // It is the second step that causes a problem. Even though the xref table is correct in
1834
            // this case, the old object is already in the cache and so effectively prevails over
1835
            // the reused object. To work around this issue, we have a special case for the xref
1836
            // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
1837
            // don't cache what we read here.
1838
            //
1839
            // It is likely that the same bug may exist for linearization hint tables, but the
1840
            // existing code uses end_before_space and end_after_space from the cache, so fixing
1841
            // that would require more significant rework. The chances of a linearization hint
1842
            // stream being reused seems smaller because the xref stream is probably the highest
1843
            // object in the file and the linearization hint stream would be some random place in
1844
            // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
1845
            // could use !check_og in place of skip_cache_if_in_xref.
1846
118
            QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1847
1.90M
        } else {
1848
1.90M
            updateCache(og, oh.getObj(), end_before_space, end_after_space);
1849
1.90M
        }
1850
1.90M
    }
1851
1852
1.94M
    return oh;
1853
1.97M
}
1854
1855
QPDFObject*
1856
QPDF::resolve(QPDFObjGen og)
1857
2.20M
{
1858
2.20M
    if (!isUnresolved(og)) {
1859
0
        return m->obj_cache[og].object.get();
1860
0
    }
1861
1862
2.20M
    if (m->resolving.count(og)) {
1863
        // This can happen if an object references itself directly or indirectly in some key that
1864
        // has to be resolved during object parsing, such as stream length.
1865
506
        QTC::TC("qpdf", "QPDF recursion loop in resolve");
1866
506
        warn(damagedPDF("", "loop detected resolving object " + og.unparse(' ')));
1867
506
        updateCache(og, QPDF_Null::create(), -1, -1);
1868
506
        return m->obj_cache[og].object.get();
1869
506
    }
1870
2.20M
    ResolveRecorder rr(this, og);
1871
1872
2.20M
    if (m->xref_table.count(og) != 0) {
1873
2.15M
        QPDFXRefEntry const& entry = m->xref_table[og];
1874
2.15M
        try {
1875
2.15M
            switch (entry.getType()) {
1876
1.97M
            case 1:
1877
1.97M
                {
1878
1.97M
                    qpdf_offset_t offset = entry.getOffset();
1879
                    // Object stored in cache by readObjectAtOffset
1880
1.97M
                    QPDFObjGen a_og;
1881
1.97M
                    QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
1882
1.97M
                }
1883
1.97M
                break;
1884
1885
183k
            case 2:
1886
183k
                resolveObjectsInStream(entry.getObjStreamNumber());
1887
183k
                break;
1888
1889
133
            default:
1890
133
                throw damagedPDF(
1891
133
                    "", 0, ("object " + og.unparse('/') + " has unexpected xref entry type"));
1892
2.15M
            }
1893
2.15M
        } catch (QPDFExc& e) {
1894
74.1k
            warn(e);
1895
74.1k
        } catch (std::exception& e) {
1896
5.06k
            warn(damagedPDF(
1897
5.06k
                "", 0, ("object " + og.unparse('/') + ": error reading object: " + e.what())));
1898
5.06k
        }
1899
2.15M
    }
1900
1901
2.18M
    if (isUnresolved(og)) {
1902
        // PDF spec says unknown objects resolve to the null object.
1903
279k
        QTC::TC("qpdf", "QPDF resolve failure to null");
1904
279k
        updateCache(og, QPDF_Null::create(), -1, -1);
1905
279k
    }
1906
1907
2.18M
    auto result(m->obj_cache[og].object);
1908
2.18M
    result->setDefaultDescription(this, og);
1909
2.18M
    return result.get();
1910
2.20M
}
1911
1912
void
1913
QPDF::resolveObjectsInStream(int obj_stream_number)
1914
183k
{
1915
183k
    if (m->resolved_object_streams.count(obj_stream_number)) {
1916
163k
        return;
1917
163k
    }
1918
20.1k
    m->resolved_object_streams.insert(obj_stream_number);
1919
    // Force resolution of object stream
1920
20.1k
    QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
1921
20.1k
    if (!obj_stream.isStream()) {
1922
3.99k
        throw damagedPDF(
1923
3.99k
            "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
1924
3.99k
    }
1925
1926
    // For linearization data in the object, use the data from the object stream for the objects in
1927
    // the stream.
1928
16.1k
    QPDFObjGen stream_og(obj_stream_number, 0);
1929
16.1k
    qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space;
1930
16.1k
    qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space;
1931
1932
16.1k
    QPDFObjectHandle dict = obj_stream.getDict();
1933
16.1k
    if (!dict.isDictionaryOfType("/ObjStm")) {
1934
910
        QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
1935
910
        warn(damagedPDF(
1936
910
            "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1937
910
    }
1938
1939
16.1k
    if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
1940
540
        throw damagedPDF(
1941
540
            ("object stream " + std::to_string(obj_stream_number) + " has incorrect keys"));
1942
540
    }
1943
1944
15.6k
    int n = dict.getKey("/N").getIntValueAsInt();
1945
15.6k
    int first = dict.getKey("/First").getIntValueAsInt();
1946
1947
15.6k
    std::map<int, int> offsets;
1948
1949
15.6k
    std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
1950
15.6k
    auto input = std::shared_ptr<InputSource>(
1951
        // line-break
1952
15.6k
        new BufferInputSource(
1953
15.6k
            (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
1954
15.6k
            bp.get()));
1955
1956
476k
    for (int i = 0; i < n; ++i) {
1957
461k
        QPDFTokenizer::Token tnum = readToken(input);
1958
461k
        QPDFTokenizer::Token toffset = readToken(input);
1959
461k
        if (!(tnum.isInteger() && toffset.isInteger())) {
1960
792
            throw damagedPDF(
1961
792
                input,
1962
792
                m->last_object_description,
1963
792
                input->getLastOffset(),
1964
792
                "expected integer in object stream header");
1965
792
        }
1966
1967
461k
        int num = QUtil::string_to_int(tnum.getValue().c_str());
1968
461k
        long long offset = QUtil::string_to_int(toffset.getValue().c_str());
1969
461k
        if (num > m->xref_table_max_id) {
1970
10.2k
            continue;
1971
10.2k
        }
1972
450k
        if (num == obj_stream_number) {
1973
69
            QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1974
69
            warn(damagedPDF(
1975
69
                input,
1976
69
                m->last_object_description,
1977
69
                input->getLastOffset(),
1978
69
                "object stream claims to contain itself"));
1979
69
            continue;
1980
69
        }
1981
450k
        offsets[num] = toI(offset + first);
1982
450k
    }
1983
1984
    // To avoid having to read the object stream multiple times, store all objects that would be
1985
    // found here in the cache.  Remember that some objects stored here might have been overridden
1986
    // by new objects appended to the file, so it is necessary to recheck the xref table and only
1987
    // cache what would actually be resolved here.
1988
14.8k
    m->last_object_description.clear();
1989
14.8k
    m->last_object_description += "object ";
1990
359k
    for (auto const& iter: offsets) {
1991
359k
        QPDFObjGen og(iter.first, 0);
1992
359k
        auto entry = m->xref_table.find(og);
1993
359k
        if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1994
359k
            entry->second.getObjStreamNumber() == obj_stream_number) {
1995
317k
            int offset = iter.second;
1996
317k
            input->seek(offset, SEEK_SET);
1997
317k
            QPDFObjectHandle oh = readObjectInStream(input, iter.first);
1998
317k
            updateCache(og, oh.getObj(), end_before_space, end_after_space);
1999
317k
        } else {
2000
42.3k
            QTC::TC("qpdf", "QPDF not caching overridden objstm object");
2001
42.3k
        }
2002
359k
    }
2003
14.8k
}
2004
2005
QPDFObjectHandle
2006
QPDF::newIndirect(QPDFObjGen const& og, std::shared_ptr<QPDFObject> const& obj)
2007
663k
{
2008
663k
    obj->setDefaultDescription(this, og);
2009
663k
    return {obj};
2010
663k
}
2011
2012
void
2013
QPDF::updateCache(
2014
    QPDFObjGen const& og,
2015
    std::shared_ptr<QPDFObject> const& object,
2016
    qpdf_offset_t end_before_space,
2017
    qpdf_offset_t end_after_space)
2018
2.52M
{
2019
2.52M
    object->setObjGen(this, og);
2020
2.52M
    if (isCached(og)) {
2021
1.93M
        auto& cache = m->obj_cache[og];
2022
1.93M
        cache.object->assign(object);
2023
1.93M
        cache.end_before_space = end_before_space;
2024
1.93M
        cache.end_after_space = end_after_space;
2025
1.93M
    } else {
2026
587k
        m->obj_cache[og] = ObjCache(object, end_before_space, end_after_space);
2027
587k
    }
2028
2.52M
}
2029
2030
bool
2031
QPDF::isCached(QPDFObjGen const& og)
2032
10.9M
{
2033
10.9M
    return m->obj_cache.count(og) != 0;
2034
10.9M
}
2035
2036
bool
2037
QPDF::isUnresolved(QPDFObjGen const& og)
2038
8.41M
{
2039
8.41M
    return !isCached(og) || m->obj_cache[og].object->isUnresolved();
2040
8.41M
}
2041
2042
QPDFObjGen
2043
QPDF::nextObjGen()
2044
59.7k
{
2045
59.7k
    int max_objid = toI(getObjectCount());
2046
59.7k
    if (max_objid == std::numeric_limits<int>::max()) {
2047
18
        throw std::range_error("max object id is too high to create new objects");
2048
18
    }
2049
59.7k
    return QPDFObjGen(max_objid + 1, 0);
2050
59.7k
}
2051
2052
QPDFObjectHandle
2053
QPDF::makeIndirectFromQPDFObject(std::shared_ptr<QPDFObject> const& obj)
2054
49.3k
{
2055
49.3k
    QPDFObjGen next{nextObjGen()};
2056
49.3k
    m->obj_cache[next] = ObjCache(obj, -1, -1);
2057
49.3k
    return newIndirect(next, m->obj_cache[next].object);
2058
49.3k
}
2059
2060
QPDFObjectHandle
2061
QPDF::makeIndirectObject(QPDFObjectHandle oh)
2062
33.6k
{
2063
33.6k
    if (!oh.isInitialized()) {
2064
0
        throw std::logic_error("attempted to make an uninitialized QPDFObjectHandle indirect");
2065
0
    }
2066
33.6k
    return makeIndirectFromQPDFObject(oh.getObj());
2067
33.6k
}
2068
2069
QPDFObjectHandle
2070
QPDF::newReserved()
2071
0
{
2072
0
    return makeIndirectFromQPDFObject(QPDF_Reserved::create());
2073
0
}
2074
2075
QPDFObjectHandle
2076
QPDF::newIndirectNull()
2077
5.40k
{
2078
5.40k
    return makeIndirectFromQPDFObject(QPDF_Null::create());
2079
5.40k
}
2080
2081
QPDFObjectHandle
2082
QPDF::newStream()
2083
10.4k
{
2084
10.4k
    return makeIndirectFromQPDFObject(
2085
10.4k
        QPDF_Stream::create(this, nextObjGen(), QPDFObjectHandle::newDictionary(), 0, 0));
2086
10.4k
}
2087
2088
QPDFObjectHandle
2089
QPDF::newStream(std::shared_ptr<Buffer> data)
2090
0
{
2091
0
    auto result = newStream();
2092
0
    result.replaceStreamData(data, QPDFObjectHandle::newNull(), QPDFObjectHandle::newNull());
2093
0
    return result;
2094
0
}
2095
2096
QPDFObjectHandle
2097
QPDF::newStream(std::string const& data)
2098
7.82k
{
2099
7.82k
    auto result = newStream();
2100
7.82k
    result.replaceStreamData(data, QPDFObjectHandle::newNull(), QPDFObjectHandle::newNull());
2101
7.82k
    return result;
2102
7.82k
}
2103
2104
QPDFObjectHandle
2105
QPDF::reserveStream(QPDFObjGen const& og)
2106
7.80k
{
2107
7.80k
    return {QPDF_Stream::create(this, og, QPDFObjectHandle::newDictionary(), 0, 0)};
2108
7.80k
}
2109
2110
std::shared_ptr<QPDFObject>
2111
QPDF::getObjectForParser(int id, int gen, bool parse_pdf)
2112
6.72M
{
2113
    // This method is called by the parser and therefore must not resolve any objects.
2114
6.72M
    auto og = QPDFObjGen(id, gen);
2115
6.72M
    if (auto iter = m->obj_cache.find(og); iter != m->obj_cache.end()) {
2116
3.50M
        return iter->second.object;
2117
3.50M
    }
2118
3.22M
    if (m->xref_table.count(og) || !m->parsed) {
2119
1.94M
        return m->obj_cache.insert({og, QPDF_Unresolved::create(this, og)}).first->second.object;
2120
1.94M
    }
2121
1.28M
    if (parse_pdf) {
2122
1.28M
        return QPDF_Null::create();
2123
1.28M
    }
2124
0
    return m->obj_cache.insert({og, QPDF_Null::create(this, og)}).first->second.object;
2125
1.28M
}
2126
2127
std::shared_ptr<QPDFObject>
2128
QPDF::getObjectForJSON(int id, int gen)
2129
652k
{
2130
652k
    auto og = QPDFObjGen(id, gen);
2131
652k
    auto [it, inserted] = m->obj_cache.try_emplace(og);
2132
652k
    auto& obj = it->second.object;
2133
652k
    if (inserted) {
2134
34.2k
        obj = (m->parsed && !m->xref_table.count(og)) ? QPDF_Null::create(this, og)
2135
34.2k
                                                      : QPDF_Unresolved::create(this, og);
2136
34.2k
    }
2137
652k
    return obj;
2138
652k
}
2139
2140
QPDFObjectHandle
2141
QPDF::getObject(QPDFObjGen const& og)
2142
1.36M
{
2143
1.36M
    if (auto it = m->obj_cache.find(og); it != m->obj_cache.end()) {
2144
1.16M
        return {it->second.object};
2145
1.16M
    } else if (m->parsed && !m->xref_table.count(og)) {
2146
3.08k
        return QPDF_Null::create();
2147
195k
    } else {
2148
195k
        auto result = m->obj_cache.try_emplace(og, QPDF_Unresolved::create(this, og), -1, -1);
2149
195k
        return {result.first->second.object};
2150
195k
    }
2151
1.36M
}
2152
2153
QPDFObjectHandle
2154
QPDF::getObject(int objid, int generation)
2155
6.13k
{
2156
6.13k
    return getObject(QPDFObjGen(objid, generation));
2157
6.13k
}
2158
2159
QPDFObjectHandle
2160
QPDF::getObjectByObjGen(QPDFObjGen const& og)
2161
350k
{
2162
350k
    return getObject(og);
2163
350k
}
2164
2165
QPDFObjectHandle
2166
QPDF::getObjectByID(int objid, int generation)
2167
20.1k
{
2168
20.1k
    return getObject(QPDFObjGen(objid, generation));
2169
20.1k
}
2170
2171
void
2172
QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
2173
0
{
2174
0
    replaceObject(QPDFObjGen(objid, generation), oh);
2175
0
}
2176
2177
void
2178
QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle oh)
2179
21.7k
{
2180
21.7k
    if (oh.isIndirect() || !oh.isInitialized()) {
2181
0
        QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
2182
0
        throw std::logic_error("QPDF::replaceObject called with indirect object handle");
2183
0
    }
2184
21.7k
    updateCache(og, oh.getObj(), -1, -1);
2185
21.7k
}
2186
2187
void
2188
QPDF::removeObject(QPDFObjGen og)
2189
32.4k
{
2190
32.4k
    m->xref_table.erase(og);
2191
32.4k
    if (auto cached = m->obj_cache.find(og); cached != m->obj_cache.end()) {
2192
        // Take care of any object handles that may be floating around.
2193
31.3k
        cached->second.object->assign(QPDF_Null::create());
2194
31.3k
        cached->second.object->setObjGen(nullptr, QPDFObjGen());
2195
31.3k
        m->obj_cache.erase(cached);
2196
31.3k
    }
2197
32.4k
}
2198
2199
void
2200
QPDF::replaceReserved(QPDFObjectHandle reserved, QPDFObjectHandle replacement)
2201
0
{
2202
0
    QTC::TC("qpdf", "QPDF replaceReserved");
2203
0
    auto tc = reserved.getTypeCode();
2204
0
    if (!(tc == ::ot_reserved || tc == ::ot_null)) {
2205
0
        throw std::logic_error("replaceReserved called with non-reserved object");
2206
0
    }
2207
0
    replaceObject(reserved.getObjGen(), replacement);
2208
0
}
2209
2210
QPDFObjectHandle
2211
QPDF::copyForeignObject(QPDFObjectHandle foreign)
2212
0
{
2213
    // Here's an explanation of what's going on here.
2214
    //
2215
    // A QPDFObjectHandle that is an indirect object has an owning QPDF. The object ID and
2216
    // generation refers to an object in the owning QPDF. When we copy the QPDFObjectHandle from a
2217
    // foreign QPDF into the local QPDF, we have to replace all indirect object references with
2218
    // references to the corresponding object in the local file.
2219
    //
2220
    // To do this, we maintain mappings from foreign object IDs to local object IDs for each foreign
2221
    // QPDF that we are copying from. The mapping is stored in an ObjCopier, which contains a
2222
    // mapping from the foreign ObjGen to the local QPDFObjectHandle.
2223
    //
2224
    // To copy, we do a deep traversal of the foreign object with loop detection to discover all
2225
    // indirect objects that are encountered, stopping at page boundaries. Whenever we encounter an
2226
    // indirect object, we check to see if we have already created a local copy of it. If not, we
2227
    // allocate a "reserved" object (or, for a stream, just a new stream) and store in the map the
2228
    // mapping from the foreign object ID to the new object. While we
2229
    // do this, we keep a list of objects to copy.
2230
    //
2231
    // Once we are done with the traversal, we copy all the objects that we need to copy. However,
2232
    // the copies will contain indirect object IDs that refer to objects in the foreign file. We
2233
    // need to replace them with references to objects in the local file. This is what
2234
    // replaceForeignIndirectObjects does. Once we have created a copy of the foreign object with
2235
    // all the indirect references replaced with new ones in the local context, we can replace the
2236
    // local reserved object with the copy. This mechanism allows us to copy objects with circular
2237
    // references in any order.
2238
2239
    // For streams, rather than copying the objects, we set up the stream data to pull from the
2240
    // original stream by using a stream data provider. This is done in a manner that doesn't
2241
    // require the original QPDF object but may require the original source of the stream data with
2242
    // special handling for immediate_copy_from. This logic is also in
2243
    // replaceForeignIndirectObjects.
2244
2245
    // Note that we explicitly allow use of copyForeignObject on page objects. It is a documented
2246
    // use case to copy pages this way if the intention is to not update the pages tree.
2247
0
    if (!foreign.isIndirect()) {
2248
0
        QTC::TC("qpdf", "QPDF copyForeign direct");
2249
0
        throw std::logic_error("QPDF::copyForeign called with direct object handle");
2250
0
    }
2251
0
    QPDF& other = foreign.getQPDF();
2252
0
    if (&other == this) {
2253
0
        QTC::TC("qpdf", "QPDF copyForeign not foreign");
2254
0
        throw std::logic_error("QPDF::copyForeign called with object from this QPDF");
2255
0
    }
2256
2257
0
    ObjCopier& obj_copier = m->object_copiers[other.m->unique_id];
2258
0
    if (!obj_copier.visiting.empty()) {
2259
0
        throw std::logic_error("obj_copier.visiting is not empty"
2260
0
                               " at the beginning of copyForeignObject");
2261
0
    }
2262
2263
    // Make sure we have an object in this file for every referenced object in the old file.
2264
    // obj_copier.object_map maps foreign QPDFObjGen to local objects.  For everything new that we
2265
    // have to copy, the local object will be a reservation, unless it is a stream, in which case
2266
    // the local object will already be a stream.
2267
0
    reserveObjects(foreign, obj_copier, true);
2268
2269
0
    if (!obj_copier.visiting.empty()) {
2270
0
        throw std::logic_error("obj_copier.visiting is not empty after reserving objects");
2271
0
    }
2272
2273
    // Copy any new objects and replace the reservations.
2274
0
    for (auto& to_copy: obj_copier.to_copy) {
2275
0
        QPDFObjectHandle copy = replaceForeignIndirectObjects(to_copy, obj_copier, true);
2276
0
        if (!to_copy.isStream()) {
2277
0
            QPDFObjGen og(to_copy.getObjGen());
2278
0
            replaceReserved(obj_copier.object_map[og], copy);
2279
0
        }
2280
0
    }
2281
0
    obj_copier.to_copy.clear();
2282
2283
0
    auto og = foreign.getObjGen();
2284
0
    if (!obj_copier.object_map.count(og)) {
2285
0
        warn(damagedPDF("unexpected reference to /Pages object while copying foreign object; "
2286
0
                        "replacing with null"));
2287
0
        return QPDFObjectHandle::newNull();
2288
0
    }
2289
0
    return obj_copier.object_map[foreign.getObjGen()];
2290
0
}
2291
2292
void
2293
QPDF::reserveObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top)
2294
0
{
2295
0
    auto foreign_tc = foreign.getTypeCode();
2296
0
    if (foreign_tc == ::ot_reserved) {
2297
0
        throw std::logic_error("QPDF: attempting to copy a foreign reserved object");
2298
0
    }
2299
2300
0
    if (foreign.isPagesObject()) {
2301
0
        QTC::TC("qpdf", "QPDF not copying pages object");
2302
0
        return;
2303
0
    }
2304
2305
0
    if (foreign.isIndirect()) {
2306
0
        QPDFObjGen foreign_og(foreign.getObjGen());
2307
0
        if (!obj_copier.visiting.add(foreign_og)) {
2308
0
            QTC::TC("qpdf", "QPDF loop reserving objects");
2309
0
            return;
2310
0
        }
2311
0
        if (obj_copier.object_map.count(foreign_og) > 0) {
2312
0
            QTC::TC("qpdf", "QPDF already reserved object");
2313
0
            if (!(top && foreign.isPageObject() && obj_copier.object_map[foreign_og].isNull())) {
2314
0
                obj_copier.visiting.erase(foreign);
2315
0
                return;
2316
0
            }
2317
0
        } else {
2318
0
            QTC::TC("qpdf", "QPDF copy indirect");
2319
0
            obj_copier.object_map[foreign_og] =
2320
0
                foreign.isStream() ? newStream() : newIndirectNull();
2321
0
            if ((!top) && foreign.isPageObject()) {
2322
0
                QTC::TC("qpdf", "QPDF not crossing page boundary");
2323
0
                obj_copier.visiting.erase(foreign_og);
2324
0
                return;
2325
0
            }
2326
0
        }
2327
0
        obj_copier.to_copy.push_back(foreign);
2328
0
    }
2329
2330
0
    if (foreign_tc == ::ot_array) {
2331
0
        QTC::TC("qpdf", "QPDF reserve array");
2332
0
        int n = foreign.getArrayNItems();
2333
0
        for (int i = 0; i < n; ++i) {
2334
0
            reserveObjects(foreign.getArrayItem(i), obj_copier, false);
2335
0
        }
2336
0
    } else if (foreign_tc == ::ot_dictionary) {
2337
0
        QTC::TC("qpdf", "QPDF reserve dictionary");
2338
0
        for (auto const& key: foreign.getKeys()) {
2339
0
            reserveObjects(foreign.getKey(key), obj_copier, false);
2340
0
        }
2341
0
    } else if (foreign_tc == ::ot_stream) {
2342
0
        QTC::TC("qpdf", "QPDF reserve stream");
2343
0
        reserveObjects(foreign.getDict(), obj_copier, false);
2344
0
    }
2345
2346
0
    obj_copier.visiting.erase(foreign);
2347
0
}
2348
2349
QPDFObjectHandle
2350
QPDF::replaceForeignIndirectObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top)
2351
0
{
2352
0
    auto foreign_tc = foreign.getTypeCode();
2353
0
    QPDFObjectHandle result;
2354
0
    if ((!top) && foreign.isIndirect()) {
2355
0
        QTC::TC("qpdf", "QPDF replace indirect");
2356
0
        auto mapping = obj_copier.object_map.find(foreign.getObjGen());
2357
0
        if (mapping == obj_copier.object_map.end()) {
2358
            // This case would occur if this is a reference to a Pages object that we didn't
2359
            // traverse into.
2360
0
            QTC::TC("qpdf", "QPDF replace foreign indirect with null");
2361
0
            result = QPDFObjectHandle::newNull();
2362
0
        } else {
2363
0
            result = mapping->second;
2364
0
        }
2365
0
    } else if (foreign_tc == ::ot_array) {
2366
0
        QTC::TC("qpdf", "QPDF replace array");
2367
0
        result = QPDFObjectHandle::newArray();
2368
0
        int n = foreign.getArrayNItems();
2369
0
        for (int i = 0; i < n; ++i) {
2370
0
            result.appendItem(
2371
                // line-break
2372
0
                replaceForeignIndirectObjects(foreign.getArrayItem(i), obj_copier, false));
2373
0
        }
2374
0
    } else if (foreign_tc == ::ot_dictionary) {
2375
0
        QTC::TC("qpdf", "QPDF replace dictionary");
2376
0
        result = QPDFObjectHandle::newDictionary();
2377
0
        std::set<std::string> keys = foreign.getKeys();
2378
0
        for (auto const& iter: keys) {
2379
0
            result.replaceKey(
2380
0
                iter, replaceForeignIndirectObjects(foreign.getKey(iter), obj_copier, false));
2381
0
        }
2382
0
    } else if (foreign_tc == ::ot_stream) {
2383
0
        QTC::TC("qpdf", "QPDF replace stream");
2384
0
        result = obj_copier.object_map[foreign.getObjGen()];
2385
0
        result.assertStream();
2386
0
        QPDFObjectHandle dict = result.getDict();
2387
0
        QPDFObjectHandle old_dict = foreign.getDict();
2388
0
        std::set<std::string> keys = old_dict.getKeys();
2389
0
        for (auto const& iter: keys) {
2390
0
            dict.replaceKey(
2391
0
                iter, replaceForeignIndirectObjects(old_dict.getKey(iter), obj_copier, false));
2392
0
        }
2393
0
        copyStreamData(result, foreign);
2394
0
    } else {
2395
0
        foreign.assertScalar();
2396
0
        result = foreign;
2397
0
        result.makeDirect();
2398
0
    }
2399
2400
0
    if (top && (!result.isStream()) && result.isIndirect()) {
2401
0
        throw std::logic_error("replacement for foreign object is indirect");
2402
0
    }
2403
2404
0
    return result;
2405
0
}
2406
2407
void
2408
QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
2409
0
{
2410
    // This method was originally written for copying foreign streams, but it is used by
2411
    // QPDFObjectHandle to copy streams from the same QPDF object as well.
2412
2413
0
    QPDFObjectHandle dict = result.getDict();
2414
0
    QPDFObjectHandle old_dict = foreign.getDict();
2415
0
    if (m->copied_stream_data_provider == nullptr) {
2416
0
        m->copied_stream_data_provider = new CopiedStreamDataProvider(*this);
2417
0
        m->copied_streams =
2418
0
            std::shared_ptr<QPDFObjectHandle::StreamDataProvider>(m->copied_stream_data_provider);
2419
0
    }
2420
0
    QPDFObjGen local_og(result.getObjGen());
2421
    // Copy information from the foreign stream so we can pipe its data later without keeping the
2422
    // original QPDF object around.
2423
2424
0
    QPDF& foreign_stream_qpdf =
2425
0
        foreign.getQPDF("unable to retrieve owning qpdf from foreign stream");
2426
2427
0
    auto stream = foreign.getObjectPtr()->as<QPDF_Stream>();
2428
0
    if (stream == nullptr) {
2429
0
        throw std::logic_error("unable to retrieve underlying"
2430
0
                               " stream object from foreign stream");
2431
0
    }
2432
0
    std::shared_ptr<Buffer> stream_buffer = stream->getStreamDataBuffer();
2433
0
    if ((foreign_stream_qpdf.m->immediate_copy_from) && (stream_buffer == nullptr)) {
2434
        // Pull the stream data into a buffer before attempting the copy operation. Do it on the
2435
        // source stream so that if the source stream is copied multiple times, we don't have to
2436
        // keep duplicating the memory.
2437
0
        QTC::TC("qpdf", "QPDF immediate copy stream data");
2438
0
        foreign.replaceStreamData(
2439
0
            foreign.getRawStreamData(),
2440
0
            old_dict.getKey("/Filter"),
2441
0
            old_dict.getKey("/DecodeParms"));
2442
0
        stream_buffer = stream->getStreamDataBuffer();
2443
0
    }
2444
0
    std::shared_ptr<QPDFObjectHandle::StreamDataProvider> stream_provider =
2445
0
        stream->getStreamDataProvider();
2446
0
    if (stream_buffer.get()) {
2447
0
        QTC::TC("qpdf", "QPDF copy foreign stream with buffer");
2448
0
        result.replaceStreamData(
2449
0
            stream_buffer, dict.getKey("/Filter"), dict.getKey("/DecodeParms"));
2450
0
    } else if (stream_provider.get()) {
2451
        // In this case, the remote stream's QPDF must stay in scope.
2452
0
        QTC::TC("qpdf", "QPDF copy foreign stream with provider");
2453
0
        m->copied_stream_data_provider->registerForeignStream(local_og, foreign);
2454
0
        result.replaceStreamData(
2455
0
            m->copied_streams, dict.getKey("/Filter"), dict.getKey("/DecodeParms"));
2456
0
    } else {
2457
0
        auto foreign_stream_data = std::make_shared<ForeignStreamData>(
2458
0
            foreign_stream_qpdf.m->encp,
2459
0
            foreign_stream_qpdf.m->file,
2460
0
            foreign.getObjGen(),
2461
0
            stream->getParsedOffset(),
2462
0
            stream->getLength(),
2463
0
            dict);
2464
0
        m->copied_stream_data_provider->registerForeignStream(local_og, foreign_stream_data);
2465
0
        result.replaceStreamData(
2466
0
            m->copied_streams, dict.getKey("/Filter"), dict.getKey("/DecodeParms"));
2467
0
    }
2468
0
}
2469
2470
void
2471
QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
2472
0
{
2473
0
    swapObjects(QPDFObjGen(objid1, generation1), QPDFObjGen(objid2, generation2));
2474
0
}
2475
2476
void
2477
QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2)
2478
0
{
2479
    // Force objects to be read from the input source if needed, then swap them in the cache.
2480
0
    resolve(og1);
2481
0
    resolve(og2);
2482
0
    m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object);
2483
0
}
2484
2485
unsigned long long
2486
QPDF::getUniqueId() const
2487
0
{
2488
0
    return m->unique_id;
2489
0
}
2490
2491
std::string
2492
QPDF::getFilename() const
2493
872k
{
2494
872k
    return m->file->getName();
2495
872k
}
2496
2497
PDFVersion
2498
QPDF::getVersionAsPDFVersion()
2499
0
{
2500
0
    int major = 1;
2501
0
    int minor = 3;
2502
0
    int extension_level = getExtensionLevel();
2503
2504
0
    std::regex v("^[[:space:]]*([0-9]+)\\.([0-9]+)");
2505
0
    std::smatch match;
2506
0
    if (std::regex_search(m->pdf_version, match, v)) {
2507
0
        major = QUtil::string_to_int(match[1].str().c_str());
2508
0
        minor = QUtil::string_to_int(match[2].str().c_str());
2509
0
    }
2510
2511
0
    return {major, minor, extension_level};
2512
0
}
2513
2514
std::string
2515
QPDF::getPDFVersion() const
2516
33.1k
{
2517
33.1k
    return m->pdf_version;
2518
33.1k
}
2519
2520
int
2521
QPDF::getExtensionLevel()
2522
33.1k
{
2523
33.1k
    int result = 0;
2524
33.1k
    QPDFObjectHandle obj = getRoot();
2525
33.1k
    if (obj.hasKey("/Extensions")) {
2526
928
        obj = obj.getKey("/Extensions");
2527
928
        if (obj.isDictionary() && obj.hasKey("/ADBE")) {
2528
829
            obj = obj.getKey("/ADBE");
2529
829
            if (obj.isDictionary() && obj.hasKey("/ExtensionLevel")) {
2530
626
                obj = obj.getKey("/ExtensionLevel");
2531
626
                if (obj.isInteger()) {
2532
609
                    result = obj.getIntValueAsInt();
2533
609
                }
2534
626
            }
2535
829
        }
2536
928
    }
2537
33.1k
    return result;
2538
33.1k
}
2539
2540
QPDFObjectHandle
2541
QPDF::getTrailer()
2542
184k
{
2543
184k
    return m->trailer;
2544
184k
}
2545
2546
QPDFObjectHandle
2547
QPDF::getRoot()
2548
406k
{
2549
406k
    QPDFObjectHandle root = m->trailer.getKey("/Root");
2550
406k
    if (!root.isDictionary()) {
2551
5.00k
        throw damagedPDF("", 0, "unable to find /Root dictionary");
2552
401k
    } else if (
2553
        // Check_mode is an interim solution to request #810 pending a more comprehensive review of
2554
        // the approach to more extensive checks and warning levels.
2555
401k
        m->check_mode && !root.getKey("/Type").isNameAndEquals("/Catalog")) {
2556
0
        warn(damagedPDF("", 0, "catalog /Type entry missing or invalid"));
2557
0
        root.replaceKey("/Type", "/Catalog"_qpdf);
2558
0
    }
2559
401k
    return root;
2560
406k
}
2561
2562
std::map<QPDFObjGen, QPDFXRefEntry>
2563
QPDF::getXRefTable()
2564
0
{
2565
0
    return getXRefTableInternal();
2566
0
}
2567
2568
std::map<QPDFObjGen, QPDFXRefEntry> const&
2569
QPDF::getXRefTableInternal()
2570
22.0k
{
2571
22.0k
    if (!m->parsed) {
2572
0
        throw std::logic_error("QPDF::getXRefTable called before parsing.");
2573
0
    }
2574
2575
22.0k
    return m->xref_table;
2576
22.0k
}
2577
2578
size_t
2579
QPDF::tableSize()
2580
33.2k
{
2581
    // If obj_cache is dense, accommodate all object in tables,else accommodate only original
2582
    // objects.
2583
33.2k
    auto max_xref = m->xref_table.size() ? m->xref_table.crbegin()->first.getObj() : 0;
2584
33.2k
    auto max_obj = m->obj_cache.size() ? m->obj_cache.crbegin()->first.getObj() : 0;
2585
33.2k
    auto max_id = std::numeric_limits<int>::max() - 1;
2586
33.2k
    if (max_obj >= max_id || max_xref >= max_id) {
2587
        // Temporary fix. Long-term solution is
2588
        // - QPDFObjGen to enforce objgens are valid and sensible
2589
        // - xref table and obj cache to protect against insertion of impossibly large obj ids
2590
23
        stopOnError("Impossibly large object id encountered.");
2591
23
    }
2592
33.2k
    if (max_obj < 1.1 * std::max(toI(m->obj_cache.size()), max_xref)) {
2593
29.0k
        return toS(++max_obj);
2594
29.0k
    }
2595
4.19k
    return toS(++max_xref);
2596
33.2k
}
2597
2598
std::vector<QPDFObjGen>
2599
QPDF::getCompressibleObjVector()
2600
5.27k
{
2601
5.27k
    return getCompressibleObjGens<QPDFObjGen>();
2602
5.27k
}
2603
2604
std::vector<bool>
2605
QPDF::getCompressibleObjSet()
2606
2.63k
{
2607
2.63k
    return getCompressibleObjGens<bool>();
2608
2.63k
}
2609
2610
template <typename T>
2611
std::vector<T>
2612
QPDF::getCompressibleObjGens()
2613
7.91k
{
2614
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
2615
    // by traversing the document from the root, including a traversal of the pages tree.  This
2616
    // makes that objects that are on the same page are more likely to be in the same object stream,
2617
    // which is slightly more efficient, particularly with linearized files.  This is better than
2618
    // iterating through the xref table since it avoids preserving orphaned items.
2619
2620
    // Exclude encryption dictionary, if any
2621
7.91k
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
2622
7.91k
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
2623
2624
7.91k
    const size_t max_obj = getObjectCount();
2625
7.91k
    std::vector<bool> visited(max_obj, false);
2626
7.91k
    std::vector<QPDFObjectHandle> queue;
2627
7.91k
    queue.reserve(512);
2628
7.91k
    queue.push_back(m->trailer);
2629
7.91k
    std::vector<T> result;
2630
7.91k
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
2631
2.63k
        result.reserve(m->obj_cache.size());
2632
2.63k
    } else if constexpr (std::is_same_v<T, bool>) {
2633
2.52k
        result.resize(max_obj + 1U, false);
2634
2.52k
    } else {
2635
7.91k
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
2636
7.91k
    }
2637
5.07M
    while (!queue.empty()) {
2638
5.07M
        auto obj = queue.back();
2639
5.07M
        queue.pop_back();
2640
5.07M
        if (obj.getObjectID() > 0) {
2641
828k
            QPDFObjGen og = obj.getObjGen();
2642
828k
            const size_t id = toS(og.getObj() - 1);
2643
828k
            if (id >= max_obj) {
2644
0
                throw std::logic_error(
2645
0
                    "unexpected object id encountered in getCompressibleObjGens");
2646
0
            }
2647
828k
            if (visited[id]) {
2648
445k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2649
445k
                continue;
2650
445k
            }
2651
2652
            // Check whether this is the current object. If not, remove it (which changes it into a
2653
            // direct null and therefore stops us from revisiting it) and move on to the next object
2654
            // in the queue.
2655
383k
            auto upper = m->obj_cache.upper_bound(og);
2656
383k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2657
766
                removeObject(og);
2658
766
                continue;
2659
766
            }
2660
2661
382k
            visited[id] = true;
2662
2663
382k
            if (og == encryption_dict_og) {
2664
161
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2665
382k
            } else if (!(obj.isStream() ||
2666
382k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2667
292k
                          obj.hasKey("/Contents")))) {
2668
292k
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2669
162k
                    result.push_back(og);
2670
162k
                } else if constexpr (std::is_same_v<T, bool>) {
2671
162k
                    result[id + 1U] = true;
2672
162k
                }
2673
292k
            }
2674
382k
        }
2675
4.62M
        if (obj.isStream()) {
2676
89.9k
            QPDFObjectHandle dict = obj.getDict();
2677
89.9k
            std::set<std::string> keys = dict.getKeys();
2678
473k
            for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
2679
383k
                std::string const& key = *iter;
2680
383k
                QPDFObjectHandle value = dict.getKey(key);
2681
383k
                if (key == "/Length") {
2682
                    // omit stream lengths
2683
86.9k
                    if (value.isIndirect()) {
2684
53.3k
                        QTC::TC("qpdf", "QPDF exclude indirect length");
2685
53.3k
                    }
2686
296k
                } else {
2687
296k
                    queue.push_back(value);
2688
296k
                }
2689
383k
            }
2690
4.53M
        } else if (obj.isDictionary()) {
2691
470k
            std::set<std::string> keys = obj.getKeys();
2692
2.13M
            for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
2693
1.66M
                queue.push_back(obj.getKey(*iter));
2694
1.66M
            }
2695
4.06M
        } else if (obj.isArray()) {
2696
293k
            int n = obj.getArrayNItems();
2697
3.39M
            for (int i = 1; i <= n; ++i) {
2698
3.10M
                queue.push_back(obj.getArrayItem(n - i));
2699
3.10M
            }
2700
293k
        }
2701
4.62M
    }
2702
2703
7.91k
    return result;
2704
7.91k
}
std::__1::vector<QPDFObjGen, std::__1::allocator<QPDFObjGen> > QPDF::getCompressibleObjGens<QPDFObjGen>()
Line
Count
Source
2613
5.27k
{
2614
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
2615
    // by traversing the document from the root, including a traversal of the pages tree.  This
2616
    // makes that objects that are on the same page are more likely to be in the same object stream,
2617
    // which is slightly more efficient, particularly with linearized files.  This is better than
2618
    // iterating through the xref table since it avoids preserving orphaned items.
2619
2620
    // Exclude encryption dictionary, if any
2621
5.27k
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
2622
5.27k
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
2623
2624
5.27k
    const size_t max_obj = getObjectCount();
2625
5.27k
    std::vector<bool> visited(max_obj, false);
2626
5.27k
    std::vector<QPDFObjectHandle> queue;
2627
5.27k
    queue.reserve(512);
2628
5.27k
    queue.push_back(m->trailer);
2629
5.27k
    std::vector<T> result;
2630
5.27k
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
2631
5.20k
        result.reserve(m->obj_cache.size());
2632
5.20k
    } else if constexpr (std::is_same_v<T, bool>) {
2633
5.27k
        result.resize(max_obj + 1U, false);
2634
5.27k
    } else {
2635
5.27k
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
2636
5.27k
    }
2637
2.26M
    while (!queue.empty()) {
2638
2.26M
        auto obj = queue.back();
2639
2.26M
        queue.pop_back();
2640
2.26M
        if (obj.getObjectID() > 0) {
2641
420k
            QPDFObjGen og = obj.getObjGen();
2642
420k
            const size_t id = toS(og.getObj() - 1);
2643
420k
            if (id >= max_obj) {
2644
0
                throw std::logic_error(
2645
0
                    "unexpected object id encountered in getCompressibleObjGens");
2646
0
            }
2647
420k
            if (visited[id]) {
2648
225k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2649
225k
                continue;
2650
225k
            }
2651
2652
            // Check whether this is the current object. If not, remove it (which changes it into a
2653
            // direct null and therefore stops us from revisiting it) and move on to the next object
2654
            // in the queue.
2655
195k
            auto upper = m->obj_cache.upper_bound(og);
2656
195k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2657
362
                removeObject(og);
2658
362
                continue;
2659
362
            }
2660
2661
194k
            visited[id] = true;
2662
2663
194k
            if (og == encryption_dict_og) {
2664
0
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2665
194k
            } else if (!(obj.isStream() ||
2666
194k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2667
129k
                          obj.hasKey("/Contents")))) {
2668
129k
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2669
129k
                    result.push_back(og);
2670
129k
                } else if constexpr (std::is_same_v<T, bool>) {
2671
129k
                    result[id + 1U] = true;
2672
129k
                }
2673
129k
            }
2674
194k
        }
2675
2.03M
        if (obj.isStream()) {
2676
65.1k
            QPDFObjectHandle dict = obj.getDict();
2677
65.1k
            std::set<std::string> keys = dict.getKeys();
2678
319k
            for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
2679
254k
                std::string const& key = *iter;
2680
254k
                QPDFObjectHandle value = dict.getKey(key);
2681
254k
                if (key == "/Length") {
2682
                    // omit stream lengths
2683
62.7k
                    if (value.isIndirect()) {
2684
49.3k
                        QTC::TC("qpdf", "QPDF exclude indirect length");
2685
49.3k
                    }
2686
191k
                } else {
2687
191k
                    queue.push_back(value);
2688
191k
                }
2689
254k
            }
2690
1.97M
        } else if (obj.isDictionary()) {
2691
251k
            std::set<std::string> keys = obj.getKeys();
2692
1.09M
            for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
2693
847k
                queue.push_back(obj.getKey(*iter));
2694
847k
            }
2695
1.72M
        } else if (obj.isArray()) {
2696
156k
            int n = obj.getArrayNItems();
2697
1.37M
            for (int i = 1; i <= n; ++i) {
2698
1.21M
                queue.push_back(obj.getArrayItem(n - i));
2699
1.21M
            }
2700
156k
        }
2701
2.03M
    }
2702
2703
5.27k
    return result;
2704
5.27k
}
std::__1::vector<bool, std::__1::allocator<bool> > QPDF::getCompressibleObjGens<bool>()
Line
Count
Source
2613
2.63k
{
2614
    // Return a list of objects that are allowed to be in object streams.  Walk through the objects
2615
    // by traversing the document from the root, including a traversal of the pages tree.  This
2616
    // makes that objects that are on the same page are more likely to be in the same object stream,
2617
    // which is slightly more efficient, particularly with linearized files.  This is better than
2618
    // iterating through the xref table since it avoids preserving orphaned items.
2619
2620
    // Exclude encryption dictionary, if any
2621
2.63k
    QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt");
2622
2.63k
    QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
2623
2624
2.63k
    const size_t max_obj = getObjectCount();
2625
2.63k
    std::vector<bool> visited(max_obj, false);
2626
2.63k
    std::vector<QPDFObjectHandle> queue;
2627
2.63k
    queue.reserve(512);
2628
2.63k
    queue.push_back(m->trailer);
2629
2.63k
    std::vector<T> result;
2630
2.63k
    if constexpr (std::is_same_v<T, QPDFObjGen>) {
2631
2.63k
        result.reserve(m->obj_cache.size());
2632
2.63k
    } else if constexpr (std::is_same_v<T, bool>) {
2633
2.52k
        result.resize(max_obj + 1U, false);
2634
2.52k
    } else {
2635
2.63k
        throw std::logic_error("Unsupported type in QPDF::getCompressibleObjGens");
2636
2.63k
    }
2637
2.81M
    while (!queue.empty()) {
2638
2.80M
        auto obj = queue.back();
2639
2.80M
        queue.pop_back();
2640
2.80M
        if (obj.getObjectID() > 0) {
2641
407k
            QPDFObjGen og = obj.getObjGen();
2642
407k
            const size_t id = toS(og.getObj() - 1);
2643
407k
            if (id >= max_obj) {
2644
0
                throw std::logic_error(
2645
0
                    "unexpected object id encountered in getCompressibleObjGens");
2646
0
            }
2647
407k
            if (visited[id]) {
2648
219k
                QTC::TC("qpdf", "QPDF loop detected traversing objects");
2649
219k
                continue;
2650
219k
            }
2651
2652
            // Check whether this is the current object. If not, remove it (which changes it into a
2653
            // direct null and therefore stops us from revisiting it) and move on to the next object
2654
            // in the queue.
2655
188k
            auto upper = m->obj_cache.upper_bound(og);
2656
188k
            if (upper != m->obj_cache.end() && upper->first.getObj() == og.getObj()) {
2657
404
                removeObject(og);
2658
404
                continue;
2659
404
            }
2660
2661
187k
            visited[id] = true;
2662
2663
187k
            if (og == encryption_dict_og) {
2664
161
                QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2665
187k
            } else if (!(obj.isStream() ||
2666
187k
                         (obj.isDictionaryOfType("/Sig") && obj.hasKey("/ByteRange") &&
2667
162k
                          obj.hasKey("/Contents")))) {
2668
162k
                if constexpr (std::is_same_v<T, QPDFObjGen>) {
2669
162k
                    result.push_back(og);
2670
162k
                } else if constexpr (std::is_same_v<T, bool>) {
2671
162k
                    result[id + 1U] = true;
2672
162k
                }
2673
162k
            }
2674
187k
        }
2675
2.58M
        if (obj.isStream()) {
2676
24.8k
            QPDFObjectHandle dict = obj.getDict();
2677
24.8k
            std::set<std::string> keys = dict.getKeys();
2678
154k
            for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
2679
129k
                std::string const& key = *iter;
2680
129k
                QPDFObjectHandle value = dict.getKey(key);
2681
129k
                if (key == "/Length") {
2682
                    // omit stream lengths
2683
24.2k
                    if (value.isIndirect()) {
2684
3.96k
                        QTC::TC("qpdf", "QPDF exclude indirect length");
2685
3.96k
                    }
2686
104k
                } else {
2687
104k
                    queue.push_back(value);
2688
104k
                }
2689
129k
            }
2690
2.56M
        } else if (obj.isDictionary()) {
2691
219k
            std::set<std::string> keys = obj.getKeys();
2692
1.03M
            for (auto iter = keys.rbegin(); iter != keys.rend(); ++iter) {
2693
817k
                queue.push_back(obj.getKey(*iter));
2694
817k
            }
2695
2.34M
        } else if (obj.isArray()) {
2696
136k
            int n = obj.getArrayNItems();
2697
2.01M
            for (int i = 1; i <= n; ++i) {
2698
1.88M
                queue.push_back(obj.getArrayItem(n - i));
2699
1.88M
            }
2700
136k
        }
2701
2.58M
    }
2702
2703
2.63k
    return result;
2704
2.63k
}
2705
2706
bool
2707
QPDF::pipeStreamData(
2708
    std::shared_ptr<EncryptionParameters> encp,
2709
    std::shared_ptr<InputSource> file,
2710
    QPDF& qpdf_for_warning,
2711
    QPDFObjGen const& og,
2712
    qpdf_offset_t offset,
2713
    size_t length,
2714
    QPDFObjectHandle stream_dict,
2715
    Pipeline* pipeline,
2716
    bool suppress_warnings,
2717
    bool will_retry)
2718
713k
{
2719
713k
    std::unique_ptr<Pipeline> to_delete;
2720
713k
    if (encp->encrypted) {
2721
12.1k
        decryptStream(encp, file, qpdf_for_warning, pipeline, og, stream_dict, to_delete);
2722
12.1k
    }
2723
2724
713k
    bool attempted_finish = false;
2725
713k
    try {
2726
713k
        file->seek(offset, SEEK_SET);
2727
713k
        auto buf = std::make_unique<char[]>(length);
2728
713k
        if (auto read = file->read(buf.get(), length); read != length) {
2729
0
            throw damagedPDF(file, "", offset + toO(read), "unexpected EOF reading stream data");
2730
0
        }
2731
713k
        pipeline->write(buf.get(), length);
2732
713k
        attempted_finish = true;
2733
713k
        pipeline->finish();
2734
713k
        return true;
2735
713k
    } catch (QPDFExc& e) {
2736
0
        if (!suppress_warnings) {
2737
0
            qpdf_for_warning.warn(e);
2738
0
        }
2739
33.4k
    } catch (std::exception& e) {
2740
33.4k
        if (!suppress_warnings) {
2741
33.4k
            QTC::TC("qpdf", "QPDF decoding error warning");
2742
33.4k
            qpdf_for_warning.warn(
2743
                // line-break
2744
33.4k
                damagedPDF(
2745
33.4k
                    file,
2746
33.4k
                    "",
2747
33.4k
                    file->getLastOffset(),
2748
33.4k
                    ("error decoding stream data for object " + og.unparse(' ') + ": " +
2749
33.4k
                     e.what())));
2750
33.4k
            if (will_retry) {
2751
28.1k
                qpdf_for_warning.warn(
2752
                    // line-break
2753
28.1k
                    damagedPDF(
2754
28.1k
                        file,
2755
28.1k
                        "",
2756
28.1k
                        file->getLastOffset(),
2757
28.1k
                        "stream will be re-processed without filtering to avoid data loss"));
2758
28.1k
            }
2759
33.4k
        }
2760
33.4k
    }
2761
32.9k
    if (!attempted_finish) {
2762
22.8k
        try {
2763
22.8k
            pipeline->finish();
2764
22.8k
        } catch (std::exception&) {
2765
            // ignore
2766
12.2k
        }
2767
22.8k
    }
2768
32.9k
    return false;
2769
32.9k
}
2770
2771
bool
2772
QPDF::pipeStreamData(
2773
    QPDFObjGen const& og,
2774
    qpdf_offset_t offset,
2775
    size_t length,
2776
    QPDFObjectHandle stream_dict,
2777
    Pipeline* pipeline,
2778
    bool suppress_warnings,
2779
    bool will_retry)
2780
713k
{
2781
713k
    return pipeStreamData(
2782
713k
        m->encp,
2783
713k
        m->file,
2784
713k
        *this,
2785
713k
        og,
2786
713k
        offset,
2787
713k
        length,
2788
713k
        stream_dict,
2789
713k
        pipeline,
2790
713k
        suppress_warnings,
2791
713k
        will_retry);
2792
713k
}
2793
2794
bool
2795
QPDF::pipeForeignStreamData(
2796
    std::shared_ptr<ForeignStreamData> foreign,
2797
    Pipeline* pipeline,
2798
    bool suppress_warnings,
2799
    bool will_retry)
2800
0
{
2801
0
    if (foreign->encp->encrypted) {
2802
0
        QTC::TC("qpdf", "QPDF pipe foreign encrypted stream");
2803
0
    }
2804
0
    return pipeStreamData(
2805
0
        foreign->encp,
2806
0
        foreign->file,
2807
0
        *this,
2808
0
        foreign->foreign_og,
2809
0
        foreign->offset,
2810
0
        foreign->length,
2811
0
        foreign->local_dict,
2812
0
        pipeline,
2813
0
        suppress_warnings,
2814
0
        will_retry);
2815
0
}
2816
2817
// Throw a generic exception when we lack context for something more specific. New code should not
2818
// use this. This method exists to improve somewhat from calling assert in very old code.
2819
void
2820
QPDF::stopOnError(std::string const& message)
2821
46.7k
{
2822
46.7k
    throw damagedPDF("", message);
2823
46.7k
}
2824
2825
// Return an exception of type qpdf_e_damaged_pdf.
2826
QPDFExc
2827
QPDF::damagedPDF(
2828
    std::shared_ptr<InputSource> const& input,
2829
    std::string const& object,
2830
    qpdf_offset_t offset,
2831
    std::string const& message)
2832
333k
{
2833
333k
    return {qpdf_e_damaged_pdf, input->getName(), object, offset, message};
2834
333k
}
2835
2836
// Return an exception of type qpdf_e_damaged_pdf.  The object is taken from
2837
// m->last_object_description.
2838
QPDFExc
2839
QPDF::damagedPDF(
2840
    std::shared_ptr<InputSource> const& input, qpdf_offset_t offset, std::string const& message)
2841
271k
{
2842
271k
    return damagedPDF(input, m->last_object_description, offset, message);
2843
271k
}
2844
2845
// Return an exception of type qpdf_e_damaged_pdf.  The filename is taken from m->file.
2846
QPDFExc
2847
QPDF::damagedPDF(std::string const& object, qpdf_offset_t offset, std::string const& message)
2848
884k
{
2849
884k
    return {qpdf_e_damaged_pdf, m->file->getName(), object, offset, message};
2850
884k
}
2851
2852
// Return an exception of type qpdf_e_damaged_pdf.  The filename is taken from m->file and the
2853
// offset from .m->file->getLastOffset().
2854
QPDFExc
2855
QPDF::damagedPDF(std::string const& object, std::string const& message)
2856
79.9k
{
2857
79.9k
    return damagedPDF(object, m->file->getLastOffset(), message);
2858
79.9k
}
2859
2860
// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from m->file and the object
2861
// from .m->last_object_description.
2862
QPDFExc
2863
QPDF::damagedPDF(qpdf_offset_t offset, std::string const& message)
2864
162k
{
2865
162k
    return damagedPDF(m->last_object_description, offset, message);
2866
162k
}
2867
2868
// Return an exception of type qpdf_e_damaged_pdf.  The filename is taken from m->file, the object
2869
// from m->last_object_description and the offset from m->file->getLastOffset().
2870
QPDFExc
2871
QPDF::damagedPDF(std::string const& message)
2872
281k
{
2873
281k
    return damagedPDF(m->last_object_description, m->file->getLastOffset(), message);
2874
281k
}
2875
2876
bool
2877
QPDF::everCalledGetAllPages() const
2878
0
{
2879
0
    return m->ever_called_get_all_pages;
2880
0
}
2881
2882
bool
2883
QPDF::everPushedInheritedAttributesToPages() const
2884
0
{
2885
0
    return m->ever_pushed_inherited_attributes_to_pages;
2886
0
}
2887
2888
void
2889
QPDF::removeSecurityRestrictions()
2890
0
{
2891
0
    auto root = getRoot();
2892
0
    root.removeKey("/Perms");
2893
0
    auto acroform = root.getKey("/AcroForm");
2894
0
    if (acroform.isDictionary() && acroform.hasKey("/SigFlags")) {
2895
0
        acroform.replaceKey("/SigFlags", QPDFObjectHandle::newInteger(0));
2896
0
    }
2897
0
}