Coverage Report

Created: 2025-06-22 06:30

/src/qpdf/libqpdf/QPDF_linearization.cc
Line
Count
Source (jump to first uncovered line)
1
// See doc/linearization.
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <qpdf/BitStream.hh>
6
#include <qpdf/BitWriter.hh>
7
#include <qpdf/InputSource_private.hh>
8
#include <qpdf/Pipeline_private.hh>
9
#include <qpdf/Pl_Buffer.hh>
10
#include <qpdf/Pl_Flate.hh>
11
#include <qpdf/Pl_String.hh>
12
#include <qpdf/QPDFExc.hh>
13
#include <qpdf/QPDFObjectHandle_private.hh>
14
#include <qpdf/QPDFWriter_private.hh>
15
#include <qpdf/QTC.hh>
16
#include <qpdf/QUtil.hh>
17
#include <qpdf/Util.hh>
18
19
#include <algorithm>
20
#include <cmath>
21
#include <cstring>
22
23
using namespace qpdf;
24
using namespace std::literals;
25
26
template <class T, class int_type>
27
static void
28
load_vector_int(
29
    BitStream& bit_stream, int nitems, std::vector<T>& vec, int bits_wanted, int_type T::* field)
30
0
{
31
0
    bool append = vec.empty();
32
    // nitems times, read bits_wanted from the given bit stream, storing results in the ith vector
33
    // entry.
34
35
0
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
36
0
        if (append) {
37
0
            vec.push_back(T());
38
0
        }
39
0
        vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted));
40
0
    }
41
0
    if (QIntC::to_int(vec.size()) != nitems) {
42
0
        throw std::logic_error("vector has wrong size in load_vector_int");
43
0
    }
44
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
45
    // start on a byte boundary.
46
0
    bit_stream.skipToNextByte();
47
0
}
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HPageOffsetEntry, int>(BitStream&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, int QPDF::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HPageOffsetEntry, long long>(BitStream&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, long long QPDF::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HSharedObjectEntry, int>(BitStream&, int, std::__1::vector<QPDF::HSharedObjectEntry, std::__1::allocator<QPDF::HSharedObjectEntry> >&, int, int QPDF::HSharedObjectEntry::*)
48
49
template <class T>
50
static void
51
load_vector_vector(
52
    BitStream& bit_stream,
53
    int nitems1,
54
    std::vector<T>& vec1,
55
    int T::* nitems2,
56
    int bits_wanted,
57
    std::vector<int> T::* vec2)
58
0
{
59
    // nitems1 times, read nitems2 (from the ith element of vec1) items into the vec2 vector field
60
    // of the ith item of vec1.
61
0
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
62
0
        for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2) {
63
0
            (vec1.at(i1).*vec2).push_back(bit_stream.getBitsInt(QIntC::to_size(bits_wanted)));
64
0
        }
65
0
    }
66
0
    bit_stream.skipToNextByte();
67
0
}
68
69
void
70
QPDF::linearizationWarning(std::string_view msg)
71
0
{
72
0
    m->linearization_warnings = true;
73
0
    warn(qpdf_e_linearization, "", 0, std::string(msg));
74
0
}
75
76
bool
77
QPDF::checkLinearization()
78
0
{
79
0
    bool result = false;
80
0
    try {
81
0
        readLinearizationData();
82
0
        result = checkLinearizationInternal();
83
0
    } catch (std::runtime_error& e) {
84
0
        linearizationWarning(
85
0
            "error encountered while checking linearization data: " + std::string(e.what()));
86
0
    }
87
0
    return result;
88
0
}
89
90
bool
91
QPDF::isLinearized()
92
0
{
93
    // If the first object in the file is a dictionary with a suitable /Linearized key and has an /L
94
    // key that accurately indicates the file size, initialize m->lindict and return true.
95
96
    // A linearized PDF spec's first object will be contained within the first 1024 bytes of the
97
    // file and will be a dictionary with a valid /Linearized key.  This routine looks for that and
98
    // does no additional validation.
99
100
    // The PDF spec says the linearization dictionary must be completely contained within the first
101
    // 1024 bytes of the file. Add a byte for a null terminator.
102
0
    auto buffer = m->file->read(1024, 0);
103
0
    size_t pos = 0;
104
0
    while (true) {
105
        // Find a digit or end of buffer
106
0
        pos = buffer.find_first_of("0123456789"sv, pos);
107
0
        if (pos == std::string::npos) {
108
0
            return false;
109
0
        }
110
        // Seek to the digit. Then skip over digits for a potential
111
        // next iteration.
112
0
        m->file->seek(toO(pos), SEEK_SET);
113
114
0
        auto t1 = readToken(*m->file, 20);
115
0
        if (!(t1.isInteger() && readToken(*m->file, 6).isInteger() &&
116
0
              readToken(*m->file, 4).isWord("obj"))) {
117
0
            pos = buffer.find_first_not_of("0123456789"sv, pos);
118
0
            if (pos == std::string::npos) {
119
0
                return false;
120
0
            }
121
0
            continue;
122
0
        }
123
124
0
        auto candidate = getObject(toI(QUtil::string_to_ll(t1.getValue().data())), 0);
125
0
        if (!candidate.isDictionary()) {
126
0
            return false;
127
0
        }
128
129
0
        auto linkey = candidate.getKey("/Linearized");
130
0
        if (!(linkey.isNumber() && toI(floor(linkey.getNumericValue())) == 1)) {
131
0
            return false;
132
0
        }
133
134
0
        auto L = candidate.getKey("/L");
135
0
        if (!L.isInteger()) {
136
0
            return false;
137
0
        }
138
0
        qpdf_offset_t Li = L.getIntValue();
139
0
        m->file->seek(0, SEEK_END);
140
0
        if (Li != m->file->tell()) {
141
0
            QTC::TC("qpdf", "QPDF /L mismatch");
142
0
            return false;
143
0
        }
144
0
        m->linp.file_size = Li;
145
0
        m->lindict = candidate;
146
0
        return true;
147
0
    }
148
0
}
149
150
void
151
QPDF::readLinearizationData()
152
0
{
153
    // This function throws an exception (which is trapped by checkLinearization()) for any errors
154
    // that prevent loading.
155
156
0
    if (!isLinearized()) {
157
0
        throw std::logic_error("called readLinearizationData for file that is not linearized");
158
0
    }
159
160
    // /L is read and stored in linp by isLinearized()
161
0
    QPDFObjectHandle H = m->lindict.getKey("/H");
162
0
    QPDFObjectHandle O = m->lindict.getKey("/O");
163
0
    QPDFObjectHandle E = m->lindict.getKey("/E");
164
0
    QPDFObjectHandle N = m->lindict.getKey("/N");
165
0
    QPDFObjectHandle T = m->lindict.getKey("/T");
166
0
    QPDFObjectHandle P = m->lindict.getKey("/P");
167
168
0
    if (!(H.isArray() && O.isInteger() && E.isInteger() && N.isInteger() && T.isInteger() &&
169
0
          (P.isInteger() || P.isNull()))) {
170
0
        throw damagedPDF(
171
0
            "linearization dictionary",
172
0
            "some keys in linearization dictionary are of the wrong type");
173
0
    }
174
175
    // Hint table array: offset length [ offset length ]
176
0
    size_t n_H_items = toS(H.getArrayNItems());
177
0
    if (!((n_H_items == 2) || (n_H_items == 4))) {
178
0
        throw damagedPDF("linearization dictionary", "H has the wrong number of items");
179
0
    }
180
181
0
    std::vector<int> H_items;
182
0
    for (size_t i = 0; i < n_H_items; ++i) {
183
0
        QPDFObjectHandle oh(H.getArrayItem(toI(i)));
184
0
        if (oh.isInteger()) {
185
0
            H_items.push_back(oh.getIntValueAsInt());
186
0
        } else {
187
0
            throw damagedPDF("linearization dictionary", "some H items are of the wrong type");
188
0
        }
189
0
    }
190
191
    // H: hint table offset/length for primary and overflow hint tables
192
0
    int H0_offset = H_items.at(0);
193
0
    int H0_length = H_items.at(1);
194
0
    int H1_offset = 0;
195
0
    int H1_length = 0;
196
0
    if (H_items.size() == 4) {
197
        // Acrobat doesn't read or write these (as PDF 1.4), so we don't have a way to generate a
198
        // test case.
199
        // QTC::TC("qpdf", "QPDF overflow hint table");
200
0
        H1_offset = H_items.at(2);
201
0
        H1_length = H_items.at(3);
202
0
    }
203
204
    // P: first page number
205
0
    int first_page = 0;
206
0
    if (P.isInteger()) {
207
0
        QTC::TC("qpdf", "QPDF P present in lindict");
208
0
        first_page = P.getIntValueAsInt();
209
0
    } else {
210
0
        QTC::TC("qpdf", "QPDF P absent in lindict");
211
0
    }
212
213
    // Store linearization parameter data
214
215
    // Various places in the code use linp.npages, which is initialized from N, to pre-allocate
216
    // memory, so make sure it's accurate and bail right now if it's not.
217
0
    if (N.getIntValue() != static_cast<long long>(getAllPages().size())) {
218
0
        throw damagedPDF("linearization hint table", "/N does not match number of pages");
219
0
    }
220
221
    // file_size initialized by isLinearized()
222
0
    m->linp.first_page_object = O.getIntValueAsInt();
223
0
    m->linp.first_page_end = E.getIntValue();
224
0
    m->linp.npages = N.getIntValueAsInt();
225
0
    m->linp.xref_zero_offset = T.getIntValue();
226
0
    m->linp.first_page = first_page;
227
0
    m->linp.H_offset = H0_offset;
228
0
    m->linp.H_length = H0_length;
229
230
    // Read hint streams
231
232
0
    Pl_Buffer pb("hint buffer");
233
0
    QPDFObjectHandle H0 = readHintStream(pb, H0_offset, toS(H0_length));
234
0
    if (H1_offset) {
235
0
        (void)readHintStream(pb, H1_offset, toS(H1_length));
236
0
    }
237
238
    // PDF 1.4 hint tables that we ignore:
239
240
    //  /T    thumbnail
241
    //  /A    thread information
242
    //  /E    named destination
243
    //  /V    interactive form
244
    //  /I    information dictionary
245
    //  /C    logical structure
246
    //  /L    page label
247
248
    // Individual hint table offsets
249
0
    QPDFObjectHandle HS = H0.getKey("/S"); // shared object
250
0
    QPDFObjectHandle HO = H0.getKey("/O"); // outline
251
252
0
    auto hbp = pb.getBufferSharedPointer();
253
0
    Buffer* hb = hbp.get();
254
0
    unsigned char const* h_buf = hb->getBuffer();
255
0
    size_t h_size = hb->getSize();
256
257
0
    readHPageOffset(BitStream(h_buf, h_size));
258
259
0
    int HSi = HS.getIntValueAsInt();
260
0
    if ((HSi < 0) || (toS(HSi) >= h_size)) {
261
0
        throw damagedPDF("linearization hint table", "/S (shared object) offset is out of bounds");
262
0
    }
263
0
    readHSharedObject(BitStream(h_buf + HSi, h_size - toS(HSi)));
264
265
0
    if (HO.isInteger()) {
266
0
        int HOi = HO.getIntValueAsInt();
267
0
        if ((HOi < 0) || (toS(HOi) >= h_size)) {
268
0
            throw damagedPDF("linearization hint table", "/O (outline) offset is out of bounds");
269
0
        }
270
0
        readHGeneric(BitStream(h_buf + HOi, h_size - toS(HOi)), m->outline_hints);
271
0
    }
272
0
}
273
274
QPDFObjectHandle
275
QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
276
0
{
277
0
    QPDFObjGen og;
278
0
    QPDFObjectHandle H =
279
0
        readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);
280
0
    ObjCache& oc = m->obj_cache[og];
281
0
    qpdf_offset_t min_end_offset = oc.end_before_space;
282
0
    qpdf_offset_t max_end_offset = oc.end_after_space;
283
0
    if (!H.isStream()) {
284
0
        throw damagedPDF("linearization dictionary", "hint table is not a stream");
285
0
    }
286
287
0
    QPDFObjectHandle Hdict = H.getDict();
288
289
    // Some versions of Acrobat make /Length indirect and place it immediately after the stream,
290
    // increasing length to cover it, even though the specification says all objects in the
291
    // linearization parameter dictionary must be direct.  We have to get the file position of the
292
    // end of length in this case.
293
0
    QPDFObjectHandle length_obj = Hdict.getKey("/Length");
294
0
    if (length_obj.isIndirect()) {
295
0
        QTC::TC("qpdf", "QPDF hint table length indirect");
296
        // Force resolution
297
0
        (void)length_obj.getIntValue();
298
0
        ObjCache& oc2 = m->obj_cache[length_obj.getObjGen()];
299
0
        min_end_offset = oc2.end_before_space;
300
0
        max_end_offset = oc2.end_after_space;
301
0
    } else {
302
0
        QTC::TC("qpdf", "QPDF hint table length direct");
303
0
    }
304
0
    qpdf_offset_t computed_end = offset + toO(length);
305
0
    if ((computed_end < min_end_offset) || (computed_end > max_end_offset)) {
306
0
        linearizationWarning(
307
0
            "expected = " + std::to_string(computed_end) +
308
0
            "; actual = " + std::to_string(min_end_offset) + ".." + std::to_string(max_end_offset));
309
0
        throw damagedPDF("linearization dictionary", "hint table length mismatch");
310
0
    }
311
0
    H.pipeStreamData(&pl, 0, qpdf_dl_specialized);
312
0
    return Hdict;
313
0
}
314
315
void
316
QPDF::readHPageOffset(BitStream h)
317
0
{
318
    // All comments referring to the PDF spec refer to the spec for version 1.4.
319
320
0
    HPageOffset& t = m->page_offset_hints;
321
322
0
    t.min_nobjects = h.getBitsInt(32);               // 1
323
0
    t.first_page_offset = h.getBitsInt(32);          // 2
324
0
    t.nbits_delta_nobjects = h.getBitsInt(16);       // 3
325
0
    t.min_page_length = h.getBitsInt(32);            // 4
326
0
    t.nbits_delta_page_length = h.getBitsInt(16);    // 5
327
0
    t.min_content_offset = h.getBitsInt(32);         // 6
328
0
    t.nbits_delta_content_offset = h.getBitsInt(16); // 7
329
0
    t.min_content_length = h.getBitsInt(32);         // 8
330
0
    t.nbits_delta_content_length = h.getBitsInt(16); // 9
331
0
    t.nbits_nshared_objects = h.getBitsInt(16);      // 10
332
0
    t.nbits_shared_identifier = h.getBitsInt(16);    // 11
333
0
    t.nbits_shared_numerator = h.getBitsInt(16);     // 12
334
0
    t.shared_denominator = h.getBitsInt(16);         // 13
335
336
0
    std::vector<HPageOffsetEntry>& entries = t.entries;
337
0
    entries.clear();
338
0
    int nitems = m->linp.npages;
339
0
    load_vector_int(h, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
340
0
    load_vector_int(
341
0
        h, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
342
0
    load_vector_int(
343
0
        h, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
344
0
    load_vector_vector(
345
0
        h,
346
0
        nitems,
347
0
        entries,
348
0
        &HPageOffsetEntry::nshared_objects,
349
0
        t.nbits_shared_identifier,
350
0
        &HPageOffsetEntry::shared_identifiers);
351
0
    load_vector_vector(
352
0
        h,
353
0
        nitems,
354
0
        entries,
355
0
        &HPageOffsetEntry::nshared_objects,
356
0
        t.nbits_shared_numerator,
357
0
        &HPageOffsetEntry::shared_numerators);
358
0
    load_vector_int(
359
0
        h, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
360
0
    load_vector_int(
361
0
        h, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
362
0
}
363
364
void
365
QPDF::readHSharedObject(BitStream h)
366
0
{
367
0
    HSharedObject& t = m->shared_object_hints;
368
369
0
    t.first_shared_obj = h.getBitsInt(32);         // 1
370
0
    t.first_shared_offset = h.getBitsInt(32);      // 2
371
0
    t.nshared_first_page = h.getBitsInt(32);       // 3
372
0
    t.nshared_total = h.getBitsInt(32);            // 4
373
0
    t.nbits_nobjects = h.getBitsInt(16);           // 5
374
0
    t.min_group_length = h.getBitsInt(32);         // 6
375
0
    t.nbits_delta_group_length = h.getBitsInt(16); // 7
376
377
0
    QTC::TC(
378
0
        "qpdf",
379
0
        "QPDF lin nshared_total > nshared_first_page",
380
0
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
381
382
0
    std::vector<HSharedObjectEntry>& entries = t.entries;
383
0
    entries.clear();
384
0
    int nitems = t.nshared_total;
385
0
    load_vector_int(
386
0
        h, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
387
0
    load_vector_int(h, nitems, entries, 1, &HSharedObjectEntry::signature_present);
388
0
    for (size_t i = 0; i < toS(nitems); ++i) {
389
0
        if (entries.at(i).signature_present) {
390
            // Skip 128-bit MD5 hash.  These are not supported by acrobat, so they should probably
391
            // never be there.  We have no test case for this.
392
0
            for (int j = 0; j < 4; ++j) {
393
0
                (void)h.getBits(32);
394
0
            }
395
0
        }
396
0
    }
397
0
    load_vector_int(h, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
398
0
}
399
400
void
401
QPDF::readHGeneric(BitStream h, HGeneric& t)
402
0
{
403
0
    t.first_object = h.getBitsInt(32);        // 1
404
0
    t.first_object_offset = h.getBitsInt(32); // 2
405
0
    t.nobjects = h.getBitsInt(32);            // 3
406
0
    t.group_length = h.getBitsInt(32);        // 4
407
0
}
408
409
bool
410
QPDF::checkLinearizationInternal()
411
0
{
412
    // All comments referring to the PDF spec refer to the spec for version 1.4.
413
414
    // Check all values in linearization parameter dictionary
415
416
0
    LinParameters& p = m->linp;
417
418
    // L: file size in bytes -- checked by isLinearized
419
420
    // O: object number of first page
421
0
    std::vector<QPDFObjectHandle> const& pages = getAllPages();
422
0
    if (p.first_page_object != pages.at(0).getObjectID()) {
423
0
        QTC::TC("qpdf", "QPDF err /O mismatch");
424
0
        linearizationWarning("first page object (/O) mismatch");
425
0
    }
426
427
    // N: number of pages
428
0
    int npages = toI(pages.size());
429
0
    if (p.npages != npages) {
430
        // Not tested in the test suite
431
0
        linearizationWarning("page count (/N) mismatch");
432
0
    }
433
434
0
    for (size_t i = 0; i < toS(npages); ++i) {
435
0
        QPDFObjectHandle const& page = pages.at(i);
436
0
        QPDFObjGen og(page.getObjGen());
437
0
        if (m->xref_table[og].getType() == 2) {
438
0
            linearizationWarning(
439
0
                "page dictionary for page " + std::to_string(i) + " is compressed");
440
0
        }
441
0
    }
442
443
    // T: offset of whitespace character preceding xref entry for object 0
444
0
    m->file->seek(p.xref_zero_offset, SEEK_SET);
445
0
    while (true) {
446
0
        char ch;
447
0
        m->file->read(&ch, 1);
448
0
        if (!((ch == ' ') || (ch == '\r') || (ch == '\n'))) {
449
0
            m->file->seek(-1, SEEK_CUR);
450
0
            break;
451
0
        }
452
0
    }
453
0
    if (m->file->tell() != m->first_xref_item_offset) {
454
0
        QTC::TC("qpdf", "QPDF err /T mismatch");
455
0
        linearizationWarning(
456
0
            "space before first xref item (/T) mismatch (computed = " +
457
0
            std::to_string(m->first_xref_item_offset) +
458
0
            "; file = " + std::to_string(m->file->tell()));
459
0
    }
460
461
    // P: first page number -- Implementation note 124 says Acrobat ignores this value, so we will
462
    // too.
463
464
    // Check numbering of compressed objects in each xref section. For linearized files, all
465
    // compressed objects are supposed to be at the end of the containing xref section if any object
466
    // streams are in use.
467
468
0
    if (m->uncompressed_after_compressed) {
469
0
        linearizationWarning(
470
0
            "linearized file contains an uncompressed object after a compressed "
471
0
            "one in a cross-reference stream");
472
0
    }
473
474
    // Further checking requires optimization and order calculation. Don't allow optimization to
475
    // make changes.  If it has to, then the file is not properly linearized.  We use the xref table
476
    // to figure out which objects are compressed and which are uncompressed.
477
0
    { // local scope
478
0
        std::map<int, int> object_stream_data;
479
0
        for (auto const& iter: m->xref_table) {
480
0
            QPDFObjGen const& og = iter.first;
481
0
            QPDFXRefEntry const& entry = iter.second;
482
0
            if (entry.getType() == 2) {
483
0
                object_stream_data[og.getObj()] = entry.getObjStreamNumber();
484
0
            }
485
0
        }
486
0
        optimize_internal(object_stream_data, false, nullptr);
487
0
        calculateLinearizationData(object_stream_data);
488
0
    }
489
490
    // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
491
    // object here by mistake.  pdlin fails to place thumbnail images in section 9, so when
492
    // thumbnails are present, it also gets the wrong value for /E.  It also doesn't count outlines
493
    // here when it should even though it places them in part 6.  This code fails to put thread
494
    // information dictionaries in part 9, so it actually gets the wrong value for E when threads
495
    // are present.  In that case, it would probably agree with pdlin.  As of this writing, the test
496
    // suite doesn't contain any files with threads.
497
498
0
    if (m->part6.empty()) {
499
0
        stopOnError("linearization part 6 unexpectedly empty");
500
0
    }
501
0
    qpdf_offset_t min_E = -1;
502
0
    qpdf_offset_t max_E = -1;
503
0
    for (auto const& oh: m->part6) {
504
0
        QPDFObjGen og(oh.getObjGen());
505
0
        if (!m->obj_cache.contains(og)) {
506
            // All objects have to have been dereferenced to be classified.
507
0
            throw std::logic_error("linearization part6 object not in cache");
508
0
        }
509
0
        ObjCache const& oc = m->obj_cache[og];
510
0
        min_E = std::max(min_E, oc.end_before_space);
511
0
        max_E = std::max(max_E, oc.end_after_space);
512
0
    }
513
0
    if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) {
514
0
        QTC::TC("qpdf", "QPDF warn /E mismatch");
515
0
        linearizationWarning(
516
0
            "end of first page section (/E) mismatch: /E = " + std::to_string(p.first_page_end) +
517
0
            "; computed = " + std::to_string(min_E) + ".." + std::to_string(max_E));
518
0
    }
519
520
    // Check hint tables
521
522
0
    std::map<int, int> shared_idx_to_obj;
523
0
    checkHSharedObject(pages, shared_idx_to_obj);
524
0
    checkHPageOffset(pages, shared_idx_to_obj);
525
0
    checkHOutlines();
526
527
0
    return !m->linearization_warnings;
528
0
}
529
530
qpdf_offset_t
531
QPDF::maxEnd(ObjUser const& ou)
532
0
{
533
0
    if (!m->obj_user_to_objects.contains(ou)) {
534
0
        stopOnError("no entry in object user table for requested object user");
535
0
    }
536
0
    qpdf_offset_t end = 0;
537
0
    for (auto const& og: m->obj_user_to_objects[ou]) {
538
0
        if (!m->obj_cache.contains(og)) {
539
0
            stopOnError("unknown object referenced in object user table");
540
0
        }
541
0
        end = std::max(end, m->obj_cache[og].end_after_space);
542
0
    }
543
0
    return end;
544
0
}
545
546
qpdf_offset_t
547
QPDF::getLinearizationOffset(QPDFObjGen og)
548
0
{
549
0
    QPDFXRefEntry entry = m->xref_table[og];
550
0
    qpdf_offset_t result = 0;
551
0
    switch (entry.getType()) {
552
0
    case 1:
553
0
        result = entry.getOffset();
554
0
        break;
555
556
0
    case 2:
557
        // For compressed objects, return the offset of the object stream that contains them.
558
0
        result = getLinearizationOffset(QPDFObjGen(entry.getObjStreamNumber(), 0));
559
0
        break;
560
561
0
    default:
562
0
        stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2");
563
0
        break;
564
0
    }
565
0
    return result;
566
0
}
567
568
QPDFObjectHandle
569
QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data)
570
0
{
571
0
    if (obj.null() || (!object_stream_data.contains(obj.getObjectID()))) {
572
0
        return obj;
573
0
    } else {
574
0
        int repl = (*(object_stream_data.find(obj.getObjectID()))).second;
575
0
        return getObject(repl, 0);
576
0
    }
577
0
}
578
579
QPDFObjectHandle
580
QPDF::getUncompressedObject(QPDFObjectHandle& oh, QPDFWriter::ObjTable const& obj)
581
24.5k
{
582
24.5k
    if (obj.contains(oh)) {
583
24.4k
        if (auto id = obj[oh].object_stream; id > 0) {
584
596
            return oh.isNull() ? oh : getObject(id, 0);
585
596
        }
586
24.4k
    }
587
23.9k
    return oh;
588
24.5k
}
589
590
int
591
QPDF::lengthNextN(int first_object, int n)
592
0
{
593
0
    int length = 0;
594
0
    for (int i = 0; i < n; ++i) {
595
0
        QPDFObjGen og(first_object + i, 0);
596
0
        if (!m->xref_table.contains(og)) {
597
0
            linearizationWarning(
598
0
                "no xref table entry for " + std::to_string(first_object + i) + " 0");
599
0
        } else {
600
0
            if (!m->obj_cache.contains(og)) {
601
0
                stopOnError("found unknown object while calculating length for linearization data");
602
0
            }
603
0
            length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og));
604
0
        }
605
0
    }
606
0
    return length;
607
0
}
608
609
void
610
QPDF::checkHPageOffset(
611
    std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& shared_idx_to_obj)
612
0
{
613
    // Implementation note 126 says Acrobat always sets delta_content_offset and
614
    // delta_content_length in the page offset header dictionary to 0.  It also states that
615
    // min_content_offset in the per-page information is always 0, which is an incorrect value.
616
617
    // Implementation note 127 explains that Acrobat always sets item 8 (min_content_length) to
618
    // zero, item 9 (nbits_delta_content_length) to the value of item 5 (nbits_delta_page_length),
619
    // and item 7 of each per-page hint table (delta_content_length) to item 2 (delta_page_length)
620
    // of that entry.  Acrobat ignores these values when reading files.
621
622
    // Empirically, it also seems that Acrobat sometimes puts items under a page's /Resources
623
    // dictionary in with shared objects even when they are private.
624
625
0
    int npages = toI(pages.size());
626
0
    qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset);
627
0
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
628
0
    if (!m->xref_table.contains(first_page_og)) {
629
0
        stopOnError("supposed first page object is not known");
630
0
    }
631
0
    qpdf_offset_t offset = getLinearizationOffset(first_page_og);
632
0
    if (table_offset != offset) {
633
0
        linearizationWarning("first page object offset mismatch");
634
0
    }
635
636
0
    for (int pageno = 0; pageno < npages; ++pageno) {
637
0
        QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
638
0
        int first_object = page_og.getObj();
639
0
        if (!m->xref_table.contains(page_og)) {
640
0
            stopOnError("unknown object in page offset hint table");
641
0
        }
642
0
        offset = getLinearizationOffset(page_og);
643
644
0
        HPageOffsetEntry& he = m->page_offset_hints.entries.at(toS(pageno));
645
0
        CHPageOffsetEntry& ce = m->c_page_offset_data.entries.at(toS(pageno));
646
0
        int h_nobjects = he.delta_nobjects + m->page_offset_hints.min_nobjects;
647
0
        if (h_nobjects != ce.nobjects) {
648
            // This happens with pdlin when there are thumbnails.
649
0
            linearizationWarning(
650
0
                "object count mismatch for page " + std::to_string(pageno) + ": hint table = " +
651
0
                std::to_string(h_nobjects) + "; computed = " + std::to_string(ce.nobjects));
652
0
        }
653
654
        // Use value for number of objects in hint table rather than computed value if there is a
655
        // discrepancy.
656
0
        int length = lengthNextN(first_object, h_nobjects);
657
0
        int h_length = toI(he.delta_page_length + m->page_offset_hints.min_page_length);
658
0
        if (length != h_length) {
659
            // This condition almost certainly indicates a bad hint table or a bug in this code.
660
0
            linearizationWarning(
661
0
                "page length mismatch for page " + std::to_string(pageno) + ": hint table = " +
662
0
                std::to_string(h_length) + "; computed length = " + std::to_string(length) +
663
0
                " (offset = " + std::to_string(offset) + ")");
664
0
        }
665
666
0
        offset += h_length;
667
668
        // Translate shared object indexes to object numbers.
669
0
        std::set<int> hint_shared;
670
0
        std::set<int> computed_shared;
671
672
0
        if ((pageno == 0) && (he.nshared_objects > 0)) {
673
            // pdlin and Acrobat both do this even though the spec states clearly and unambiguously
674
            // that they should not.
675
0
            linearizationWarning("page 0 has shared identifier entries");
676
0
        }
677
678
0
        for (size_t i = 0; i < toS(he.nshared_objects); ++i) {
679
0
            int idx = he.shared_identifiers.at(i);
680
0
            if (!shared_idx_to_obj.contains(idx)) {
681
0
                stopOnError("unable to get object for item in shared objects hint table");
682
0
            }
683
0
            hint_shared.insert(shared_idx_to_obj[idx]);
684
0
        }
685
686
0
        for (size_t i = 0; i < toS(ce.nshared_objects); ++i) {
687
0
            int idx = ce.shared_identifiers.at(i);
688
0
            if (idx >= m->c_shared_object_data.nshared_total) {
689
0
                stopOnError("index out of bounds for shared object hint table");
690
0
            }
691
0
            int obj = m->c_shared_object_data.entries.at(toS(idx)).object;
692
0
            computed_shared.insert(obj);
693
0
        }
694
695
0
        for (int iter: hint_shared) {
696
0
            if (!computed_shared.contains(iter)) {
697
                // pdlin puts thumbnails here even though it shouldn't
698
0
                linearizationWarning(
699
0
                    "page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
700
0
                    ": in hint table but not computed list");
701
0
            }
702
0
        }
703
704
0
        for (int iter: computed_shared) {
705
0
            if (!hint_shared.contains(iter)) {
706
                // Acrobat does not put some things including at least built-in fonts and procsets
707
                // here, at least in some cases.
708
0
                linearizationWarning(
709
0
                    ("page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
710
0
                     ": in computed list but not hint table"));
711
0
            }
712
0
        }
713
0
    }
714
0
}
715
716
void
717
QPDF::checkHSharedObject(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj)
718
0
{
719
    // Implementation note 125 says shared object groups always contain only one object.
720
    // Implementation note 128 says that Acrobat always nbits_nobjects to zero.  Implementation note
721
    // 130 says that Acrobat does not support more than one shared object per group.  These are all
722
    // consistent.
723
724
    // Implementation note 129 states that MD5 signatures are not implemented in Acrobat, so
725
    // signature_present must always be zero.
726
727
    // Implementation note 131 states that first_shared_obj and first_shared_offset have meaningless
728
    // values for single-page files.
729
730
    // Empirically, Acrobat and pdlin generate incorrect values for these whenever there are no
731
    // shared objects not referenced by the first page (i.e., nshared_total == nshared_first_page).
732
733
0
    HSharedObject& so = m->shared_object_hints;
734
0
    if (so.nshared_total < so.nshared_first_page) {
735
0
        linearizationWarning("shared object hint table: ntotal < nfirst_page");
736
0
    } else {
737
        // The first nshared_first_page objects are consecutive objects starting with the first page
738
        // object.  The rest are consecutive starting from the first_shared_obj object.
739
0
        int cur_object = pages.at(0).getObjectID();
740
0
        for (int i = 0; i < so.nshared_total; ++i) {
741
0
            if (i == so.nshared_first_page) {
742
0
                QTC::TC("qpdf", "QPDF lin check shared past first page");
743
0
                if (m->part8.empty()) {
744
0
                    linearizationWarning("part 8 is empty but nshared_total > nshared_first_page");
745
0
                } else {
746
0
                    int obj = m->part8.at(0).getObjectID();
747
0
                    if (obj != so.first_shared_obj) {
748
0
                        linearizationWarning(
749
0
                            "first shared object number mismatch: hint table = " +
750
0
                            std::to_string(so.first_shared_obj) +
751
0
                            "; computed = " + std::to_string(obj));
752
0
                    }
753
0
                }
754
755
0
                cur_object = so.first_shared_obj;
756
757
0
                QPDFObjGen og(cur_object, 0);
758
0
                if (!m->xref_table.contains(og)) {
759
0
                    stopOnError("unknown object in shared object hint table");
760
0
                }
761
0
                qpdf_offset_t offset = getLinearizationOffset(og);
762
0
                qpdf_offset_t h_offset = adjusted_offset(so.first_shared_offset);
763
0
                if (offset != h_offset) {
764
0
                    linearizationWarning(
765
0
                        "first shared object offset mismatch: hint table = " +
766
0
                        std::to_string(h_offset) + "; computed = " + std::to_string(offset));
767
0
                }
768
0
            }
769
770
0
            idx_to_obj[i] = cur_object;
771
0
            HSharedObjectEntry& se = so.entries.at(toS(i));
772
0
            int nobjects = se.nobjects_minus_one + 1;
773
0
            int length = lengthNextN(cur_object, nobjects);
774
0
            int h_length = so.min_group_length + se.delta_group_length;
775
0
            if (length != h_length) {
776
0
                linearizationWarning(
777
0
                    "shared object " + std::to_string(i) + " length mismatch: hint table = " +
778
0
                    std::to_string(h_length) + "; computed = " + std::to_string(length));
779
0
            }
780
0
            cur_object += nobjects;
781
0
        }
782
0
    }
783
0
}
784
785
void
786
QPDF::checkHOutlines()
787
0
{
788
    // Empirically, Acrobat generates the correct value for the object number but incorrectly stores
789
    // the next object number's offset as the offset, at least when outlines appear in part 6.  It
790
    // also generates an incorrect value for length (specifically, the length that would cover the
791
    // correct number of objects from the wrong starting place).  pdlin appears to generate correct
792
    // values in those cases.
793
794
0
    if (m->c_outline_data.nobjects == m->outline_hints.nobjects) {
795
0
        if (m->c_outline_data.nobjects == 0) {
796
0
            return;
797
0
        }
798
799
0
        if (m->c_outline_data.first_object == m->outline_hints.first_object) {
800
            // Check length and offset.  Acrobat gets these wrong.
801
0
            QPDFObjectHandle outlines = getRoot().getKey("/Outlines");
802
0
            if (!outlines.isIndirect()) {
803
                // This case is not exercised in test suite since not permitted by the spec, but if
804
                // this does occur, the code below would fail.
805
0
                linearizationWarning("/Outlines key of root dictionary is not indirect");
806
0
                return;
807
0
            }
808
0
            QPDFObjGen og(outlines.getObjGen());
809
0
            if (!m->xref_table.contains(og)) {
810
0
                stopOnError("unknown object in outlines hint table");
811
0
            }
812
0
            qpdf_offset_t offset = getLinearizationOffset(og);
813
0
            ObjUser ou(ObjUser::ou_root_key, "/Outlines");
814
0
            int length = toI(maxEnd(ou) - offset);
815
0
            qpdf_offset_t table_offset = adjusted_offset(m->outline_hints.first_object_offset);
816
0
            if (offset != table_offset) {
817
0
                linearizationWarning(
818
0
                    "incorrect offset in outlines table: hint table = " +
819
0
                    std::to_string(table_offset) + "; computed = " + std::to_string(offset));
820
0
            }
821
0
            int table_length = m->outline_hints.group_length;
822
0
            if (length != table_length) {
823
0
                linearizationWarning(
824
0
                    "incorrect length in outlines table: hint table = " +
825
0
                    std::to_string(table_length) + "; computed = " + std::to_string(length));
826
0
            }
827
0
        } else {
828
0
            linearizationWarning("incorrect first object number in outline hints table.");
829
0
        }
830
0
    } else {
831
0
        linearizationWarning("incorrect object count in outline hint table");
832
0
    }
833
0
}
834
835
void
836
QPDF::showLinearizationData()
837
0
{
838
0
    try {
839
0
        readLinearizationData();
840
0
        checkLinearizationInternal();
841
0
        dumpLinearizationDataInternal();
842
0
    } catch (QPDFExc& e) {
843
0
        linearizationWarning(e.what());
844
0
    }
845
0
}
846
847
void
848
QPDF::dumpLinearizationDataInternal()
849
0
{
850
0
    *m->log->getInfo() << m->file->getName() << ": linearization data:\n\n";
851
852
0
    *m->log->getInfo() << "file_size: " << m->linp.file_size << "\n"
853
0
                       << "first_page_object: " << m->linp.first_page_object << "\n"
854
0
                       << "first_page_end: " << m->linp.first_page_end << "\n"
855
0
                       << "npages: " << m->linp.npages << "\n"
856
0
                       << "xref_zero_offset: " << m->linp.xref_zero_offset << "\n"
857
0
                       << "first_page: " << m->linp.first_page << "\n"
858
0
                       << "H_offset: " << m->linp.H_offset << "\n"
859
0
                       << "H_length: " << m->linp.H_length << "\n"
860
0
                       << "\n";
861
862
0
    *m->log->getInfo() << "Page Offsets Hint Table\n\n";
863
0
    dumpHPageOffset();
864
0
    *m->log->getInfo() << "\nShared Objects Hint Table\n\n";
865
0
    dumpHSharedObject();
866
867
0
    if (m->outline_hints.nobjects > 0) {
868
0
        *m->log->getInfo() << "\nOutlines Hint Table\n\n";
869
0
        dumpHGeneric(m->outline_hints);
870
0
    }
871
0
}
872
873
qpdf_offset_t
874
QPDF::adjusted_offset(qpdf_offset_t offset)
875
0
{
876
    // All offsets >= H_offset have to be increased by H_length since all hint table location values
877
    // disregard the hint table itself.
878
0
    if (offset >= m->linp.H_offset) {
879
0
        return offset + m->linp.H_length;
880
0
    }
881
0
    return offset;
882
0
}
883
884
void
885
QPDF::dumpHPageOffset()
886
0
{
887
0
    HPageOffset& t = m->page_offset_hints;
888
0
    *m->log->getInfo() << "min_nobjects: " << t.min_nobjects << "\n"
889
0
                       << "first_page_offset: " << adjusted_offset(t.first_page_offset) << "\n"
890
0
                       << "nbits_delta_nobjects: " << t.nbits_delta_nobjects << "\n"
891
0
                       << "min_page_length: " << t.min_page_length << "\n"
892
0
                       << "nbits_delta_page_length: " << t.nbits_delta_page_length << "\n"
893
0
                       << "min_content_offset: " << t.min_content_offset << "\n"
894
0
                       << "nbits_delta_content_offset: " << t.nbits_delta_content_offset << "\n"
895
0
                       << "min_content_length: " << t.min_content_length << "\n"
896
0
                       << "nbits_delta_content_length: " << t.nbits_delta_content_length << "\n"
897
0
                       << "nbits_nshared_objects: " << t.nbits_nshared_objects << "\n"
898
0
                       << "nbits_shared_identifier: " << t.nbits_shared_identifier << "\n"
899
0
                       << "nbits_shared_numerator: " << t.nbits_shared_numerator << "\n"
900
0
                       << "shared_denominator: " << t.shared_denominator << "\n";
901
902
0
    for (size_t i1 = 0; i1 < toS(m->linp.npages); ++i1) {
903
0
        HPageOffsetEntry& pe = t.entries.at(i1);
904
0
        *m->log->getInfo() << "Page " << i1 << ":\n"
905
0
                           << "  nobjects: " << pe.delta_nobjects + t.min_nobjects << "\n"
906
0
                           << "  length: " << pe.delta_page_length + t.min_page_length
907
0
                           << "\n"
908
                           // content offset is relative to page, not file
909
0
                           << "  content_offset: " << pe.delta_content_offset + t.min_content_offset
910
0
                           << "\n"
911
0
                           << "  content_length: " << pe.delta_content_length + t.min_content_length
912
0
                           << "\n"
913
0
                           << "  nshared_objects: " << pe.nshared_objects << "\n";
914
0
        for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2) {
915
0
            *m->log->getInfo() << "    identifier " << i2 << ": " << pe.shared_identifiers.at(i2)
916
0
                               << "\n";
917
0
            *m->log->getInfo() << "    numerator " << i2 << ": " << pe.shared_numerators.at(i2)
918
0
                               << "\n";
919
0
        }
920
0
    }
921
0
}
922
923
void
924
QPDF::dumpHSharedObject()
925
0
{
926
0
    HSharedObject& t = m->shared_object_hints;
927
0
    *m->log->getInfo() << "first_shared_obj: " << t.first_shared_obj << "\n"
928
0
                       << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) << "\n"
929
0
                       << "nshared_first_page: " << t.nshared_first_page << "\n"
930
0
                       << "nshared_total: " << t.nshared_total << "\n"
931
0
                       << "nbits_nobjects: " << t.nbits_nobjects << "\n"
932
0
                       << "min_group_length: " << t.min_group_length << "\n"
933
0
                       << "nbits_delta_group_length: " << t.nbits_delta_group_length << "\n";
934
935
0
    for (size_t i = 0; i < toS(t.nshared_total); ++i) {
936
0
        HSharedObjectEntry& se = t.entries.at(i);
937
0
        *m->log->getInfo() << "Shared Object " << i << ":\n"
938
0
                           << "  group length: " << se.delta_group_length + t.min_group_length
939
0
                           << "\n";
940
        // PDF spec says signature present nobjects_minus_one are always 0, so print them only if
941
        // they have a non-zero value.
942
0
        if (se.signature_present) {
943
0
            *m->log->getInfo() << "  signature present\n";
944
0
        }
945
0
        if (se.nobjects_minus_one != 0) {
946
0
            *m->log->getInfo() << "  nobjects: " << se.nobjects_minus_one + 1 << "\n";
947
0
        }
948
0
    }
949
0
}
950
951
void
952
QPDF::dumpHGeneric(HGeneric& t)
953
0
{
954
0
    *m->log->getInfo() << "first_object: " << t.first_object << "\n"
955
0
                       << "first_object_offset: " << adjusted_offset(t.first_object_offset) << "\n"
956
0
                       << "nobjects: " << t.nobjects << "\n"
957
0
                       << "group_length: " << t.group_length << "\n";
958
0
}
959
960
template <typename T>
961
void
962
QPDF::calculateLinearizationData(T const& object_stream_data)
963
7.98k
{
964
    // This function calculates the ordering of objects, divides them into the appropriate parts,
965
    // and computes some values for the linearization parameter dictionary and hint tables.  The
966
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
967
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
968
969
7.98k
    if (m->object_to_obj_users.empty()) {
970
        // Note that we can't call optimize here because we don't know whether it should be called
971
        // with or without allow changes.
972
0
        throw std::logic_error(
973
0
            "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()");
974
0
    }
975
976
    // Separate objects into the categories sufficient for us to determine which part of the
977
    // linearized file should contain the object.  This categorization is useful for other purposes
978
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
979
980
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
981
    // trailer dictionary in part 11).
982
983
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
984
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
985
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
986
    // for now.
987
988
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
989
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
990
    // /Outlines, then information from /Outlines also appears here.
991
992
    // Part 7 contains remaining objects private to pages other than the first page.
993
994
    // Part 8 contains all remaining shared objects except those that are shared only within
995
    // thumbnails.
996
997
    // Part 9 contains all remaining objects.
998
999
    // We sort objects into the following categories:
1000
1001
    //   * open_document: part 4
1002
1003
    //   * first_page_private: part 6
1004
1005
    //   * first_page_shared: part 6
1006
1007
    //   * other_page_private: part 7
1008
1009
    //   * other_page_shared: part 8
1010
1011
    //   * thumbnail_private: part 9
1012
1013
    //   * thumbnail_shared: part 9
1014
1015
    //   * other: part 9
1016
1017
    //   * outlines: part 6 or 9
1018
1019
7.98k
    m->part4.clear();
1020
7.98k
    m->part6.clear();
1021
7.98k
    m->part7.clear();
1022
7.98k
    m->part8.clear();
1023
7.98k
    m->part9.clear();
1024
7.98k
    m->c_linp = LinParameters();
1025
7.98k
    m->c_page_offset_data = CHPageOffset();
1026
7.98k
    m->c_shared_object_data = CHSharedObject();
1027
7.98k
    m->c_outline_data = HGeneric();
1028
1029
7.98k
    QPDFObjectHandle root = getRoot();
1030
7.98k
    bool outlines_in_first_page = false;
1031
7.98k
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
1032
7.98k
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
1033
7.98k
    if (pagemode.isName()) {
1034
521
        if (pagemode.getName() == "/UseOutlines") {
1035
422
            if (root.hasKey("/Outlines")) {
1036
199
                outlines_in_first_page = true;
1037
223
            } else {
1038
223
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1039
223
            }
1040
422
        }
1041
521
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1042
521
    }
1043
1044
7.98k
    std::set<std::string> open_document_keys;
1045
7.98k
    open_document_keys.insert("/ViewerPreferences");
1046
7.98k
    open_document_keys.insert("/PageMode");
1047
7.98k
    open_document_keys.insert("/Threads");
1048
7.98k
    open_document_keys.insert("/OpenAction");
1049
7.98k
    open_document_keys.insert("/AcroForm");
1050
1051
7.98k
    std::set<QPDFObjGen> lc_open_document;
1052
7.98k
    std::set<QPDFObjGen> lc_first_page_private;
1053
7.98k
    std::set<QPDFObjGen> lc_first_page_shared;
1054
7.98k
    std::set<QPDFObjGen> lc_other_page_private;
1055
7.98k
    std::set<QPDFObjGen> lc_other_page_shared;
1056
7.98k
    std::set<QPDFObjGen> lc_thumbnail_private;
1057
7.98k
    std::set<QPDFObjGen> lc_thumbnail_shared;
1058
7.98k
    std::set<QPDFObjGen> lc_other;
1059
7.98k
    std::set<QPDFObjGen> lc_outlines;
1060
7.98k
    std::set<QPDFObjGen> lc_root;
1061
1062
77.6k
    for (auto& oiter: m->object_to_obj_users) {
1063
77.6k
        QPDFObjGen const& og = oiter.first;
1064
77.6k
        std::set<ObjUser>& ous = oiter.second;
1065
1066
77.6k
        bool in_open_document = false;
1067
77.6k
        bool in_first_page = false;
1068
77.6k
        int other_pages = 0;
1069
77.6k
        int thumbs = 0;
1070
77.6k
        int others = 0;
1071
77.6k
        bool in_outlines = false;
1072
77.6k
        bool is_root = false;
1073
1074
147k
        for (auto const& ou: ous) {
1075
147k
            switch (ou.ou_type) {
1076
7.63k
            case ObjUser::ou_trailer_key:
1077
7.63k
                if (ou.key == "/Encrypt") {
1078
0
                    in_open_document = true;
1079
7.63k
                } else {
1080
7.63k
                    ++others;
1081
7.63k
                }
1082
7.63k
                break;
1083
1084
8.85k
            case ObjUser::ou_thumb:
1085
8.85k
                ++thumbs;
1086
8.85k
                break;
1087
1088
40.6k
            case ObjUser::ou_root_key:
1089
40.6k
                if (open_document_keys.contains(ou.key)) {
1090
3.31k
                    in_open_document = true;
1091
37.3k
                } else if (ou.key == "/Outlines") {
1092
866
                    in_outlines = true;
1093
36.4k
                } else {
1094
36.4k
                    ++others;
1095
36.4k
                }
1096
40.6k
                break;
1097
1098
82.3k
            case ObjUser::ou_page:
1099
82.3k
                if (ou.pageno == 0) {
1100
36.2k
                    in_first_page = true;
1101
46.1k
                } else {
1102
46.1k
                    ++other_pages;
1103
46.1k
                }
1104
82.3k
                break;
1105
1106
7.98k
            case ObjUser::ou_root:
1107
7.98k
                is_root = true;
1108
7.98k
                break;
1109
1110
0
            case ObjUser::ou_bad:
1111
0
                stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: invalid user type");
1112
0
                break;
1113
147k
            }
1114
147k
        }
1115
1116
77.6k
        if (is_root) {
1117
7.98k
            lc_root.insert(og);
1118
69.6k
        } else if (in_outlines) {
1119
825
            lc_outlines.insert(og);
1120
68.8k
        } else if (in_open_document) {
1121
3.28k
            lc_open_document.insert(og);
1122
65.5k
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1123
19.4k
            lc_first_page_private.insert(og);
1124
46.1k
        } else if (in_first_page) {
1125
14.4k
            lc_first_page_shared.insert(og);
1126
31.6k
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1127
9.81k
            lc_other_page_private.insert(og);
1128
21.8k
        } else if (other_pages > 1) {
1129
3.31k
            lc_other_page_shared.insert(og);
1130
18.5k
        } else if ((thumbs == 1) && (others == 0)) {
1131
3.34k
            lc_thumbnail_private.insert(og);
1132
15.2k
        } else if (thumbs > 1) {
1133
1.29k
            lc_thumbnail_shared.insert(og);
1134
13.9k
        } else {
1135
13.9k
            lc_other.insert(og);
1136
13.9k
        }
1137
77.6k
    }
1138
1139
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1140
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1141
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1142
    // and the above section can be considered only with categorization.  Note that sets of
1143
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1144
    // the possible exception of hints tables which we won't see here anyway.  That means that
1145
    // running calculateLinearizationData() on a linearized file should give results identical to
1146
    // the original file ordering.
1147
1148
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1149
    // code optimization if necessary. Premature optimization is the root of all evil.
1150
7.98k
    std::vector<QPDFObjectHandle> pages;
1151
7.98k
    { // local scope
1152
        // Map all page objects to the containing object stream.  This should be a no-op in a
1153
        // properly linearized file.
1154
12.9k
        for (auto oh: getAllPages()) {
1155
12.9k
            pages.push_back(getUncompressedObject(oh, object_stream_data));
1156
12.9k
        }
1157
7.98k
    }
1158
7.98k
    int npages = toI(pages.size());
1159
1160
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1161
    // initialize any items that deal with object numbers or counts but not any items that deal with
1162
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1163
    // during the first pass.  The validation code can compute them relatively easily given the rest
1164
    // of the information.
1165
1166
    // npages is the size of the existing pages vector, which has been created by traversing the
1167
    // pages tree, and as such is a reasonable size.
1168
7.98k
    m->c_linp.npages = npages;
1169
7.98k
    m->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(toS(npages));
1170
1171
    // Part 4: open document objects.  We don't care about the order.
1172
1173
7.98k
    if (lc_root.size() != 1) {
1174
0
        stopOnError("found other than one root while calculating linearization data");
1175
0
    }
1176
7.98k
    m->part4.push_back(getObject(*(lc_root.begin())));
1177
7.98k
    for (auto const& og: lc_open_document) {
1178
3.28k
        m->part4.push_back(getObject(og));
1179
3.28k
    }
1180
1181
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1182
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1183
    // any option to set this and also disregards /OpenAction.  We will do the same.
1184
1185
    // First, place the actual first page object itself.
1186
7.98k
    if (pages.empty()) {
1187
36
        stopOnError("no pages found while calculating linearization data");
1188
36
    }
1189
7.98k
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
1190
7.98k
    if (!lc_first_page_private.contains(first_page_og)) {
1191
997
        stopOnError(
1192
997
            "INTERNAL ERROR: QPDF::calculateLinearizationData: first page "
1193
997
            "object not in lc_first_page_private");
1194
997
    }
1195
7.98k
    lc_first_page_private.erase(first_page_og);
1196
7.98k
    m->c_linp.first_page_object = pages.at(0).getObjectID();
1197
7.98k
    m->part6.push_back(pages.at(0));
1198
1199
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1200
    // it except to the extent that it groups private and shared objects contiguously for the sake
1201
    // of hint tables.
1202
1203
12.4k
    for (auto const& og: lc_first_page_private) {
1204
12.4k
        m->part6.push_back(getObject(og));
1205
12.4k
    }
1206
1207
12.4k
    for (auto const& og: lc_first_page_shared) {
1208
12.4k
        m->part6.push_back(getObject(og));
1209
12.4k
    }
1210
1211
    // Place the outline dictionary if it goes in the first page section.
1212
7.98k
    if (outlines_in_first_page) {
1213
198
        pushOutlinesToPart(m->part6, lc_outlines, object_stream_data);
1214
198
    }
1215
1216
    // Fill in page offset hint table information for the first page. The PDF spec says that
1217
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1218
    // it fills in garbage values for all the shared object identifiers on the first page.
1219
1220
7.98k
    m->c_page_offset_data.entries.at(0).nobjects = toI(m->part6.size());
1221
1222
    // Part 7: other pages' private objects
1223
1224
    // For each page in order:
1225
12.8k
    for (size_t i = 1; i < toS(npages); ++i) {
1226
        // Place this page's page object
1227
1228
4.85k
        QPDFObjGen page_og(pages.at(i).getObjGen());
1229
4.85k
        if (!lc_other_page_private.contains(page_og)) {
1230
26
            stopOnError(
1231
26
                "INTERNAL ERROR: QPDF::calculateLinearizationData: page object for page " +
1232
26
                std::to_string(i) + " not in lc_other_page_private");
1233
26
        }
1234
4.85k
        lc_other_page_private.erase(page_og);
1235
4.85k
        m->part7.push_back(pages.at(i));
1236
1237
        // Place all non-shared objects referenced by this page, updating the page object count for
1238
        // the hint table.
1239
1240
4.85k
        m->c_page_offset_data.entries.at(i).nobjects = 1;
1241
1242
4.85k
        ObjUser ou(ObjUser::ou_page, toI(i));
1243
4.85k
        if (!m->obj_user_to_objects.contains(ou)) {
1244
0
            stopOnError("found unreferenced page while calculating linearization data");
1245
0
        }
1246
45.7k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1247
45.7k
            if (lc_other_page_private.contains(og)) {
1248
4.78k
                lc_other_page_private.erase(og);
1249
4.78k
                m->part7.push_back(getObject(og));
1250
4.78k
                ++m->c_page_offset_data.entries.at(i).nobjects;
1251
4.78k
            }
1252
45.7k
        }
1253
4.85k
    }
1254
    // That should have covered all part7 objects.
1255
7.98k
    if (!lc_other_page_private.empty()) {
1256
0
        stopOnError(
1257
0
            "INTERNAL ERROR: QPDF::calculateLinearizationData:"
1258
0
            " lc_other_page_private is not empty after generation of part7");
1259
0
    }
1260
1261
    // Part 8: other pages' shared objects
1262
1263
    // Order is unimportant.
1264
7.98k
    for (auto const& og: lc_other_page_shared) {
1265
3.30k
        m->part8.push_back(getObject(og));
1266
3.30k
    }
1267
1268
    // Part 9: other objects
1269
1270
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1271
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1272
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1273
    // we throw all remaining objects in arbitrary order.
1274
1275
    // Place the pages tree.
1276
7.98k
    std::set<QPDFObjGen> pages_ogs =
1277
7.98k
        m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1278
7.98k
    if (pages_ogs.empty()) {
1279
187
        stopOnError("found empty pages tree while calculating linearization data");
1280
187
    }
1281
15.7k
    for (auto const& og: pages_ogs) {
1282
15.7k
        if (lc_other.contains(og)) {
1283
7.65k
            lc_other.erase(og);
1284
7.65k
            m->part9.push_back(getObject(og));
1285
7.65k
        }
1286
15.7k
    }
1287
1288
    // Place private thumbnail images in page order.  Slightly more information would be required if
1289
    // we were going to bother with thumbnail hint tables.
1290
19.3k
    for (size_t i = 0; i < toS(npages); ++i) {
1291
11.3k
        QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1292
11.3k
        thumb = getUncompressedObject(thumb, object_stream_data);
1293
11.3k
        if (!thumb.isNull()) {
1294
            // Output the thumbnail itself
1295
1.72k
            QPDFObjGen thumb_og(thumb.getObjGen());
1296
1.72k
            if (lc_thumbnail_private.contains(thumb_og)) {
1297
1.44k
                lc_thumbnail_private.erase(thumb_og);
1298
1.44k
                m->part9.push_back(thumb);
1299
1.44k
            } else {
1300
                // No internal error this time...there's nothing to stop this object from having
1301
                // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1302
                // there's nothing to prevent it from having been in some set other than
1303
                // lc_thumbnail_private.
1304
276
            }
1305
1.72k
            std::set<QPDFObjGen>& ogs = m->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, toI(i))];
1306
7.05k
            for (auto const& og: ogs) {
1307
7.05k
                if (lc_thumbnail_private.contains(og)) {
1308
1.40k
                    lc_thumbnail_private.erase(og);
1309
1.40k
                    m->part9.push_back(getObject(og));
1310
1.40k
                }
1311
7.05k
            }
1312
1.72k
        }
1313
11.3k
    }
1314
7.98k
    if (!lc_thumbnail_private.empty()) {
1315
15
        stopOnError(
1316
15
            "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1317
15
            "empty after placing thumbnails");
1318
15
    }
1319
1320
    // Place shared thumbnail objects
1321
7.98k
    for (auto const& og: lc_thumbnail_shared) {
1322
1.06k
        m->part9.push_back(getObject(og));
1323
1.06k
    }
1324
1325
    // Place outlines unless in first page
1326
7.98k
    if (!outlines_in_first_page) {
1327
6.52k
        pushOutlinesToPart(m->part9, lc_outlines, object_stream_data);
1328
6.52k
    }
1329
1330
    // Place all remaining objects
1331
7.98k
    for (auto const& og: lc_other) {
1332
6.05k
        m->part9.push_back(getObject(og));
1333
6.05k
    }
1334
1335
    // Make sure we got everything exactly once.
1336
1337
7.98k
    size_t num_placed =
1338
7.98k
        m->part4.size() + m->part6.size() + m->part7.size() + m->part8.size() + m->part9.size();
1339
7.98k
    size_t num_wanted = m->object_to_obj_users.size();
1340
7.98k
    if (num_placed != num_wanted) {
1341
22
        stopOnError(
1342
22
            "INTERNAL ERROR: QPDF::calculateLinearizationData: wrong "
1343
22
            "number of objects placed (num_placed = " +
1344
22
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted));
1345
22
    }
1346
1347
    // Calculate shared object hint table information including references to shared objects from
1348
    // page offset hint data.
1349
1350
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1351
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1352
    // object number to index.  Then populate the shared object information for the pages.
1353
1354
    // Note that two objects never have the same object number, so we can map from object number
1355
    // only without regards to generation.
1356
7.98k
    std::map<int, int> obj_to_index;
1357
1358
7.98k
    m->c_shared_object_data.nshared_first_page = toI(m->part6.size());
1359
7.98k
    m->c_shared_object_data.nshared_total =
1360
7.98k
        m->c_shared_object_data.nshared_first_page + toI(m->part8.size());
1361
1362
7.98k
    std::vector<CHSharedObjectEntry>& shared = m->c_shared_object_data.entries;
1363
29.6k
    for (auto& oh: m->part6) {
1364
29.6k
        int obj = oh.getObjectID();
1365
29.6k
        obj_to_index[obj] = toI(shared.size());
1366
29.6k
        shared.emplace_back(obj);
1367
29.6k
    }
1368
7.98k
    QTC::TC("qpdf", "QPDF lin part 8 empty", m->part8.empty() ? 1 : 0);
1369
7.98k
    if (!m->part8.empty()) {
1370
128
        m->c_shared_object_data.first_shared_obj = m->part8.at(0).getObjectID();
1371
2.97k
        for (auto& oh: m->part8) {
1372
2.97k
            int obj = oh.getObjectID();
1373
2.97k
            obj_to_index[obj] = toI(shared.size());
1374
2.97k
            shared.emplace_back(obj);
1375
2.97k
        }
1376
128
    }
1377
7.98k
    if (static_cast<size_t>(m->c_shared_object_data.nshared_total) !=
1378
7.98k
        m->c_shared_object_data.entries.size()) {
1379
0
        stopOnError("shared object hint table has wrong number of entries");
1380
0
    }
1381
1382
    // Now compute the list of shared objects for each page after the first page.
1383
1384
12.4k
    for (size_t i = 1; i < toS(npages); ++i) {
1385
4.49k
        CHPageOffsetEntry& pe = m->c_page_offset_data.entries.at(i);
1386
4.49k
        ObjUser ou(ObjUser::ou_page, toI(i));
1387
4.49k
        if (!m->obj_user_to_objects.contains(ou)) {
1388
0
            stopOnError("found unreferenced page while calculating linearization data");
1389
0
        }
1390
36.0k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1391
36.0k
            if ((m->object_to_obj_users[og].size() > 1) && (obj_to_index.contains(og.getObj()))) {
1392
25.3k
                int idx = obj_to_index[og.getObj()];
1393
25.3k
                ++pe.nshared_objects;
1394
25.3k
                pe.shared_identifiers.push_back(idx);
1395
25.3k
            }
1396
36.0k
        }
1397
4.49k
    }
1398
7.98k
}
Unexecuted instantiation: void QPDF::calculateLinearizationData<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
void QPDF::calculateLinearizationData<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&)
Line
Count
Source
963
7.98k
{
964
    // This function calculates the ordering of objects, divides them into the appropriate parts,
965
    // and computes some values for the linearization parameter dictionary and hint tables.  The
966
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
967
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
968
969
7.98k
    if (m->object_to_obj_users.empty()) {
970
        // Note that we can't call optimize here because we don't know whether it should be called
971
        // with or without allow changes.
972
0
        throw std::logic_error(
973
0
            "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()");
974
0
    }
975
976
    // Separate objects into the categories sufficient for us to determine which part of the
977
    // linearized file should contain the object.  This categorization is useful for other purposes
978
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
979
980
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
981
    // trailer dictionary in part 11).
982
983
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
984
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
985
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
986
    // for now.
987
988
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
989
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
990
    // /Outlines, then information from /Outlines also appears here.
991
992
    // Part 7 contains remaining objects private to pages other than the first page.
993
994
    // Part 8 contains all remaining shared objects except those that are shared only within
995
    // thumbnails.
996
997
    // Part 9 contains all remaining objects.
998
999
    // We sort objects into the following categories:
1000
1001
    //   * open_document: part 4
1002
1003
    //   * first_page_private: part 6
1004
1005
    //   * first_page_shared: part 6
1006
1007
    //   * other_page_private: part 7
1008
1009
    //   * other_page_shared: part 8
1010
1011
    //   * thumbnail_private: part 9
1012
1013
    //   * thumbnail_shared: part 9
1014
1015
    //   * other: part 9
1016
1017
    //   * outlines: part 6 or 9
1018
1019
7.98k
    m->part4.clear();
1020
7.98k
    m->part6.clear();
1021
7.98k
    m->part7.clear();
1022
7.98k
    m->part8.clear();
1023
7.98k
    m->part9.clear();
1024
7.98k
    m->c_linp = LinParameters();
1025
7.98k
    m->c_page_offset_data = CHPageOffset();
1026
7.98k
    m->c_shared_object_data = CHSharedObject();
1027
7.98k
    m->c_outline_data = HGeneric();
1028
1029
7.98k
    QPDFObjectHandle root = getRoot();
1030
7.98k
    bool outlines_in_first_page = false;
1031
7.98k
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
1032
7.98k
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
1033
7.98k
    if (pagemode.isName()) {
1034
521
        if (pagemode.getName() == "/UseOutlines") {
1035
422
            if (root.hasKey("/Outlines")) {
1036
199
                outlines_in_first_page = true;
1037
223
            } else {
1038
223
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1039
223
            }
1040
422
        }
1041
521
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1042
521
    }
1043
1044
7.98k
    std::set<std::string> open_document_keys;
1045
7.98k
    open_document_keys.insert("/ViewerPreferences");
1046
7.98k
    open_document_keys.insert("/PageMode");
1047
7.98k
    open_document_keys.insert("/Threads");
1048
7.98k
    open_document_keys.insert("/OpenAction");
1049
7.98k
    open_document_keys.insert("/AcroForm");
1050
1051
7.98k
    std::set<QPDFObjGen> lc_open_document;
1052
7.98k
    std::set<QPDFObjGen> lc_first_page_private;
1053
7.98k
    std::set<QPDFObjGen> lc_first_page_shared;
1054
7.98k
    std::set<QPDFObjGen> lc_other_page_private;
1055
7.98k
    std::set<QPDFObjGen> lc_other_page_shared;
1056
7.98k
    std::set<QPDFObjGen> lc_thumbnail_private;
1057
7.98k
    std::set<QPDFObjGen> lc_thumbnail_shared;
1058
7.98k
    std::set<QPDFObjGen> lc_other;
1059
7.98k
    std::set<QPDFObjGen> lc_outlines;
1060
7.98k
    std::set<QPDFObjGen> lc_root;
1061
1062
77.6k
    for (auto& oiter: m->object_to_obj_users) {
1063
77.6k
        QPDFObjGen const& og = oiter.first;
1064
77.6k
        std::set<ObjUser>& ous = oiter.second;
1065
1066
77.6k
        bool in_open_document = false;
1067
77.6k
        bool in_first_page = false;
1068
77.6k
        int other_pages = 0;
1069
77.6k
        int thumbs = 0;
1070
77.6k
        int others = 0;
1071
77.6k
        bool in_outlines = false;
1072
77.6k
        bool is_root = false;
1073
1074
147k
        for (auto const& ou: ous) {
1075
147k
            switch (ou.ou_type) {
1076
7.63k
            case ObjUser::ou_trailer_key:
1077
7.63k
                if (ou.key == "/Encrypt") {
1078
0
                    in_open_document = true;
1079
7.63k
                } else {
1080
7.63k
                    ++others;
1081
7.63k
                }
1082
7.63k
                break;
1083
1084
8.85k
            case ObjUser::ou_thumb:
1085
8.85k
                ++thumbs;
1086
8.85k
                break;
1087
1088
40.6k
            case ObjUser::ou_root_key:
1089
40.6k
                if (open_document_keys.contains(ou.key)) {
1090
3.31k
                    in_open_document = true;
1091
37.3k
                } else if (ou.key == "/Outlines") {
1092
866
                    in_outlines = true;
1093
36.4k
                } else {
1094
36.4k
                    ++others;
1095
36.4k
                }
1096
40.6k
                break;
1097
1098
82.3k
            case ObjUser::ou_page:
1099
82.3k
                if (ou.pageno == 0) {
1100
36.2k
                    in_first_page = true;
1101
46.1k
                } else {
1102
46.1k
                    ++other_pages;
1103
46.1k
                }
1104
82.3k
                break;
1105
1106
7.98k
            case ObjUser::ou_root:
1107
7.98k
                is_root = true;
1108
7.98k
                break;
1109
1110
0
            case ObjUser::ou_bad:
1111
0
                stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: invalid user type");
1112
0
                break;
1113
147k
            }
1114
147k
        }
1115
1116
77.6k
        if (is_root) {
1117
7.98k
            lc_root.insert(og);
1118
69.6k
        } else if (in_outlines) {
1119
825
            lc_outlines.insert(og);
1120
68.8k
        } else if (in_open_document) {
1121
3.28k
            lc_open_document.insert(og);
1122
65.5k
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1123
19.4k
            lc_first_page_private.insert(og);
1124
46.1k
        } else if (in_first_page) {
1125
14.4k
            lc_first_page_shared.insert(og);
1126
31.6k
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1127
9.81k
            lc_other_page_private.insert(og);
1128
21.8k
        } else if (other_pages > 1) {
1129
3.31k
            lc_other_page_shared.insert(og);
1130
18.5k
        } else if ((thumbs == 1) && (others == 0)) {
1131
3.34k
            lc_thumbnail_private.insert(og);
1132
15.2k
        } else if (thumbs > 1) {
1133
1.29k
            lc_thumbnail_shared.insert(og);
1134
13.9k
        } else {
1135
13.9k
            lc_other.insert(og);
1136
13.9k
        }
1137
77.6k
    }
1138
1139
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1140
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1141
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1142
    // and the above section can be considered only with categorization.  Note that sets of
1143
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1144
    // the possible exception of hints tables which we won't see here anyway.  That means that
1145
    // running calculateLinearizationData() on a linearized file should give results identical to
1146
    // the original file ordering.
1147
1148
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1149
    // code optimization if necessary. Premature optimization is the root of all evil.
1150
7.98k
    std::vector<QPDFObjectHandle> pages;
1151
7.98k
    { // local scope
1152
        // Map all page objects to the containing object stream.  This should be a no-op in a
1153
        // properly linearized file.
1154
12.9k
        for (auto oh: getAllPages()) {
1155
12.9k
            pages.push_back(getUncompressedObject(oh, object_stream_data));
1156
12.9k
        }
1157
7.98k
    }
1158
7.98k
    int npages = toI(pages.size());
1159
1160
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1161
    // initialize any items that deal with object numbers or counts but not any items that deal with
1162
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1163
    // during the first pass.  The validation code can compute them relatively easily given the rest
1164
    // of the information.
1165
1166
    // npages is the size of the existing pages vector, which has been created by traversing the
1167
    // pages tree, and as such is a reasonable size.
1168
7.98k
    m->c_linp.npages = npages;
1169
7.98k
    m->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(toS(npages));
1170
1171
    // Part 4: open document objects.  We don't care about the order.
1172
1173
7.98k
    if (lc_root.size() != 1) {
1174
0
        stopOnError("found other than one root while calculating linearization data");
1175
0
    }
1176
7.98k
    m->part4.push_back(getObject(*(lc_root.begin())));
1177
7.98k
    for (auto const& og: lc_open_document) {
1178
3.28k
        m->part4.push_back(getObject(og));
1179
3.28k
    }
1180
1181
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1182
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1183
    // any option to set this and also disregards /OpenAction.  We will do the same.
1184
1185
    // First, place the actual first page object itself.
1186
7.98k
    if (pages.empty()) {
1187
36
        stopOnError("no pages found while calculating linearization data");
1188
36
    }
1189
7.98k
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
1190
7.98k
    if (!lc_first_page_private.contains(first_page_og)) {
1191
997
        stopOnError(
1192
997
            "INTERNAL ERROR: QPDF::calculateLinearizationData: first page "
1193
997
            "object not in lc_first_page_private");
1194
997
    }
1195
7.98k
    lc_first_page_private.erase(first_page_og);
1196
7.98k
    m->c_linp.first_page_object = pages.at(0).getObjectID();
1197
7.98k
    m->part6.push_back(pages.at(0));
1198
1199
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1200
    // it except to the extent that it groups private and shared objects contiguously for the sake
1201
    // of hint tables.
1202
1203
12.4k
    for (auto const& og: lc_first_page_private) {
1204
12.4k
        m->part6.push_back(getObject(og));
1205
12.4k
    }
1206
1207
12.4k
    for (auto const& og: lc_first_page_shared) {
1208
12.4k
        m->part6.push_back(getObject(og));
1209
12.4k
    }
1210
1211
    // Place the outline dictionary if it goes in the first page section.
1212
7.98k
    if (outlines_in_first_page) {
1213
198
        pushOutlinesToPart(m->part6, lc_outlines, object_stream_data);
1214
198
    }
1215
1216
    // Fill in page offset hint table information for the first page. The PDF spec says that
1217
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1218
    // it fills in garbage values for all the shared object identifiers on the first page.
1219
1220
7.98k
    m->c_page_offset_data.entries.at(0).nobjects = toI(m->part6.size());
1221
1222
    // Part 7: other pages' private objects
1223
1224
    // For each page in order:
1225
12.8k
    for (size_t i = 1; i < toS(npages); ++i) {
1226
        // Place this page's page object
1227
1228
4.85k
        QPDFObjGen page_og(pages.at(i).getObjGen());
1229
4.85k
        if (!lc_other_page_private.contains(page_og)) {
1230
26
            stopOnError(
1231
26
                "INTERNAL ERROR: QPDF::calculateLinearizationData: page object for page " +
1232
26
                std::to_string(i) + " not in lc_other_page_private");
1233
26
        }
1234
4.85k
        lc_other_page_private.erase(page_og);
1235
4.85k
        m->part7.push_back(pages.at(i));
1236
1237
        // Place all non-shared objects referenced by this page, updating the page object count for
1238
        // the hint table.
1239
1240
4.85k
        m->c_page_offset_data.entries.at(i).nobjects = 1;
1241
1242
4.85k
        ObjUser ou(ObjUser::ou_page, toI(i));
1243
4.85k
        if (!m->obj_user_to_objects.contains(ou)) {
1244
0
            stopOnError("found unreferenced page while calculating linearization data");
1245
0
        }
1246
45.7k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1247
45.7k
            if (lc_other_page_private.contains(og)) {
1248
4.78k
                lc_other_page_private.erase(og);
1249
4.78k
                m->part7.push_back(getObject(og));
1250
4.78k
                ++m->c_page_offset_data.entries.at(i).nobjects;
1251
4.78k
            }
1252
45.7k
        }
1253
4.85k
    }
1254
    // That should have covered all part7 objects.
1255
7.98k
    if (!lc_other_page_private.empty()) {
1256
0
        stopOnError(
1257
0
            "INTERNAL ERROR: QPDF::calculateLinearizationData:"
1258
0
            " lc_other_page_private is not empty after generation of part7");
1259
0
    }
1260
1261
    // Part 8: other pages' shared objects
1262
1263
    // Order is unimportant.
1264
7.98k
    for (auto const& og: lc_other_page_shared) {
1265
3.30k
        m->part8.push_back(getObject(og));
1266
3.30k
    }
1267
1268
    // Part 9: other objects
1269
1270
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1271
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1272
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1273
    // we throw all remaining objects in arbitrary order.
1274
1275
    // Place the pages tree.
1276
7.98k
    std::set<QPDFObjGen> pages_ogs =
1277
7.98k
        m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1278
7.98k
    if (pages_ogs.empty()) {
1279
187
        stopOnError("found empty pages tree while calculating linearization data");
1280
187
    }
1281
15.7k
    for (auto const& og: pages_ogs) {
1282
15.7k
        if (lc_other.contains(og)) {
1283
7.65k
            lc_other.erase(og);
1284
7.65k
            m->part9.push_back(getObject(og));
1285
7.65k
        }
1286
15.7k
    }
1287
1288
    // Place private thumbnail images in page order.  Slightly more information would be required if
1289
    // we were going to bother with thumbnail hint tables.
1290
19.3k
    for (size_t i = 0; i < toS(npages); ++i) {
1291
11.3k
        QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1292
11.3k
        thumb = getUncompressedObject(thumb, object_stream_data);
1293
11.3k
        if (!thumb.isNull()) {
1294
            // Output the thumbnail itself
1295
1.72k
            QPDFObjGen thumb_og(thumb.getObjGen());
1296
1.72k
            if (lc_thumbnail_private.contains(thumb_og)) {
1297
1.44k
                lc_thumbnail_private.erase(thumb_og);
1298
1.44k
                m->part9.push_back(thumb);
1299
1.44k
            } else {
1300
                // No internal error this time...there's nothing to stop this object from having
1301
                // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1302
                // there's nothing to prevent it from having been in some set other than
1303
                // lc_thumbnail_private.
1304
276
            }
1305
1.72k
            std::set<QPDFObjGen>& ogs = m->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, toI(i))];
1306
7.05k
            for (auto const& og: ogs) {
1307
7.05k
                if (lc_thumbnail_private.contains(og)) {
1308
1.40k
                    lc_thumbnail_private.erase(og);
1309
1.40k
                    m->part9.push_back(getObject(og));
1310
1.40k
                }
1311
7.05k
            }
1312
1.72k
        }
1313
11.3k
    }
1314
7.98k
    if (!lc_thumbnail_private.empty()) {
1315
15
        stopOnError(
1316
15
            "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1317
15
            "empty after placing thumbnails");
1318
15
    }
1319
1320
    // Place shared thumbnail objects
1321
7.98k
    for (auto const& og: lc_thumbnail_shared) {
1322
1.06k
        m->part9.push_back(getObject(og));
1323
1.06k
    }
1324
1325
    // Place outlines unless in first page
1326
7.98k
    if (!outlines_in_first_page) {
1327
6.52k
        pushOutlinesToPart(m->part9, lc_outlines, object_stream_data);
1328
6.52k
    }
1329
1330
    // Place all remaining objects
1331
7.98k
    for (auto const& og: lc_other) {
1332
6.05k
        m->part9.push_back(getObject(og));
1333
6.05k
    }
1334
1335
    // Make sure we got everything exactly once.
1336
1337
7.98k
    size_t num_placed =
1338
7.98k
        m->part4.size() + m->part6.size() + m->part7.size() + m->part8.size() + m->part9.size();
1339
7.98k
    size_t num_wanted = m->object_to_obj_users.size();
1340
7.98k
    if (num_placed != num_wanted) {
1341
22
        stopOnError(
1342
22
            "INTERNAL ERROR: QPDF::calculateLinearizationData: wrong "
1343
22
            "number of objects placed (num_placed = " +
1344
22
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted));
1345
22
    }
1346
1347
    // Calculate shared object hint table information including references to shared objects from
1348
    // page offset hint data.
1349
1350
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1351
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1352
    // object number to index.  Then populate the shared object information for the pages.
1353
1354
    // Note that two objects never have the same object number, so we can map from object number
1355
    // only without regards to generation.
1356
7.98k
    std::map<int, int> obj_to_index;
1357
1358
7.98k
    m->c_shared_object_data.nshared_first_page = toI(m->part6.size());
1359
7.98k
    m->c_shared_object_data.nshared_total =
1360
7.98k
        m->c_shared_object_data.nshared_first_page + toI(m->part8.size());
1361
1362
7.98k
    std::vector<CHSharedObjectEntry>& shared = m->c_shared_object_data.entries;
1363
29.6k
    for (auto& oh: m->part6) {
1364
29.6k
        int obj = oh.getObjectID();
1365
29.6k
        obj_to_index[obj] = toI(shared.size());
1366
29.6k
        shared.emplace_back(obj);
1367
29.6k
    }
1368
7.98k
    QTC::TC("qpdf", "QPDF lin part 8 empty", m->part8.empty() ? 1 : 0);
1369
7.98k
    if (!m->part8.empty()) {
1370
128
        m->c_shared_object_data.first_shared_obj = m->part8.at(0).getObjectID();
1371
2.97k
        for (auto& oh: m->part8) {
1372
2.97k
            int obj = oh.getObjectID();
1373
2.97k
            obj_to_index[obj] = toI(shared.size());
1374
2.97k
            shared.emplace_back(obj);
1375
2.97k
        }
1376
128
    }
1377
7.98k
    if (static_cast<size_t>(m->c_shared_object_data.nshared_total) !=
1378
7.98k
        m->c_shared_object_data.entries.size()) {
1379
0
        stopOnError("shared object hint table has wrong number of entries");
1380
0
    }
1381
1382
    // Now compute the list of shared objects for each page after the first page.
1383
1384
12.4k
    for (size_t i = 1; i < toS(npages); ++i) {
1385
4.49k
        CHPageOffsetEntry& pe = m->c_page_offset_data.entries.at(i);
1386
4.49k
        ObjUser ou(ObjUser::ou_page, toI(i));
1387
4.49k
        if (!m->obj_user_to_objects.contains(ou)) {
1388
0
            stopOnError("found unreferenced page while calculating linearization data");
1389
0
        }
1390
36.0k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1391
36.0k
            if ((m->object_to_obj_users[og].size() > 1) && (obj_to_index.contains(og.getObj()))) {
1392
25.3k
                int idx = obj_to_index[og.getObj()];
1393
25.3k
                ++pe.nshared_objects;
1394
25.3k
                pe.shared_identifiers.push_back(idx);
1395
25.3k
            }
1396
36.0k
        }
1397
4.49k
    }
1398
7.98k
}
1399
1400
template <typename T>
1401
void
1402
QPDF::pushOutlinesToPart(
1403
    std::vector<QPDFObjectHandle>& part,
1404
    std::set<QPDFObjGen>& lc_outlines,
1405
    T const& object_stream_data)
1406
6.72k
{
1407
6.72k
    QPDFObjectHandle root = getRoot();
1408
6.72k
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1409
6.72k
    if (outlines.isNull()) {
1410
6.46k
        return;
1411
6.46k
    }
1412
260
    outlines = getUncompressedObject(outlines, object_stream_data);
1413
260
    QPDFObjGen outlines_og(outlines.getObjGen());
1414
260
    QTC::TC(
1415
260
        "qpdf",
1416
260
        "QPDF lin outlines in part",
1417
260
        ((&part == (&m->part6))       ? 0
1418
260
             : (&part == (&m->part9)) ? 1
1419
62
                                      : 9999)); // can't happen
1420
260
    m->c_outline_data.first_object = outlines_og.getObj();
1421
260
    m->c_outline_data.nobjects = 1;
1422
260
    lc_outlines.erase(outlines_og);
1423
260
    part.push_back(outlines);
1424
469
    for (auto const& og: lc_outlines) {
1425
469
        part.push_back(getObject(og));
1426
469
        ++m->c_outline_data.nobjects;
1427
469
    }
1428
260
}
Unexecuted instantiation: void QPDF::pushOutlinesToPart<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
void QPDF::pushOutlinesToPart<QPDFWriter::ObjTable>(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, QPDFWriter::ObjTable const&)
Line
Count
Source
1406
6.72k
{
1407
6.72k
    QPDFObjectHandle root = getRoot();
1408
6.72k
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1409
6.72k
    if (outlines.isNull()) {
1410
6.46k
        return;
1411
6.46k
    }
1412
260
    outlines = getUncompressedObject(outlines, object_stream_data);
1413
260
    QPDFObjGen outlines_og(outlines.getObjGen());
1414
260
    QTC::TC(
1415
260
        "qpdf",
1416
260
        "QPDF lin outlines in part",
1417
260
        ((&part == (&m->part6))       ? 0
1418
260
             : (&part == (&m->part9)) ? 1
1419
62
                                      : 9999)); // can't happen
1420
260
    m->c_outline_data.first_object = outlines_og.getObj();
1421
260
    m->c_outline_data.nobjects = 1;
1422
260
    lc_outlines.erase(outlines_og);
1423
260
    part.push_back(outlines);
1424
469
    for (auto const& og: lc_outlines) {
1425
469
        part.push_back(getObject(og));
1426
469
        ++m->c_outline_data.nobjects;
1427
469
    }
1428
260
}
1429
1430
void
1431
QPDF::getLinearizedParts(
1432
    QPDFWriter::ObjTable const& obj,
1433
    std::vector<QPDFObjectHandle>& part4,
1434
    std::vector<QPDFObjectHandle>& part6,
1435
    std::vector<QPDFObjectHandle>& part7,
1436
    std::vector<QPDFObjectHandle>& part8,
1437
    std::vector<QPDFObjectHandle>& part9)
1438
7.98k
{
1439
7.98k
    calculateLinearizationData(obj);
1440
7.98k
    part4 = m->part4;
1441
7.98k
    part6 = m->part6;
1442
7.98k
    part7 = m->part7;
1443
7.98k
    part8 = m->part8;
1444
7.98k
    part9 = m->part9;
1445
7.98k
}
1446
1447
static inline int
1448
nbits(int val)
1449
87.7k
{
1450
87.7k
    return (val == 0 ? 0 : (1 + nbits(val >> 1)));
1451
87.7k
}
1452
1453
int
1454
QPDF::outputLengthNextN(
1455
    int in_object, int n, QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1456
47.3k
{
1457
    // Figure out the length of a series of n consecutive objects in the output file starting with
1458
    // whatever object in_object from the input file mapped to.
1459
1460
47.3k
    int first = obj[in_object].renumber;
1461
47.3k
    int last = first + n;
1462
47.3k
    if (first <= 0) {
1463
0
        stopOnError("found object that is not renumbered while writing linearization data");
1464
0
    }
1465
47.3k
    qpdf_offset_t length = 0;
1466
131k
    for (int i = first; i < last; ++i) {
1467
84.4k
        auto l = new_obj[i].length;
1468
84.4k
        if (l == 0) {
1469
0
            stopOnError("found item with unknown length while writing linearization data");
1470
0
        }
1471
84.4k
        length += l;
1472
84.4k
    }
1473
47.3k
    return toI(length);
1474
47.3k
}
1475
1476
void
1477
QPDF::calculateHPageOffset(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1478
5.95k
{
1479
    // Page Offset Hint Table
1480
1481
    // We are purposely leaving some values set to their initial zero values.
1482
1483
5.95k
    std::vector<QPDFObjectHandle> const& pages = getAllPages();
1484
5.95k
    size_t npages = pages.size();
1485
5.95k
    CHPageOffset& cph = m->c_page_offset_data;
1486
5.95k
    std::vector<CHPageOffsetEntry>& cphe = cph.entries;
1487
1488
    // Calculate minimum and maximum values for number of objects per page and page length.
1489
1490
5.95k
    int min_nobjects = cphe.at(0).nobjects;
1491
5.95k
    int max_nobjects = min_nobjects;
1492
5.95k
    int min_length = outputLengthNextN(pages.at(0).getObjectID(), min_nobjects, new_obj, obj);
1493
5.95k
    int max_length = min_length;
1494
5.95k
    int max_shared = cphe.at(0).nshared_objects;
1495
1496
5.95k
    HPageOffset& ph = m->page_offset_hints;
1497
5.95k
    std::vector<HPageOffsetEntry>& phe = ph.entries;
1498
    // npages is the size of the existing pages array.
1499
5.95k
    phe = std::vector<HPageOffsetEntry>(npages);
1500
1501
16.1k
    for (unsigned int i = 0; i < npages; ++i) {
1502
        // Calculate values for each page, assigning full values to the delta items.  They will be
1503
        // adjusted later.
1504
1505
        // Repeat calculations for page 0 so we can assign to phe[i] without duplicating those
1506
        // assignments.
1507
1508
10.1k
        int nobjects = cphe.at(i).nobjects;
1509
10.1k
        int length = outputLengthNextN(pages.at(i).getObjectID(), nobjects, new_obj, obj);
1510
10.1k
        int nshared = cphe.at(i).nshared_objects;
1511
1512
10.1k
        min_nobjects = std::min(min_nobjects, nobjects);
1513
10.1k
        max_nobjects = std::max(max_nobjects, nobjects);
1514
10.1k
        min_length = std::min(min_length, length);
1515
10.1k
        max_length = std::max(max_length, length);
1516
10.1k
        max_shared = std::max(max_shared, nshared);
1517
1518
10.1k
        phe.at(i).delta_nobjects = nobjects;
1519
10.1k
        phe.at(i).delta_page_length = length;
1520
10.1k
        phe.at(i).nshared_objects = nshared;
1521
10.1k
    }
1522
1523
5.95k
    ph.min_nobjects = min_nobjects;
1524
5.95k
    ph.first_page_offset = new_obj[obj[pages.at(0)].renumber].xref.getOffset();
1525
5.95k
    ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects);
1526
5.95k
    ph.min_page_length = min_length;
1527
5.95k
    ph.nbits_delta_page_length = nbits(max_length - min_length);
1528
5.95k
    ph.nbits_nshared_objects = nbits(max_shared);
1529
5.95k
    ph.nbits_shared_identifier = nbits(m->c_shared_object_data.nshared_total);
1530
5.95k
    ph.shared_denominator = 4; // doesn't matter
1531
1532
    // It isn't clear how to compute content offset and content length.  Since we are not
1533
    // interleaving page objects with the content stream, we'll use the same values for content
1534
    // length as page length.  We will use 0 as content offset because this is what Adobe does
1535
    // (implementation note 127) and pdlin as well.
1536
5.95k
    ph.nbits_delta_content_length = ph.nbits_delta_page_length;
1537
5.95k
    ph.min_content_length = ph.min_page_length;
1538
1539
16.1k
    for (size_t i = 0; i < npages; ++i) {
1540
        // Adjust delta entries
1541
10.1k
        if ((phe.at(i).delta_nobjects < min_nobjects) ||
1542
10.1k
            (phe.at(i).delta_page_length < min_length)) {
1543
0
            stopOnError(
1544
0
                "found too small delta nobjects or delta page length while writing "
1545
0
                "linearization data");
1546
0
        }
1547
10.1k
        phe.at(i).delta_nobjects -= min_nobjects;
1548
10.1k
        phe.at(i).delta_page_length -= min_length;
1549
10.1k
        phe.at(i).delta_content_length = phe.at(i).delta_page_length;
1550
1551
29.5k
        for (size_t j = 0; j < toS(cphe.at(i).nshared_objects); ++j) {
1552
19.3k
            phe.at(i).shared_identifiers.push_back(cphe.at(i).shared_identifiers.at(j));
1553
19.3k
            phe.at(i).shared_numerators.push_back(0);
1554
19.3k
        }
1555
10.1k
    }
1556
5.95k
}
1557
1558
void
1559
QPDF::calculateHSharedObject(
1560
    QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1561
5.95k
{
1562
5.95k
    CHSharedObject& cso = m->c_shared_object_data;
1563
5.95k
    std::vector<CHSharedObjectEntry>& csoe = cso.entries;
1564
5.95k
    HSharedObject& so = m->shared_object_hints;
1565
5.95k
    std::vector<HSharedObjectEntry>& soe = so.entries;
1566
5.95k
    soe.clear();
1567
1568
5.95k
    int min_length = outputLengthNextN(csoe.at(0).object, 1, new_obj, obj);
1569
5.95k
    int max_length = min_length;
1570
1571
30.9k
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1572
        // Assign absolute numbers to deltas; adjust later
1573
24.9k
        int length = outputLengthNextN(csoe.at(i).object, 1, new_obj, obj);
1574
24.9k
        min_length = std::min(min_length, length);
1575
24.9k
        max_length = std::max(max_length, length);
1576
24.9k
        soe.emplace_back();
1577
24.9k
        soe.at(i).delta_group_length = length;
1578
24.9k
    }
1579
5.95k
    if (soe.size() != toS(cso.nshared_total)) {
1580
0
        stopOnError("soe has wrong size after initialization");
1581
0
    }
1582
1583
5.95k
    so.nshared_total = cso.nshared_total;
1584
5.95k
    so.nshared_first_page = cso.nshared_first_page;
1585
5.95k
    if (so.nshared_total > so.nshared_first_page) {
1586
108
        so.first_shared_obj = obj[cso.first_shared_obj].renumber;
1587
108
        so.min_group_length = min_length;
1588
108
        so.first_shared_offset = new_obj[so.first_shared_obj].xref.getOffset();
1589
108
    }
1590
5.95k
    so.min_group_length = min_length;
1591
5.95k
    so.nbits_delta_group_length = nbits(max_length - min_length);
1592
1593
30.9k
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1594
        // Adjust deltas
1595
24.9k
        if (soe.at(i).delta_group_length < min_length) {
1596
0
            stopOnError("found too small group length while writing linearization data");
1597
0
        }
1598
24.9k
        soe.at(i).delta_group_length -= min_length;
1599
24.9k
    }
1600
5.95k
}
1601
1602
void
1603
QPDF::calculateHOutline(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1604
5.95k
{
1605
5.95k
    HGeneric& cho = m->c_outline_data;
1606
1607
5.95k
    if (cho.nobjects == 0) {
1608
5.75k
        return;
1609
5.75k
    }
1610
1611
200
    HGeneric& ho = m->outline_hints;
1612
1613
200
    ho.first_object = obj[cho.first_object].renumber;
1614
200
    ho.first_object_offset = new_obj[ho.first_object].xref.getOffset();
1615
200
    ho.nobjects = cho.nobjects;
1616
200
    ho.group_length = outputLengthNextN(cho.first_object, ho.nobjects, new_obj, obj);
1617
200
}
1618
1619
template <class T, class int_type>
1620
static void
1621
write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec, int bits, int_type T::* field)
1622
47.6k
{
1623
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1624
1625
173k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1626
125k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1627
125k
    }
1628
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1629
    // start on a byte boundary.
1630
47.6k
    w.flush();
1631
47.6k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HPageOffsetEntry, int>(BitWriter&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, int QPDF::HPageOffsetEntry::*)
Line
Count
Source
1622
11.9k
{
1623
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1624
1625
32.2k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1626
20.3k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1627
20.3k
    }
1628
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1629
    // start on a byte boundary.
1630
11.9k
    w.flush();
1631
11.9k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HPageOffsetEntry, long long>(BitWriter&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, long long QPDF::HPageOffsetEntry::*)
Line
Count
Source
1622
17.8k
{
1623
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1624
1625
48.4k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1626
30.5k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1627
30.5k
    }
1628
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1629
    // start on a byte boundary.
1630
17.8k
    w.flush();
1631
17.8k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HSharedObjectEntry, int>(BitWriter&, int, std::__1::vector<QPDF::HSharedObjectEntry, std::__1::allocator<QPDF::HSharedObjectEntry> >&, int, int QPDF::HSharedObjectEntry::*)
Line
Count
Source
1622
17.8k
{
1623
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1624
1625
92.8k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1626
74.9k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1627
74.9k
    }
1628
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1629
    // start on a byte boundary.
1630
17.8k
    w.flush();
1631
17.8k
}
1632
1633
template <class T>
1634
static void
1635
write_vector_vector(
1636
    BitWriter& w,
1637
    int nitems1,
1638
    std::vector<T>& vec1,
1639
    int T::* nitems2,
1640
    int bits,
1641
    std::vector<int> T::* vec2)
1642
11.9k
{
1643
    // nitems1 times, write nitems2 (from the ith element of vec1) items from the vec2 vector field
1644
    // of the ith item of vec1.
1645
32.2k
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
1646
59.0k
        for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2) {
1647
38.6k
            w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)), QIntC::to_size(bits));
1648
38.6k
        }
1649
20.3k
    }
1650
11.9k
    w.flush();
1651
11.9k
}
1652
1653
void
1654
QPDF::writeHPageOffset(BitWriter& w)
1655
5.95k
{
1656
5.95k
    HPageOffset& t = m->page_offset_hints;
1657
1658
5.95k
    w.writeBitsInt(t.min_nobjects, 32);               // 1
1659
5.95k
    w.writeBits(toULL(t.first_page_offset), 32);      // 2
1660
5.95k
    w.writeBitsInt(t.nbits_delta_nobjects, 16);       // 3
1661
5.95k
    w.writeBitsInt(t.min_page_length, 32);            // 4
1662
5.95k
    w.writeBitsInt(t.nbits_delta_page_length, 16);    // 5
1663
5.95k
    w.writeBits(toULL(t.min_content_offset), 32);     // 6
1664
5.95k
    w.writeBitsInt(t.nbits_delta_content_offset, 16); // 7
1665
5.95k
    w.writeBitsInt(t.min_content_length, 32);         // 8
1666
5.95k
    w.writeBitsInt(t.nbits_delta_content_length, 16); // 9
1667
5.95k
    w.writeBitsInt(t.nbits_nshared_objects, 16);      // 10
1668
5.95k
    w.writeBitsInt(t.nbits_shared_identifier, 16);    // 11
1669
5.95k
    w.writeBitsInt(t.nbits_shared_numerator, 16);     // 12
1670
5.95k
    w.writeBitsInt(t.shared_denominator, 16);         // 13
1671
1672
5.95k
    int nitems = toI(getAllPages().size());
1673
5.95k
    std::vector<HPageOffsetEntry>& entries = t.entries;
1674
1675
5.95k
    write_vector_int(w, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
1676
5.95k
    write_vector_int(
1677
5.95k
        w, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
1678
5.95k
    write_vector_int(
1679
5.95k
        w, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
1680
5.95k
    write_vector_vector(
1681
5.95k
        w,
1682
5.95k
        nitems,
1683
5.95k
        entries,
1684
5.95k
        &HPageOffsetEntry::nshared_objects,
1685
5.95k
        t.nbits_shared_identifier,
1686
5.95k
        &HPageOffsetEntry::shared_identifiers);
1687
5.95k
    write_vector_vector(
1688
5.95k
        w,
1689
5.95k
        nitems,
1690
5.95k
        entries,
1691
5.95k
        &HPageOffsetEntry::nshared_objects,
1692
5.95k
        t.nbits_shared_numerator,
1693
5.95k
        &HPageOffsetEntry::shared_numerators);
1694
5.95k
    write_vector_int(
1695
5.95k
        w, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
1696
5.95k
    write_vector_int(
1697
5.95k
        w, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
1698
5.95k
}
1699
1700
void
1701
QPDF::writeHSharedObject(BitWriter& w)
1702
5.95k
{
1703
5.95k
    HSharedObject& t = m->shared_object_hints;
1704
1705
5.95k
    w.writeBitsInt(t.first_shared_obj, 32);         // 1
1706
5.95k
    w.writeBits(toULL(t.first_shared_offset), 32);  // 2
1707
5.95k
    w.writeBitsInt(t.nshared_first_page, 32);       // 3
1708
5.95k
    w.writeBitsInt(t.nshared_total, 32);            // 4
1709
5.95k
    w.writeBitsInt(t.nbits_nobjects, 16);           // 5
1710
5.95k
    w.writeBitsInt(t.min_group_length, 32);         // 6
1711
5.95k
    w.writeBitsInt(t.nbits_delta_group_length, 16); // 7
1712
1713
5.95k
    QTC::TC(
1714
5.95k
        "qpdf",
1715
5.95k
        "QPDF lin write nshared_total > nshared_first_page",
1716
5.95k
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
1717
1718
5.95k
    int nitems = t.nshared_total;
1719
5.95k
    std::vector<HSharedObjectEntry>& entries = t.entries;
1720
1721
5.95k
    write_vector_int(
1722
5.95k
        w, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
1723
5.95k
    write_vector_int(w, nitems, entries, 1, &HSharedObjectEntry::signature_present);
1724
30.9k
    for (size_t i = 0; i < toS(nitems); ++i) {
1725
        // If signature were present, we'd have to write a 128-bit hash.
1726
24.9k
        if (entries.at(i).signature_present != 0) {
1727
0
            stopOnError("found unexpected signature present while writing linearization data");
1728
0
        }
1729
24.9k
    }
1730
5.95k
    write_vector_int(w, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
1731
5.95k
}
1732
1733
void
1734
QPDF::writeHGeneric(BitWriter& w, HGeneric& t)
1735
200
{
1736
200
    w.writeBitsInt(t.first_object, 32);            // 1
1737
200
    w.writeBits(toULL(t.first_object_offset), 32); // 2
1738
200
    w.writeBitsInt(t.nobjects, 32);                // 3
1739
200
    w.writeBitsInt(t.group_length, 32);            // 4
1740
200
}
1741
1742
void
1743
QPDF::generateHintStream(
1744
    QPDFWriter::NewObjTable const& new_obj,
1745
    QPDFWriter::ObjTable const& obj,
1746
    std::string& hint_buffer,
1747
    int& S,
1748
    int& O,
1749
    bool compressed)
1750
5.95k
{
1751
    // Populate actual hint table values
1752
5.95k
    calculateHPageOffset(new_obj, obj);
1753
5.95k
    calculateHSharedObject(new_obj, obj);
1754
5.95k
    calculateHOutline(new_obj, obj);
1755
1756
    // Write the hint stream itself into a compressed memory buffer. Write through a counter so we
1757
    // can get offsets.
1758
5.95k
    std::string b;
1759
5.95k
    auto c = compressed
1760
5.95k
        ? std::make_unique<pl::Count>(
1761
5.95k
              0, b, pl::create<Pl_Flate>(pl::create<pl::String>(hint_buffer), Pl_Flate::a_deflate))
1762
5.95k
        : std::make_unique<pl::Count>(0, hint_buffer);
1763
1764
5.95k
    BitWriter w(c.get());
1765
1766
5.95k
    writeHPageOffset(w);
1767
5.95k
    S = toI(c->getCount());
1768
5.95k
    writeHSharedObject(w);
1769
5.95k
    O = 0;
1770
5.95k
    if (m->outline_hints.nobjects > 0) {
1771
200
        O = toI(c->getCount());
1772
200
        writeHGeneric(w, m->outline_hints);
1773
200
    }
1774
5.95k
    c->finish();
1775
5.95k
}