Coverage Report

Created: 2024-09-08 06:06

/src/qpdf/libqpdf/QPDF_linearization.cc
Line
Count
Source (jump to first uncovered line)
1
// See doc/linearization.
2
3
#include <qpdf/QPDF.hh>
4
5
#include <qpdf/BitStream.hh>
6
#include <qpdf/BitWriter.hh>
7
#include <qpdf/Pl_Buffer.hh>
8
#include <qpdf/Pl_Count.hh>
9
#include <qpdf/Pl_Flate.hh>
10
#include <qpdf/QPDFExc.hh>
11
#include <qpdf/QPDFLogger.hh>
12
#include <qpdf/QPDFWriter_private.hh>
13
#include <qpdf/QTC.hh>
14
#include <qpdf/QUtil.hh>
15
16
#include <algorithm>
17
#include <cmath>
18
#include <cstring>
19
20
template <class T, class int_type>
21
static void
22
load_vector_int(
23
    BitStream& bit_stream, int nitems, std::vector<T>& vec, int bits_wanted, int_type T::*field)
24
0
{
25
0
    bool append = vec.empty();
26
    // nitems times, read bits_wanted from the given bit stream, storing results in the ith vector
27
    // entry.
28
29
0
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
30
0
        if (append) {
31
0
            vec.push_back(T());
32
0
        }
33
0
        vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted));
34
0
    }
35
0
    if (QIntC::to_int(vec.size()) != nitems) {
36
0
        throw std::logic_error("vector has wrong size in load_vector_int");
37
0
    }
38
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
39
    // start on a byte boundary.
40
0
    bit_stream.skipToNextByte();
41
0
}
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HPageOffsetEntry, int>(BitStream&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, int QPDF::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HPageOffsetEntry, long long>(BitStream&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, long long QPDF::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HSharedObjectEntry, int>(BitStream&, int, std::__1::vector<QPDF::HSharedObjectEntry, std::__1::allocator<QPDF::HSharedObjectEntry> >&, int, int QPDF::HSharedObjectEntry::*)
42
43
template <class T>
44
static void
45
load_vector_vector(
46
    BitStream& bit_stream,
47
    int nitems1,
48
    std::vector<T>& vec1,
49
    int T::*nitems2,
50
    int bits_wanted,
51
    std::vector<int> T::*vec2)
52
0
{
53
    // nitems1 times, read nitems2 (from the ith element of vec1) items into the vec2 vector field
54
    // of the ith item of vec1.
55
0
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
56
0
        for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2) {
57
0
            (vec1.at(i1).*vec2).push_back(bit_stream.getBitsInt(QIntC::to_size(bits_wanted)));
58
0
        }
59
0
    }
60
0
    bit_stream.skipToNextByte();
61
0
}
62
63
void
64
QPDF::linearizationWarning(std::string_view msg)
65
0
{
66
0
    m->linearization_warnings = true;
67
0
    warn(qpdf_e_linearization, "", 0, std::string(msg));
68
0
}
69
70
bool
71
QPDF::checkLinearization()
72
0
{
73
0
    bool result = false;
74
0
    try {
75
0
        readLinearizationData();
76
0
        result = checkLinearizationInternal();
77
0
    } catch (std::runtime_error& e) {
78
0
        linearizationWarning(
79
0
            "error encountered while checking linearization data: " + std::string(e.what()));
80
0
    }
81
0
    return result;
82
0
}
83
84
bool
85
QPDF::isLinearized()
86
0
{
87
    // If the first object in the file is a dictionary with a suitable /Linearized key and has an /L
88
    // key that accurately indicates the file size, initialize m->lindict and return true.
89
90
    // A linearized PDF spec's first object will be contained within the first 1024 bytes of the
91
    // file and will be a dictionary with a valid /Linearized key.  This routine looks for that and
92
    // does no additional validation.
93
94
    // The PDF spec says the linearization dictionary must be completely contained within the first
95
    // 1024 bytes of the file. Add a byte for a null terminator.
96
0
    static int const tbuf_size = 1025;
97
98
0
    auto b = std::make_unique<char[]>(tbuf_size);
99
0
    char* buf = b.get();
100
0
    m->file->seek(0, SEEK_SET);
101
0
    memset(buf, '\0', tbuf_size);
102
0
    m->file->read(buf, tbuf_size - 1);
103
104
0
    int lindict_obj = -1;
105
0
    char* p = buf;
106
0
    while (lindict_obj == -1) {
107
        // Find a digit or end of buffer
108
0
        while (((p - buf) < tbuf_size) && (!QUtil::is_digit(*p))) {
109
0
            ++p;
110
0
        }
111
0
        if (p - buf == tbuf_size) {
112
0
            break;
113
0
        }
114
        // Seek to the digit. Then skip over digits for a potential
115
        // next iteration.
116
0
        m->file->seek(p - buf, SEEK_SET);
117
0
        while (((p - buf) < tbuf_size) && QUtil::is_digit(*p)) {
118
0
            ++p;
119
0
        }
120
121
0
        QPDFTokenizer::Token t1 = readToken(m->file);
122
0
        if (t1.isInteger() && readToken(m->file).isInteger() && readToken(m->file).isWord("obj") &&
123
0
            (readToken(m->file).getType() == QPDFTokenizer::tt_dict_open)) {
124
0
            lindict_obj = toI(QUtil::string_to_ll(t1.getValue().c_str()));
125
0
        }
126
0
    }
127
128
0
    if (lindict_obj <= 0) {
129
0
        return false;
130
0
    }
131
132
0
    auto candidate = getObjectByID(lindict_obj, 0);
133
0
    if (!candidate.isDictionary()) {
134
0
        return false;
135
0
    }
136
137
0
    QPDFObjectHandle linkey = candidate.getKey("/Linearized");
138
0
    if (!(linkey.isNumber() && (toI(floor(linkey.getNumericValue())) == 1))) {
139
0
        return false;
140
0
    }
141
142
0
    QPDFObjectHandle L = candidate.getKey("/L");
143
0
    if (L.isInteger()) {
144
0
        qpdf_offset_t Li = L.getIntValue();
145
0
        m->file->seek(0, SEEK_END);
146
0
        if (Li != m->file->tell()) {
147
0
            QTC::TC("qpdf", "QPDF /L mismatch");
148
0
            return false;
149
0
        } else {
150
0
            m->linp.file_size = Li;
151
0
        }
152
0
    }
153
154
0
    m->lindict = candidate;
155
156
0
    return true;
157
0
}
158
159
void
160
QPDF::readLinearizationData()
161
0
{
162
    // This function throws an exception (which is trapped by checkLinearization()) for any errors
163
    // that prevent loading.
164
165
0
    if (!isLinearized()) {
166
0
        throw std::logic_error("called readLinearizationData for file"
167
0
                               " that is not linearized");
168
0
    }
169
170
    // /L is read and stored in linp by isLinearized()
171
0
    QPDFObjectHandle H = m->lindict.getKey("/H");
172
0
    QPDFObjectHandle O = m->lindict.getKey("/O");
173
0
    QPDFObjectHandle E = m->lindict.getKey("/E");
174
0
    QPDFObjectHandle N = m->lindict.getKey("/N");
175
0
    QPDFObjectHandle T = m->lindict.getKey("/T");
176
0
    QPDFObjectHandle P = m->lindict.getKey("/P");
177
178
0
    if (!(H.isArray() && O.isInteger() && E.isInteger() && N.isInteger() && T.isInteger() &&
179
0
          (P.isInteger() || P.isNull()))) {
180
0
        throw damagedPDF(
181
0
            "linearization dictionary",
182
0
            "some keys in linearization dictionary are of the wrong type");
183
0
    }
184
185
    // Hint table array: offset length [ offset length ]
186
0
    size_t n_H_items = toS(H.getArrayNItems());
187
0
    if (!((n_H_items == 2) || (n_H_items == 4))) {
188
0
        throw damagedPDF("linearization dictionary", "H has the wrong number of items");
189
0
    }
190
191
0
    std::vector<int> H_items;
192
0
    for (size_t i = 0; i < n_H_items; ++i) {
193
0
        QPDFObjectHandle oh(H.getArrayItem(toI(i)));
194
0
        if (oh.isInteger()) {
195
0
            H_items.push_back(oh.getIntValueAsInt());
196
0
        } else {
197
0
            throw damagedPDF("linearization dictionary", "some H items are of the wrong type");
198
0
        }
199
0
    }
200
201
    // H: hint table offset/length for primary and overflow hint tables
202
0
    int H0_offset = H_items.at(0);
203
0
    int H0_length = H_items.at(1);
204
0
    int H1_offset = 0;
205
0
    int H1_length = 0;
206
0
    if (H_items.size() == 4) {
207
        // Acrobat doesn't read or write these (as PDF 1.4), so we don't have a way to generate a
208
        // test case.
209
        // QTC::TC("qpdf", "QPDF overflow hint table");
210
0
        H1_offset = H_items.at(2);
211
0
        H1_length = H_items.at(3);
212
0
    }
213
214
    // P: first page number
215
0
    int first_page = 0;
216
0
    if (P.isInteger()) {
217
0
        QTC::TC("qpdf", "QPDF P present in lindict");
218
0
        first_page = P.getIntValueAsInt();
219
0
    } else {
220
0
        QTC::TC("qpdf", "QPDF P absent in lindict");
221
0
    }
222
223
    // Store linearization parameter data
224
225
    // Various places in the code use linp.npages, which is initialized from N, to pre-allocate
226
    // memory, so make sure it's accurate and bail right now if it's not.
227
0
    if (N.getIntValue() != static_cast<long long>(getAllPages().size())) {
228
0
        throw damagedPDF("linearization hint table", "/N does not match number of pages");
229
0
    }
230
231
    // file_size initialized by isLinearized()
232
0
    m->linp.first_page_object = O.getIntValueAsInt();
233
0
    m->linp.first_page_end = E.getIntValue();
234
0
    m->linp.npages = N.getIntValueAsInt();
235
0
    m->linp.xref_zero_offset = T.getIntValue();
236
0
    m->linp.first_page = first_page;
237
0
    m->linp.H_offset = H0_offset;
238
0
    m->linp.H_length = H0_length;
239
240
    // Read hint streams
241
242
0
    Pl_Buffer pb("hint buffer");
243
0
    QPDFObjectHandle H0 = readHintStream(pb, H0_offset, toS(H0_length));
244
0
    if (H1_offset) {
245
0
        (void)readHintStream(pb, H1_offset, toS(H1_length));
246
0
    }
247
248
    // PDF 1.4 hint tables that we ignore:
249
250
    //  /T    thumbnail
251
    //  /A    thread information
252
    //  /E    named destination
253
    //  /V    interactive form
254
    //  /I    information dictionary
255
    //  /C    logical structure
256
    //  /L    page label
257
258
    // Individual hint table offsets
259
0
    QPDFObjectHandle HS = H0.getKey("/S"); // shared object
260
0
    QPDFObjectHandle HO = H0.getKey("/O"); // outline
261
262
0
    auto hbp = pb.getBufferSharedPointer();
263
0
    Buffer* hb = hbp.get();
264
0
    unsigned char const* h_buf = hb->getBuffer();
265
0
    size_t h_size = hb->getSize();
266
267
0
    readHPageOffset(BitStream(h_buf, h_size));
268
269
0
    int HSi = HS.getIntValueAsInt();
270
0
    if ((HSi < 0) || (toS(HSi) >= h_size)) {
271
0
        throw damagedPDF("linearization hint table", "/S (shared object) offset is out of bounds");
272
0
    }
273
0
    readHSharedObject(BitStream(h_buf + HSi, h_size - toS(HSi)));
274
275
0
    if (HO.isInteger()) {
276
0
        int HOi = HO.getIntValueAsInt();
277
0
        if ((HOi < 0) || (toS(HOi) >= h_size)) {
278
0
            throw damagedPDF("linearization hint table", "/O (outline) offset is out of bounds");
279
0
        }
280
0
        readHGeneric(BitStream(h_buf + HOi, h_size - toS(HOi)), m->outline_hints);
281
0
    }
282
0
}
283
284
QPDFObjectHandle
285
QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
286
0
{
287
0
    QPDFObjGen og;
288
0
    QPDFObjectHandle H =
289
0
        readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);
290
0
    ObjCache& oc = m->obj_cache[og];
291
0
    qpdf_offset_t min_end_offset = oc.end_before_space;
292
0
    qpdf_offset_t max_end_offset = oc.end_after_space;
293
0
    if (!H.isStream()) {
294
0
        throw damagedPDF("linearization dictionary", "hint table is not a stream");
295
0
    }
296
297
0
    QPDFObjectHandle Hdict = H.getDict();
298
299
    // Some versions of Acrobat make /Length indirect and place it immediately after the stream,
300
    // increasing length to cover it, even though the specification says all objects in the
301
    // linearization parameter dictionary must be direct.  We have to get the file position of the
302
    // end of length in this case.
303
0
    QPDFObjectHandle length_obj = Hdict.getKey("/Length");
304
0
    if (length_obj.isIndirect()) {
305
0
        QTC::TC("qpdf", "QPDF hint table length indirect");
306
        // Force resolution
307
0
        (void)length_obj.getIntValue();
308
0
        ObjCache& oc2 = m->obj_cache[length_obj.getObjGen()];
309
0
        min_end_offset = oc2.end_before_space;
310
0
        max_end_offset = oc2.end_after_space;
311
0
    } else {
312
0
        QTC::TC("qpdf", "QPDF hint table length direct");
313
0
    }
314
0
    qpdf_offset_t computed_end = offset + toO(length);
315
0
    if ((computed_end < min_end_offset) || (computed_end > max_end_offset)) {
316
0
        linearizationWarning(
317
0
            "expected = " + std::to_string(computed_end) +
318
0
            "; actual = " + std::to_string(min_end_offset) + ".." + std::to_string(max_end_offset));
319
0
        throw damagedPDF("linearization dictionary", "hint table length mismatch");
320
0
    }
321
0
    H.pipeStreamData(&pl, 0, qpdf_dl_specialized);
322
0
    return Hdict;
323
0
}
324
325
void
326
QPDF::readHPageOffset(BitStream h)
327
0
{
328
    // All comments referring to the PDF spec refer to the spec for version 1.4.
329
330
0
    HPageOffset& t = m->page_offset_hints;
331
332
0
    t.min_nobjects = h.getBitsInt(32);               // 1
333
0
    t.first_page_offset = h.getBitsInt(32);          // 2
334
0
    t.nbits_delta_nobjects = h.getBitsInt(16);       // 3
335
0
    t.min_page_length = h.getBitsInt(32);            // 4
336
0
    t.nbits_delta_page_length = h.getBitsInt(16);    // 5
337
0
    t.min_content_offset = h.getBitsInt(32);         // 6
338
0
    t.nbits_delta_content_offset = h.getBitsInt(16); // 7
339
0
    t.min_content_length = h.getBitsInt(32);         // 8
340
0
    t.nbits_delta_content_length = h.getBitsInt(16); // 9
341
0
    t.nbits_nshared_objects = h.getBitsInt(16);      // 10
342
0
    t.nbits_shared_identifier = h.getBitsInt(16);    // 11
343
0
    t.nbits_shared_numerator = h.getBitsInt(16);     // 12
344
0
    t.shared_denominator = h.getBitsInt(16);         // 13
345
346
0
    std::vector<HPageOffsetEntry>& entries = t.entries;
347
0
    entries.clear();
348
0
    int nitems = m->linp.npages;
349
0
    load_vector_int(h, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
350
0
    load_vector_int(
351
0
        h, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
352
0
    load_vector_int(
353
0
        h, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
354
0
    load_vector_vector(
355
0
        h,
356
0
        nitems,
357
0
        entries,
358
0
        &HPageOffsetEntry::nshared_objects,
359
0
        t.nbits_shared_identifier,
360
0
        &HPageOffsetEntry::shared_identifiers);
361
0
    load_vector_vector(
362
0
        h,
363
0
        nitems,
364
0
        entries,
365
0
        &HPageOffsetEntry::nshared_objects,
366
0
        t.nbits_shared_numerator,
367
0
        &HPageOffsetEntry::shared_numerators);
368
0
    load_vector_int(
369
0
        h, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
370
0
    load_vector_int(
371
0
        h, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
372
0
}
373
374
void
375
QPDF::readHSharedObject(BitStream h)
376
0
{
377
0
    HSharedObject& t = m->shared_object_hints;
378
379
0
    t.first_shared_obj = h.getBitsInt(32);         // 1
380
0
    t.first_shared_offset = h.getBitsInt(32);      // 2
381
0
    t.nshared_first_page = h.getBitsInt(32);       // 3
382
0
    t.nshared_total = h.getBitsInt(32);            // 4
383
0
    t.nbits_nobjects = h.getBitsInt(16);           // 5
384
0
    t.min_group_length = h.getBitsInt(32);         // 6
385
0
    t.nbits_delta_group_length = h.getBitsInt(16); // 7
386
387
0
    QTC::TC(
388
0
        "qpdf",
389
0
        "QPDF lin nshared_total > nshared_first_page",
390
0
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
391
392
0
    std::vector<HSharedObjectEntry>& entries = t.entries;
393
0
    entries.clear();
394
0
    int nitems = t.nshared_total;
395
0
    load_vector_int(
396
0
        h, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
397
0
    load_vector_int(h, nitems, entries, 1, &HSharedObjectEntry::signature_present);
398
0
    for (size_t i = 0; i < toS(nitems); ++i) {
399
0
        if (entries.at(i).signature_present) {
400
            // Skip 128-bit MD5 hash.  These are not supported by acrobat, so they should probably
401
            // never be there.  We have no test case for this.
402
0
            for (int j = 0; j < 4; ++j) {
403
0
                (void)h.getBits(32);
404
0
            }
405
0
        }
406
0
    }
407
0
    load_vector_int(h, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
408
0
}
409
410
void
411
QPDF::readHGeneric(BitStream h, HGeneric& t)
412
0
{
413
0
    t.first_object = h.getBitsInt(32);        // 1
414
0
    t.first_object_offset = h.getBitsInt(32); // 2
415
0
    t.nobjects = h.getBitsInt(32);            // 3
416
0
    t.group_length = h.getBitsInt(32);        // 4
417
0
}
418
419
bool
420
QPDF::checkLinearizationInternal()
421
0
{
422
    // All comments referring to the PDF spec refer to the spec for version 1.4.
423
424
    // Check all values in linearization parameter dictionary
425
426
0
    LinParameters& p = m->linp;
427
428
    // L: file size in bytes -- checked by isLinearized
429
430
    // O: object number of first page
431
0
    std::vector<QPDFObjectHandle> const& pages = getAllPages();
432
0
    if (p.first_page_object != pages.at(0).getObjectID()) {
433
0
        QTC::TC("qpdf", "QPDF err /O mismatch");
434
0
        linearizationWarning("first page object (/O) mismatch");
435
0
    }
436
437
    // N: number of pages
438
0
    int npages = toI(pages.size());
439
0
    if (p.npages != npages) {
440
        // Not tested in the test suite
441
0
        linearizationWarning("page count (/N) mismatch");
442
0
    }
443
444
0
    for (size_t i = 0; i < toS(npages); ++i) {
445
0
        QPDFObjectHandle const& page = pages.at(i);
446
0
        QPDFObjGen og(page.getObjGen());
447
0
        if (m->xref_table[og].getType() == 2) {
448
0
            linearizationWarning(
449
0
                "page dictionary for page " + std::to_string(i) + " is compressed");
450
0
        }
451
0
    }
452
453
    // T: offset of whitespace character preceding xref entry for object 0
454
0
    m->file->seek(p.xref_zero_offset, SEEK_SET);
455
0
    while (true) {
456
0
        char ch;
457
0
        m->file->read(&ch, 1);
458
0
        if (!((ch == ' ') || (ch == '\r') || (ch == '\n'))) {
459
0
            m->file->seek(-1, SEEK_CUR);
460
0
            break;
461
0
        }
462
0
    }
463
0
    if (m->file->tell() != m->first_xref_item_offset) {
464
0
        QTC::TC("qpdf", "QPDF err /T mismatch");
465
0
        linearizationWarning(
466
0
            "space before first xref item (/T) mismatch "
467
0
            "(computed = " +
468
0
            std::to_string(m->first_xref_item_offset) +
469
0
            "; file = " + std::to_string(m->file->tell()));
470
0
    }
471
472
    // P: first page number -- Implementation note 124 says Acrobat ignores this value, so we will
473
    // too.
474
475
    // Check numbering of compressed objects in each xref section. For linearized files, all
476
    // compressed objects are supposed to be at the end of the containing xref section if any object
477
    // streams are in use.
478
479
0
    if (m->uncompressed_after_compressed) {
480
0
        linearizationWarning("linearized file contains an uncompressed object after a compressed "
481
0
                             "one in a cross-reference stream");
482
0
    }
483
484
    // Further checking requires optimization and order calculation. Don't allow optimization to
485
    // make changes.  If it has to, then the file is not properly linearized.  We use the xref table
486
    // to figure out which objects are compressed and which are uncompressed.
487
0
    { // local scope
488
0
        std::map<int, int> object_stream_data;
489
0
        for (auto const& iter: m->xref_table) {
490
0
            QPDFObjGen const& og = iter.first;
491
0
            QPDFXRefEntry const& entry = iter.second;
492
0
            if (entry.getType() == 2) {
493
0
                object_stream_data[og.getObj()] = entry.getObjStreamNumber();
494
0
            }
495
0
        }
496
0
        optimize(object_stream_data, false);
497
0
        calculateLinearizationData(object_stream_data);
498
0
    }
499
500
    // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
501
    // object here by mistake.  pdlin fails to place thumbnail images in section 9, so when
502
    // thumbnails are present, it also gets the wrong value for /E.  It also doesn't count outlines
503
    // here when it should even though it places them in part 6.  This code fails to put thread
504
    // information dictionaries in part 9, so it actually gets the wrong value for E when threads
505
    // are present.  In that case, it would probably agree with pdlin.  As of this writing, the test
506
    // suite doesn't contain any files with threads.
507
508
0
    if (m->part6.empty()) {
509
0
        stopOnError("linearization part 6 unexpectedly empty");
510
0
    }
511
0
    qpdf_offset_t min_E = -1;
512
0
    qpdf_offset_t max_E = -1;
513
0
    for (auto const& oh: m->part6) {
514
0
        QPDFObjGen og(oh.getObjGen());
515
0
        if (m->obj_cache.count(og) == 0) {
516
            // All objects have to have been dereferenced to be classified.
517
0
            throw std::logic_error("linearization part6 object not in cache");
518
0
        }
519
0
        ObjCache const& oc = m->obj_cache[og];
520
0
        min_E = std::max(min_E, oc.end_before_space);
521
0
        max_E = std::max(max_E, oc.end_after_space);
522
0
    }
523
0
    if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) {
524
0
        QTC::TC("qpdf", "QPDF warn /E mismatch");
525
0
        linearizationWarning(
526
0
            "end of first page section (/E) mismatch: /E = " + std::to_string(p.first_page_end) +
527
0
            "; computed = " + std::to_string(min_E) + ".." + std::to_string(max_E));
528
0
    }
529
530
    // Check hint tables
531
532
0
    std::map<int, int> shared_idx_to_obj;
533
0
    checkHSharedObject(pages, shared_idx_to_obj);
534
0
    checkHPageOffset(pages, shared_idx_to_obj);
535
0
    checkHOutlines();
536
537
0
    return !m->linearization_warnings;
538
0
}
539
540
qpdf_offset_t
541
QPDF::maxEnd(ObjUser const& ou)
542
0
{
543
0
    if (m->obj_user_to_objects.count(ou) == 0) {
544
0
        stopOnError("no entry in object user table for requested object user");
545
0
    }
546
0
    qpdf_offset_t end = 0;
547
0
    for (auto const& og: m->obj_user_to_objects[ou]) {
548
0
        if (m->obj_cache.count(og) == 0) {
549
0
            stopOnError("unknown object referenced in object user table");
550
0
        }
551
0
        end = std::max(end, m->obj_cache[og].end_after_space);
552
0
    }
553
0
    return end;
554
0
}
555
556
qpdf_offset_t
557
QPDF::getLinearizationOffset(QPDFObjGen const& og)
558
0
{
559
0
    QPDFXRefEntry entry = m->xref_table[og];
560
0
    qpdf_offset_t result = 0;
561
0
    switch (entry.getType()) {
562
0
    case 1:
563
0
        result = entry.getOffset();
564
0
        break;
565
566
0
    case 2:
567
        // For compressed objects, return the offset of the object stream that contains them.
568
0
        result = getLinearizationOffset(QPDFObjGen(entry.getObjStreamNumber(), 0));
569
0
        break;
570
571
0
    default:
572
0
        stopOnError("getLinearizationOffset called for xref entry not of type 1 or 2");
573
0
        break;
574
0
    }
575
0
    return result;
576
0
}
577
578
QPDFObjectHandle
579
QPDF::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data)
580
0
{
581
0
    if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0)) {
582
0
        return obj;
583
0
    } else {
584
0
        int repl = (*(object_stream_data.find(obj.getObjectID()))).second;
585
0
        return getObject(repl, 0);
586
0
    }
587
0
}
588
589
QPDFObjectHandle
590
QPDF::getUncompressedObject(QPDFObjectHandle& oh, QPDFWriter::ObjTable const& obj)
591
29.5k
{
592
29.5k
    if (obj.contains(oh)) {
593
29.3k
        if (auto id = obj[oh].object_stream; id > 0) {
594
20
            return oh.isNull() ? oh : getObject(id, 0);
595
20
        }
596
29.3k
    }
597
29.5k
    return oh;
598
29.5k
}
599
600
int
601
QPDF::lengthNextN(int first_object, int n)
602
0
{
603
0
    int length = 0;
604
0
    for (int i = 0; i < n; ++i) {
605
0
        QPDFObjGen og(first_object + i, 0);
606
0
        if (m->xref_table.count(og) == 0) {
607
0
            linearizationWarning(
608
0
                "no xref table entry for " + std::to_string(first_object + i) + " 0");
609
0
        } else {
610
0
            if (m->obj_cache.count(og) == 0) {
611
0
                stopOnError("found unknown object while calculating length for linearization data");
612
0
            }
613
0
            length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og));
614
0
        }
615
0
    }
616
0
    return length;
617
0
}
618
619
void
620
QPDF::checkHPageOffset(
621
    std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& shared_idx_to_obj)
622
0
{
623
    // Implementation note 126 says Acrobat always sets delta_content_offset and
624
    // delta_content_length in the page offset header dictionary to 0.  It also states that
625
    // min_content_offset in the per-page information is always 0, which is an incorrect value.
626
627
    // Implementation note 127 explains that Acrobat always sets item 8 (min_content_length) to
628
    // zero, item 9 (nbits_delta_content_length) to the value of item 5 (nbits_delta_page_length),
629
    // and item 7 of each per-page hint table (delta_content_length) to item 2 (delta_page_length)
630
    // of that entry.  Acrobat ignores these values when reading files.
631
632
    // Empirically, it also seems that Acrobat sometimes puts items under a page's /Resources
633
    // dictionary in with shared objects even when they are private.
634
635
0
    int npages = toI(pages.size());
636
0
    qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset);
637
0
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
638
0
    if (m->xref_table.count(first_page_og) == 0) {
639
0
        stopOnError("supposed first page object is not known");
640
0
    }
641
0
    qpdf_offset_t offset = getLinearizationOffset(first_page_og);
642
0
    if (table_offset != offset) {
643
0
        linearizationWarning("first page object offset mismatch");
644
0
    }
645
646
0
    for (int pageno = 0; pageno < npages; ++pageno) {
647
0
        QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
648
0
        int first_object = page_og.getObj();
649
0
        if (m->xref_table.count(page_og) == 0) {
650
0
            stopOnError("unknown object in page offset hint table");
651
0
        }
652
0
        offset = getLinearizationOffset(page_og);
653
654
0
        HPageOffsetEntry& he = m->page_offset_hints.entries.at(toS(pageno));
655
0
        CHPageOffsetEntry& ce = m->c_page_offset_data.entries.at(toS(pageno));
656
0
        int h_nobjects = he.delta_nobjects + m->page_offset_hints.min_nobjects;
657
0
        if (h_nobjects != ce.nobjects) {
658
            // This happens with pdlin when there are thumbnails.
659
0
            linearizationWarning(
660
0
                "object count mismatch for page " + std::to_string(pageno) + ": hint table = " +
661
0
                std::to_string(h_nobjects) + "; computed = " + std::to_string(ce.nobjects));
662
0
        }
663
664
        // Use value for number of objects in hint table rather than computed value if there is a
665
        // discrepancy.
666
0
        int length = lengthNextN(first_object, h_nobjects);
667
0
        int h_length = toI(he.delta_page_length + m->page_offset_hints.min_page_length);
668
0
        if (length != h_length) {
669
            // This condition almost certainly indicates a bad hint table or a bug in this code.
670
0
            linearizationWarning(
671
0
                "page length mismatch for page " + std::to_string(pageno) + ": hint table = " +
672
0
                std::to_string(h_length) + "; computed length = " + std::to_string(length) +
673
0
                " (offset = " + std::to_string(offset) + ")");
674
0
        }
675
676
0
        offset += h_length;
677
678
        // Translate shared object indexes to object numbers.
679
0
        std::set<int> hint_shared;
680
0
        std::set<int> computed_shared;
681
682
0
        if ((pageno == 0) && (he.nshared_objects > 0)) {
683
            // pdlin and Acrobat both do this even though the spec states clearly and unambiguously
684
            // that they should not.
685
0
            linearizationWarning("page 0 has shared identifier entries");
686
0
        }
687
688
0
        for (size_t i = 0; i < toS(he.nshared_objects); ++i) {
689
0
            int idx = he.shared_identifiers.at(i);
690
0
            if (shared_idx_to_obj.count(idx) == 0) {
691
0
                stopOnError("unable to get object for item in"
692
0
                            " shared objects hint table");
693
0
            }
694
0
            hint_shared.insert(shared_idx_to_obj[idx]);
695
0
        }
696
697
0
        for (size_t i = 0; i < toS(ce.nshared_objects); ++i) {
698
0
            int idx = ce.shared_identifiers.at(i);
699
0
            if (idx >= m->c_shared_object_data.nshared_total) {
700
0
                stopOnError("index out of bounds for shared object hint table");
701
0
            }
702
0
            int obj = m->c_shared_object_data.entries.at(toS(idx)).object;
703
0
            computed_shared.insert(obj);
704
0
        }
705
706
0
        for (int iter: hint_shared) {
707
0
            if (!computed_shared.count(iter)) {
708
                // pdlin puts thumbnails here even though it shouldn't
709
0
                linearizationWarning(
710
0
                    "page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
711
0
                    ": in hint table but not computed list");
712
0
            }
713
0
        }
714
715
0
        for (int iter: computed_shared) {
716
0
            if (!hint_shared.count(iter)) {
717
                // Acrobat does not put some things including at least built-in fonts and procsets
718
                // here, at least in some cases.
719
0
                linearizationWarning(
720
0
                    ("page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
721
0
                     ": in computed list but not hint table"));
722
0
            }
723
0
        }
724
0
    }
725
0
}
726
727
void
728
QPDF::checkHSharedObject(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj)
729
0
{
730
    // Implementation note 125 says shared object groups always contain only one object.
731
    // Implementation note 128 says that Acrobat always nbits_nobjects to zero.  Implementation note
732
    // 130 says that Acrobat does not support more than one shared object per group.  These are all
733
    // consistent.
734
735
    // Implementation note 129 states that MD5 signatures are not implemented in Acrobat, so
736
    // signature_present must always be zero.
737
738
    // Implementation note 131 states that first_shared_obj and first_shared_offset have meaningless
739
    // values for single-page files.
740
741
    // Empirically, Acrobat and pdlin generate incorrect values for these whenever there are no
742
    // shared objects not referenced by the first page (i.e., nshared_total == nshared_first_page).
743
744
0
    HSharedObject& so = m->shared_object_hints;
745
0
    if (so.nshared_total < so.nshared_first_page) {
746
0
        linearizationWarning("shared object hint table: ntotal < nfirst_page");
747
0
    } else {
748
        // The first nshared_first_page objects are consecutive objects starting with the first page
749
        // object.  The rest are consecutive starting from the first_shared_obj object.
750
0
        int cur_object = pages.at(0).getObjectID();
751
0
        for (int i = 0; i < so.nshared_total; ++i) {
752
0
            if (i == so.nshared_first_page) {
753
0
                QTC::TC("qpdf", "QPDF lin check shared past first page");
754
0
                if (m->part8.empty()) {
755
0
                    linearizationWarning("part 8 is empty but nshared_total > "
756
0
                                         "nshared_first_page");
757
0
                } else {
758
0
                    int obj = m->part8.at(0).getObjectID();
759
0
                    if (obj != so.first_shared_obj) {
760
0
                        linearizationWarning(
761
0
                            "first shared object number mismatch: "
762
0
                            "hint table = " +
763
0
                            std::to_string(so.first_shared_obj) +
764
0
                            "; computed = " + std::to_string(obj));
765
0
                    }
766
0
                }
767
768
0
                cur_object = so.first_shared_obj;
769
770
0
                QPDFObjGen og(cur_object, 0);
771
0
                if (m->xref_table.count(og) == 0) {
772
0
                    stopOnError("unknown object in shared object hint table");
773
0
                }
774
0
                qpdf_offset_t offset = getLinearizationOffset(og);
775
0
                qpdf_offset_t h_offset = adjusted_offset(so.first_shared_offset);
776
0
                if (offset != h_offset) {
777
0
                    linearizationWarning(
778
0
                        "first shared object offset mismatch: hint table = " +
779
0
                        std::to_string(h_offset) + "; computed = " + std::to_string(offset));
780
0
                }
781
0
            }
782
783
0
            idx_to_obj[i] = cur_object;
784
0
            HSharedObjectEntry& se = so.entries.at(toS(i));
785
0
            int nobjects = se.nobjects_minus_one + 1;
786
0
            int length = lengthNextN(cur_object, nobjects);
787
0
            int h_length = so.min_group_length + se.delta_group_length;
788
0
            if (length != h_length) {
789
0
                linearizationWarning(
790
0
                    "shared object " + std::to_string(i) + " length mismatch: hint table = " +
791
0
                    std::to_string(h_length) + "; computed = " + std::to_string(length));
792
0
            }
793
0
            cur_object += nobjects;
794
0
        }
795
0
    }
796
0
}
797
798
void
799
QPDF::checkHOutlines()
800
0
{
801
    // Empirically, Acrobat generates the correct value for the object number but incorrectly stores
802
    // the next object number's offset as the offset, at least when outlines appear in part 6.  It
803
    // also generates an incorrect value for length (specifically, the length that would cover the
804
    // correct number of objects from the wrong starting place).  pdlin appears to generate correct
805
    // values in those cases.
806
807
0
    if (m->c_outline_data.nobjects == m->outline_hints.nobjects) {
808
0
        if (m->c_outline_data.nobjects == 0) {
809
0
            return;
810
0
        }
811
812
0
        if (m->c_outline_data.first_object == m->outline_hints.first_object) {
813
            // Check length and offset.  Acrobat gets these wrong.
814
0
            QPDFObjectHandle outlines = getRoot().getKey("/Outlines");
815
0
            if (!outlines.isIndirect()) {
816
                // This case is not exercised in test suite since not permitted by the spec, but if
817
                // this does occur, the code below would fail.
818
0
                linearizationWarning("/Outlines key of root dictionary is not indirect");
819
0
                return;
820
0
            }
821
0
            QPDFObjGen og(outlines.getObjGen());
822
0
            if (m->xref_table.count(og) == 0) {
823
0
                stopOnError("unknown object in outlines hint table");
824
0
            }
825
0
            qpdf_offset_t offset = getLinearizationOffset(og);
826
0
            ObjUser ou(ObjUser::ou_root_key, "/Outlines");
827
0
            int length = toI(maxEnd(ou) - offset);
828
0
            qpdf_offset_t table_offset = adjusted_offset(m->outline_hints.first_object_offset);
829
0
            if (offset != table_offset) {
830
0
                linearizationWarning(
831
0
                    "incorrect offset in outlines table: hint table = " +
832
0
                    std::to_string(table_offset) + "; computed = " + std::to_string(offset));
833
0
            }
834
0
            int table_length = m->outline_hints.group_length;
835
0
            if (length != table_length) {
836
0
                linearizationWarning(
837
0
                    "incorrect length in outlines table: hint table = " +
838
0
                    std::to_string(table_length) + "; computed = " + std::to_string(length));
839
0
            }
840
0
        } else {
841
0
            linearizationWarning("incorrect first object number in outline "
842
0
                                 "hints table.");
843
0
        }
844
0
    } else {
845
0
        linearizationWarning("incorrect object count in outline hint table");
846
0
    }
847
0
}
848
849
void
850
QPDF::showLinearizationData()
851
0
{
852
0
    try {
853
0
        readLinearizationData();
854
0
        checkLinearizationInternal();
855
0
        dumpLinearizationDataInternal();
856
0
    } catch (QPDFExc& e) {
857
0
        linearizationWarning(e.what());
858
0
    }
859
0
}
860
861
void
862
QPDF::dumpLinearizationDataInternal()
863
0
{
864
0
    *m->log->getInfo() << m->file->getName() << ": linearization data:\n\n";
865
866
0
    *m->log->getInfo() << "file_size: " << m->linp.file_size << "\n"
867
0
                       << "first_page_object: " << m->linp.first_page_object << "\n"
868
0
                       << "first_page_end: " << m->linp.first_page_end << "\n"
869
0
                       << "npages: " << m->linp.npages << "\n"
870
0
                       << "xref_zero_offset: " << m->linp.xref_zero_offset << "\n"
871
0
                       << "first_page: " << m->linp.first_page << "\n"
872
0
                       << "H_offset: " << m->linp.H_offset << "\n"
873
0
                       << "H_length: " << m->linp.H_length << "\n"
874
0
                       << "\n";
875
876
0
    *m->log->getInfo() << "Page Offsets Hint Table\n\n";
877
0
    dumpHPageOffset();
878
0
    *m->log->getInfo() << "\nShared Objects Hint Table\n\n";
879
0
    dumpHSharedObject();
880
881
0
    if (m->outline_hints.nobjects > 0) {
882
0
        *m->log->getInfo() << "\nOutlines Hint Table\n\n";
883
0
        dumpHGeneric(m->outline_hints);
884
0
    }
885
0
}
886
887
qpdf_offset_t
888
QPDF::adjusted_offset(qpdf_offset_t offset)
889
0
{
890
    // All offsets >= H_offset have to be increased by H_length since all hint table location values
891
    // disregard the hint table itself.
892
0
    if (offset >= m->linp.H_offset) {
893
0
        return offset + m->linp.H_length;
894
0
    }
895
0
    return offset;
896
0
}
897
898
void
899
QPDF::dumpHPageOffset()
900
0
{
901
0
    HPageOffset& t = m->page_offset_hints;
902
0
    *m->log->getInfo() << "min_nobjects: " << t.min_nobjects << "\n"
903
0
                       << "first_page_offset: " << adjusted_offset(t.first_page_offset) << "\n"
904
0
                       << "nbits_delta_nobjects: " << t.nbits_delta_nobjects << "\n"
905
0
                       << "min_page_length: " << t.min_page_length << "\n"
906
0
                       << "nbits_delta_page_length: " << t.nbits_delta_page_length << "\n"
907
0
                       << "min_content_offset: " << t.min_content_offset << "\n"
908
0
                       << "nbits_delta_content_offset: " << t.nbits_delta_content_offset << "\n"
909
0
                       << "min_content_length: " << t.min_content_length << "\n"
910
0
                       << "nbits_delta_content_length: " << t.nbits_delta_content_length << "\n"
911
0
                       << "nbits_nshared_objects: " << t.nbits_nshared_objects << "\n"
912
0
                       << "nbits_shared_identifier: " << t.nbits_shared_identifier << "\n"
913
0
                       << "nbits_shared_numerator: " << t.nbits_shared_numerator << "\n"
914
0
                       << "shared_denominator: " << t.shared_denominator << "\n";
915
916
0
    for (size_t i1 = 0; i1 < toS(m->linp.npages); ++i1) {
917
0
        HPageOffsetEntry& pe = t.entries.at(i1);
918
0
        *m->log->getInfo() << "Page " << i1 << ":\n"
919
0
                           << "  nobjects: " << pe.delta_nobjects + t.min_nobjects << "\n"
920
0
                           << "  length: " << pe.delta_page_length + t.min_page_length
921
0
                           << "\n"
922
                           // content offset is relative to page, not file
923
0
                           << "  content_offset: " << pe.delta_content_offset + t.min_content_offset
924
0
                           << "\n"
925
0
                           << "  content_length: " << pe.delta_content_length + t.min_content_length
926
0
                           << "\n"
927
0
                           << "  nshared_objects: " << pe.nshared_objects << "\n";
928
0
        for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2) {
929
0
            *m->log->getInfo() << "    identifier " << i2 << ": " << pe.shared_identifiers.at(i2)
930
0
                               << "\n";
931
0
            *m->log->getInfo() << "    numerator " << i2 << ": " << pe.shared_numerators.at(i2)
932
0
                               << "\n";
933
0
        }
934
0
    }
935
0
}
936
937
void
938
QPDF::dumpHSharedObject()
939
0
{
940
0
    HSharedObject& t = m->shared_object_hints;
941
0
    *m->log->getInfo() << "first_shared_obj: " << t.first_shared_obj << "\n"
942
0
                       << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) << "\n"
943
0
                       << "nshared_first_page: " << t.nshared_first_page << "\n"
944
0
                       << "nshared_total: " << t.nshared_total << "\n"
945
0
                       << "nbits_nobjects: " << t.nbits_nobjects << "\n"
946
0
                       << "min_group_length: " << t.min_group_length << "\n"
947
0
                       << "nbits_delta_group_length: " << t.nbits_delta_group_length << "\n";
948
949
0
    for (size_t i = 0; i < toS(t.nshared_total); ++i) {
950
0
        HSharedObjectEntry& se = t.entries.at(i);
951
0
        *m->log->getInfo() << "Shared Object " << i << ":\n"
952
0
                           << "  group length: " << se.delta_group_length + t.min_group_length
953
0
                           << "\n";
954
        // PDF spec says signature present nobjects_minus_one are always 0, so print them only if
955
        // they have a non-zero value.
956
0
        if (se.signature_present) {
957
0
            *m->log->getInfo() << "  signature present\n";
958
0
        }
959
0
        if (se.nobjects_minus_one != 0) {
960
0
            *m->log->getInfo() << "  nobjects: " << se.nobjects_minus_one + 1 << "\n";
961
0
        }
962
0
    }
963
0
}
964
965
void
966
QPDF::dumpHGeneric(HGeneric& t)
967
0
{
968
0
    *m->log->getInfo() << "first_object: " << t.first_object << "\n"
969
0
                       << "first_object_offset: " << adjusted_offset(t.first_object_offset) << "\n"
970
0
                       << "nobjects: " << t.nobjects << "\n"
971
0
                       << "group_length: " << t.group_length << "\n";
972
0
}
973
974
template <typename T>
975
void
976
QPDF::calculateLinearizationData(T const& object_stream_data)
977
3.62k
{
978
    // This function calculates the ordering of objects, divides them into the appropriate parts,
979
    // and computes some values for the linearization parameter dictionary and hint tables.  The
980
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
981
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
982
983
3.62k
    if (m->object_to_obj_users.empty()) {
984
        // Note that we can't call optimize here because we don't know whether it should be called
985
        // with or without allow changes.
986
0
        throw std::logic_error(
987
0
            "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()");
988
0
    }
989
990
    // Separate objects into the categories sufficient for us to determine which part of the
991
    // linearized file should contain the object.  This categorization is useful for other purposes
992
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
993
994
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
995
    // trailer dictionary in part 11).
996
997
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
998
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
999
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
1000
    // for now.
1001
1002
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
1003
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
1004
    // /Outlines, then information from /Outlines also appears here.
1005
1006
    // Part 7 contains remaining objects private to pages other than the first page.
1007
1008
    // Part 8 contains all remaining shared objects except those that are shared only within
1009
    // thumbnails.
1010
1011
    // Part 9 contains all remaining objects.
1012
1013
    // We sort objects into the following categories:
1014
1015
    //   * open_document: part 4
1016
1017
    //   * first_page_private: part 6
1018
1019
    //   * first_page_shared: part 6
1020
1021
    //   * other_page_private: part 7
1022
1023
    //   * other_page_shared: part 8
1024
1025
    //   * thumbnail_private: part 9
1026
1027
    //   * thumbnail_shared: part 9
1028
1029
    //   * other: part 9
1030
1031
    //   * outlines: part 6 or 9
1032
1033
3.62k
    m->part4.clear();
1034
3.62k
    m->part6.clear();
1035
3.62k
    m->part7.clear();
1036
3.62k
    m->part8.clear();
1037
3.62k
    m->part9.clear();
1038
3.62k
    m->c_linp = LinParameters();
1039
3.62k
    m->c_page_offset_data = CHPageOffset();
1040
3.62k
    m->c_shared_object_data = CHSharedObject();
1041
3.62k
    m->c_outline_data = HGeneric();
1042
1043
3.62k
    QPDFObjectHandle root = getRoot();
1044
3.62k
    bool outlines_in_first_page = false;
1045
3.62k
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
1046
3.62k
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
1047
3.62k
    if (pagemode.isName()) {
1048
682
        if (pagemode.getName() == "/UseOutlines") {
1049
518
            if (root.hasKey("/Outlines")) {
1050
375
                outlines_in_first_page = true;
1051
375
            } else {
1052
143
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1053
143
            }
1054
518
        }
1055
682
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1056
682
    }
1057
1058
3.62k
    std::set<std::string> open_document_keys;
1059
3.62k
    open_document_keys.insert("/ViewerPreferences");
1060
3.62k
    open_document_keys.insert("/PageMode");
1061
3.62k
    open_document_keys.insert("/Threads");
1062
3.62k
    open_document_keys.insert("/OpenAction");
1063
3.62k
    open_document_keys.insert("/AcroForm");
1064
1065
3.62k
    std::set<QPDFObjGen> lc_open_document;
1066
3.62k
    std::set<QPDFObjGen> lc_first_page_private;
1067
3.62k
    std::set<QPDFObjGen> lc_first_page_shared;
1068
3.62k
    std::set<QPDFObjGen> lc_other_page_private;
1069
3.62k
    std::set<QPDFObjGen> lc_other_page_shared;
1070
3.62k
    std::set<QPDFObjGen> lc_thumbnail_private;
1071
3.62k
    std::set<QPDFObjGen> lc_thumbnail_shared;
1072
3.62k
    std::set<QPDFObjGen> lc_other;
1073
3.62k
    std::set<QPDFObjGen> lc_outlines;
1074
3.62k
    std::set<QPDFObjGen> lc_root;
1075
1076
91.5k
    for (auto& oiter: m->object_to_obj_users) {
1077
91.5k
        QPDFObjGen const& og = oiter.first;
1078
91.5k
        std::set<ObjUser>& ous = oiter.second;
1079
1080
91.5k
        bool in_open_document = false;
1081
91.5k
        bool in_first_page = false;
1082
91.5k
        int other_pages = 0;
1083
91.5k
        int thumbs = 0;
1084
91.5k
        int others = 0;
1085
91.5k
        bool in_outlines = false;
1086
91.5k
        bool is_root = false;
1087
1088
374k
        for (auto const& ou: ous) {
1089
374k
            switch (ou.ou_type) {
1090
180k
            case ObjUser::ou_trailer_key:
1091
180k
                if (ou.key == "/Encrypt") {
1092
170
                    in_open_document = true;
1093
180k
                } else {
1094
180k
                    ++others;
1095
180k
                }
1096
180k
                break;
1097
1098
9.30k
            case ObjUser::ou_thumb:
1099
9.30k
                ++thumbs;
1100
9.30k
                break;
1101
1102
46.4k
            case ObjUser::ou_root_key:
1103
46.4k
                if (open_document_keys.count(ou.key) > 0) {
1104
15.2k
                    in_open_document = true;
1105
31.1k
                } else if (ou.key == "/Outlines") {
1106
4.54k
                    in_outlines = true;
1107
26.6k
                } else {
1108
26.6k
                    ++others;
1109
26.6k
                }
1110
46.4k
                break;
1111
1112
134k
            case ObjUser::ou_page:
1113
134k
                if (ou.pageno == 0) {
1114
41.0k
                    in_first_page = true;
1115
93.2k
                } else {
1116
93.2k
                    ++other_pages;
1117
93.2k
                }
1118
134k
                break;
1119
1120
3.62k
            case ObjUser::ou_root:
1121
3.62k
                is_root = true;
1122
3.62k
                break;
1123
1124
0
            case ObjUser::ou_bad:
1125
0
                stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: "
1126
0
                            "invalid user type");
1127
0
                break;
1128
374k
            }
1129
374k
        }
1130
1131
91.5k
        if (is_root) {
1132
3.62k
            lc_root.insert(og);
1133
87.9k
        } else if (in_outlines) {
1134
4.51k
            lc_outlines.insert(og);
1135
83.4k
        } else if (in_open_document) {
1136
15.3k
            lc_open_document.insert(og);
1137
68.0k
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1138
15.2k
            lc_first_page_private.insert(og);
1139
52.8k
        } else if (in_first_page) {
1140
9.90k
            lc_first_page_shared.insert(og);
1141
42.9k
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1142
23.6k
            lc_other_page_private.insert(og);
1143
23.6k
        } else if (other_pages > 1) {
1144
5.83k
            lc_other_page_shared.insert(og);
1145
13.5k
        } else if ((thumbs == 1) && (others == 0)) {
1146
2.06k
            lc_thumbnail_private.insert(og);
1147
11.4k
        } else if (thumbs > 1) {
1148
830
            lc_thumbnail_shared.insert(og);
1149
10.6k
        } else {
1150
10.6k
            lc_other.insert(og);
1151
10.6k
        }
1152
91.5k
    }
1153
1154
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1155
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1156
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1157
    // and the above section can be considered only with categorization.  Note that sets of
1158
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1159
    // the possible exception of hints tables which we won't see here anyway.  That means that
1160
    // running calculateLinearizationData() on a linearized file should give results identical to
1161
    // the original file ordering.
1162
1163
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1164
    // code optimization if necessary. Premature optimization is the root of all evil.
1165
3.62k
    std::vector<QPDFObjectHandle> pages;
1166
3.62k
    { // local scope
1167
        // Map all page objects to the containing object stream.  This should be a no-op in a
1168
        // properly linearized file.
1169
15.2k
        for (auto oh: getAllPages()) {
1170
15.2k
            pages.push_back(getUncompressedObject(oh, object_stream_data));
1171
15.2k
        }
1172
3.62k
    }
1173
3.62k
    int npages = toI(pages.size());
1174
1175
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1176
    // initialize any items that deal with object numbers or counts but not any items that deal with
1177
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1178
    // during the first pass.  The validation code can compute them relatively easily given the rest
1179
    // of the information.
1180
1181
    // npages is the size of the existing pages vector, which has been created by traversing the
1182
    // pages tree, and as such is a reasonable size.
1183
3.62k
    m->c_linp.npages = npages;
1184
3.62k
    m->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(toS(npages));
1185
1186
    // Part 4: open document objects.  We don't care about the order.
1187
1188
3.62k
    if (lc_root.size() != 1) {
1189
0
        stopOnError("found other than one root while"
1190
0
                    " calculating linearization data");
1191
0
    }
1192
3.62k
    m->part4.push_back(getObject(*(lc_root.begin())));
1193
15.3k
    for (auto const& og: lc_open_document) {
1194
15.3k
        m->part4.push_back(getObject(og));
1195
15.3k
    }
1196
1197
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1198
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1199
    // any option to set this and also disregards /OpenAction.  We will do the same.
1200
1201
    // First, place the actual first page object itself.
1202
3.62k
    if (pages.empty()) {
1203
20
        stopOnError("no pages found while calculating linearization data");
1204
20
    }
1205
3.62k
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
1206
3.62k
    if (!lc_first_page_private.count(first_page_og)) {
1207
221
        stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: first page "
1208
221
                    "object not in lc_first_page_private");
1209
221
    }
1210
3.62k
    lc_first_page_private.erase(first_page_og);
1211
3.62k
    m->c_linp.first_page_object = pages.at(0).getObjectID();
1212
3.62k
    m->part6.push_back(pages.at(0));
1213
1214
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1215
    // it except to the extent that it groups private and shared objects contiguously for the sake
1216
    // of hint tables.
1217
1218
11.8k
    for (auto const& og: lc_first_page_private) {
1219
11.8k
        m->part6.push_back(getObject(og));
1220
11.8k
    }
1221
1222
9.15k
    for (auto const& og: lc_first_page_shared) {
1223
9.15k
        m->part6.push_back(getObject(og));
1224
9.15k
    }
1225
1226
    // Place the outline dictionary if it goes in the first page section.
1227
3.62k
    if (outlines_in_first_page) {
1228
370
        pushOutlinesToPart(m->part6, lc_outlines, object_stream_data);
1229
370
    }
1230
1231
    // Fill in page offset hint table information for the first page. The PDF spec says that
1232
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1233
    // it fills in garbage values for all the shared object identifiers on the first page.
1234
1235
3.62k
    m->c_page_offset_data.entries.at(0).nobjects = toI(m->part6.size());
1236
1237
    // Part 7: other pages' private objects
1238
1239
    // For each page in order:
1240
14.3k
    for (size_t i = 1; i < toS(npages); ++i) {
1241
        // Place this page's page object
1242
1243
10.7k
        QPDFObjGen page_og(pages.at(i).getObjGen());
1244
10.7k
        if (!lc_other_page_private.count(page_og)) {
1245
111
            stopOnError(
1246
111
                "INTERNAL ERROR: "
1247
111
                "QPDF::calculateLinearizationData: page object for page " +
1248
111
                std::to_string(i) + " not in lc_other_page_private");
1249
111
        }
1250
10.7k
        lc_other_page_private.erase(page_og);
1251
10.7k
        m->part7.push_back(pages.at(i));
1252
1253
        // Place all non-shared objects referenced by this page, updating the page object count for
1254
        // the hint table.
1255
1256
10.7k
        m->c_page_offset_data.entries.at(i).nobjects = 1;
1257
1258
10.7k
        ObjUser ou(ObjUser::ou_page, toI(i));
1259
10.7k
        if (m->obj_user_to_objects.count(ou) == 0) {
1260
0
            stopOnError("found unreferenced page while"
1261
0
                        " calculating linearization data");
1262
0
        }
1263
88.1k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1264
88.1k
            if (lc_other_page_private.count(og)) {
1265
11.8k
                lc_other_page_private.erase(og);
1266
11.8k
                m->part7.push_back(getObject(og));
1267
11.8k
                ++m->c_page_offset_data.entries.at(i).nobjects;
1268
11.8k
            }
1269
88.1k
        }
1270
10.7k
    }
1271
    // That should have covered all part7 objects.
1272
3.62k
    if (!lc_other_page_private.empty()) {
1273
0
        stopOnError("INTERNAL ERROR:"
1274
0
                    " QPDF::calculateLinearizationData: lc_other_page_private is "
1275
0
                    "not empty after generation of part7");
1276
0
    }
1277
1278
    // Part 8: other pages' shared objects
1279
1280
    // Order is unimportant.
1281
5.35k
    for (auto const& og: lc_other_page_shared) {
1282
5.35k
        m->part8.push_back(getObject(og));
1283
5.35k
    }
1284
1285
    // Part 9: other objects
1286
1287
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1288
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1289
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1290
    // we throw all remaining objects in arbitrary order.
1291
1292
    // Place the pages tree.
1293
3.62k
    std::set<QPDFObjGen> pages_ogs =
1294
3.62k
        m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1295
3.62k
    if (pages_ogs.empty()) {
1296
12
        stopOnError("found empty pages tree while"
1297
12
                    " calculating linearization data");
1298
12
    }
1299
5.45k
    for (auto const& og: pages_ogs) {
1300
5.45k
        if (lc_other.count(og)) {
1301
2.51k
            lc_other.erase(og);
1302
2.51k
            m->part9.push_back(getObject(og));
1303
2.51k
        }
1304
5.45k
    }
1305
1306
    // Place private thumbnail images in page order.  Slightly more information would be required if
1307
    // we were going to bother with thumbnail hint tables.
1308
17.3k
    for (size_t i = 0; i < toS(npages); ++i) {
1309
13.7k
        QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1310
13.7k
        thumb = getUncompressedObject(thumb, object_stream_data);
1311
13.7k
        if (!thumb.isNull()) {
1312
            // Output the thumbnail itself
1313
1.66k
            QPDFObjGen thumb_og(thumb.getObjGen());
1314
1.66k
            if (lc_thumbnail_private.count(thumb_og)) {
1315
1.47k
                lc_thumbnail_private.erase(thumb_og);
1316
1.47k
                m->part9.push_back(thumb);
1317
1.47k
            } else {
1318
                // No internal error this time...there's nothing to stop this object from having
1319
                // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1320
                // there's nothing to prevent it from having been in some set other than
1321
                // lc_thumbnail_private.
1322
190
            }
1323
1.66k
            std::set<QPDFObjGen>& ogs = m->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, toI(i))];
1324
8.62k
            for (auto const& og: ogs) {
1325
8.62k
                if (lc_thumbnail_private.count(og)) {
1326
437
                    lc_thumbnail_private.erase(og);
1327
437
                    m->part9.push_back(getObject(og));
1328
437
                }
1329
8.62k
            }
1330
1.66k
        }
1331
13.7k
    }
1332
3.62k
    if (!lc_thumbnail_private.empty()) {
1333
7
        stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1334
7
                    "empty after placing thumbnails");
1335
7
    }
1336
1337
    // Place shared thumbnail objects
1338
3.62k
    for (auto const& og: lc_thumbnail_shared) {
1339
794
        m->part9.push_back(getObject(og));
1340
794
    }
1341
1342
    // Place outlines unless in first page
1343
3.62k
    if (!outlines_in_first_page) {
1344
2.89k
        pushOutlinesToPart(m->part9, lc_outlines, object_stream_data);
1345
2.89k
    }
1346
1347
    // Place all remaining objects
1348
7.22k
    for (auto const& og: lc_other) {
1349
7.22k
        m->part9.push_back(getObject(og));
1350
7.22k
    }
1351
1352
    // Make sure we got everything exactly once.
1353
1354
3.62k
    size_t num_placed =
1355
3.62k
        m->part4.size() + m->part6.size() + m->part7.size() + m->part8.size() + m->part9.size();
1356
3.62k
    size_t num_wanted = m->object_to_obj_users.size();
1357
3.62k
    if (num_placed != num_wanted) {
1358
47
        stopOnError(
1359
47
            "INTERNAL ERROR: QPDF::calculateLinearizationData: wrong "
1360
47
            "number of objects placed (num_placed = " +
1361
47
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted));
1362
47
    }
1363
1364
    // Calculate shared object hint table information including references to shared objects from
1365
    // page offset hint data.
1366
1367
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1368
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1369
    // object number to index.  Then populate the shared object information for the pages.
1370
1371
    // Note that two objects never have the same object number, so we can map from object number
1372
    // only without regards to generation.
1373
3.62k
    std::map<int, int> obj_to_index;
1374
1375
3.62k
    m->c_shared_object_data.nshared_first_page = toI(m->part6.size());
1376
3.62k
    m->c_shared_object_data.nshared_total =
1377
3.62k
        m->c_shared_object_data.nshared_first_page + toI(m->part8.size());
1378
1379
3.62k
    std::vector<CHSharedObjectEntry>& shared = m->c_shared_object_data.entries;
1380
26.6k
    for (auto& oh: m->part6) {
1381
26.6k
        int obj = oh.getObjectID();
1382
26.6k
        obj_to_index[obj] = toI(shared.size());
1383
26.6k
        shared.emplace_back(obj);
1384
26.6k
    }
1385
3.62k
    QTC::TC("qpdf", "QPDF lin part 8 empty", m->part8.empty() ? 1 : 0);
1386
3.62k
    if (!m->part8.empty()) {
1387
313
        m->c_shared_object_data.first_shared_obj = m->part8.at(0).getObjectID();
1388
5.34k
        for (auto& oh: m->part8) {
1389
5.34k
            int obj = oh.getObjectID();
1390
5.34k
            obj_to_index[obj] = toI(shared.size());
1391
5.34k
            shared.emplace_back(obj);
1392
5.34k
        }
1393
313
    }
1394
3.62k
    if (static_cast<size_t>(m->c_shared_object_data.nshared_total) !=
1395
3.62k
        m->c_shared_object_data.entries.size()) {
1396
0
        stopOnError("shared object hint table has wrong number of entries");
1397
0
    }
1398
1399
    // Now compute the list of shared objects for each page after the first page.
1400
1401
14.0k
    for (size_t i = 1; i < toS(npages); ++i) {
1402
10.4k
        CHPageOffsetEntry& pe = m->c_page_offset_data.entries.at(i);
1403
10.4k
        ObjUser ou(ObjUser::ou_page, toI(i));
1404
10.4k
        if (m->obj_user_to_objects.count(ou) == 0) {
1405
0
            stopOnError("found unreferenced page while"
1406
0
                        " calculating linearization data");
1407
0
        }
1408
87.6k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1409
87.6k
            if ((m->object_to_obj_users[og].size() > 1) && (obj_to_index.count(og.getObj()) > 0)) {
1410
50.7k
                int idx = obj_to_index[og.getObj()];
1411
50.7k
                ++pe.nshared_objects;
1412
50.7k
                pe.shared_identifiers.push_back(idx);
1413
50.7k
            }
1414
87.6k
        }
1415
10.4k
    }
1416
3.62k
}
Unexecuted instantiation: void QPDF::calculateLinearizationData<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
void QPDF::calculateLinearizationData<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&)
Line
Count
Source
977
3.62k
{
978
    // This function calculates the ordering of objects, divides them into the appropriate parts,
979
    // and computes some values for the linearization parameter dictionary and hint tables.  The
980
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
981
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
982
983
3.62k
    if (m->object_to_obj_users.empty()) {
984
        // Note that we can't call optimize here because we don't know whether it should be called
985
        // with or without allow changes.
986
0
        throw std::logic_error(
987
0
            "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()");
988
0
    }
989
990
    // Separate objects into the categories sufficient for us to determine which part of the
991
    // linearized file should contain the object.  This categorization is useful for other purposes
992
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
993
994
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
995
    // trailer dictionary in part 11).
996
997
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
998
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
999
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
1000
    // for now.
1001
1002
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
1003
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
1004
    // /Outlines, then information from /Outlines also appears here.
1005
1006
    // Part 7 contains remaining objects private to pages other than the first page.
1007
1008
    // Part 8 contains all remaining shared objects except those that are shared only within
1009
    // thumbnails.
1010
1011
    // Part 9 contains all remaining objects.
1012
1013
    // We sort objects into the following categories:
1014
1015
    //   * open_document: part 4
1016
1017
    //   * first_page_private: part 6
1018
1019
    //   * first_page_shared: part 6
1020
1021
    //   * other_page_private: part 7
1022
1023
    //   * other_page_shared: part 8
1024
1025
    //   * thumbnail_private: part 9
1026
1027
    //   * thumbnail_shared: part 9
1028
1029
    //   * other: part 9
1030
1031
    //   * outlines: part 6 or 9
1032
1033
3.62k
    m->part4.clear();
1034
3.62k
    m->part6.clear();
1035
3.62k
    m->part7.clear();
1036
3.62k
    m->part8.clear();
1037
3.62k
    m->part9.clear();
1038
3.62k
    m->c_linp = LinParameters();
1039
3.62k
    m->c_page_offset_data = CHPageOffset();
1040
3.62k
    m->c_shared_object_data = CHSharedObject();
1041
3.62k
    m->c_outline_data = HGeneric();
1042
1043
3.62k
    QPDFObjectHandle root = getRoot();
1044
3.62k
    bool outlines_in_first_page = false;
1045
3.62k
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
1046
3.62k
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
1047
3.62k
    if (pagemode.isName()) {
1048
682
        if (pagemode.getName() == "/UseOutlines") {
1049
518
            if (root.hasKey("/Outlines")) {
1050
375
                outlines_in_first_page = true;
1051
375
            } else {
1052
143
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1053
143
            }
1054
518
        }
1055
682
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1056
682
    }
1057
1058
3.62k
    std::set<std::string> open_document_keys;
1059
3.62k
    open_document_keys.insert("/ViewerPreferences");
1060
3.62k
    open_document_keys.insert("/PageMode");
1061
3.62k
    open_document_keys.insert("/Threads");
1062
3.62k
    open_document_keys.insert("/OpenAction");
1063
3.62k
    open_document_keys.insert("/AcroForm");
1064
1065
3.62k
    std::set<QPDFObjGen> lc_open_document;
1066
3.62k
    std::set<QPDFObjGen> lc_first_page_private;
1067
3.62k
    std::set<QPDFObjGen> lc_first_page_shared;
1068
3.62k
    std::set<QPDFObjGen> lc_other_page_private;
1069
3.62k
    std::set<QPDFObjGen> lc_other_page_shared;
1070
3.62k
    std::set<QPDFObjGen> lc_thumbnail_private;
1071
3.62k
    std::set<QPDFObjGen> lc_thumbnail_shared;
1072
3.62k
    std::set<QPDFObjGen> lc_other;
1073
3.62k
    std::set<QPDFObjGen> lc_outlines;
1074
3.62k
    std::set<QPDFObjGen> lc_root;
1075
1076
91.5k
    for (auto& oiter: m->object_to_obj_users) {
1077
91.5k
        QPDFObjGen const& og = oiter.first;
1078
91.5k
        std::set<ObjUser>& ous = oiter.second;
1079
1080
91.5k
        bool in_open_document = false;
1081
91.5k
        bool in_first_page = false;
1082
91.5k
        int other_pages = 0;
1083
91.5k
        int thumbs = 0;
1084
91.5k
        int others = 0;
1085
91.5k
        bool in_outlines = false;
1086
91.5k
        bool is_root = false;
1087
1088
374k
        for (auto const& ou: ous) {
1089
374k
            switch (ou.ou_type) {
1090
180k
            case ObjUser::ou_trailer_key:
1091
180k
                if (ou.key == "/Encrypt") {
1092
170
                    in_open_document = true;
1093
180k
                } else {
1094
180k
                    ++others;
1095
180k
                }
1096
180k
                break;
1097
1098
9.30k
            case ObjUser::ou_thumb:
1099
9.30k
                ++thumbs;
1100
9.30k
                break;
1101
1102
46.4k
            case ObjUser::ou_root_key:
1103
46.4k
                if (open_document_keys.count(ou.key) > 0) {
1104
15.2k
                    in_open_document = true;
1105
31.1k
                } else if (ou.key == "/Outlines") {
1106
4.54k
                    in_outlines = true;
1107
26.6k
                } else {
1108
26.6k
                    ++others;
1109
26.6k
                }
1110
46.4k
                break;
1111
1112
134k
            case ObjUser::ou_page:
1113
134k
                if (ou.pageno == 0) {
1114
41.0k
                    in_first_page = true;
1115
93.2k
                } else {
1116
93.2k
                    ++other_pages;
1117
93.2k
                }
1118
134k
                break;
1119
1120
3.62k
            case ObjUser::ou_root:
1121
3.62k
                is_root = true;
1122
3.62k
                break;
1123
1124
0
            case ObjUser::ou_bad:
1125
0
                stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: "
1126
0
                            "invalid user type");
1127
0
                break;
1128
374k
            }
1129
374k
        }
1130
1131
91.5k
        if (is_root) {
1132
3.62k
            lc_root.insert(og);
1133
87.9k
        } else if (in_outlines) {
1134
4.51k
            lc_outlines.insert(og);
1135
83.4k
        } else if (in_open_document) {
1136
15.3k
            lc_open_document.insert(og);
1137
68.0k
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1138
15.2k
            lc_first_page_private.insert(og);
1139
52.8k
        } else if (in_first_page) {
1140
9.90k
            lc_first_page_shared.insert(og);
1141
42.9k
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1142
23.6k
            lc_other_page_private.insert(og);
1143
23.6k
        } else if (other_pages > 1) {
1144
5.83k
            lc_other_page_shared.insert(og);
1145
13.5k
        } else if ((thumbs == 1) && (others == 0)) {
1146
2.06k
            lc_thumbnail_private.insert(og);
1147
11.4k
        } else if (thumbs > 1) {
1148
830
            lc_thumbnail_shared.insert(og);
1149
10.6k
        } else {
1150
10.6k
            lc_other.insert(og);
1151
10.6k
        }
1152
91.5k
    }
1153
1154
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1155
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1156
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1157
    // and the above section can be considered only with categorization.  Note that sets of
1158
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1159
    // the possible exception of hints tables which we won't see here anyway.  That means that
1160
    // running calculateLinearizationData() on a linearized file should give results identical to
1161
    // the original file ordering.
1162
1163
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1164
    // code optimization if necessary. Premature optimization is the root of all evil.
1165
3.62k
    std::vector<QPDFObjectHandle> pages;
1166
3.62k
    { // local scope
1167
        // Map all page objects to the containing object stream.  This should be a no-op in a
1168
        // properly linearized file.
1169
15.2k
        for (auto oh: getAllPages()) {
1170
15.2k
            pages.push_back(getUncompressedObject(oh, object_stream_data));
1171
15.2k
        }
1172
3.62k
    }
1173
3.62k
    int npages = toI(pages.size());
1174
1175
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1176
    // initialize any items that deal with object numbers or counts but not any items that deal with
1177
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1178
    // during the first pass.  The validation code can compute them relatively easily given the rest
1179
    // of the information.
1180
1181
    // npages is the size of the existing pages vector, which has been created by traversing the
1182
    // pages tree, and as such is a reasonable size.
1183
3.62k
    m->c_linp.npages = npages;
1184
3.62k
    m->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(toS(npages));
1185
1186
    // Part 4: open document objects.  We don't care about the order.
1187
1188
3.62k
    if (lc_root.size() != 1) {
1189
0
        stopOnError("found other than one root while"
1190
0
                    " calculating linearization data");
1191
0
    }
1192
3.62k
    m->part4.push_back(getObject(*(lc_root.begin())));
1193
15.3k
    for (auto const& og: lc_open_document) {
1194
15.3k
        m->part4.push_back(getObject(og));
1195
15.3k
    }
1196
1197
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1198
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1199
    // any option to set this and also disregards /OpenAction.  We will do the same.
1200
1201
    // First, place the actual first page object itself.
1202
3.62k
    if (pages.empty()) {
1203
20
        stopOnError("no pages found while calculating linearization data");
1204
20
    }
1205
3.62k
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
1206
3.62k
    if (!lc_first_page_private.count(first_page_og)) {
1207
221
        stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: first page "
1208
221
                    "object not in lc_first_page_private");
1209
221
    }
1210
3.62k
    lc_first_page_private.erase(first_page_og);
1211
3.62k
    m->c_linp.first_page_object = pages.at(0).getObjectID();
1212
3.62k
    m->part6.push_back(pages.at(0));
1213
1214
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1215
    // it except to the extent that it groups private and shared objects contiguously for the sake
1216
    // of hint tables.
1217
1218
11.8k
    for (auto const& og: lc_first_page_private) {
1219
11.8k
        m->part6.push_back(getObject(og));
1220
11.8k
    }
1221
1222
9.15k
    for (auto const& og: lc_first_page_shared) {
1223
9.15k
        m->part6.push_back(getObject(og));
1224
9.15k
    }
1225
1226
    // Place the outline dictionary if it goes in the first page section.
1227
3.62k
    if (outlines_in_first_page) {
1228
370
        pushOutlinesToPart(m->part6, lc_outlines, object_stream_data);
1229
370
    }
1230
1231
    // Fill in page offset hint table information for the first page. The PDF spec says that
1232
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1233
    // it fills in garbage values for all the shared object identifiers on the first page.
1234
1235
3.62k
    m->c_page_offset_data.entries.at(0).nobjects = toI(m->part6.size());
1236
1237
    // Part 7: other pages' private objects
1238
1239
    // For each page in order:
1240
14.3k
    for (size_t i = 1; i < toS(npages); ++i) {
1241
        // Place this page's page object
1242
1243
10.7k
        QPDFObjGen page_og(pages.at(i).getObjGen());
1244
10.7k
        if (!lc_other_page_private.count(page_og)) {
1245
111
            stopOnError(
1246
111
                "INTERNAL ERROR: "
1247
111
                "QPDF::calculateLinearizationData: page object for page " +
1248
111
                std::to_string(i) + " not in lc_other_page_private");
1249
111
        }
1250
10.7k
        lc_other_page_private.erase(page_og);
1251
10.7k
        m->part7.push_back(pages.at(i));
1252
1253
        // Place all non-shared objects referenced by this page, updating the page object count for
1254
        // the hint table.
1255
1256
10.7k
        m->c_page_offset_data.entries.at(i).nobjects = 1;
1257
1258
10.7k
        ObjUser ou(ObjUser::ou_page, toI(i));
1259
10.7k
        if (m->obj_user_to_objects.count(ou) == 0) {
1260
0
            stopOnError("found unreferenced page while"
1261
0
                        " calculating linearization data");
1262
0
        }
1263
88.1k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1264
88.1k
            if (lc_other_page_private.count(og)) {
1265
11.8k
                lc_other_page_private.erase(og);
1266
11.8k
                m->part7.push_back(getObject(og));
1267
11.8k
                ++m->c_page_offset_data.entries.at(i).nobjects;
1268
11.8k
            }
1269
88.1k
        }
1270
10.7k
    }
1271
    // That should have covered all part7 objects.
1272
3.62k
    if (!lc_other_page_private.empty()) {
1273
0
        stopOnError("INTERNAL ERROR:"
1274
0
                    " QPDF::calculateLinearizationData: lc_other_page_private is "
1275
0
                    "not empty after generation of part7");
1276
0
    }
1277
1278
    // Part 8: other pages' shared objects
1279
1280
    // Order is unimportant.
1281
5.35k
    for (auto const& og: lc_other_page_shared) {
1282
5.35k
        m->part8.push_back(getObject(og));
1283
5.35k
    }
1284
1285
    // Part 9: other objects
1286
1287
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1288
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1289
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1290
    // we throw all remaining objects in arbitrary order.
1291
1292
    // Place the pages tree.
1293
3.62k
    std::set<QPDFObjGen> pages_ogs =
1294
3.62k
        m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1295
3.62k
    if (pages_ogs.empty()) {
1296
12
        stopOnError("found empty pages tree while"
1297
12
                    " calculating linearization data");
1298
12
    }
1299
5.45k
    for (auto const& og: pages_ogs) {
1300
5.45k
        if (lc_other.count(og)) {
1301
2.51k
            lc_other.erase(og);
1302
2.51k
            m->part9.push_back(getObject(og));
1303
2.51k
        }
1304
5.45k
    }
1305
1306
    // Place private thumbnail images in page order.  Slightly more information would be required if
1307
    // we were going to bother with thumbnail hint tables.
1308
17.3k
    for (size_t i = 0; i < toS(npages); ++i) {
1309
13.7k
        QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1310
13.7k
        thumb = getUncompressedObject(thumb, object_stream_data);
1311
13.7k
        if (!thumb.isNull()) {
1312
            // Output the thumbnail itself
1313
1.66k
            QPDFObjGen thumb_og(thumb.getObjGen());
1314
1.66k
            if (lc_thumbnail_private.count(thumb_og)) {
1315
1.47k
                lc_thumbnail_private.erase(thumb_og);
1316
1.47k
                m->part9.push_back(thumb);
1317
1.47k
            } else {
1318
                // No internal error this time...there's nothing to stop this object from having
1319
                // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1320
                // there's nothing to prevent it from having been in some set other than
1321
                // lc_thumbnail_private.
1322
190
            }
1323
1.66k
            std::set<QPDFObjGen>& ogs = m->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, toI(i))];
1324
8.62k
            for (auto const& og: ogs) {
1325
8.62k
                if (lc_thumbnail_private.count(og)) {
1326
437
                    lc_thumbnail_private.erase(og);
1327
437
                    m->part9.push_back(getObject(og));
1328
437
                }
1329
8.62k
            }
1330
1.66k
        }
1331
13.7k
    }
1332
3.62k
    if (!lc_thumbnail_private.empty()) {
1333
7
        stopOnError("INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1334
7
                    "empty after placing thumbnails");
1335
7
    }
1336
1337
    // Place shared thumbnail objects
1338
3.62k
    for (auto const& og: lc_thumbnail_shared) {
1339
794
        m->part9.push_back(getObject(og));
1340
794
    }
1341
1342
    // Place outlines unless in first page
1343
3.62k
    if (!outlines_in_first_page) {
1344
2.89k
        pushOutlinesToPart(m->part9, lc_outlines, object_stream_data);
1345
2.89k
    }
1346
1347
    // Place all remaining objects
1348
7.22k
    for (auto const& og: lc_other) {
1349
7.22k
        m->part9.push_back(getObject(og));
1350
7.22k
    }
1351
1352
    // Make sure we got everything exactly once.
1353
1354
3.62k
    size_t num_placed =
1355
3.62k
        m->part4.size() + m->part6.size() + m->part7.size() + m->part8.size() + m->part9.size();
1356
3.62k
    size_t num_wanted = m->object_to_obj_users.size();
1357
3.62k
    if (num_placed != num_wanted) {
1358
47
        stopOnError(
1359
47
            "INTERNAL ERROR: QPDF::calculateLinearizationData: wrong "
1360
47
            "number of objects placed (num_placed = " +
1361
47
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted));
1362
47
    }
1363
1364
    // Calculate shared object hint table information including references to shared objects from
1365
    // page offset hint data.
1366
1367
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1368
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1369
    // object number to index.  Then populate the shared object information for the pages.
1370
1371
    // Note that two objects never have the same object number, so we can map from object number
1372
    // only without regards to generation.
1373
3.62k
    std::map<int, int> obj_to_index;
1374
1375
3.62k
    m->c_shared_object_data.nshared_first_page = toI(m->part6.size());
1376
3.62k
    m->c_shared_object_data.nshared_total =
1377
3.62k
        m->c_shared_object_data.nshared_first_page + toI(m->part8.size());
1378
1379
3.62k
    std::vector<CHSharedObjectEntry>& shared = m->c_shared_object_data.entries;
1380
26.6k
    for (auto& oh: m->part6) {
1381
26.6k
        int obj = oh.getObjectID();
1382
26.6k
        obj_to_index[obj] = toI(shared.size());
1383
26.6k
        shared.emplace_back(obj);
1384
26.6k
    }
1385
3.62k
    QTC::TC("qpdf", "QPDF lin part 8 empty", m->part8.empty() ? 1 : 0);
1386
3.62k
    if (!m->part8.empty()) {
1387
313
        m->c_shared_object_data.first_shared_obj = m->part8.at(0).getObjectID();
1388
5.34k
        for (auto& oh: m->part8) {
1389
5.34k
            int obj = oh.getObjectID();
1390
5.34k
            obj_to_index[obj] = toI(shared.size());
1391
5.34k
            shared.emplace_back(obj);
1392
5.34k
        }
1393
313
    }
1394
3.62k
    if (static_cast<size_t>(m->c_shared_object_data.nshared_total) !=
1395
3.62k
        m->c_shared_object_data.entries.size()) {
1396
0
        stopOnError("shared object hint table has wrong number of entries");
1397
0
    }
1398
1399
    // Now compute the list of shared objects for each page after the first page.
1400
1401
14.0k
    for (size_t i = 1; i < toS(npages); ++i) {
1402
10.4k
        CHPageOffsetEntry& pe = m->c_page_offset_data.entries.at(i);
1403
10.4k
        ObjUser ou(ObjUser::ou_page, toI(i));
1404
10.4k
        if (m->obj_user_to_objects.count(ou) == 0) {
1405
0
            stopOnError("found unreferenced page while"
1406
0
                        " calculating linearization data");
1407
0
        }
1408
87.6k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1409
87.6k
            if ((m->object_to_obj_users[og].size() > 1) && (obj_to_index.count(og.getObj()) > 0)) {
1410
50.7k
                int idx = obj_to_index[og.getObj()];
1411
50.7k
                ++pe.nshared_objects;
1412
50.7k
                pe.shared_identifiers.push_back(idx);
1413
50.7k
            }
1414
87.6k
        }
1415
10.4k
    }
1416
3.62k
}
1417
1418
template <typename T>
1419
void
1420
QPDF::pushOutlinesToPart(
1421
    std::vector<QPDFObjectHandle>& part,
1422
    std::set<QPDFObjGen>& lc_outlines,
1423
    T const& object_stream_data)
1424
3.26k
{
1425
3.26k
    QPDFObjectHandle root = getRoot();
1426
3.26k
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1427
3.26k
    if (outlines.isNull()) {
1428
2.72k
        return;
1429
2.72k
    }
1430
536
    outlines = getUncompressedObject(outlines, object_stream_data);
1431
536
    QPDFObjGen outlines_og(outlines.getObjGen());
1432
536
    QTC::TC(
1433
536
        "qpdf",
1434
536
        "QPDF lin outlines in part",
1435
536
        ((&part == (&m->part6))       ? 0
1436
536
             : (&part == (&m->part9)) ? 1
1437
166
                                      : 9999)); // can't happen
1438
536
    m->c_outline_data.first_object = outlines_og.getObj();
1439
536
    m->c_outline_data.nobjects = 1;
1440
536
    lc_outlines.erase(outlines_og);
1441
536
    part.push_back(outlines);
1442
3.86k
    for (auto const& og: lc_outlines) {
1443
3.86k
        part.push_back(getObject(og));
1444
3.86k
        ++m->c_outline_data.nobjects;
1445
3.86k
    }
1446
536
}
Unexecuted instantiation: void QPDF::pushOutlinesToPart<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
void QPDF::pushOutlinesToPart<QPDFWriter::ObjTable>(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, QPDFWriter::ObjTable const&)
Line
Count
Source
1424
3.26k
{
1425
3.26k
    QPDFObjectHandle root = getRoot();
1426
3.26k
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1427
3.26k
    if (outlines.isNull()) {
1428
2.72k
        return;
1429
2.72k
    }
1430
536
    outlines = getUncompressedObject(outlines, object_stream_data);
1431
536
    QPDFObjGen outlines_og(outlines.getObjGen());
1432
536
    QTC::TC(
1433
536
        "qpdf",
1434
536
        "QPDF lin outlines in part",
1435
536
        ((&part == (&m->part6))       ? 0
1436
536
             : (&part == (&m->part9)) ? 1
1437
166
                                      : 9999)); // can't happen
1438
536
    m->c_outline_data.first_object = outlines_og.getObj();
1439
536
    m->c_outline_data.nobjects = 1;
1440
536
    lc_outlines.erase(outlines_og);
1441
536
    part.push_back(outlines);
1442
3.86k
    for (auto const& og: lc_outlines) {
1443
3.86k
        part.push_back(getObject(og));
1444
3.86k
        ++m->c_outline_data.nobjects;
1445
3.86k
    }
1446
536
}
1447
1448
void
1449
QPDF::getLinearizedParts(
1450
    QPDFWriter::ObjTable const& obj,
1451
    std::vector<QPDFObjectHandle>& part4,
1452
    std::vector<QPDFObjectHandle>& part6,
1453
    std::vector<QPDFObjectHandle>& part7,
1454
    std::vector<QPDFObjectHandle>& part8,
1455
    std::vector<QPDFObjectHandle>& part9)
1456
3.62k
{
1457
3.62k
    calculateLinearizationData(obj);
1458
3.62k
    part4 = m->part4;
1459
3.62k
    part6 = m->part6;
1460
3.62k
    part7 = m->part7;
1461
3.62k
    part8 = m->part8;
1462
3.62k
    part9 = m->part9;
1463
3.62k
}
1464
1465
static inline int
1466
nbits(int val)
1467
62.8k
{
1468
62.8k
    return (val == 0 ? 0 : (1 + nbits(val >> 1)));
1469
62.8k
}
1470
1471
int
1472
QPDF::outputLengthNextN(
1473
    int in_object, int n, QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1474
49.8k
{
1475
    // Figure out the length of a series of n consecutive objects in the output file starting with
1476
    // whatever object in_object from the input file mapped to.
1477
1478
49.8k
    int first = obj[in_object].renumber;
1479
49.8k
    int last = first + n;
1480
49.8k
    if (first <= 0) {
1481
0
        stopOnError("found object that is not renumbered while writing linearization data");
1482
0
    }
1483
49.8k
    qpdf_offset_t length = 0;
1484
159k
    for (int i = first; i < last; ++i) {
1485
110k
        auto l = new_obj[i].length;
1486
110k
        if (l == 0) {
1487
0
            stopOnError("found item with unknown length while writing linearization data");
1488
0
        }
1489
110k
        length += l;
1490
110k
    }
1491
49.8k
    return toI(length);
1492
49.8k
}
1493
1494
void
1495
QPDF::calculateHPageOffset(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1496
2.92k
{
1497
    // Page Offset Hint Table
1498
1499
    // We are purposely leaving some values set to their initial zero values.
1500
1501
2.92k
    std::vector<QPDFObjectHandle> const& pages = getAllPages();
1502
2.92k
    size_t npages = pages.size();
1503
2.92k
    CHPageOffset& cph = m->c_page_offset_data;
1504
2.92k
    std::vector<CHPageOffsetEntry>& cphe = cph.entries;
1505
1506
    // Calculate minimum and maximum values for number of objects per page and page length.
1507
1508
2.92k
    int min_nobjects = cphe.at(0).nobjects;
1509
2.92k
    int max_nobjects = min_nobjects;
1510
2.92k
    int min_length = outputLengthNextN(pages.at(0).getObjectID(), min_nobjects, new_obj, obj);
1511
2.92k
    int max_length = min_length;
1512
2.92k
    int max_shared = cphe.at(0).nshared_objects;
1513
1514
2.92k
    HPageOffset& ph = m->page_offset_hints;
1515
2.92k
    std::vector<HPageOffsetEntry>& phe = ph.entries;
1516
    // npages is the size of the existing pages array.
1517
2.92k
    phe = std::vector<HPageOffsetEntry>(npages);
1518
1519
15.8k
    for (unsigned int i = 0; i < npages; ++i) {
1520
        // Calculate values for each page, assigning full values to the delta items.  They will be
1521
        // adjusted later.
1522
1523
        // Repeat calculations for page 0 so we can assign to phe[i] without duplicating those
1524
        // assignments.
1525
1526
12.9k
        int nobjects = cphe.at(i).nobjects;
1527
12.9k
        int length = outputLengthNextN(pages.at(i).getObjectID(), nobjects, new_obj, obj);
1528
12.9k
        int nshared = cphe.at(i).nshared_objects;
1529
1530
12.9k
        min_nobjects = std::min(min_nobjects, nobjects);
1531
12.9k
        max_nobjects = std::max(max_nobjects, nobjects);
1532
12.9k
        min_length = std::min(min_length, length);
1533
12.9k
        max_length = std::max(max_length, length);
1534
12.9k
        max_shared = std::max(max_shared, nshared);
1535
1536
12.9k
        phe.at(i).delta_nobjects = nobjects;
1537
12.9k
        phe.at(i).delta_page_length = length;
1538
12.9k
        phe.at(i).nshared_objects = nshared;
1539
12.9k
    }
1540
1541
2.92k
    ph.min_nobjects = min_nobjects;
1542
2.92k
    ph.first_page_offset = new_obj[obj[pages.at(0)].renumber].xref.getOffset();
1543
2.92k
    ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects);
1544
2.92k
    ph.min_page_length = min_length;
1545
2.92k
    ph.nbits_delta_page_length = nbits(max_length - min_length);
1546
2.92k
    ph.nbits_nshared_objects = nbits(max_shared);
1547
2.92k
    ph.nbits_shared_identifier = nbits(m->c_shared_object_data.nshared_total);
1548
2.92k
    ph.shared_denominator = 4; // doesn't matter
1549
1550
    // It isn't clear how to compute content offset and content length.  Since we are not
1551
    // interleaving page objects with the content stream, we'll use the same values for content
1552
    // length as page length.  We will use 0 as content offset because this is what Adobe does
1553
    // (implementation note 127) and pdlin as well.
1554
2.92k
    ph.nbits_delta_content_length = ph.nbits_delta_page_length;
1555
2.92k
    ph.min_content_length = ph.min_page_length;
1556
1557
15.8k
    for (size_t i = 0; i < npages; ++i) {
1558
        // Adjust delta entries
1559
12.9k
        if ((phe.at(i).delta_nobjects < min_nobjects) ||
1560
12.9k
            (phe.at(i).delta_page_length < min_length)) {
1561
0
            stopOnError("found too small delta nobjects or delta page length while writing "
1562
0
                        "linearization data");
1563
0
        }
1564
12.9k
        phe.at(i).delta_nobjects -= min_nobjects;
1565
12.9k
        phe.at(i).delta_page_length -= min_length;
1566
12.9k
        phe.at(i).delta_content_length = phe.at(i).delta_page_length;
1567
1568
62.9k
        for (size_t j = 0; j < toS(cphe.at(i).nshared_objects); ++j) {
1569
50.0k
            phe.at(i).shared_identifiers.push_back(cphe.at(i).shared_identifiers.at(j));
1570
50.0k
            phe.at(i).shared_numerators.push_back(0);
1571
50.0k
        }
1572
12.9k
    }
1573
2.92k
}
1574
1575
void
1576
QPDF::calculateHSharedObject(
1577
    QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1578
2.92k
{
1579
2.92k
    CHSharedObject& cso = m->c_shared_object_data;
1580
2.92k
    std::vector<CHSharedObjectEntry>& csoe = cso.entries;
1581
2.92k
    HSharedObject& so = m->shared_object_hints;
1582
2.92k
    std::vector<HSharedObjectEntry>& soe = so.entries;
1583
2.92k
    soe.clear();
1584
1585
2.92k
    int min_length = outputLengthNextN(csoe.at(0).object, 1, new_obj, obj);
1586
2.92k
    int max_length = min_length;
1587
1588
33.5k
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1589
        // Assign absolute numbers to deltas; adjust later
1590
30.6k
        int length = outputLengthNextN(csoe.at(i).object, 1, new_obj, obj);
1591
30.6k
        min_length = std::min(min_length, length);
1592
30.6k
        max_length = std::max(max_length, length);
1593
30.6k
        soe.emplace_back();
1594
30.6k
        soe.at(i).delta_group_length = length;
1595
30.6k
    }
1596
2.92k
    if (soe.size() != toS(cso.nshared_total)) {
1597
0
        stopOnError("soe has wrong size after initialization");
1598
0
    }
1599
1600
2.92k
    so.nshared_total = cso.nshared_total;
1601
2.92k
    so.nshared_first_page = cso.nshared_first_page;
1602
2.92k
    if (so.nshared_total > so.nshared_first_page) {
1603
292
        so.first_shared_obj = obj[cso.first_shared_obj].renumber;
1604
292
        so.min_group_length = min_length;
1605
292
        so.first_shared_offset = new_obj[so.first_shared_obj].xref.getOffset();
1606
292
    }
1607
2.92k
    so.min_group_length = min_length;
1608
2.92k
    so.nbits_delta_group_length = nbits(max_length - min_length);
1609
1610
33.5k
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1611
        // Adjust deltas
1612
30.6k
        if (soe.at(i).delta_group_length < min_length) {
1613
0
            stopOnError("found too small group length while writing linearization data");
1614
0
        }
1615
30.6k
        soe.at(i).delta_group_length -= min_length;
1616
30.6k
    }
1617
2.92k
}
1618
1619
void
1620
QPDF::calculateHOutline(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1621
2.92k
{
1622
2.92k
    HGeneric& cho = m->c_outline_data;
1623
1624
2.92k
    if (cho.nobjects == 0) {
1625
2.45k
        return;
1626
2.45k
    }
1627
1628
471
    HGeneric& ho = m->outline_hints;
1629
1630
471
    ho.first_object = obj[cho.first_object].renumber;
1631
471
    ho.first_object_offset = new_obj[ho.first_object].xref.getOffset();
1632
471
    ho.nobjects = cho.nobjects;
1633
471
    ho.group_length = outputLengthNextN(cho.first_object, ho.nobjects, new_obj, obj);
1634
471
}
1635
1636
template <class T, class int_type>
1637
static void
1638
write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec, int bits, int_type T::*field)
1639
23.3k
{
1640
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1641
1642
179k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1643
156k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1644
156k
    }
1645
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1646
    // start on a byte boundary.
1647
23.3k
    w.flush();
1648
23.3k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HPageOffsetEntry, int>(BitWriter&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, int QPDF::HPageOffsetEntry::*)
Line
Count
Source
1639
5.84k
{
1640
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1641
1642
31.6k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1643
25.8k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1644
25.8k
    }
1645
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1646
    // start on a byte boundary.
1647
5.84k
    w.flush();
1648
5.84k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HPageOffsetEntry, long long>(BitWriter&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, long long QPDF::HPageOffsetEntry::*)
Line
Count
Source
1639
8.76k
{
1640
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1641
1642
47.4k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1643
38.7k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1644
38.7k
    }
1645
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1646
    // start on a byte boundary.
1647
8.76k
    w.flush();
1648
8.76k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HSharedObjectEntry, int>(BitWriter&, int, std::__1::vector<QPDF::HSharedObjectEntry, std::__1::allocator<QPDF::HSharedObjectEntry> >&, int, int QPDF::HSharedObjectEntry::*)
Line
Count
Source
1639
8.76k
{
1640
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1641
1642
100k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1643
92.0k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1644
92.0k
    }
1645
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1646
    // start on a byte boundary.
1647
8.76k
    w.flush();
1648
8.76k
}
1649
1650
template <class T>
1651
static void
1652
write_vector_vector(
1653
    BitWriter& w,
1654
    int nitems1,
1655
    std::vector<T>& vec1,
1656
    int T::*nitems2,
1657
    int bits,
1658
    std::vector<int> T::*vec2)
1659
5.84k
{
1660
    // nitems1 times, write nitems2 (from the ith element of vec1) items from the vec2 vector field
1661
    // of the ith item of vec1.
1662
31.6k
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
1663
125k
        for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2) {
1664
100k
            w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)), QIntC::to_size(bits));
1665
100k
        }
1666
25.8k
    }
1667
5.84k
    w.flush();
1668
5.84k
}
1669
1670
void
1671
QPDF::writeHPageOffset(BitWriter& w)
1672
2.92k
{
1673
2.92k
    HPageOffset& t = m->page_offset_hints;
1674
1675
2.92k
    w.writeBitsInt(t.min_nobjects, 32);               // 1
1676
2.92k
    w.writeBits(toULL(t.first_page_offset), 32);      // 2
1677
2.92k
    w.writeBitsInt(t.nbits_delta_nobjects, 16);       // 3
1678
2.92k
    w.writeBitsInt(t.min_page_length, 32);            // 4
1679
2.92k
    w.writeBitsInt(t.nbits_delta_page_length, 16);    // 5
1680
2.92k
    w.writeBits(toULL(t.min_content_offset), 32);     // 6
1681
2.92k
    w.writeBitsInt(t.nbits_delta_content_offset, 16); // 7
1682
2.92k
    w.writeBitsInt(t.min_content_length, 32);         // 8
1683
2.92k
    w.writeBitsInt(t.nbits_delta_content_length, 16); // 9
1684
2.92k
    w.writeBitsInt(t.nbits_nshared_objects, 16);      // 10
1685
2.92k
    w.writeBitsInt(t.nbits_shared_identifier, 16);    // 11
1686
2.92k
    w.writeBitsInt(t.nbits_shared_numerator, 16);     // 12
1687
2.92k
    w.writeBitsInt(t.shared_denominator, 16);         // 13
1688
1689
2.92k
    int nitems = toI(getAllPages().size());
1690
2.92k
    std::vector<HPageOffsetEntry>& entries = t.entries;
1691
1692
2.92k
    write_vector_int(w, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
1693
2.92k
    write_vector_int(
1694
2.92k
        w, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
1695
2.92k
    write_vector_int(
1696
2.92k
        w, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
1697
2.92k
    write_vector_vector(
1698
2.92k
        w,
1699
2.92k
        nitems,
1700
2.92k
        entries,
1701
2.92k
        &HPageOffsetEntry::nshared_objects,
1702
2.92k
        t.nbits_shared_identifier,
1703
2.92k
        &HPageOffsetEntry::shared_identifiers);
1704
2.92k
    write_vector_vector(
1705
2.92k
        w,
1706
2.92k
        nitems,
1707
2.92k
        entries,
1708
2.92k
        &HPageOffsetEntry::nshared_objects,
1709
2.92k
        t.nbits_shared_numerator,
1710
2.92k
        &HPageOffsetEntry::shared_numerators);
1711
2.92k
    write_vector_int(
1712
2.92k
        w, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
1713
2.92k
    write_vector_int(
1714
2.92k
        w, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
1715
2.92k
}
1716
1717
void
1718
QPDF::writeHSharedObject(BitWriter& w)
1719
2.92k
{
1720
2.92k
    HSharedObject& t = m->shared_object_hints;
1721
1722
2.92k
    w.writeBitsInt(t.first_shared_obj, 32);         // 1
1723
2.92k
    w.writeBits(toULL(t.first_shared_offset), 32);  // 2
1724
2.92k
    w.writeBitsInt(t.nshared_first_page, 32);       // 3
1725
2.92k
    w.writeBitsInt(t.nshared_total, 32);            // 4
1726
2.92k
    w.writeBitsInt(t.nbits_nobjects, 16);           // 5
1727
2.92k
    w.writeBitsInt(t.min_group_length, 32);         // 6
1728
2.92k
    w.writeBitsInt(t.nbits_delta_group_length, 16); // 7
1729
1730
2.92k
    QTC::TC(
1731
2.92k
        "qpdf",
1732
2.92k
        "QPDF lin write nshared_total > nshared_first_page",
1733
2.92k
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
1734
1735
2.92k
    int nitems = t.nshared_total;
1736
2.92k
    std::vector<HSharedObjectEntry>& entries = t.entries;
1737
1738
2.92k
    write_vector_int(
1739
2.92k
        w, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
1740
2.92k
    write_vector_int(w, nitems, entries, 1, &HSharedObjectEntry::signature_present);
1741
33.5k
    for (size_t i = 0; i < toS(nitems); ++i) {
1742
        // If signature were present, we'd have to write a 128-bit hash.
1743
30.6k
        if (entries.at(i).signature_present != 0) {
1744
0
            stopOnError("found unexpected signature present"
1745
0
                        " while writing linearization data");
1746
0
        }
1747
30.6k
    }
1748
2.92k
    write_vector_int(w, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
1749
2.92k
}
1750
1751
void
1752
QPDF::writeHGeneric(BitWriter& w, HGeneric& t)
1753
471
{
1754
471
    w.writeBitsInt(t.first_object, 32);            // 1
1755
471
    w.writeBits(toULL(t.first_object_offset), 32); // 2
1756
471
    w.writeBitsInt(t.nobjects, 32);                // 3
1757
471
    w.writeBitsInt(t.group_length, 32);            // 4
1758
471
}
1759
1760
void
1761
QPDF::generateHintStream(
1762
    QPDFWriter::NewObjTable const& new_obj,
1763
    QPDFWriter::ObjTable const& obj,
1764
    std::shared_ptr<Buffer>& hint_buffer,
1765
    int& S,
1766
    int& O,
1767
    bool compressed)
1768
2.92k
{
1769
    // Populate actual hint table values
1770
2.92k
    calculateHPageOffset(new_obj, obj);
1771
2.92k
    calculateHSharedObject(new_obj, obj);
1772
2.92k
    calculateHOutline(new_obj, obj);
1773
1774
    // Write the hint stream itself into a compressed memory buffer. Write through a counter so we
1775
    // can get offsets.
1776
2.92k
    Pl_Buffer hint_stream("hint stream");
1777
2.92k
    Pipeline* next = &hint_stream;
1778
2.92k
    std::shared_ptr<Pipeline> flate;
1779
2.92k
    if (compressed) {
1780
2.92k
        flate =
1781
2.92k
            std::make_shared<Pl_Flate>("compress hint stream", &hint_stream, Pl_Flate::a_deflate);
1782
2.92k
        next = flate.get();
1783
2.92k
    }
1784
2.92k
    Pl_Count c("count", next);
1785
2.92k
    BitWriter w(&c);
1786
1787
2.92k
    writeHPageOffset(w);
1788
2.92k
    S = toI(c.getCount());
1789
2.92k
    writeHSharedObject(w);
1790
2.92k
    O = 0;
1791
2.92k
    if (m->outline_hints.nobjects > 0) {
1792
471
        O = toI(c.getCount());
1793
471
        writeHGeneric(w, m->outline_hints);
1794
471
    }
1795
2.92k
    c.finish();
1796
1797
2.92k
    hint_buffer = hint_stream.getBufferSharedPointer();
1798
2.92k
}