Coverage Report

Created: 2025-10-12 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/QPDF_linearization.cc
Line
Count
Source
1
// See doc/linearization.
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <qpdf/BitStream.hh>
6
#include <qpdf/BitWriter.hh>
7
#include <qpdf/InputSource_private.hh>
8
#include <qpdf/Pipeline_private.hh>
9
#include <qpdf/Pl_Buffer.hh>
10
#include <qpdf/Pl_Flate.hh>
11
#include <qpdf/Pl_String.hh>
12
#include <qpdf/QPDFExc.hh>
13
#include <qpdf/QPDFObjectHandle_private.hh>
14
#include <qpdf/QPDFWriter_private.hh>
15
#include <qpdf/QTC.hh>
16
#include <qpdf/QUtil.hh>
17
#include <qpdf/Util.hh>
18
19
#include <algorithm>
20
#include <cmath>
21
#include <cstring>
22
#include <utility>
23
24
using namespace qpdf;
25
using namespace std::literals;
26
27
using Lin = QPDF::Doc::Linearization;
28
29
template <class T, class int_type>
30
static void
31
load_vector_int(
32
    BitStream& bit_stream, int nitems, std::vector<T>& vec, int bits_wanted, int_type T::* field)
33
0
{
34
0
    bool append = vec.empty();
35
    // nitems times, read bits_wanted from the given bit stream, storing results in the ith vector
36
    // entry.
37
38
0
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
39
0
        if (append) {
40
0
            vec.push_back(T());
41
0
        }
42
0
        vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted));
43
0
    }
44
0
    util::assertion(
45
0
        std::cmp_equal(vec.size(), nitems), "vector has wrong size in load_vector_int" //
46
0
    );
47
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
48
    // start on a byte boundary.
49
0
    bit_stream.skipToNextByte();
50
0
}
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HPageOffsetEntry, int>(BitStream&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, int QPDF::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HPageOffsetEntry, long long>(BitStream&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, long long QPDF::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::HSharedObjectEntry, int>(BitStream&, int, std::__1::vector<QPDF::HSharedObjectEntry, std::__1::allocator<QPDF::HSharedObjectEntry> >&, int, int QPDF::HSharedObjectEntry::*)
51
52
template <class T>
53
static void
54
load_vector_vector(
55
    BitStream& bit_stream,
56
    int nitems1,
57
    std::vector<T>& vec1,
58
    int T::* nitems2,
59
    int bits_wanted,
60
    std::vector<int> T::* vec2)
61
0
{
62
    // nitems1 times, read nitems2 (from the ith element of vec1) items into the vec2 vector field
63
    // of the ith item of vec1.
64
0
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
65
0
        for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2) {
66
0
            (vec1.at(i1).*vec2).push_back(bit_stream.getBitsInt(QIntC::to_size(bits_wanted)));
67
0
        }
68
0
    }
69
0
    bit_stream.skipToNextByte();
70
0
}
71
72
void
73
Lin::linearizationWarning(std::string_view msg)
74
0
{
75
0
    m->linearization_warnings = true;
76
0
    qpdf.warn(qpdf_e_linearization, "", 0, std::string(msg));
77
0
}
78
79
bool
80
QPDF::checkLinearization()
81
0
{
82
0
    try {
83
0
        m->lin.readLinearizationData();
84
0
        m->lin.checkLinearizationInternal();
85
0
        return !m->linearization_warnings;
86
0
    } catch (std::runtime_error& e) {
87
0
        m->lin.linearizationWarning(
88
0
            "error encountered while checking linearization data: " + std::string(e.what()));
89
0
        return false;
90
0
    }
91
0
}
92
93
bool
94
QPDF::isLinearized()
95
0
{
96
    // If the first object in the file is a dictionary with a suitable /Linearized key and has an /L
97
    // key that accurately indicates the file size, initialize m->lindict and return true.
98
99
    // A linearized PDF spec's first object will be contained within the first 1024 bytes of the
100
    // file and will be a dictionary with a valid /Linearized key.  This routine looks for that and
101
    // does no additional validation.
102
103
    // The PDF spec says the linearization dictionary must be completely contained within the first
104
    // 1024 bytes of the file. Add a byte for a null terminator.
105
0
    auto buffer = m->file->read(1024, 0);
106
0
    size_t pos = 0;
107
0
    while (true) {
108
        // Find a digit or end of buffer
109
0
        pos = buffer.find_first_of("0123456789"sv, pos);
110
0
        if (pos == std::string::npos) {
111
0
            return false;
112
0
        }
113
        // Seek to the digit. Then skip over digits for a potential
114
        // next iteration.
115
0
        m->file->seek(toO(pos), SEEK_SET);
116
117
0
        auto t1 = m->objects.readToken(*m->file, 20);
118
0
        if (!(t1.isInteger() && m->objects.readToken(*m->file, 6).isInteger() &&
119
0
              m->objects.readToken(*m->file, 4).isWord("obj"))) {
120
0
            pos = buffer.find_first_not_of("0123456789"sv, pos);
121
0
            if (pos == std::string::npos) {
122
0
                return false;
123
0
            }
124
0
            continue;
125
0
        }
126
127
0
        Dictionary candidate = getObject(toI(QUtil::string_to_ll(t1.getValue().data())), 0);
128
0
        auto linkey = candidate["/Linearized"];
129
0
        if (!(linkey.isNumber() && toI(floor(linkey.getNumericValue())) == 1)) {
130
0
            return false;
131
0
        }
132
133
0
        m->file->seek(0, SEEK_END);
134
0
        Integer L = candidate["/L"];
135
0
        if (L != m->file->tell()) {
136
0
            return false;
137
0
        }
138
0
        m->linp.file_size = L;
139
0
        m->lindict = candidate;
140
0
        return true;
141
0
    }
142
0
}
143
144
void
145
Lin::readLinearizationData()
146
0
{
147
0
    util::assertion(
148
0
        qpdf.isLinearized(), "called readLinearizationData for file that is not linearized" //
149
0
    );
150
151
    // This function throws an exception (which is trapped by checkLinearization()) for any errors
152
    // that prevent loading.
153
154
    // /L is read and stored in linp by isLinearized()
155
0
    Array H = m->lindict["/H"]; // hint table offset/length for primary and overflow hint tables
156
0
    auto H_size = H.size();
157
0
    Integer H_0 = H[0]; // hint table offset
158
0
    Integer H_1 = H[1]; // hint table length
159
0
    Integer H_2 = H[2]; // hint table offset for overflow hint table
160
0
    Integer H_3 = H[3]; // hint table length for overflow hint table
161
0
    Integer O = m->lindict["/O"];
162
0
    Integer E = m->lindict["/E"];
163
0
    Integer N = m->lindict["/N"];
164
0
    Integer T = m->lindict["/T"];
165
0
    auto P_oh = m->lindict["/P"];
166
0
    Integer P = P_oh; // first page number
167
0
    QTC::TC("qpdf", "QPDF P absent in lindict", P ? 0 : 1);
168
169
0
    qpdf.no_ci_stop_if(
170
0
        !(H && O && E && N && T && (P || P_oh.null())),
171
0
        "some keys in linearization dictionary are of the wrong type",
172
0
        "linearization dictionary" //
173
0
    );
174
175
0
    qpdf.no_ci_stop_if(
176
0
        !(H_size == 2 || H_size == 4),
177
0
        "H has the wrong number of items",
178
0
        "linearization dictionary" //
179
0
    );
180
181
0
    qpdf.no_ci_stop_if(
182
0
        !(H_0 && H_1 && (H_size == 2 || (H_2 && H_3))),
183
0
        "some H items are of the wrong type",
184
0
        "linearization dictionary" //
185
0
    );
186
187
    // Store linearization parameter data
188
189
    // Various places in the code use linp.npages, which is initialized from N, to pre-allocate
190
    // memory, so make sure it's accurate and bail right now if it's not.
191
0
    qpdf.no_ci_stop_if(
192
0
        N != qpdf.getAllPages().size(),
193
0
        "/N does not match number of pages",
194
0
        "linearization dictionary" //
195
0
    );
196
197
    // file_size initialized by isLinearized()
198
0
    m->linp.first_page_object = O;
199
0
    m->linp.first_page_end = E;
200
0
    m->linp.npages = N;
201
0
    m->linp.xref_zero_offset = T;
202
0
    m->linp.first_page = P ? P : 0;
203
0
    m->linp.H_offset = H_0;
204
0
    m->linp.H_length = H_1;
205
206
    // Read hint streams
207
208
0
    Pl_Buffer pb("hint buffer");
209
0
    auto H0 = readHintStream(pb, H_0, H_1);
210
0
    if (H_2) {
211
0
        (void)readHintStream(pb, H_2, H_3);
212
0
    }
213
214
    // PDF 1.4 hint tables that we ignore:
215
216
    //  /T    thumbnail
217
    //  /A    thread information
218
    //  /E    named destination
219
    //  /V    interactive form
220
    //  /I    information dictionary
221
    //  /C    logical structure
222
    //  /L    page label
223
224
    // Individual hint table offsets
225
0
    Integer HS = H0["/S"]; // shared object
226
0
    Integer HO = H0["/O"]; // outline
227
228
0
    auto hbp = pb.getBufferSharedPointer();
229
0
    Buffer* hb = hbp.get();
230
0
    unsigned char const* h_buf = hb->getBuffer();
231
0
    size_t h_size = hb->getSize();
232
233
0
    readHPageOffset(BitStream(h_buf, h_size));
234
235
0
    size_t HSi = HS;
236
0
    if (HSi < 0 || HSi >= h_size) {
237
0
        throw qpdf.damagedPDF(
238
0
            "linearization hint table", "/S (shared object) offset is out of bounds");
239
0
    }
240
0
    readHSharedObject(BitStream(h_buf + HSi, h_size - HSi));
241
242
0
    if (HO) {
243
0
        qpdf.no_ci_stop_if(
244
0
            HO < 0 || HO >= h_size,
245
0
            "/O (outline) offset is out of bounds",
246
0
            "linearization dictionary" //
247
0
        );
248
0
        size_t HOi = HO;
249
0
        readHGeneric(BitStream(h_buf + HO, h_size - HOi), m->outline_hints);
250
0
    }
251
0
}
252
253
Dictionary
254
Lin::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
255
0
{
256
0
    auto H = m->objects.readObjectAtOffset(offset, "linearization hint stream", false);
257
0
    ObjCache& oc = m->obj_cache[H];
258
0
    qpdf_offset_t min_end_offset = oc.end_before_space;
259
0
    qpdf_offset_t max_end_offset = oc.end_after_space;
260
0
    qpdf.no_ci_stop_if(
261
0
        !H.isStream(), "hint table is not a stream", "linearization dictionary" //
262
0
    );
263
264
0
    Dictionary Hdict = H.getDict();
265
266
    // Some versions of Acrobat make /Length indirect and place it immediately after the stream,
267
    // increasing length to cover it, even though the specification says all objects in the
268
    // linearization parameter dictionary must be direct.  We have to get the file position of the
269
    // end of length in this case.
270
0
    if (Hdict["/Length"].indirect()) {
271
0
        ObjCache& oc2 = m->obj_cache[Hdict["/Length"]];
272
0
        min_end_offset = oc2.end_before_space;
273
0
        max_end_offset = oc2.end_after_space;
274
0
    } else {
275
0
        QTC::TC("qpdf", "QPDF hint table length direct");
276
0
    }
277
0
    qpdf_offset_t computed_end = offset + toO(length);
278
0
    qpdf.no_ci_stop_if(
279
0
        computed_end < min_end_offset || computed_end > max_end_offset,
280
0
        "hint table length mismatch (expected = " + std::to_string(computed_end) + "; actual = " +
281
0
            std::to_string(min_end_offset) + ".." + std::to_string(max_end_offset) + ")",
282
0
        "linearization dictionary" //
283
0
    );
284
0
    H.pipeStreamData(&pl, 0, qpdf_dl_specialized);
285
0
    return Hdict;
286
0
}
287
288
void
289
Lin::readHPageOffset(BitStream h)
290
0
{
291
    // All comments referring to the PDF spec refer to the spec for version 1.4.
292
293
0
    HPageOffset& t = m->page_offset_hints;
294
295
0
    t.min_nobjects = h.getBitsInt(32);               // 1
296
0
    t.first_page_offset = h.getBitsInt(32);          // 2
297
0
    t.nbits_delta_nobjects = h.getBitsInt(16);       // 3
298
0
    t.min_page_length = h.getBitsInt(32);            // 4
299
0
    t.nbits_delta_page_length = h.getBitsInt(16);    // 5
300
0
    t.min_content_offset = h.getBitsInt(32);         // 6
301
0
    t.nbits_delta_content_offset = h.getBitsInt(16); // 7
302
0
    t.min_content_length = h.getBitsInt(32);         // 8
303
0
    t.nbits_delta_content_length = h.getBitsInt(16); // 9
304
0
    t.nbits_nshared_objects = h.getBitsInt(16);      // 10
305
0
    t.nbits_shared_identifier = h.getBitsInt(16);    // 11
306
0
    t.nbits_shared_numerator = h.getBitsInt(16);     // 12
307
0
    t.shared_denominator = h.getBitsInt(16);         // 13
308
309
0
    std::vector<HPageOffsetEntry>& entries = t.entries;
310
0
    entries.clear();
311
0
    int nitems = toI(m->linp.npages);
312
0
    load_vector_int(h, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
313
0
    load_vector_int(
314
0
        h, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
315
0
    load_vector_int(
316
0
        h, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
317
0
    load_vector_vector(
318
0
        h,
319
0
        nitems,
320
0
        entries,
321
0
        &HPageOffsetEntry::nshared_objects,
322
0
        t.nbits_shared_identifier,
323
0
        &HPageOffsetEntry::shared_identifiers);
324
0
    load_vector_vector(
325
0
        h,
326
0
        nitems,
327
0
        entries,
328
0
        &HPageOffsetEntry::nshared_objects,
329
0
        t.nbits_shared_numerator,
330
0
        &HPageOffsetEntry::shared_numerators);
331
0
    load_vector_int(
332
0
        h, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
333
0
    load_vector_int(
334
0
        h, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
335
0
}
336
337
void
338
Lin::readHSharedObject(BitStream h)
339
0
{
340
0
    HSharedObject& t = m->shared_object_hints;
341
342
0
    t.first_shared_obj = h.getBitsInt(32);         // 1
343
0
    t.first_shared_offset = h.getBitsInt(32);      // 2
344
0
    t.nshared_first_page = h.getBitsInt(32);       // 3
345
0
    t.nshared_total = h.getBitsInt(32);            // 4
346
0
    t.nbits_nobjects = h.getBitsInt(16);           // 5
347
0
    t.min_group_length = h.getBitsInt(32);         // 6
348
0
    t.nbits_delta_group_length = h.getBitsInt(16); // 7
349
350
0
    QTC::TC(
351
0
        "qpdf",
352
0
        "QPDF lin nshared_total > nshared_first_page",
353
0
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
354
355
0
    std::vector<HSharedObjectEntry>& entries = t.entries;
356
0
    entries.clear();
357
0
    int nitems = t.nshared_total;
358
0
    load_vector_int(
359
0
        h, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
360
0
    load_vector_int(h, nitems, entries, 1, &HSharedObjectEntry::signature_present);
361
0
    for (size_t i = 0; i < toS(nitems); ++i) {
362
0
        if (entries.at(i).signature_present) {
363
            // Skip 128-bit MD5 hash.  These are not supported by acrobat, so they should probably
364
            // never be there.  We have no test case for this.
365
0
            for (int j = 0; j < 4; ++j) {
366
0
                (void)h.getBits(32);
367
0
            }
368
0
        }
369
0
    }
370
0
    load_vector_int(h, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
371
0
}
372
373
void
374
Lin::readHGeneric(BitStream h, HGeneric& t)
375
0
{
376
0
    t.first_object = h.getBitsInt(32);        // 1
377
0
    t.first_object_offset = h.getBitsInt(32); // 2
378
0
    t.nobjects = h.getBitsInt(32);            // 3
379
0
    t.group_length = h.getBitsInt(32);        // 4
380
0
}
381
382
void
383
Lin::checkLinearizationInternal()
384
0
{
385
    // All comments referring to the PDF spec refer to the spec for version 1.4.
386
387
    // Check all values in linearization parameter dictionary
388
389
0
    LinParameters& p = m->linp;
390
391
    // L: file size in bytes -- checked by isLinearized
392
393
    // O: object number of first page
394
0
    std::vector<QPDFObjectHandle> const& pages = qpdf.getAllPages();
395
0
    if (p.first_page_object != pages.at(0).getObjectID()) {
396
0
        linearizationWarning("first page object (/O) mismatch");
397
0
    }
398
399
    // N: number of pages
400
0
    size_t npages = pages.size();
401
0
    if (std::cmp_not_equal(p.npages, npages)) {
402
        // Not tested in the test suite
403
0
        linearizationWarning("page count (/N) mismatch");
404
0
    }
405
406
0
    int i = 0;
407
0
    for (auto const& page: pages) {
408
0
        if (m->xref_table[page].getType() == 2) {
409
0
            linearizationWarning(
410
0
                "page dictionary for page " + std::to_string(i) + " is compressed");
411
0
        }
412
0
        ++i;
413
0
    }
414
415
    // T: offset of whitespace character preceding xref entry for object 0
416
0
    m->file->seek(p.xref_zero_offset, SEEK_SET);
417
0
    while (true) {
418
0
        char ch;
419
0
        m->file->read(&ch, 1);
420
0
        if (!(ch == ' ' || ch == '\r' || ch == '\n')) {
421
0
            m->file->seek(-1, SEEK_CUR);
422
0
            break;
423
0
        }
424
0
    }
425
0
    if (m->file->tell() != m->first_xref_item_offset) {
426
0
        linearizationWarning(
427
0
            "space before first xref item (/T) mismatch (computed = " +
428
0
            std::to_string(m->first_xref_item_offset) +
429
0
            "; file = " + std::to_string(m->file->tell()));
430
0
    }
431
432
    // P: first page number -- Implementation note 124 says Acrobat ignores this value, so we will
433
    // too.
434
435
    // Check numbering of compressed objects in each xref section. For linearized files, all
436
    // compressed objects are supposed to be at the end of the containing xref section if any object
437
    // streams are in use.
438
439
0
    if (m->uncompressed_after_compressed) {
440
0
        linearizationWarning(
441
0
            "linearized file contains an uncompressed object after a compressed "
442
0
            "one in a cross-reference stream");
443
0
    }
444
445
    // Further checking requires optimization and order calculation. Don't allow optimization to
446
    // make changes.  If it has to, then the file is not properly linearized.  We use the xref table
447
    // to figure out which objects are compressed and which are uncompressed.
448
0
    { // local scope
449
0
        std::map<int, int> object_stream_data;
450
0
        for (auto const& [og, entry]: m->xref_table) {
451
0
            if (entry.getType() == 2) {
452
0
                object_stream_data[og.getObj()] = entry.getObjStreamNumber();
453
0
            }
454
0
        }
455
0
        optimize_internal(object_stream_data, false, nullptr);
456
0
        calculateLinearizationData(object_stream_data);
457
0
    }
458
459
    // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
460
    // object here by mistake.  pdlin fails to place thumbnail images in section 9, so when
461
    // thumbnails are present, it also gets the wrong value for /E.  It also doesn't count outlines
462
    // here when it should even though it places them in part 6.  This code fails to put thread
463
    // information dictionaries in part 9, so it actually gets the wrong value for E when threads
464
    // are present.  In that case, it would probably agree with pdlin.  As of this writing, the test
465
    // suite doesn't contain any files with threads.
466
467
0
    qpdf.no_ci_stop_if(
468
0
        m->part6.empty(), "linearization part 6 unexpectedly empty" //
469
0
    );
470
0
    qpdf_offset_t min_E = -1;
471
0
    qpdf_offset_t max_E = -1;
472
0
    for (auto const& oh: m->part6) {
473
0
        QPDFObjGen og(oh.getObjGen());
474
        // All objects have to have been dereferenced to be classified.
475
0
        util::assertion(m->obj_cache.contains(og), "linearization part6 object not in cache");
476
0
        ObjCache const& oc = m->obj_cache[og];
477
0
        min_E = std::max(min_E, oc.end_before_space);
478
0
        max_E = std::max(max_E, oc.end_after_space);
479
0
    }
480
0
    if (p.first_page_end < min_E || p.first_page_end > max_E) {
481
0
        linearizationWarning(
482
0
            "end of first page section (/E) mismatch: /E = " + std::to_string(p.first_page_end) +
483
0
            "; computed = " + std::to_string(min_E) + ".." + std::to_string(max_E));
484
0
    }
485
486
    // Check hint tables
487
488
0
    std::map<int, int> shared_idx_to_obj;
489
0
    checkHSharedObject(pages, shared_idx_to_obj);
490
0
    checkHPageOffset(pages, shared_idx_to_obj);
491
0
    checkHOutlines();
492
0
}
493
494
qpdf_offset_t
495
Lin::maxEnd(ObjUser const& ou)
496
0
{
497
0
    qpdf.no_ci_stop_if(
498
0
        !m->obj_user_to_objects.contains(ou),
499
0
        "no entry in object user table for requested object user" //
500
0
    );
501
502
0
    qpdf_offset_t end = 0;
503
0
    for (auto const& og: m->obj_user_to_objects[ou]) {
504
0
        qpdf.no_ci_stop_if(
505
0
            !m->obj_cache.contains(og), "unknown object referenced in object user table" //
506
0
        );
507
0
        end = std::max(end, m->obj_cache[og].end_after_space);
508
0
    }
509
0
    return end;
510
0
}
511
512
qpdf_offset_t
513
Lin::getLinearizationOffset(QPDFObjGen og)
514
0
{
515
0
    QPDFXRefEntry const& entry = m->xref_table[og];
516
0
    auto typ = entry.getType();
517
0
    if (typ == 1) {
518
0
        return entry.getOffset();
519
0
    }
520
0
    qpdf.no_ci_stop_if(
521
0
        typ != 2, "getLinearizationOffset called for xref entry not of type 1 or 2" //
522
0
    );
523
    // For compressed objects, return the offset of the object stream that contains them.
524
0
    return getLinearizationOffset({entry.getObjStreamNumber(), 0});
525
0
}
526
527
QPDFObjectHandle
528
Lin::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data)
529
0
{
530
0
    if (obj.null() || !object_stream_data.contains(obj.getObjectID())) {
531
0
        return obj;
532
0
    }
533
0
    return qpdf.getObject((*(object_stream_data.find(obj.getObjectID()))).second, 0);
534
0
}
535
536
QPDFObjectHandle
537
Lin::getUncompressedObject(QPDFObjectHandle& oh, QPDFWriter::ObjTable const& obj)
538
117k
{
539
117k
    if (obj.contains(oh)) {
540
116k
        if (auto id = obj[oh].object_stream; id > 0) {
541
746
            return oh.null() ? oh : qpdf.getObject(id, 0);
542
746
        }
543
116k
    }
544
116k
    return oh;
545
117k
}
546
547
int
548
Lin::lengthNextN(int first_object, int n)
549
0
{
550
0
    int length = 0;
551
0
    for (int i = 0; i < n; ++i) {
552
0
        QPDFObjGen og(first_object + i, 0);
553
0
        if (m->xref_table.contains(og)) {
554
0
            qpdf.no_ci_stop_if(
555
0
                !m->obj_cache.contains(og),
556
0
                "found unknown object while calculating length for linearization data" //
557
0
            );
558
559
0
            length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og));
560
0
        } else {
561
0
            linearizationWarning(
562
0
                "no xref table entry for " + std::to_string(first_object + i) + " 0");
563
0
        }
564
0
    }
565
0
    return length;
566
0
}
567
568
void
569
Lin::checkHPageOffset(
570
    std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& shared_idx_to_obj)
571
0
{
572
    // Implementation note 126 says Acrobat always sets delta_content_offset and
573
    // delta_content_length in the page offset header dictionary to 0.  It also states that
574
    // min_content_offset in the per-page information is always 0, which is an incorrect value.
575
576
    // Implementation note 127 explains that Acrobat always sets item 8 (min_content_length) to
577
    // zero, item 9 (nbits_delta_content_length) to the value of item 5 (nbits_delta_page_length),
578
    // and item 7 of each per-page hint table (delta_content_length) to item 2 (delta_page_length)
579
    // of that entry.  Acrobat ignores these values when reading files.
580
581
    // Empirically, it also seems that Acrobat sometimes puts items under a page's /Resources
582
    // dictionary in with shared objects even when they are private.
583
584
0
    size_t npages = pages.size();
585
0
    qpdf_offset_t table_offset = adjusted_offset(m->page_offset_hints.first_page_offset);
586
0
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
587
0
    if (!m->xref_table.contains(first_page_og)) {
588
0
        qpdf.stopOnError("supposed first page object is not known");
589
0
    }
590
0
    qpdf_offset_t offset = getLinearizationOffset(first_page_og);
591
0
    if (table_offset != offset) {
592
0
        linearizationWarning("first page object offset mismatch");
593
0
    }
594
595
0
    for (size_t pageno = 0; pageno < npages; ++pageno) {
596
0
        QPDFObjGen page_og(pages.at(pageno).getObjGen());
597
0
        int first_object = page_og.getObj();
598
0
        if (!m->xref_table.contains(page_og)) {
599
0
            qpdf.stopOnError("unknown object in page offset hint table");
600
0
        }
601
0
        offset = getLinearizationOffset(page_og);
602
603
0
        HPageOffsetEntry& he = m->page_offset_hints.entries.at(toS(pageno));
604
0
        CHPageOffsetEntry& ce = m->c_page_offset_data.entries.at(toS(pageno));
605
0
        int h_nobjects = he.delta_nobjects + m->page_offset_hints.min_nobjects;
606
0
        if (h_nobjects != ce.nobjects) {
607
            // This happens with pdlin when there are thumbnails.
608
0
            linearizationWarning(
609
0
                "object count mismatch for page " + std::to_string(pageno) + ": hint table = " +
610
0
                std::to_string(h_nobjects) + "; computed = " + std::to_string(ce.nobjects));
611
0
        }
612
613
        // Use value for number of objects in hint table rather than computed value if there is a
614
        // discrepancy.
615
0
        int length = lengthNextN(first_object, h_nobjects);
616
0
        int h_length = toI(he.delta_page_length + m->page_offset_hints.min_page_length);
617
0
        if (length != h_length) {
618
            // This condition almost certainly indicates a bad hint table or a bug in this code.
619
0
            linearizationWarning(
620
0
                "page length mismatch for page " + std::to_string(pageno) + ": hint table = " +
621
0
                std::to_string(h_length) + "; computed length = " + std::to_string(length) +
622
0
                " (offset = " + std::to_string(offset) + ")");
623
0
        }
624
625
0
        offset += h_length;
626
627
        // Translate shared object indexes to object numbers.
628
0
        std::set<int> hint_shared;
629
0
        std::set<int> computed_shared;
630
631
0
        if (pageno == 0 && he.nshared_objects > 0) {
632
            // pdlin and Acrobat both do this even though the spec states clearly and unambiguously
633
            // that they should not.
634
0
            linearizationWarning("page 0 has shared identifier entries");
635
0
        }
636
637
0
        for (size_t i = 0; i < toS(he.nshared_objects); ++i) {
638
0
            int idx = he.shared_identifiers.at(i);
639
0
            qpdf.no_ci_stop_if(
640
0
                !shared_idx_to_obj.contains(idx),
641
0
                "unable to get object for item in shared objects hint table");
642
643
0
            hint_shared.insert(shared_idx_to_obj[idx]);
644
0
        }
645
646
0
        for (size_t i = 0; i < toS(ce.nshared_objects); ++i) {
647
0
            int idx = ce.shared_identifiers.at(i);
648
0
            qpdf.no_ci_stop_if(
649
0
                idx >= m->c_shared_object_data.nshared_total,
650
0
                "index out of bounds for shared object hint table" //
651
0
            );
652
653
0
            int obj = m->c_shared_object_data.entries.at(toS(idx)).object;
654
0
            computed_shared.insert(obj);
655
0
        }
656
657
0
        for (int iter: hint_shared) {
658
0
            if (!computed_shared.contains(iter)) {
659
                // pdlin puts thumbnails here even though it shouldn't
660
0
                linearizationWarning(
661
0
                    "page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
662
0
                    ": in hint table but not computed list");
663
0
            }
664
0
        }
665
666
0
        for (int iter: computed_shared) {
667
0
            if (!hint_shared.contains(iter)) {
668
                // Acrobat does not put some things including at least built-in fonts and procsets
669
                // here, at least in some cases.
670
0
                linearizationWarning(
671
0
                    ("page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
672
0
                     ": in computed list but not hint table"));
673
0
            }
674
0
        }
675
0
    }
676
0
}
677
678
void
679
Lin::checkHSharedObject(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj)
680
0
{
681
    // Implementation note 125 says shared object groups always contain only one object.
682
    // Implementation note 128 says that Acrobat always nbits_nobjects to zero.  Implementation note
683
    // 130 says that Acrobat does not support more than one shared object per group.  These are all
684
    // consistent.
685
686
    // Implementation note 129 states that MD5 signatures are not implemented in Acrobat, so
687
    // signature_present must always be zero.
688
689
    // Implementation note 131 states that first_shared_obj and first_shared_offset have meaningless
690
    // values for single-page files.
691
692
    // Empirically, Acrobat and pdlin generate incorrect values for these whenever there are no
693
    // shared objects not referenced by the first page (i.e., nshared_total == nshared_first_page).
694
695
0
    HSharedObject& so = m->shared_object_hints;
696
0
    if (so.nshared_total < so.nshared_first_page) {
697
0
        linearizationWarning("shared object hint table: ntotal < nfirst_page");
698
0
    } else {
699
        // The first nshared_first_page objects are consecutive objects starting with the first page
700
        // object.  The rest are consecutive starting from the first_shared_obj object.
701
0
        int cur_object = pages.at(0).getObjectID();
702
0
        for (int i = 0; i < so.nshared_total; ++i) {
703
0
            if (i == so.nshared_first_page) {
704
0
                QTC::TC("qpdf", "QPDF lin check shared past first page");
705
0
                if (m->part8.empty()) {
706
0
                    linearizationWarning("part 8 is empty but nshared_total > nshared_first_page");
707
0
                } else {
708
0
                    int obj = m->part8.at(0).getObjectID();
709
0
                    if (obj != so.first_shared_obj) {
710
0
                        linearizationWarning(
711
0
                            "first shared object number mismatch: hint table = " +
712
0
                            std::to_string(so.first_shared_obj) +
713
0
                            "; computed = " + std::to_string(obj));
714
0
                    }
715
0
                }
716
717
0
                cur_object = so.first_shared_obj;
718
719
0
                QPDFObjGen og(cur_object, 0);
720
0
                if (!m->xref_table.contains(og)) {
721
0
                    qpdf.stopOnError("unknown object in shared object hint table");
722
0
                }
723
0
                qpdf_offset_t offset = getLinearizationOffset(og);
724
0
                qpdf_offset_t h_offset = adjusted_offset(so.first_shared_offset);
725
0
                if (offset != h_offset) {
726
0
                    linearizationWarning(
727
0
                        "first shared object offset mismatch: hint table = " +
728
0
                        std::to_string(h_offset) + "; computed = " + std::to_string(offset));
729
0
                }
730
0
            }
731
732
0
            idx_to_obj[i] = cur_object;
733
0
            HSharedObjectEntry& se = so.entries.at(toS(i));
734
0
            int nobjects = se.nobjects_minus_one + 1;
735
0
            int length = lengthNextN(cur_object, nobjects);
736
0
            int h_length = so.min_group_length + se.delta_group_length;
737
0
            if (length != h_length) {
738
0
                linearizationWarning(
739
0
                    "shared object " + std::to_string(i) + " length mismatch: hint table = " +
740
0
                    std::to_string(h_length) + "; computed = " + std::to_string(length));
741
0
            }
742
0
            cur_object += nobjects;
743
0
        }
744
0
    }
745
0
}
746
747
void
748
Lin::checkHOutlines()
749
0
{
750
    // Empirically, Acrobat generates the correct value for the object number but incorrectly stores
751
    // the next object number's offset as the offset, at least when outlines appear in part 6.  It
752
    // also generates an incorrect value for length (specifically, the length that would cover the
753
    // correct number of objects from the wrong starting place).  pdlin appears to generate correct
754
    // values in those cases.
755
756
0
    if (m->c_outline_data.nobjects == m->outline_hints.nobjects) {
757
0
        if (m->c_outline_data.nobjects == 0) {
758
0
            return;
759
0
        }
760
761
0
        if (m->c_outline_data.first_object == m->outline_hints.first_object) {
762
            // Check length and offset.  Acrobat gets these wrong.
763
0
            QPDFObjectHandle outlines = qpdf.getRoot().getKey("/Outlines");
764
0
            if (!outlines.isIndirect()) {
765
                // This case is not exercised in test suite since not permitted by the spec, but if
766
                // this does occur, the code below would fail.
767
0
                linearizationWarning("/Outlines key of root dictionary is not indirect");
768
0
                return;
769
0
            }
770
0
            QPDFObjGen og(outlines.getObjGen());
771
0
            qpdf.no_ci_stop_if(
772
0
                !m->xref_table.contains(og), "unknown object in outlines hint table" //
773
0
            );
774
0
            qpdf_offset_t offset = getLinearizationOffset(og);
775
0
            ObjUser ou(ObjUser::ou_root_key, "/Outlines");
776
0
            int length = toI(maxEnd(ou) - offset);
777
0
            qpdf_offset_t table_offset = adjusted_offset(m->outline_hints.first_object_offset);
778
0
            if (offset != table_offset) {
779
0
                linearizationWarning(
780
0
                    "incorrect offset in outlines table: hint table = " +
781
0
                    std::to_string(table_offset) + "; computed = " + std::to_string(offset));
782
0
            }
783
0
            int table_length = m->outline_hints.group_length;
784
0
            if (length != table_length) {
785
0
                linearizationWarning(
786
0
                    "incorrect length in outlines table: hint table = " +
787
0
                    std::to_string(table_length) + "; computed = " + std::to_string(length));
788
0
            }
789
0
        } else {
790
0
            linearizationWarning("incorrect first object number in outline hints table.");
791
0
        }
792
0
    } else {
793
0
        linearizationWarning("incorrect object count in outline hint table");
794
0
    }
795
0
}
796
797
void
798
QPDF::showLinearizationData()
799
0
{
800
0
    try {
801
0
        m->lin.readLinearizationData();
802
0
        m->lin.checkLinearizationInternal();
803
0
        m->lin.dumpLinearizationDataInternal();
804
0
    } catch (QPDFExc& e) {
805
0
        m->lin.linearizationWarning(e.what());
806
0
    }
807
0
}
808
809
void
810
Lin::dumpLinearizationDataInternal()
811
0
{
812
0
    *m->log->getInfo() << m->file->getName() << ": linearization data:\n\n";
813
814
0
    *m->log->getInfo() << "file_size: " << m->linp.file_size << "\n"
815
0
                       << "first_page_object: " << m->linp.first_page_object << "\n"
816
0
                       << "first_page_end: " << m->linp.first_page_end << "\n"
817
0
                       << "npages: " << m->linp.npages << "\n"
818
0
                       << "xref_zero_offset: " << m->linp.xref_zero_offset << "\n"
819
0
                       << "first_page: " << m->linp.first_page << "\n"
820
0
                       << "H_offset: " << m->linp.H_offset << "\n"
821
0
                       << "H_length: " << m->linp.H_length << "\n"
822
0
                       << "\n";
823
824
0
    *m->log->getInfo() << "Page Offsets Hint Table\n\n";
825
0
    dumpHPageOffset();
826
0
    *m->log->getInfo() << "\nShared Objects Hint Table\n\n";
827
0
    dumpHSharedObject();
828
829
0
    if (m->outline_hints.nobjects > 0) {
830
0
        *m->log->getInfo() << "\nOutlines Hint Table\n\n";
831
0
        dumpHGeneric(m->outline_hints);
832
0
    }
833
0
}
834
835
qpdf_offset_t
836
Lin::adjusted_offset(qpdf_offset_t offset)
837
0
{
838
    // All offsets >= H_offset have to be increased by H_length since all hint table location values
839
    // disregard the hint table itself.
840
0
    if (offset >= m->linp.H_offset) {
841
0
        return offset + m->linp.H_length;
842
0
    }
843
0
    return offset;
844
0
}
845
846
void
847
Lin::dumpHPageOffset()
848
0
{
849
0
    HPageOffset& t = m->page_offset_hints;
850
0
    *m->log->getInfo() << "min_nobjects: " << t.min_nobjects << "\n"
851
0
                       << "first_page_offset: " << adjusted_offset(t.first_page_offset) << "\n"
852
0
                       << "nbits_delta_nobjects: " << t.nbits_delta_nobjects << "\n"
853
0
                       << "min_page_length: " << t.min_page_length << "\n"
854
0
                       << "nbits_delta_page_length: " << t.nbits_delta_page_length << "\n"
855
0
                       << "min_content_offset: " << t.min_content_offset << "\n"
856
0
                       << "nbits_delta_content_offset: " << t.nbits_delta_content_offset << "\n"
857
0
                       << "min_content_length: " << t.min_content_length << "\n"
858
0
                       << "nbits_delta_content_length: " << t.nbits_delta_content_length << "\n"
859
0
                       << "nbits_nshared_objects: " << t.nbits_nshared_objects << "\n"
860
0
                       << "nbits_shared_identifier: " << t.nbits_shared_identifier << "\n"
861
0
                       << "nbits_shared_numerator: " << t.nbits_shared_numerator << "\n"
862
0
                       << "shared_denominator: " << t.shared_denominator << "\n";
863
864
0
    for (size_t i1 = 0; i1 < m->linp.npages; ++i1) {
865
0
        HPageOffsetEntry& pe = t.entries.at(i1);
866
0
        *m->log->getInfo() << "Page " << i1 << ":\n"
867
0
                           << "  nobjects: " << pe.delta_nobjects + t.min_nobjects << "\n"
868
0
                           << "  length: " << pe.delta_page_length + t.min_page_length
869
0
                           << "\n"
870
                           // content offset is relative to page, not file
871
0
                           << "  content_offset: " << pe.delta_content_offset + t.min_content_offset
872
0
                           << "\n"
873
0
                           << "  content_length: " << pe.delta_content_length + t.min_content_length
874
0
                           << "\n"
875
0
                           << "  nshared_objects: " << pe.nshared_objects << "\n";
876
0
        for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2) {
877
0
            *m->log->getInfo() << "    identifier " << i2 << ": " << pe.shared_identifiers.at(i2)
878
0
                               << "\n";
879
0
            *m->log->getInfo() << "    numerator " << i2 << ": " << pe.shared_numerators.at(i2)
880
0
                               << "\n";
881
0
        }
882
0
    }
883
0
}
884
885
void
886
Lin::dumpHSharedObject()
887
0
{
888
0
    HSharedObject& t = m->shared_object_hints;
889
0
    *m->log->getInfo() << "first_shared_obj: " << t.first_shared_obj << "\n"
890
0
                       << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) << "\n"
891
0
                       << "nshared_first_page: " << t.nshared_first_page << "\n"
892
0
                       << "nshared_total: " << t.nshared_total << "\n"
893
0
                       << "nbits_nobjects: " << t.nbits_nobjects << "\n"
894
0
                       << "min_group_length: " << t.min_group_length << "\n"
895
0
                       << "nbits_delta_group_length: " << t.nbits_delta_group_length << "\n";
896
897
0
    for (size_t i = 0; i < toS(t.nshared_total); ++i) {
898
0
        HSharedObjectEntry& se = t.entries.at(i);
899
0
        *m->log->getInfo() << "Shared Object " << i << ":\n"
900
0
                           << "  group length: " << se.delta_group_length + t.min_group_length
901
0
                           << "\n";
902
        // PDF spec says signature present nobjects_minus_one are always 0, so print them only if
903
        // they have a non-zero value.
904
0
        if (se.signature_present) {
905
0
            *m->log->getInfo() << "  signature present\n";
906
0
        }
907
0
        if (se.nobjects_minus_one != 0) {
908
0
            *m->log->getInfo() << "  nobjects: " << se.nobjects_minus_one + 1 << "\n";
909
0
        }
910
0
    }
911
0
}
912
913
void
914
Lin::dumpHGeneric(HGeneric& t)
915
0
{
916
0
    *m->log->getInfo() << "first_object: " << t.first_object << "\n"
917
0
                       << "first_object_offset: " << adjusted_offset(t.first_object_offset) << "\n"
918
0
                       << "nobjects: " << t.nobjects << "\n"
919
0
                       << "group_length: " << t.group_length << "\n";
920
0
}
921
922
template <typename T>
923
void
924
Lin::calculateLinearizationData(T const& object_stream_data)
925
37.7k
{
926
    // This function calculates the ordering of objects, divides them into the appropriate parts,
927
    // and computes some values for the linearization parameter dictionary and hint tables.  The
928
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
929
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
930
931
37.7k
    util::assertion(
932
37.7k
        !m->object_to_obj_users.empty(),
933
37.7k
        "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()" //
934
37.7k
    );
935
    // Note that we can't call optimize here because we don't know whether it should be called
936
    // with or without allow changes.
937
938
    // Separate objects into the categories sufficient for us to determine which part of the
939
    // linearized file should contain the object.  This categorization is useful for other purposes
940
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
941
942
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
943
    // trailer dictionary in part 11).
944
945
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
946
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
947
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
948
    // for now.
949
950
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
951
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
952
    // /Outlines, then information from /Outlines also appears here.
953
954
    // Part 7 contains remaining objects private to pages other than the first page.
955
956
    // Part 8 contains all remaining shared objects except those that are shared only within
957
    // thumbnails.
958
959
    // Part 9 contains all remaining objects.
960
961
    // We sort objects into the following categories:
962
963
    //   * open_document: part 4
964
965
    //   * first_page_private: part 6
966
967
    //   * first_page_shared: part 6
968
969
    //   * other_page_private: part 7
970
971
    //   * other_page_shared: part 8
972
973
    //   * thumbnail_private: part 9
974
975
    //   * thumbnail_shared: part 9
976
977
    //   * other: part 9
978
979
    //   * outlines: part 6 or 9
980
981
37.7k
    m->part4.clear();
982
37.7k
    m->part6.clear();
983
37.7k
    m->part7.clear();
984
37.7k
    m->part8.clear();
985
37.7k
    m->part9.clear();
986
37.7k
    m->c_linp = LinParameters();
987
37.7k
    m->c_page_offset_data = CHPageOffset();
988
37.7k
    m->c_shared_object_data = CHSharedObject();
989
37.7k
    m->c_outline_data = HGeneric();
990
991
37.7k
    QPDFObjectHandle root = qpdf.getRoot();
992
37.7k
    bool outlines_in_first_page = false;
993
37.7k
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
994
37.7k
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
995
37.7k
    if (pagemode.isName()) {
996
1.17k
        if (pagemode.getName() == "/UseOutlines") {
997
648
            if (root.hasKey("/Outlines")) {
998
203
                outlines_in_first_page = true;
999
445
            } else {
1000
445
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1001
445
            }
1002
648
        }
1003
1.17k
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1004
1.17k
    }
1005
1006
37.7k
    std::set<std::string> open_document_keys;
1007
37.7k
    open_document_keys.insert("/ViewerPreferences");
1008
37.7k
    open_document_keys.insert("/PageMode");
1009
37.7k
    open_document_keys.insert("/Threads");
1010
37.7k
    open_document_keys.insert("/OpenAction");
1011
37.7k
    open_document_keys.insert("/AcroForm");
1012
1013
37.7k
    std::set<QPDFObjGen> lc_open_document;
1014
37.7k
    std::set<QPDFObjGen> lc_first_page_private;
1015
37.7k
    std::set<QPDFObjGen> lc_first_page_shared;
1016
37.7k
    std::set<QPDFObjGen> lc_other_page_private;
1017
37.7k
    std::set<QPDFObjGen> lc_other_page_shared;
1018
37.7k
    std::set<QPDFObjGen> lc_thumbnail_private;
1019
37.7k
    std::set<QPDFObjGen> lc_thumbnail_shared;
1020
37.7k
    std::set<QPDFObjGen> lc_other;
1021
37.7k
    std::set<QPDFObjGen> lc_outlines;
1022
37.7k
    std::set<QPDFObjGen> lc_root;
1023
1024
455k
    for (auto& oiter: m->object_to_obj_users) {
1025
455k
        QPDFObjGen const& og = oiter.first;
1026
455k
        std::set<ObjUser>& ous = oiter.second;
1027
1028
455k
        bool in_open_document = false;
1029
455k
        bool in_first_page = false;
1030
455k
        int other_pages = 0;
1031
455k
        int thumbs = 0;
1032
455k
        int others = 0;
1033
455k
        bool in_outlines = false;
1034
455k
        bool is_root = false;
1035
1036
912k
        for (auto const& ou: ous) {
1037
912k
            switch (ou.ou_type) {
1038
66.0k
            case ObjUser::ou_trailer_key:
1039
66.0k
                if (ou.key == "/Encrypt") {
1040
2.26k
                    in_open_document = true;
1041
63.7k
                } else {
1042
63.7k
                    ++others;
1043
63.7k
                }
1044
66.0k
                break;
1045
1046
21.5k
            case ObjUser::ou_thumb:
1047
21.5k
                ++thumbs;
1048
21.5k
                break;
1049
1050
287k
            case ObjUser::ou_root_key:
1051
287k
                if (open_document_keys.contains(ou.key)) {
1052
32.8k
                    in_open_document = true;
1053
254k
                } else if (ou.key == "/Outlines") {
1054
9.15k
                    in_outlines = true;
1055
245k
                } else {
1056
245k
                    ++others;
1057
245k
                }
1058
287k
                break;
1059
1060
500k
            case ObjUser::ou_page:
1061
500k
                if (ou.pageno == 0) {
1062
210k
                    in_first_page = true;
1063
289k
                } else {
1064
289k
                    ++other_pages;
1065
289k
                }
1066
500k
                break;
1067
1068
37.7k
            case ObjUser::ou_root:
1069
37.7k
                is_root = true;
1070
37.7k
                break;
1071
912k
            }
1072
912k
        }
1073
1074
455k
        if (is_root) {
1075
37.7k
            lc_root.insert(og);
1076
417k
        } else if (in_outlines) {
1077
9.06k
            lc_outlines.insert(og);
1078
408k
        } else if (in_open_document) {
1079
34.9k
            lc_open_document.insert(og);
1080
373k
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1081
154k
            lc_first_page_private.insert(og);
1082
218k
        } else if (in_first_page) {
1083
42.8k
            lc_first_page_shared.insert(og);
1084
175k
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1085
53.7k
            lc_other_page_private.insert(og);
1086
121k
        } else if (other_pages > 1) {
1087
15.1k
            lc_other_page_shared.insert(og);
1088
106k
        } else if ((thumbs == 1) && (others == 0)) {
1089
7.22k
            lc_thumbnail_private.insert(og);
1090
99.5k
        } else if (thumbs > 1) {
1091
3.83k
            lc_thumbnail_shared.insert(og);
1092
95.7k
        } else {
1093
95.7k
            lc_other.insert(og);
1094
95.7k
        }
1095
455k
    }
1096
1097
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1098
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1099
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1100
    // and the above section can be considered only with categorization.  Note that sets of
1101
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1102
    // the possible exception of hints tables which we won't see here anyway.  That means that
1103
    // running calculateLinearizationData() on a linearized file should give results identical to
1104
    // the original file ordering.
1105
1106
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1107
    // code optimization if necessary. Premature optimization is the root of all evil.
1108
37.7k
    std::vector<QPDFObjectHandle> pages;
1109
37.7k
    { // local scope
1110
        // Map all page objects to the containing object stream.  This should be a no-op in a
1111
        // properly linearized file.
1112
60.3k
        for (auto oh: qpdf.getAllPages()) {
1113
60.3k
            pages.emplace_back(getUncompressedObject(oh, object_stream_data));
1114
60.3k
        }
1115
37.7k
    }
1116
37.7k
    size_t npages = pages.size();
1117
1118
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1119
    // initialize any items that deal with object numbers or counts but not any items that deal with
1120
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1121
    // during the first pass.  The validation code can compute them relatively easily given the rest
1122
    // of the information.
1123
1124
    // npages is the size of the existing pages vector, which has been created by traversing the
1125
    // pages tree, and as such is a reasonable size.
1126
37.7k
    m->c_linp.npages = npages;
1127
37.7k
    m->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(npages);
1128
1129
    // Part 4: open document objects.  We don't care about the order.
1130
1131
37.7k
    qpdf.no_ci_stop_if(
1132
37.7k
        lc_root.size() != 1, "found other than one root while calculating linearization data" //
1133
37.7k
    );
1134
1135
37.7k
    m->part4.emplace_back(qpdf.getObject(*(lc_root.begin())));
1136
37.7k
    for (auto const& og: lc_open_document) {
1137
34.9k
        m->part4.emplace_back(qpdf.getObject(og));
1138
34.9k
    }
1139
1140
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1141
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1142
    // any option to set this and also disregards /OpenAction.  We will do the same.
1143
1144
    // First, place the actual first page object itself.
1145
37.7k
    qpdf.no_ci_stop_if(
1146
37.7k
        pages.empty(), "no pages found while calculating linearization data" //
1147
37.7k
    );
1148
37.7k
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
1149
37.7k
    qpdf.no_ci_stop_if(
1150
37.7k
        !lc_first_page_private.erase(first_page_og), "unable to linearize first page" //
1151
37.7k
    );
1152
37.7k
    m->c_linp.first_page_object = pages.at(0).getObjectID();
1153
37.7k
    m->part6.emplace_back(pages.at(0));
1154
1155
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1156
    // it except to the extent that it groups private and shared objects contiguously for the sake
1157
    // of hint tables.
1158
1159
120k
    for (auto const& og: lc_first_page_private) {
1160
120k
        m->part6.emplace_back(qpdf.getObject(og));
1161
120k
    }
1162
1163
37.7k
    for (auto const& og: lc_first_page_shared) {
1164
30.5k
        m->part6.emplace_back(qpdf.getObject(og));
1165
30.5k
    }
1166
1167
    // Place the outline dictionary if it goes in the first page section.
1168
37.7k
    if (outlines_in_first_page) {
1169
199
        pushOutlinesToPart(m->part6, lc_outlines, object_stream_data);
1170
199
    }
1171
1172
    // Fill in page offset hint table information for the first page. The PDF spec says that
1173
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1174
    // it fills in garbage values for all the shared object identifiers on the first page.
1175
1176
37.7k
    m->c_page_offset_data.entries.at(0).nobjects = toI(m->part6.size());
1177
1178
    // Part 7: other pages' private objects
1179
1180
    // For each page in order:
1181
60.0k
    for (size_t i = 1; i < npages; ++i) {
1182
        // Place this page's page object
1183
1184
22.2k
        QPDFObjGen page_og(pages.at(i).getObjGen());
1185
22.2k
        qpdf.no_ci_stop_if(
1186
22.2k
            !lc_other_page_private.erase(page_og),
1187
22.2k
            "unable to linearize page " + std::to_string(i) //
1188
22.2k
        );
1189
1190
22.2k
        m->part7.emplace_back(pages.at(i));
1191
1192
        // Place all non-shared objects referenced by this page, updating the page object count for
1193
        // the hint table.
1194
1195
22.2k
        m->c_page_offset_data.entries.at(i).nobjects = 1;
1196
1197
22.2k
        ObjUser ou(ObjUser::ou_page, i);
1198
22.2k
        qpdf.no_ci_stop_if(
1199
22.2k
            !m->obj_user_to_objects.contains(ou),
1200
22.2k
            "found unreferenced page while calculating linearization data" //
1201
22.2k
        );
1202
1203
286k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1204
286k
            if (lc_other_page_private.erase(og)) {
1205
31.1k
                m->part7.emplace_back(qpdf.getObject(og));
1206
31.1k
                ++m->c_page_offset_data.entries.at(i).nobjects;
1207
31.1k
            }
1208
286k
        }
1209
22.2k
    }
1210
    // That should have covered all part7 objects.
1211
37.7k
    util::assertion(
1212
37.7k
        lc_other_page_private.empty(),
1213
37.7k
        "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_other_page_private is not empty "
1214
37.7k
        "after generation of part7" //
1215
37.7k
    );
1216
1217
    // Part 8: other pages' shared objects
1218
1219
    // Order is unimportant.
1220
37.7k
    for (auto const& og: lc_other_page_shared) {
1221
15.0k
        m->part8.emplace_back(qpdf.getObject(og));
1222
15.0k
    }
1223
1224
    // Part 9: other objects
1225
1226
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1227
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1228
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1229
    // we throw all remaining objects in arbitrary order.
1230
1231
    // Place the pages tree.
1232
37.7k
    std::set<QPDFObjGen> pages_ogs =
1233
37.7k
        m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1234
37.7k
    qpdf.no_ci_stop_if(
1235
37.7k
        pages_ogs.empty(), "found empty pages tree while calculating linearization data" //
1236
37.7k
    );
1237
60.5k
    for (auto const& og: pages_ogs) {
1238
60.5k
        if (lc_other.erase(og)) {
1239
47.0k
            m->part9.emplace_back(qpdf.getObject(og));
1240
47.0k
        }
1241
60.5k
    }
1242
1243
    // Place private thumbnail images in page order.  Slightly more information would be required if
1244
    // we were going to bother with thumbnail hint tables.
1245
93.4k
    for (size_t i = 0; i < npages; ++i) {
1246
55.7k
        QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1247
55.7k
        thumb = getUncompressedObject(thumb, object_stream_data);
1248
55.7k
        QPDFObjGen thumb_og(thumb.getObjGen());
1249
        // Output the thumbnail itself
1250
55.7k
        if (lc_thumbnail_private.erase(thumb_og) && !thumb.null()) {
1251
1.72k
            m->part9.emplace_back(thumb);
1252
53.9k
        } else {
1253
            // No internal error this time...there's nothing to stop this object from having
1254
            // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1255
            // there's nothing to prevent it from having been in some set other than
1256
            // lc_thumbnail_private.
1257
53.9k
        }
1258
55.7k
        std::set<QPDFObjGen>& ogs = m->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, i)];
1259
55.7k
        for (auto const& og: ogs) {
1260
20.1k
            if (lc_thumbnail_private.erase(og)) {
1261
5.05k
                m->part9.emplace_back(qpdf.getObject(og));
1262
5.05k
            }
1263
20.1k
        }
1264
55.7k
    }
1265
37.7k
    util::assertion(
1266
37.7k
        lc_thumbnail_private.empty(),
1267
37.7k
        "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1268
37.7k
        "empty after placing thumbnails" //
1269
37.7k
    );
1270
1271
    // Place shared thumbnail objects
1272
37.7k
    for (auto const& og: lc_thumbnail_shared) {
1273
3.54k
        m->part9.emplace_back(qpdf.getObject(og));
1274
3.54k
    }
1275
1276
    // Place outlines unless in first page
1277
37.7k
    if (!outlines_in_first_page) {
1278
33.9k
        pushOutlinesToPart(m->part9, lc_outlines, object_stream_data);
1279
33.9k
    }
1280
1281
    // Place all remaining objects
1282
46.4k
    for (auto const& og: lc_other) {
1283
46.4k
        m->part9.emplace_back(qpdf.getObject(og));
1284
46.4k
    }
1285
1286
    // Make sure we got everything exactly once.
1287
1288
37.7k
    size_t num_placed =
1289
37.7k
        m->part4.size() + m->part6.size() + m->part7.size() + m->part8.size() + m->part9.size();
1290
37.7k
    size_t num_wanted = m->object_to_obj_users.size();
1291
37.7k
    qpdf.no_ci_stop_if(
1292
        // This can happen with damaged files, e.g. if the root is part of the the pages tree.
1293
37.7k
        num_placed != num_wanted,
1294
37.7k
        "QPDF::calculateLinearizationData: wrong number of objects placed (num_placed = " +
1295
37.7k
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted) +
1296
37.7k
            "\nIf the file did not generate any other warnings please report this as a bug." //
1297
37.7k
    );
1298
1299
    // Calculate shared object hint table information including references to shared objects from
1300
    // page offset hint data.
1301
1302
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1303
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1304
    // object number to index.  Then populate the shared object information for the pages.
1305
1306
    // Note that two objects never have the same object number, so we can map from object number
1307
    // only without regards to generation.
1308
37.7k
    std::map<int, int> obj_to_index;
1309
1310
37.7k
    m->c_shared_object_data.nshared_first_page = toI(m->part6.size());
1311
37.7k
    m->c_shared_object_data.nshared_total =
1312
37.7k
        m->c_shared_object_data.nshared_first_page + toI(m->part8.size());
1313
1314
37.7k
    std::vector<CHSharedObjectEntry>& shared = m->c_shared_object_data.entries;
1315
187k
    for (auto& oh: m->part6) {
1316
187k
        int obj = oh.getObjectID();
1317
187k
        obj_to_index[obj] = toI(shared.size());
1318
187k
        shared.emplace_back(obj);
1319
187k
    }
1320
37.7k
    QTC::TC("qpdf", "QPDF lin part 8 empty", m->part8.empty() ? 1 : 0);
1321
37.7k
    if (!m->part8.empty()) {
1322
463
        m->c_shared_object_data.first_shared_obj = m->part8.at(0).getObjectID();
1323
14.6k
        for (auto& oh: m->part8) {
1324
14.6k
            int obj = oh.getObjectID();
1325
14.6k
            obj_to_index[obj] = toI(shared.size());
1326
14.6k
            shared.emplace_back(obj);
1327
14.6k
        }
1328
463
    }
1329
37.7k
    qpdf.no_ci_stop_if(
1330
37.7k
        std::cmp_not_equal(
1331
37.7k
            m->c_shared_object_data.nshared_total, m->c_shared_object_data.entries.size()),
1332
37.7k
        "shared object hint table has wrong number of entries" //
1333
37.7k
    );
1334
1335
    // Now compute the list of shared objects for each page after the first page.
1336
1337
59.2k
    for (size_t i = 1; i < npages; ++i) {
1338
21.5k
        CHPageOffsetEntry& pe = m->c_page_offset_data.entries.at(i);
1339
21.5k
        ObjUser ou(ObjUser::ou_page, i);
1340
21.5k
        qpdf.no_ci_stop_if(
1341
21.5k
            !m->obj_user_to_objects.contains(ou),
1342
21.5k
            "found unreferenced page while calculating linearization data" //
1343
21.5k
        );
1344
1345
276k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1346
276k
            if ((m->object_to_obj_users[og].size() > 1) && (obj_to_index.contains(og.getObj()))) {
1347
217k
                int idx = obj_to_index[og.getObj()];
1348
217k
                ++pe.nshared_objects;
1349
217k
                pe.shared_identifiers.push_back(idx);
1350
217k
            }
1351
276k
        }
1352
21.5k
    }
1353
37.7k
}
Unexecuted instantiation: void QPDF::Doc::Linearization::calculateLinearizationData<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
void QPDF::Doc::Linearization::calculateLinearizationData<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&)
Line
Count
Source
925
37.7k
{
926
    // This function calculates the ordering of objects, divides them into the appropriate parts,
927
    // and computes some values for the linearization parameter dictionary and hint tables.  The
928
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
929
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
930
931
37.7k
    util::assertion(
932
37.7k
        !m->object_to_obj_users.empty(),
933
37.7k
        "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()" //
934
37.7k
    );
935
    // Note that we can't call optimize here because we don't know whether it should be called
936
    // with or without allow changes.
937
938
    // Separate objects into the categories sufficient for us to determine which part of the
939
    // linearized file should contain the object.  This categorization is useful for other purposes
940
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
941
942
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
943
    // trailer dictionary in part 11).
944
945
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
946
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
947
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
948
    // for now.
949
950
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
951
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
952
    // /Outlines, then information from /Outlines also appears here.
953
954
    // Part 7 contains remaining objects private to pages other than the first page.
955
956
    // Part 8 contains all remaining shared objects except those that are shared only within
957
    // thumbnails.
958
959
    // Part 9 contains all remaining objects.
960
961
    // We sort objects into the following categories:
962
963
    //   * open_document: part 4
964
965
    //   * first_page_private: part 6
966
967
    //   * first_page_shared: part 6
968
969
    //   * other_page_private: part 7
970
971
    //   * other_page_shared: part 8
972
973
    //   * thumbnail_private: part 9
974
975
    //   * thumbnail_shared: part 9
976
977
    //   * other: part 9
978
979
    //   * outlines: part 6 or 9
980
981
37.7k
    m->part4.clear();
982
37.7k
    m->part6.clear();
983
37.7k
    m->part7.clear();
984
37.7k
    m->part8.clear();
985
37.7k
    m->part9.clear();
986
37.7k
    m->c_linp = LinParameters();
987
37.7k
    m->c_page_offset_data = CHPageOffset();
988
37.7k
    m->c_shared_object_data = CHSharedObject();
989
37.7k
    m->c_outline_data = HGeneric();
990
991
37.7k
    QPDFObjectHandle root = qpdf.getRoot();
992
37.7k
    bool outlines_in_first_page = false;
993
37.7k
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
994
37.7k
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
995
37.7k
    if (pagemode.isName()) {
996
1.17k
        if (pagemode.getName() == "/UseOutlines") {
997
648
            if (root.hasKey("/Outlines")) {
998
203
                outlines_in_first_page = true;
999
445
            } else {
1000
445
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1001
445
            }
1002
648
        }
1003
1.17k
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1004
1.17k
    }
1005
1006
37.7k
    std::set<std::string> open_document_keys;
1007
37.7k
    open_document_keys.insert("/ViewerPreferences");
1008
37.7k
    open_document_keys.insert("/PageMode");
1009
37.7k
    open_document_keys.insert("/Threads");
1010
37.7k
    open_document_keys.insert("/OpenAction");
1011
37.7k
    open_document_keys.insert("/AcroForm");
1012
1013
37.7k
    std::set<QPDFObjGen> lc_open_document;
1014
37.7k
    std::set<QPDFObjGen> lc_first_page_private;
1015
37.7k
    std::set<QPDFObjGen> lc_first_page_shared;
1016
37.7k
    std::set<QPDFObjGen> lc_other_page_private;
1017
37.7k
    std::set<QPDFObjGen> lc_other_page_shared;
1018
37.7k
    std::set<QPDFObjGen> lc_thumbnail_private;
1019
37.7k
    std::set<QPDFObjGen> lc_thumbnail_shared;
1020
37.7k
    std::set<QPDFObjGen> lc_other;
1021
37.7k
    std::set<QPDFObjGen> lc_outlines;
1022
37.7k
    std::set<QPDFObjGen> lc_root;
1023
1024
455k
    for (auto& oiter: m->object_to_obj_users) {
1025
455k
        QPDFObjGen const& og = oiter.first;
1026
455k
        std::set<ObjUser>& ous = oiter.second;
1027
1028
455k
        bool in_open_document = false;
1029
455k
        bool in_first_page = false;
1030
455k
        int other_pages = 0;
1031
455k
        int thumbs = 0;
1032
455k
        int others = 0;
1033
455k
        bool in_outlines = false;
1034
455k
        bool is_root = false;
1035
1036
912k
        for (auto const& ou: ous) {
1037
912k
            switch (ou.ou_type) {
1038
66.0k
            case ObjUser::ou_trailer_key:
1039
66.0k
                if (ou.key == "/Encrypt") {
1040
2.26k
                    in_open_document = true;
1041
63.7k
                } else {
1042
63.7k
                    ++others;
1043
63.7k
                }
1044
66.0k
                break;
1045
1046
21.5k
            case ObjUser::ou_thumb:
1047
21.5k
                ++thumbs;
1048
21.5k
                break;
1049
1050
287k
            case ObjUser::ou_root_key:
1051
287k
                if (open_document_keys.contains(ou.key)) {
1052
32.8k
                    in_open_document = true;
1053
254k
                } else if (ou.key == "/Outlines") {
1054
9.15k
                    in_outlines = true;
1055
245k
                } else {
1056
245k
                    ++others;
1057
245k
                }
1058
287k
                break;
1059
1060
500k
            case ObjUser::ou_page:
1061
500k
                if (ou.pageno == 0) {
1062
210k
                    in_first_page = true;
1063
289k
                } else {
1064
289k
                    ++other_pages;
1065
289k
                }
1066
500k
                break;
1067
1068
37.7k
            case ObjUser::ou_root:
1069
37.7k
                is_root = true;
1070
37.7k
                break;
1071
912k
            }
1072
912k
        }
1073
1074
455k
        if (is_root) {
1075
37.7k
            lc_root.insert(og);
1076
417k
        } else if (in_outlines) {
1077
9.06k
            lc_outlines.insert(og);
1078
408k
        } else if (in_open_document) {
1079
34.9k
            lc_open_document.insert(og);
1080
373k
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1081
154k
            lc_first_page_private.insert(og);
1082
218k
        } else if (in_first_page) {
1083
42.8k
            lc_first_page_shared.insert(og);
1084
175k
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1085
53.7k
            lc_other_page_private.insert(og);
1086
121k
        } else if (other_pages > 1) {
1087
15.1k
            lc_other_page_shared.insert(og);
1088
106k
        } else if ((thumbs == 1) && (others == 0)) {
1089
7.22k
            lc_thumbnail_private.insert(og);
1090
99.5k
        } else if (thumbs > 1) {
1091
3.83k
            lc_thumbnail_shared.insert(og);
1092
95.7k
        } else {
1093
95.7k
            lc_other.insert(og);
1094
95.7k
        }
1095
455k
    }
1096
1097
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1098
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1099
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1100
    // and the above section can be considered only with categorization.  Note that sets of
1101
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1102
    // the possible exception of hints tables which we won't see here anyway.  That means that
1103
    // running calculateLinearizationData() on a linearized file should give results identical to
1104
    // the original file ordering.
1105
1106
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1107
    // code optimization if necessary. Premature optimization is the root of all evil.
1108
37.7k
    std::vector<QPDFObjectHandle> pages;
1109
37.7k
    { // local scope
1110
        // Map all page objects to the containing object stream.  This should be a no-op in a
1111
        // properly linearized file.
1112
60.3k
        for (auto oh: qpdf.getAllPages()) {
1113
60.3k
            pages.emplace_back(getUncompressedObject(oh, object_stream_data));
1114
60.3k
        }
1115
37.7k
    }
1116
37.7k
    size_t npages = pages.size();
1117
1118
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1119
    // initialize any items that deal with object numbers or counts but not any items that deal with
1120
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1121
    // during the first pass.  The validation code can compute them relatively easily given the rest
1122
    // of the information.
1123
1124
    // npages is the size of the existing pages vector, which has been created by traversing the
1125
    // pages tree, and as such is a reasonable size.
1126
37.7k
    m->c_linp.npages = npages;
1127
37.7k
    m->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(npages);
1128
1129
    // Part 4: open document objects.  We don't care about the order.
1130
1131
37.7k
    qpdf.no_ci_stop_if(
1132
37.7k
        lc_root.size() != 1, "found other than one root while calculating linearization data" //
1133
37.7k
    );
1134
1135
37.7k
    m->part4.emplace_back(qpdf.getObject(*(lc_root.begin())));
1136
37.7k
    for (auto const& og: lc_open_document) {
1137
34.9k
        m->part4.emplace_back(qpdf.getObject(og));
1138
34.9k
    }
1139
1140
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1141
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1142
    // any option to set this and also disregards /OpenAction.  We will do the same.
1143
1144
    // First, place the actual first page object itself.
1145
37.7k
    qpdf.no_ci_stop_if(
1146
37.7k
        pages.empty(), "no pages found while calculating linearization data" //
1147
37.7k
    );
1148
37.7k
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
1149
37.7k
    qpdf.no_ci_stop_if(
1150
37.7k
        !lc_first_page_private.erase(first_page_og), "unable to linearize first page" //
1151
37.7k
    );
1152
37.7k
    m->c_linp.first_page_object = pages.at(0).getObjectID();
1153
37.7k
    m->part6.emplace_back(pages.at(0));
1154
1155
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1156
    // it except to the extent that it groups private and shared objects contiguously for the sake
1157
    // of hint tables.
1158
1159
120k
    for (auto const& og: lc_first_page_private) {
1160
120k
        m->part6.emplace_back(qpdf.getObject(og));
1161
120k
    }
1162
1163
37.7k
    for (auto const& og: lc_first_page_shared) {
1164
30.5k
        m->part6.emplace_back(qpdf.getObject(og));
1165
30.5k
    }
1166
1167
    // Place the outline dictionary if it goes in the first page section.
1168
37.7k
    if (outlines_in_first_page) {
1169
199
        pushOutlinesToPart(m->part6, lc_outlines, object_stream_data);
1170
199
    }
1171
1172
    // Fill in page offset hint table information for the first page. The PDF spec says that
1173
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1174
    // it fills in garbage values for all the shared object identifiers on the first page.
1175
1176
37.7k
    m->c_page_offset_data.entries.at(0).nobjects = toI(m->part6.size());
1177
1178
    // Part 7: other pages' private objects
1179
1180
    // For each page in order:
1181
60.0k
    for (size_t i = 1; i < npages; ++i) {
1182
        // Place this page's page object
1183
1184
22.2k
        QPDFObjGen page_og(pages.at(i).getObjGen());
1185
22.2k
        qpdf.no_ci_stop_if(
1186
22.2k
            !lc_other_page_private.erase(page_og),
1187
22.2k
            "unable to linearize page " + std::to_string(i) //
1188
22.2k
        );
1189
1190
22.2k
        m->part7.emplace_back(pages.at(i));
1191
1192
        // Place all non-shared objects referenced by this page, updating the page object count for
1193
        // the hint table.
1194
1195
22.2k
        m->c_page_offset_data.entries.at(i).nobjects = 1;
1196
1197
22.2k
        ObjUser ou(ObjUser::ou_page, i);
1198
22.2k
        qpdf.no_ci_stop_if(
1199
22.2k
            !m->obj_user_to_objects.contains(ou),
1200
22.2k
            "found unreferenced page while calculating linearization data" //
1201
22.2k
        );
1202
1203
286k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1204
286k
            if (lc_other_page_private.erase(og)) {
1205
31.1k
                m->part7.emplace_back(qpdf.getObject(og));
1206
31.1k
                ++m->c_page_offset_data.entries.at(i).nobjects;
1207
31.1k
            }
1208
286k
        }
1209
22.2k
    }
1210
    // That should have covered all part7 objects.
1211
37.7k
    util::assertion(
1212
37.7k
        lc_other_page_private.empty(),
1213
37.7k
        "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_other_page_private is not empty "
1214
37.7k
        "after generation of part7" //
1215
37.7k
    );
1216
1217
    // Part 8: other pages' shared objects
1218
1219
    // Order is unimportant.
1220
37.7k
    for (auto const& og: lc_other_page_shared) {
1221
15.0k
        m->part8.emplace_back(qpdf.getObject(og));
1222
15.0k
    }
1223
1224
    // Part 9: other objects
1225
1226
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1227
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1228
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1229
    // we throw all remaining objects in arbitrary order.
1230
1231
    // Place the pages tree.
1232
37.7k
    std::set<QPDFObjGen> pages_ogs =
1233
37.7k
        m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1234
37.7k
    qpdf.no_ci_stop_if(
1235
37.7k
        pages_ogs.empty(), "found empty pages tree while calculating linearization data" //
1236
37.7k
    );
1237
60.5k
    for (auto const& og: pages_ogs) {
1238
60.5k
        if (lc_other.erase(og)) {
1239
47.0k
            m->part9.emplace_back(qpdf.getObject(og));
1240
47.0k
        }
1241
60.5k
    }
1242
1243
    // Place private thumbnail images in page order.  Slightly more information would be required if
1244
    // we were going to bother with thumbnail hint tables.
1245
93.4k
    for (size_t i = 0; i < npages; ++i) {
1246
55.7k
        QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1247
55.7k
        thumb = getUncompressedObject(thumb, object_stream_data);
1248
55.7k
        QPDFObjGen thumb_og(thumb.getObjGen());
1249
        // Output the thumbnail itself
1250
55.7k
        if (lc_thumbnail_private.erase(thumb_og) && !thumb.null()) {
1251
1.72k
            m->part9.emplace_back(thumb);
1252
53.9k
        } else {
1253
            // No internal error this time...there's nothing to stop this object from having
1254
            // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1255
            // there's nothing to prevent it from having been in some set other than
1256
            // lc_thumbnail_private.
1257
53.9k
        }
1258
55.7k
        std::set<QPDFObjGen>& ogs = m->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, i)];
1259
55.7k
        for (auto const& og: ogs) {
1260
20.1k
            if (lc_thumbnail_private.erase(og)) {
1261
5.05k
                m->part9.emplace_back(qpdf.getObject(og));
1262
5.05k
            }
1263
20.1k
        }
1264
55.7k
    }
1265
37.7k
    util::assertion(
1266
37.7k
        lc_thumbnail_private.empty(),
1267
37.7k
        "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1268
37.7k
        "empty after placing thumbnails" //
1269
37.7k
    );
1270
1271
    // Place shared thumbnail objects
1272
37.7k
    for (auto const& og: lc_thumbnail_shared) {
1273
3.54k
        m->part9.emplace_back(qpdf.getObject(og));
1274
3.54k
    }
1275
1276
    // Place outlines unless in first page
1277
37.7k
    if (!outlines_in_first_page) {
1278
33.9k
        pushOutlinesToPart(m->part9, lc_outlines, object_stream_data);
1279
33.9k
    }
1280
1281
    // Place all remaining objects
1282
46.4k
    for (auto const& og: lc_other) {
1283
46.4k
        m->part9.emplace_back(qpdf.getObject(og));
1284
46.4k
    }
1285
1286
    // Make sure we got everything exactly once.
1287
1288
37.7k
    size_t num_placed =
1289
37.7k
        m->part4.size() + m->part6.size() + m->part7.size() + m->part8.size() + m->part9.size();
1290
37.7k
    size_t num_wanted = m->object_to_obj_users.size();
1291
37.7k
    qpdf.no_ci_stop_if(
1292
        // This can happen with damaged files, e.g. if the root is part of the the pages tree.
1293
37.7k
        num_placed != num_wanted,
1294
37.7k
        "QPDF::calculateLinearizationData: wrong number of objects placed (num_placed = " +
1295
37.7k
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted) +
1296
37.7k
            "\nIf the file did not generate any other warnings please report this as a bug." //
1297
37.7k
    );
1298
1299
    // Calculate shared object hint table information including references to shared objects from
1300
    // page offset hint data.
1301
1302
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1303
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1304
    // object number to index.  Then populate the shared object information for the pages.
1305
1306
    // Note that two objects never have the same object number, so we can map from object number
1307
    // only without regards to generation.
1308
37.7k
    std::map<int, int> obj_to_index;
1309
1310
37.7k
    m->c_shared_object_data.nshared_first_page = toI(m->part6.size());
1311
37.7k
    m->c_shared_object_data.nshared_total =
1312
37.7k
        m->c_shared_object_data.nshared_first_page + toI(m->part8.size());
1313
1314
37.7k
    std::vector<CHSharedObjectEntry>& shared = m->c_shared_object_data.entries;
1315
187k
    for (auto& oh: m->part6) {
1316
187k
        int obj = oh.getObjectID();
1317
187k
        obj_to_index[obj] = toI(shared.size());
1318
187k
        shared.emplace_back(obj);
1319
187k
    }
1320
37.7k
    QTC::TC("qpdf", "QPDF lin part 8 empty", m->part8.empty() ? 1 : 0);
1321
37.7k
    if (!m->part8.empty()) {
1322
463
        m->c_shared_object_data.first_shared_obj = m->part8.at(0).getObjectID();
1323
14.6k
        for (auto& oh: m->part8) {
1324
14.6k
            int obj = oh.getObjectID();
1325
14.6k
            obj_to_index[obj] = toI(shared.size());
1326
14.6k
            shared.emplace_back(obj);
1327
14.6k
        }
1328
463
    }
1329
37.7k
    qpdf.no_ci_stop_if(
1330
37.7k
        std::cmp_not_equal(
1331
37.7k
            m->c_shared_object_data.nshared_total, m->c_shared_object_data.entries.size()),
1332
37.7k
        "shared object hint table has wrong number of entries" //
1333
37.7k
    );
1334
1335
    // Now compute the list of shared objects for each page after the first page.
1336
1337
59.2k
    for (size_t i = 1; i < npages; ++i) {
1338
21.5k
        CHPageOffsetEntry& pe = m->c_page_offset_data.entries.at(i);
1339
21.5k
        ObjUser ou(ObjUser::ou_page, i);
1340
21.5k
        qpdf.no_ci_stop_if(
1341
21.5k
            !m->obj_user_to_objects.contains(ou),
1342
21.5k
            "found unreferenced page while calculating linearization data" //
1343
21.5k
        );
1344
1345
276k
        for (auto const& og: m->obj_user_to_objects[ou]) {
1346
276k
            if ((m->object_to_obj_users[og].size() > 1) && (obj_to_index.contains(og.getObj()))) {
1347
217k
                int idx = obj_to_index[og.getObj()];
1348
217k
                ++pe.nshared_objects;
1349
217k
                pe.shared_identifiers.push_back(idx);
1350
217k
            }
1351
276k
        }
1352
21.5k
    }
1353
37.7k
}
1354
1355
template <typename T>
1356
void
1357
Lin::pushOutlinesToPart(
1358
    std::vector<QPDFObjectHandle>& part,
1359
    std::set<QPDFObjGen>& lc_outlines,
1360
    T const& object_stream_data)
1361
34.1k
{
1362
34.1k
    QPDFObjectHandle root = qpdf.getRoot();
1363
34.1k
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1364
34.1k
    if (outlines.null()) {
1365
33.0k
        return;
1366
33.0k
    }
1367
1.01k
    outlines = getUncompressedObject(outlines, object_stream_data);
1368
1.01k
    QPDFObjGen outlines_og(outlines.getObjGen());
1369
1.01k
    QTC::TC(
1370
1.01k
        "qpdf",
1371
1.01k
        "QPDF lin outlines in part",
1372
1.01k
        &part == &m->part6         ? 0
1373
1.01k
            : (&part == &m->part9) ? 1
1374
820
                                   : 9999); // can't happen
1375
1.01k
    if (lc_outlines.erase(outlines_og)) {
1376
        // Make sure outlines is in lc_outlines in case the file is damaged. in which case it may be
1377
        // included in an earlier part.
1378
870
        part.emplace_back(outlines);
1379
870
        m->c_outline_data.first_object = outlines_og.getObj();
1380
870
        m->c_outline_data.nobjects = 1;
1381
870
    }
1382
7.98k
    for (auto const& og: lc_outlines) {
1383
7.98k
        if (!m->c_outline_data.first_object) {
1384
45
            m->c_outline_data.first_object = og.getObj();
1385
45
        }
1386
7.98k
        part.emplace_back(qpdf.getObject(og));
1387
7.98k
        ++m->c_outline_data.nobjects;
1388
7.98k
    }
1389
1.01k
}
Unexecuted instantiation: void QPDF::Doc::Linearization::pushOutlinesToPart<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
void QPDF::Doc::Linearization::pushOutlinesToPart<QPDFWriter::ObjTable>(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, QPDFWriter::ObjTable const&)
Line
Count
Source
1361
34.1k
{
1362
34.1k
    QPDFObjectHandle root = qpdf.getRoot();
1363
34.1k
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1364
34.1k
    if (outlines.null()) {
1365
33.0k
        return;
1366
33.0k
    }
1367
1.01k
    outlines = getUncompressedObject(outlines, object_stream_data);
1368
1.01k
    QPDFObjGen outlines_og(outlines.getObjGen());
1369
1.01k
    QTC::TC(
1370
1.01k
        "qpdf",
1371
1.01k
        "QPDF lin outlines in part",
1372
1.01k
        &part == &m->part6         ? 0
1373
1.01k
            : (&part == &m->part9) ? 1
1374
820
                                   : 9999); // can't happen
1375
1.01k
    if (lc_outlines.erase(outlines_og)) {
1376
        // Make sure outlines is in lc_outlines in case the file is damaged. in which case it may be
1377
        // included in an earlier part.
1378
870
        part.emplace_back(outlines);
1379
870
        m->c_outline_data.first_object = outlines_og.getObj();
1380
870
        m->c_outline_data.nobjects = 1;
1381
870
    }
1382
7.98k
    for (auto const& og: lc_outlines) {
1383
7.98k
        if (!m->c_outline_data.first_object) {
1384
45
            m->c_outline_data.first_object = og.getObj();
1385
45
        }
1386
7.98k
        part.emplace_back(qpdf.getObject(og));
1387
7.98k
        ++m->c_outline_data.nobjects;
1388
7.98k
    }
1389
1.01k
}
1390
1391
void
1392
Lin::getLinearizedParts(
1393
    QPDFWriter::ObjTable const& obj,
1394
    std::vector<QPDFObjectHandle>& part4,
1395
    std::vector<QPDFObjectHandle>& part6,
1396
    std::vector<QPDFObjectHandle>& part7,
1397
    std::vector<QPDFObjectHandle>& part8,
1398
    std::vector<QPDFObjectHandle>& part9)
1399
37.7k
{
1400
37.7k
    calculateLinearizationData(obj);
1401
37.7k
    part4 = m->part4;
1402
37.7k
    part6 = m->part6;
1403
37.7k
    part7 = m->part7;
1404
37.7k
    part8 = m->part8;
1405
37.7k
    part9 = m->part9;
1406
37.7k
}
1407
1408
static inline int
1409
nbits(int val)
1410
465k
{
1411
465k
    return (val == 0 ? 0 : (1 + nbits(val >> 1)));
1412
465k
}
1413
1414
int
1415
Lin::outputLengthNextN(
1416
    int in_object, int n, QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1417
244k
{
1418
    // Figure out the length of a series of n consecutive objects in the output file starting with
1419
    // whatever object in_object from the input file mapped to.
1420
1421
244k
    int first = obj[in_object].renumber;
1422
244k
    int last = first + n;
1423
244k
    qpdf.no_ci_stop_if(
1424
244k
        first <= 0, "found object that is not renumbered while writing linearization data");
1425
244k
    qpdf_offset_t length = 0;
1426
642k
    for (int i = first; i < last; ++i) {
1427
398k
        auto l = new_obj[i].length;
1428
398k
        qpdf.no_ci_stop_if(
1429
398k
            l == 0, "found item with unknown length while writing linearization data" //
1430
398k
        );
1431
398k
        length += l;
1432
398k
    }
1433
244k
    return toI(length);
1434
244k
}
1435
1436
void
1437
Lin::calculateHPageOffset(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1438
30.1k
{
1439
    // Page Offset Hint Table
1440
1441
    // We are purposely leaving some values set to their initial zero values.
1442
1443
30.1k
    std::vector<QPDFObjectHandle> const& pages = qpdf.getAllPages();
1444
30.1k
    size_t npages = pages.size();
1445
30.1k
    CHPageOffset& cph = m->c_page_offset_data;
1446
30.1k
    std::vector<CHPageOffsetEntry>& cphe = cph.entries;
1447
1448
    // Calculate minimum and maximum values for number of objects per page and page length.
1449
1450
30.1k
    int min_nobjects = std::numeric_limits<int>::max();
1451
30.1k
    int max_nobjects = 0;
1452
30.1k
    int min_length = std::numeric_limits<int>::max();
1453
30.1k
    int max_length = 0;
1454
30.1k
    int max_shared = 0;
1455
1456
30.1k
    HPageOffset& ph = m->page_offset_hints;
1457
30.1k
    std::vector<HPageOffsetEntry>& phe = ph.entries;
1458
    // npages is the size of the existing pages array.
1459
30.1k
    phe = std::vector<HPageOffsetEntry>(npages);
1460
1461
30.1k
    size_t i = 0;
1462
50.6k
    for (auto& phe_i: phe) {
1463
        // Calculate values for each page, assigning full values to the delta items.  They will be
1464
        // adjusted later.
1465
1466
        // Repeat calculations for page 0 so we can assign to phe[i] without duplicating those
1467
        // assignments.
1468
1469
50.6k
        int nobjects = cphe.at(i).nobjects;
1470
50.6k
        int length = outputLengthNextN(pages.at(i).getObjectID(), nobjects, new_obj, obj);
1471
50.6k
        int nshared = cphe.at(i).nshared_objects;
1472
1473
50.6k
        min_nobjects = std::min(min_nobjects, nobjects);
1474
50.6k
        max_nobjects = std::max(max_nobjects, nobjects);
1475
50.6k
        min_length = std::min(min_length, length);
1476
50.6k
        max_length = std::max(max_length, length);
1477
50.6k
        max_shared = std::max(max_shared, nshared);
1478
1479
50.6k
        phe_i.delta_nobjects = nobjects;
1480
50.6k
        phe_i.delta_page_length = length;
1481
50.6k
        phe_i.nshared_objects = nshared;
1482
50.6k
        ++i;
1483
50.6k
    }
1484
1485
30.1k
    ph.min_nobjects = min_nobjects;
1486
30.1k
    ph.first_page_offset = new_obj[obj[pages.at(0)].renumber].xref.getOffset();
1487
30.1k
    ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects);
1488
30.1k
    ph.min_page_length = min_length;
1489
30.1k
    ph.nbits_delta_page_length = nbits(max_length - min_length);
1490
30.1k
    ph.nbits_nshared_objects = nbits(max_shared);
1491
30.1k
    ph.nbits_shared_identifier = nbits(m->c_shared_object_data.nshared_total);
1492
30.1k
    ph.shared_denominator = 4; // doesn't matter
1493
1494
    // It isn't clear how to compute content offset and content length.  Since we are not
1495
    // interleaving page objects with the content stream, we'll use the same values for content
1496
    // length as page length.  We will use 0 as content offset because this is what Adobe does
1497
    // (implementation note 127) and pdlin as well.
1498
30.1k
    ph.nbits_delta_content_length = ph.nbits_delta_page_length;
1499
30.1k
    ph.min_content_length = ph.min_page_length;
1500
1501
30.1k
    i = 0;
1502
50.6k
    for (auto& phe_i: phe) {
1503
        // Adjust delta entries
1504
50.6k
        if (phe_i.delta_nobjects < min_nobjects || phe_i.delta_page_length < min_length) {
1505
0
            qpdf.stopOnError(
1506
0
                "found too small delta nobjects or delta page length while writing "
1507
0
                "linearization data");
1508
0
        }
1509
50.6k
        phe_i.delta_nobjects -= min_nobjects;
1510
50.6k
        phe_i.delta_page_length -= min_length;
1511
50.6k
        phe_i.delta_content_length = phe_i.delta_page_length;
1512
1513
50.6k
        auto& si = cphe.at(i).shared_identifiers;
1514
50.6k
        phe_i.shared_identifiers.insert(phe_i.shared_identifiers.end(), si.begin(), si.end());
1515
50.6k
        phe_i.shared_numerators.insert(phe_i.shared_numerators.end(), si.size(), 0);
1516
50.6k
        ++i;
1517
50.6k
    }
1518
30.1k
}
1519
1520
void
1521
Lin::calculateHSharedObject(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1522
30.1k
{
1523
30.1k
    CHSharedObject& cso = m->c_shared_object_data;
1524
30.1k
    std::vector<CHSharedObjectEntry>& csoe = cso.entries;
1525
30.1k
    HSharedObject& so = m->shared_object_hints;
1526
30.1k
    std::vector<HSharedObjectEntry>& soe = so.entries;
1527
30.1k
    soe.clear();
1528
1529
30.1k
    int min_length = outputLengthNextN(csoe.at(0).object, 1, new_obj, obj);
1530
30.1k
    int max_length = min_length;
1531
1532
193k
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1533
        // Assign absolute numbers to deltas; adjust later
1534
163k
        int length = outputLengthNextN(csoe.at(i).object, 1, new_obj, obj);
1535
163k
        min_length = std::min(min_length, length);
1536
163k
        max_length = std::max(max_length, length);
1537
163k
        soe.emplace_back();
1538
163k
        soe.at(i).delta_group_length = length;
1539
163k
    }
1540
30.1k
    qpdf.no_ci_stop_if(
1541
30.1k
        soe.size() != toS(cso.nshared_total), "soe has wrong size after initialization" //
1542
30.1k
    );
1543
1544
30.1k
    so.nshared_total = cso.nshared_total;
1545
30.1k
    so.nshared_first_page = cso.nshared_first_page;
1546
30.1k
    if (so.nshared_total > so.nshared_first_page) {
1547
367
        so.first_shared_obj = obj[cso.first_shared_obj].renumber;
1548
367
        so.min_group_length = min_length;
1549
367
        so.first_shared_offset = new_obj[so.first_shared_obj].xref.getOffset();
1550
367
    }
1551
30.1k
    so.min_group_length = min_length;
1552
30.1k
    so.nbits_delta_group_length = nbits(max_length - min_length);
1553
1554
193k
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1555
        // Adjust deltas
1556
163k
        qpdf.no_ci_stop_if(
1557
163k
            soe.at(i).delta_group_length < min_length,
1558
163k
            "found too small group length while writing linearization data" //
1559
163k
        );
1560
1561
163k
        soe.at(i).delta_group_length -= min_length;
1562
163k
    }
1563
30.1k
}
1564
1565
void
1566
Lin::calculateHOutline(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1567
30.1k
{
1568
30.1k
    HGeneric& cho = m->c_outline_data;
1569
1570
30.1k
    if (cho.nobjects == 0) {
1571
29.3k
        return;
1572
29.3k
    }
1573
1574
814
    HGeneric& ho = m->outline_hints;
1575
1576
814
    ho.first_object = obj[cho.first_object].renumber;
1577
814
    ho.first_object_offset = new_obj[ho.first_object].xref.getOffset();
1578
814
    ho.nobjects = cho.nobjects;
1579
814
    ho.group_length = outputLengthNextN(cho.first_object, ho.nobjects, new_obj, obj);
1580
814
}
1581
1582
template <class T, class int_type>
1583
static void
1584
write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec, int bits, int_type T::* field)
1585
241k
{
1586
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1587
1588
983k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1589
742k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1590
742k
    }
1591
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1592
    // start on a byte boundary.
1593
241k
    w.flush();
1594
241k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HPageOffsetEntry, int>(BitWriter&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, int QPDF::HPageOffsetEntry::*)
Line
Count
Source
1585
60.3k
{
1586
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1587
1588
161k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1589
101k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1590
101k
    }
1591
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1592
    // start on a byte boundary.
1593
60.3k
    w.flush();
1594
60.3k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HPageOffsetEntry, long long>(BitWriter&, int, std::__1::vector<QPDF::HPageOffsetEntry, std::__1::allocator<QPDF::HPageOffsetEntry> >&, int, long long QPDF::HPageOffsetEntry::*)
Line
Count
Source
1585
90.4k
{
1586
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1587
1588
242k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1589
151k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1590
151k
    }
1591
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1592
    // start on a byte boundary.
1593
90.4k
    w.flush();
1594
90.4k
}
QPDF_linearization.cc:void write_vector_int<QPDF::HSharedObjectEntry, int>(BitWriter&, int, std::__1::vector<QPDF::HSharedObjectEntry, std::__1::allocator<QPDF::HSharedObjectEntry> >&, int, int QPDF::HSharedObjectEntry::*)
Line
Count
Source
1585
90.4k
{
1586
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1587
1588
579k
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1589
489k
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1590
489k
    }
1591
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1592
    // start on a byte boundary.
1593
90.4k
    w.flush();
1594
90.4k
}
1595
1596
template <class T>
1597
static void
1598
write_vector_vector(
1599
    BitWriter& w,
1600
    int nitems1,
1601
    std::vector<T>& vec1,
1602
    int T::* nitems2,
1603
    int bits,
1604
    std::vector<int> T::* vec2)
1605
60.3k
{
1606
    // nitems1 times, write nitems2 (from the ith element of vec1) items from the vec2 vector field
1607
    // of the ith item of vec1.
1608
161k
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
1609
500k
        for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2) {
1610
399k
            w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)), QIntC::to_size(bits));
1611
399k
        }
1612
101k
    }
1613
60.3k
    w.flush();
1614
60.3k
}
1615
1616
void
1617
Lin::writeHPageOffset(BitWriter& w)
1618
30.1k
{
1619
30.1k
    HPageOffset& t = m->page_offset_hints;
1620
1621
30.1k
    w.writeBitsInt(t.min_nobjects, 32);               // 1
1622
30.1k
    w.writeBits(toULL(t.first_page_offset), 32);      // 2
1623
30.1k
    w.writeBitsInt(t.nbits_delta_nobjects, 16);       // 3
1624
30.1k
    w.writeBitsInt(t.min_page_length, 32);            // 4
1625
30.1k
    w.writeBitsInt(t.nbits_delta_page_length, 16);    // 5
1626
30.1k
    w.writeBits(toULL(t.min_content_offset), 32);     // 6
1627
30.1k
    w.writeBitsInt(t.nbits_delta_content_offset, 16); // 7
1628
30.1k
    w.writeBitsInt(t.min_content_length, 32);         // 8
1629
30.1k
    w.writeBitsInt(t.nbits_delta_content_length, 16); // 9
1630
30.1k
    w.writeBitsInt(t.nbits_nshared_objects, 16);      // 10
1631
30.1k
    w.writeBitsInt(t.nbits_shared_identifier, 16);    // 11
1632
30.1k
    w.writeBitsInt(t.nbits_shared_numerator, 16);     // 12
1633
30.1k
    w.writeBitsInt(t.shared_denominator, 16);         // 13
1634
1635
30.1k
    int nitems = toI(qpdf.getAllPages().size());
1636
30.1k
    std::vector<HPageOffsetEntry>& entries = t.entries;
1637
1638
30.1k
    write_vector_int(w, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
1639
30.1k
    write_vector_int(
1640
30.1k
        w, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
1641
30.1k
    write_vector_int(
1642
30.1k
        w, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
1643
30.1k
    write_vector_vector(
1644
30.1k
        w,
1645
30.1k
        nitems,
1646
30.1k
        entries,
1647
30.1k
        &HPageOffsetEntry::nshared_objects,
1648
30.1k
        t.nbits_shared_identifier,
1649
30.1k
        &HPageOffsetEntry::shared_identifiers);
1650
30.1k
    write_vector_vector(
1651
30.1k
        w,
1652
30.1k
        nitems,
1653
30.1k
        entries,
1654
30.1k
        &HPageOffsetEntry::nshared_objects,
1655
30.1k
        t.nbits_shared_numerator,
1656
30.1k
        &HPageOffsetEntry::shared_numerators);
1657
30.1k
    write_vector_int(
1658
30.1k
        w, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
1659
30.1k
    write_vector_int(
1660
30.1k
        w, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
1661
30.1k
}
1662
1663
void
1664
Lin::writeHSharedObject(BitWriter& w)
1665
30.1k
{
1666
30.1k
    HSharedObject& t = m->shared_object_hints;
1667
1668
30.1k
    w.writeBitsInt(t.first_shared_obj, 32);         // 1
1669
30.1k
    w.writeBits(toULL(t.first_shared_offset), 32);  // 2
1670
30.1k
    w.writeBitsInt(t.nshared_first_page, 32);       // 3
1671
30.1k
    w.writeBitsInt(t.nshared_total, 32);            // 4
1672
30.1k
    w.writeBitsInt(t.nbits_nobjects, 16);           // 5
1673
30.1k
    w.writeBitsInt(t.min_group_length, 32);         // 6
1674
30.1k
    w.writeBitsInt(t.nbits_delta_group_length, 16); // 7
1675
1676
30.1k
    QTC::TC(
1677
30.1k
        "qpdf",
1678
30.1k
        "QPDF lin write nshared_total > nshared_first_page",
1679
30.1k
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
1680
1681
30.1k
    int nitems = t.nshared_total;
1682
30.1k
    std::vector<HSharedObjectEntry>& entries = t.entries;
1683
1684
30.1k
    write_vector_int(
1685
30.1k
        w, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
1686
30.1k
    write_vector_int(w, nitems, entries, 1, &HSharedObjectEntry::signature_present);
1687
193k
    for (size_t i = 0; i < toS(nitems); ++i) {
1688
        // If signature were present, we'd have to write a 128-bit hash.
1689
163k
        if (entries.at(i).signature_present != 0) {
1690
0
            qpdf.stopOnError("found unexpected signature present while writing linearization data");
1691
0
        }
1692
163k
    }
1693
30.1k
    write_vector_int(w, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
1694
30.1k
}
1695
1696
void
1697
Lin::writeHGeneric(BitWriter& w, HGeneric& t)
1698
814
{
1699
814
    w.writeBitsInt(t.first_object, 32);            // 1
1700
814
    w.writeBits(toULL(t.first_object_offset), 32); // 2
1701
814
    w.writeBitsInt(t.nobjects, 32);                // 3
1702
814
    w.writeBitsInt(t.group_length, 32);            // 4
1703
814
}
1704
1705
void
1706
Lin::generateHintStream(
1707
    QPDFWriter::NewObjTable const& new_obj,
1708
    QPDFWriter::ObjTable const& obj,
1709
    std::string& hint_buffer,
1710
    int& S,
1711
    int& O,
1712
    bool compressed)
1713
30.1k
{
1714
    // Populate actual hint table values
1715
30.1k
    calculateHPageOffset(new_obj, obj);
1716
30.1k
    calculateHSharedObject(new_obj, obj);
1717
30.1k
    calculateHOutline(new_obj, obj);
1718
1719
    // Write the hint stream itself into a compressed memory buffer. Write through a counter so we
1720
    // can get offsets.
1721
30.1k
    pl::Count c(0, hint_buffer);
1722
30.1k
    BitWriter w(&c);
1723
1724
30.1k
    writeHPageOffset(w);
1725
30.1k
    S = toI(c.getCount());
1726
30.1k
    writeHSharedObject(w);
1727
30.1k
    O = 0;
1728
30.1k
    if (m->outline_hints.nobjects > 0) {
1729
814
        O = toI(c.getCount());
1730
814
        writeHGeneric(w, m->outline_hints);
1731
814
    }
1732
30.1k
    if (compressed) {
1733
30.1k
        hint_buffer = pl::pipe<Pl_Flate>(hint_buffer, Pl_Flate::a_deflate);
1734
30.1k
    }
1735
30.1k
}