Coverage Report

Created: 2025-12-05 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/QPDF_linearization.cc
Line
Count
Source
1
// See doc/linearization.
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <qpdf/BitStream.hh>
6
#include <qpdf/BitWriter.hh>
7
#include <qpdf/InputSource_private.hh>
8
#include <qpdf/Pipeline_private.hh>
9
#include <qpdf/Pl_Buffer.hh>
10
#include <qpdf/Pl_Flate.hh>
11
#include <qpdf/Pl_String.hh>
12
#include <qpdf/QPDFExc.hh>
13
#include <qpdf/QPDFObjectHandle_private.hh>
14
#include <qpdf/QPDFWriter_private.hh>
15
#include <qpdf/QTC.hh>
16
#include <qpdf/QUtil.hh>
17
#include <qpdf/Util.hh>
18
19
#include <algorithm>
20
#include <cmath>
21
#include <cstring>
22
#include <utility>
23
24
using namespace qpdf;
25
using namespace std::literals;
26
27
using Lin = QPDF::Doc::Linearization;
28
29
template <class T, class int_type>
30
static void
31
load_vector_int(
32
    BitStream& bit_stream, int nitems, std::vector<T>& vec, int bits_wanted, int_type T::* field)
33
0
{
34
0
    bool append = vec.empty();
35
    // nitems times, read bits_wanted from the given bit stream, storing results in the ith vector
36
    // entry.
37
38
0
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
39
0
        if (append) {
40
0
            vec.push_back(T());
41
0
        }
42
0
        vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted));
43
0
    }
44
0
    util::assertion(
45
0
        std::cmp_equal(vec.size(), nitems), "vector has wrong size in load_vector_int" //
46
0
    );
47
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
48
    // start on a byte boundary.
49
0
    bit_stream.skipToNextByte();
50
0
}
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, int>(BitStream&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, int QPDF::Doc::Linearization::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, long long>(BitStream&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, long long QPDF::Doc::Linearization::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void load_vector_int<QPDF::Doc::Linearization::HSharedObjectEntry, int>(BitStream&, int, std::__1::vector<QPDF::Doc::Linearization::HSharedObjectEntry, std::__1::allocator<QPDF::Doc::Linearization::HSharedObjectEntry> >&, int, int QPDF::Doc::Linearization::HSharedObjectEntry::*)
51
52
template <class T>
53
static void
54
load_vector_vector(
55
    BitStream& bit_stream,
56
    int nitems1,
57
    std::vector<T>& vec1,
58
    int T::* nitems2,
59
    int bits_wanted,
60
    std::vector<int> T::* vec2)
61
0
{
62
    // nitems1 times, read nitems2 (from the ith element of vec1) items into the vec2 vector field
63
    // of the ith item of vec1.
64
0
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
65
0
        for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2) {
66
0
            (vec1.at(i1).*vec2).push_back(bit_stream.getBitsInt(QIntC::to_size(bits_wanted)));
67
0
        }
68
0
    }
69
0
    bit_stream.skipToNextByte();
70
0
}
71
72
Lin::ObjUser::ObjUser(user_e type) :
73
0
    ou_type(type)
74
0
{
75
0
    qpdf_expect(type == ou_root);
76
0
}
77
78
Lin::ObjUser::ObjUser(user_e type, size_t pageno) :
79
0
    ou_type(type),
80
0
    pageno(pageno)
81
0
{
82
0
    qpdf_expect(type == ou_page || type == ou_thumb);
83
0
}
84
85
Lin::ObjUser::ObjUser(user_e type, std::string const& key) :
86
0
    ou_type(type),
87
0
    key(key)
88
0
{
89
0
    qpdf_expect(type == ou_trailer_key || type == ou_root_key);
90
0
}
91
92
bool
93
Lin::ObjUser::operator<(ObjUser const& rhs) const
94
0
{
95
0
    if (ou_type < rhs.ou_type) {
96
0
        return true;
97
0
    }
98
0
    if (ou_type == rhs.ou_type) {
99
0
        if (pageno < rhs.pageno) {
100
0
            return true;
101
0
        }
102
0
        if (pageno == rhs.pageno) {
103
0
            return key < rhs.key;
104
0
        }
105
0
    }
106
0
    return false;
107
0
}
108
109
Lin::UpdateObjectMapsFrame::UpdateObjectMapsFrame(
110
    ObjUser const& ou, QPDFObjectHandle oh, bool top) :
111
0
    ou(ou),
112
0
    oh(oh),
113
0
    top(top)
114
0
{
115
0
}
116
117
void
118
QPDF::optimize(
119
    std::map<int, int> const& object_stream_data,
120
    bool allow_changes,
121
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
122
0
{
123
0
    m->lin.optimize_internal(object_stream_data, allow_changes, skip_stream_parameters);
124
0
}
125
126
void
127
Lin::optimize(
128
    QPDFWriter::ObjTable const& obj, std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
129
0
{
130
0
    optimize_internal(obj, true, skip_stream_parameters);
131
0
}
132
133
template <typename T>
134
void
135
Lin::optimize_internal(
136
    T const& object_stream_data,
137
    bool allow_changes,
138
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
139
0
{
140
0
    if (!obj_user_to_objects_.empty()) {
141
        // already optimized
142
0
        return;
143
0
    }
144
145
    // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force
146
    // it to be so if it exists and is direct.  (This has been seen in the wild.)
147
0
    QPDFObjectHandle root = qpdf.getRoot();
148
0
    if (root.getKey("/Outlines").isDictionary()) {
149
0
        QPDFObjectHandle outlines = root.getKey("/Outlines");
150
0
        if (!outlines.isIndirect()) {
151
0
            root.replaceKey("/Outlines", qpdf.makeIndirectObject(outlines));
152
0
        }
153
0
    }
154
155
    // Traverse pages tree pushing all inherited resources down to the page level.  This also
156
    // initializes m->all_pages.
157
0
    m->pages.pushInheritedAttributesToPage(allow_changes, false);
158
    // Traverse pages
159
160
0
    size_t n = 0;
161
0
    for (auto const& page: m->pages) {
162
0
        updateObjectMaps(ObjUser(ObjUser::ou_page, n), page, skip_stream_parameters);
163
0
        ++n;
164
0
    }
165
166
    // Traverse document-level items
167
0
    for (auto const& [key, value]: m->trailer.as_dictionary()) {
168
0
        if (key == "/Root") {
169
            // handled separately
170
0
        } else {
171
0
            if (!value.null()) {
172
0
                updateObjectMaps(
173
0
                    ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters);
174
0
            }
175
0
        }
176
0
    }
177
178
0
    for (auto const& [key, value]: root.as_dictionary()) {
179
        // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but
180
        // we are going to disregard that specification for now.  There is loads of evidence that
181
        // pdlin and Acrobat both disregard things like this from time to time, so this is almost
182
        // certain not to cause any problems.
183
0
        if (!value.null()) {
184
0
            updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters);
185
0
        }
186
0
    }
187
188
0
    ObjUser root_ou = ObjUser(ObjUser::ou_root);
189
0
    auto root_og = root.id_gen();
190
0
    obj_user_to_objects_[root_ou].insert(root_og);
191
0
    object_to_obj_users_[root_og].insert(root_ou);
192
193
0
    filterCompressedObjects(object_stream_data);
194
0
}
Unexecuted instantiation: void QPDF::Doc::Linearization::optimize_internal<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
Unexecuted instantiation: void QPDF::Doc::Linearization::optimize_internal<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
195
196
void
197
Lin::updateObjectMaps(
198
    ObjUser const& first_ou,
199
    QPDFObjectHandle first_oh,
200
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
201
0
{
202
0
    QPDFObjGen::set visited;
203
0
    std::vector<UpdateObjectMapsFrame> pending;
204
0
    pending.emplace_back(first_ou, first_oh, true);
205
    // Traverse the object tree from this point taking care to avoid crossing page boundaries.
206
0
    std::unique_ptr<ObjUser> thumb_ou;
207
0
    while (!pending.empty()) {
208
0
        auto cur = pending.back();
209
0
        pending.pop_back();
210
211
0
        bool is_page_node = false;
212
213
0
        if (cur.oh.isDictionaryOfType("/Page")) {
214
0
            is_page_node = true;
215
0
            if (!cur.top) {
216
0
                continue;
217
0
            }
218
0
        }
219
220
0
        if (cur.oh.indirect()) {
221
0
            QPDFObjGen og(cur.oh.getObjGen());
222
0
            if (!visited.add(og)) {
223
0
                QTC::TC("qpdf", "QPDF opt loop detected");
224
0
                continue;
225
0
            }
226
0
            obj_user_to_objects_[cur.ou].insert(og);
227
0
            object_to_obj_users_[og].insert(cur.ou);
228
0
        }
229
230
0
        if (cur.oh.isArray()) {
231
0
            for (auto const& item: cur.oh.as_array()) {
232
0
                pending.emplace_back(cur.ou, item, false);
233
0
            }
234
0
        } else if (cur.oh.isDictionary() || cur.oh.isStream()) {
235
0
            QPDFObjectHandle dict = cur.oh;
236
0
            bool is_stream = cur.oh.isStream();
237
0
            int ssp = 0;
238
0
            if (is_stream) {
239
0
                dict = cur.oh.getDict();
240
0
                if (skip_stream_parameters) {
241
0
                    ssp = skip_stream_parameters(cur.oh);
242
0
                }
243
0
            }
244
245
0
            for (auto& [key, value]: dict.as_dictionary()) {
246
0
                if (value.null()) {
247
0
                    continue;
248
0
                }
249
250
0
                if (is_page_node && (key == "/Thumb")) {
251
                    // Traverse page thumbnail dictionaries as a special case. There can only ever
252
                    // be one /Thumb key on a page, and we see at most one page node per call.
253
0
                    thumb_ou = std::make_unique<ObjUser>(ObjUser::ou_thumb, cur.ou.pageno);
254
0
                    pending.emplace_back(*thumb_ou, dict.getKey(key), false);
255
0
                } else if (is_page_node && (key == "/Parent")) {
256
                    // Don't traverse back up the page tree
257
0
                } else if (
258
0
                    ((ssp >= 1) && (key == "/Length")) ||
259
0
                    ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) {
260
                    // Don't traverse into stream parameters that we are not going to write.
261
0
                } else {
262
0
                    pending.emplace_back(cur.ou, value, false);
263
0
                }
264
0
            }
265
0
        }
266
0
    }
267
0
}
268
269
void
270
Lin::filterCompressedObjects(std::map<int, int> const& object_stream_data)
271
0
{
272
0
    if (object_stream_data.empty()) {
273
0
        return;
274
0
    }
275
276
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
277
    // objects.  If something is a user of a compressed object, then it is really a user of the
278
    // object stream that contains it.
279
280
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
281
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
282
283
0
    for (auto const& [ou, ogs]: obj_user_to_objects_) {
284
0
        for (auto const& og: ogs) {
285
0
            auto i2 = object_stream_data.find(og.getObj());
286
0
            if (i2 == object_stream_data.end()) {
287
0
                t_obj_user_to_objects[ou].insert(og);
288
0
            } else {
289
0
                t_obj_user_to_objects[ou].insert({i2->second, 0});
290
0
            }
291
0
        }
292
0
    }
293
294
0
    for (auto const& [og, ous]: object_to_obj_users_) {
295
0
        for (auto const& ou: ous) {
296
0
            auto i2 = object_stream_data.find(og.getObj());
297
0
            if (i2 == object_stream_data.end()) {
298
0
                t_object_to_obj_users[og].insert(ou);
299
0
            } else {
300
0
                t_object_to_obj_users[{i2->second, 0}].insert(ou);
301
0
            }
302
0
        }
303
0
    }
304
305
0
    obj_user_to_objects_ = std::move(t_obj_user_to_objects);
306
0
    object_to_obj_users_ = std::move(t_object_to_obj_users);
307
0
}
308
309
void
310
Lin::filterCompressedObjects(QPDFWriter::ObjTable const& obj)
311
0
{
312
0
    if (obj.getStreamsEmpty()) {
313
0
        return;
314
0
    }
315
316
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
317
    // objects.  If something is a user of a compressed object, then it is really a user of the
318
    // object stream that contains it.
319
320
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
321
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
322
323
0
    for (auto const& [ou, ogs]: obj_user_to_objects_) {
324
0
        for (auto const& og: ogs) {
325
0
            if (obj.contains(og)) {
326
0
                if (auto const& i2 = obj[og].object_stream; i2 <= 0) {
327
0
                    t_obj_user_to_objects[ou].insert(og);
328
0
                } else {
329
0
                    t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0));
330
0
                }
331
0
            }
332
0
        }
333
0
    }
334
335
0
    for (auto const& [og, ous]: object_to_obj_users_) {
336
0
        if (obj.contains(og)) {
337
            // Loop over obj_users.
338
0
            for (auto const& ou: ous) {
339
0
                if (auto i2 = obj[og].object_stream; i2 <= 0) {
340
0
                    t_object_to_obj_users[og].insert(ou);
341
0
                } else {
342
0
                    t_object_to_obj_users[{i2, 0}].insert(ou);
343
0
                }
344
0
            }
345
0
        }
346
0
    }
347
348
0
    obj_user_to_objects_ = std::move(t_obj_user_to_objects);
349
0
    object_to_obj_users_ = std::move(t_object_to_obj_users);
350
0
}
351
352
void
353
Lin::linearizationWarning(std::string_view msg)
354
0
{
355
0
    linearization_warnings_ = true;
356
0
    warn(qpdf_e_linearization, "", 0, std::string(msg));
357
0
}
358
359
bool
360
QPDF::checkLinearization()
361
0
{
362
0
    return m->lin.check();
363
0
}
364
365
bool
366
Lin::check()
367
0
{
368
0
    try {
369
0
        readLinearizationData();
370
0
        checkLinearizationInternal();
371
0
        return !linearization_warnings_;
372
0
    } catch (std::runtime_error& e) {
373
0
        linearizationWarning(
374
0
            "error encountered while checking linearization data: " + std::string(e.what()));
375
0
        return false;
376
0
    }
377
0
}
378
379
bool
380
QPDF::isLinearized()
381
0
{
382
0
    return m->lin.linearized();
383
0
}
384
385
bool
386
Lin::linearized()
387
0
{
388
    // If the first object in the file is a dictionary with a suitable /Linearized key and has an /L
389
    // key that accurately indicates the file size, initialize m->lindict and return true.
390
391
    // A linearized PDF spec's first object will be contained within the first 1024 bytes of the
392
    // file and will be a dictionary with a valid /Linearized key.  This routine looks for that and
393
    // does no additional validation.
394
395
    // The PDF spec says the linearization dictionary must be completely contained within the first
396
    // 1024 bytes of the file. Add a byte for a null terminator.
397
0
    auto buffer = m->file->read(1024, 0);
398
0
    size_t pos = 0;
399
0
    while (true) {
400
        // Find a digit or end of buffer
401
0
        pos = buffer.find_first_of("0123456789"sv, pos);
402
0
        if (pos == std::string::npos) {
403
0
            return false;
404
0
        }
405
        // Seek to the digit. Then skip over digits for a potential
406
        // next iteration.
407
0
        m->file->seek(toO(pos), SEEK_SET);
408
409
0
        auto t1 = m->objects.readToken(*m->file, 20);
410
0
        if (!(t1.isInteger() && m->objects.readToken(*m->file, 6).isInteger() &&
411
0
              m->objects.readToken(*m->file, 4).isWord("obj"))) {
412
0
            pos = buffer.find_first_not_of("0123456789"sv, pos);
413
0
            if (pos == std::string::npos) {
414
0
                return false;
415
0
            }
416
0
            continue;
417
0
        }
418
419
0
        Dictionary candidate = qpdf.getObject(toI(QUtil::string_to_ll(t1.getValue().data())), 0);
420
0
        auto linkey = candidate["/Linearized"];
421
0
        if (!(linkey.isNumber() && toI(floor(linkey.getNumericValue())) == 1)) {
422
0
            return false;
423
0
        }
424
425
0
        m->file->seek(0, SEEK_END);
426
0
        Integer L = candidate["/L"];
427
0
        if (L != m->file->tell()) {
428
0
            return false;
429
0
        }
430
0
        linp_.file_size = L;
431
0
        lindict_ = candidate;
432
0
        return true;
433
0
    }
434
0
}
435
436
void
437
Lin::readLinearizationData()
438
0
{
439
0
    util::assertion(
440
0
        linearized(), "called readLinearizationData for file that is not linearized" //
441
0
    );
442
443
    // This function throws an exception (which is trapped by checkLinearization()) for any errors
444
    // that prevent loading.
445
446
    // /L is read and stored in linp by isLinearized()
447
0
    Array H = lindict_["/H"]; // hint table offset/length for primary and overflow hint tables
448
0
    auto H_size = H.size();
449
0
    Integer H_0 = H[0]; // hint table offset
450
0
    Integer H_1 = H[1]; // hint table length
451
0
    Integer H_2 = H[2]; // hint table offset for overflow hint table
452
0
    Integer H_3 = H[3]; // hint table length for overflow hint table
453
0
    Integer O = lindict_["/O"];
454
0
    Integer E = lindict_["/E"];
455
0
    Integer N = lindict_["/N"];
456
0
    Integer T = lindict_["/T"];
457
0
    auto P_oh = lindict_["/P"];
458
0
    Integer P = P_oh; // first page number
459
0
    QTC::TC("qpdf", "QPDF P absent in lindict", P ? 0 : 1);
460
461
0
    no_ci_stop_if(
462
0
        !(H && O && E && N && T && (P || P_oh.null())),
463
0
        "some keys in linearization dictionary are of the wrong type",
464
0
        "linearization dictionary" //
465
0
    );
466
467
0
    no_ci_stop_if(
468
0
        !(H_size == 2 || H_size == 4),
469
0
        "H has the wrong number of items",
470
0
        "linearization dictionary" //
471
0
    );
472
473
0
    no_ci_stop_if(
474
0
        !(H_0 && H_1 && (H_size == 2 || (H_2 && H_3))),
475
0
        "some H items are of the wrong type",
476
0
        "linearization dictionary" //
477
0
    );
478
479
    // Store linearization parameter data
480
481
    // Various places in the code use linp.npages, which is initialized from N, to pre-allocate
482
    // memory, so make sure it's accurate and bail right now if it's not.
483
0
    no_ci_stop_if(
484
0
        N != pages.size(),
485
0
        "/N does not match number of pages",
486
0
        "linearization dictionary" //
487
0
    );
488
489
    // file_size initialized by isLinearized()
490
0
    linp_.first_page_object = O.value<int>();
491
0
    linp_.first_page_end = E;
492
0
    linp_.npages = N.value<size_t>();
493
0
    linp_.xref_zero_offset = T;
494
0
    linp_.first_page = P ? P.value<int>() : 0;
495
0
    linp_.H_offset = H_0;
496
0
    linp_.H_length = H_1;
497
498
    // Read hint streams
499
500
0
    Pl_Buffer pb("hint buffer");
501
0
    auto H0 = readHintStream(pb, H_0, H_1.value<size_t>());
502
0
    if (H_2) {
503
0
        (void)readHintStream(pb, H_2, H_3.value<size_t>());
504
0
    }
505
506
    // PDF 1.4 hint tables that we ignore:
507
508
    //  /T    thumbnail
509
    //  /A    thread information
510
    //  /E    named destination
511
    //  /V    interactive form
512
    //  /I    information dictionary
513
    //  /C    logical structure
514
    //  /L    page label
515
516
    // Individual hint table offsets
517
0
    Integer HS = H0["/S"]; // shared object
518
0
    Integer HO = H0["/O"]; // outline
519
520
0
    auto hbp = pb.getBufferSharedPointer();
521
0
    Buffer* hb = hbp.get();
522
0
    unsigned char const* h_buf = hb->getBuffer();
523
0
    size_t h_size = hb->getSize();
524
525
0
    readHPageOffset(BitStream(h_buf, h_size));
526
527
0
    size_t HSi = HS.value<size_t>();
528
0
    if (HSi < 0 || HSi >= h_size) {
529
0
        throw damagedPDF("linearization hint table", "/S (shared object) offset is out of bounds");
530
0
    }
531
0
    readHSharedObject(BitStream(h_buf + HSi, h_size - HSi));
532
533
0
    if (HO) {
534
0
        no_ci_stop_if(
535
0
            HO < 0 || HO >= h_size,
536
0
            "/O (outline) offset is out of bounds",
537
0
            "linearization dictionary" //
538
0
        );
539
0
        size_t HOi = HO.value<size_t>();
540
0
        readHGeneric(BitStream(h_buf + HO, h_size - HOi), outline_hints_);
541
0
    }
542
0
}
543
544
Dictionary
545
Lin::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
546
0
{
547
0
    auto H = m->objects.readObjectAtOffset(offset, "linearization hint stream", false);
548
0
    ObjCache& oc = m->obj_cache[H];
549
0
    qpdf_offset_t min_end_offset = oc.end_before_space;
550
0
    qpdf_offset_t max_end_offset = oc.end_after_space;
551
0
    no_ci_stop_if(
552
0
        !H.isStream(), "hint table is not a stream", "linearization dictionary" //
553
0
    );
554
555
0
    Dictionary Hdict = H.getDict();
556
557
    // Some versions of Acrobat make /Length indirect and place it immediately after the stream,
558
    // increasing length to cover it, even though the specification says all objects in the
559
    // linearization parameter dictionary must be direct.  We have to get the file position of the
560
    // end of length in this case.
561
0
    if (Hdict["/Length"].indirect()) {
562
0
        ObjCache& oc2 = m->obj_cache[Hdict["/Length"]];
563
0
        min_end_offset = oc2.end_before_space;
564
0
        max_end_offset = oc2.end_after_space;
565
0
    } else {
566
0
        QTC::TC("qpdf", "QPDF hint table length direct");
567
0
    }
568
0
    qpdf_offset_t computed_end = offset + toO(length);
569
0
    no_ci_stop_if(
570
0
        computed_end < min_end_offset || computed_end > max_end_offset,
571
0
        "hint table length mismatch (expected = " + std::to_string(computed_end) + "; actual = " +
572
0
            std::to_string(min_end_offset) + ".." + std::to_string(max_end_offset) + ")",
573
0
        "linearization dictionary" //
574
0
    );
575
0
    H.pipeStreamData(&pl, 0, qpdf_dl_specialized);
576
0
    return Hdict;
577
0
}
578
579
void
580
Lin::readHPageOffset(BitStream h)
581
0
{
582
    // All comments referring to the PDF spec refer to the spec for version 1.4.
583
584
0
    HPageOffset& t = page_offset_hints_;
585
586
0
    t.min_nobjects = h.getBitsInt(32);               // 1
587
0
    t.first_page_offset = h.getBitsInt(32);          // 2
588
0
    t.nbits_delta_nobjects = h.getBitsInt(16);       // 3
589
0
    t.min_page_length = h.getBitsInt(32);            // 4
590
0
    t.nbits_delta_page_length = h.getBitsInt(16);    // 5
591
0
    t.min_content_offset = h.getBitsInt(32);         // 6
592
0
    t.nbits_delta_content_offset = h.getBitsInt(16); // 7
593
0
    t.min_content_length = h.getBitsInt(32);         // 8
594
0
    t.nbits_delta_content_length = h.getBitsInt(16); // 9
595
0
    t.nbits_nshared_objects = h.getBitsInt(16);      // 10
596
0
    t.nbits_shared_identifier = h.getBitsInt(16);    // 11
597
0
    t.nbits_shared_numerator = h.getBitsInt(16);     // 12
598
0
    t.shared_denominator = h.getBitsInt(16);         // 13
599
600
0
    std::vector<HPageOffsetEntry>& entries = t.entries;
601
0
    entries.clear();
602
0
    int nitems = toI(linp_.npages);
603
0
    load_vector_int(h, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
604
0
    load_vector_int(
605
0
        h, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
606
0
    load_vector_int(
607
0
        h, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
608
0
    load_vector_vector(
609
0
        h,
610
0
        nitems,
611
0
        entries,
612
0
        &HPageOffsetEntry::nshared_objects,
613
0
        t.nbits_shared_identifier,
614
0
        &HPageOffsetEntry::shared_identifiers);
615
0
    load_vector_vector(
616
0
        h,
617
0
        nitems,
618
0
        entries,
619
0
        &HPageOffsetEntry::nshared_objects,
620
0
        t.nbits_shared_numerator,
621
0
        &HPageOffsetEntry::shared_numerators);
622
0
    load_vector_int(
623
0
        h, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
624
0
    load_vector_int(
625
0
        h, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
626
0
}
627
628
void
629
Lin::readHSharedObject(BitStream h)
630
0
{
631
0
    HSharedObject& t = shared_object_hints_;
632
633
0
    t.first_shared_obj = h.getBitsInt(32);         // 1
634
0
    t.first_shared_offset = h.getBitsInt(32);      // 2
635
0
    t.nshared_first_page = h.getBitsInt(32);       // 3
636
0
    t.nshared_total = h.getBitsInt(32);            // 4
637
0
    t.nbits_nobjects = h.getBitsInt(16);           // 5
638
0
    t.min_group_length = h.getBitsInt(32);         // 6
639
0
    t.nbits_delta_group_length = h.getBitsInt(16); // 7
640
641
0
    QTC::TC(
642
0
        "qpdf",
643
0
        "QPDF lin nshared_total > nshared_first_page",
644
0
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
645
646
0
    std::vector<HSharedObjectEntry>& entries = t.entries;
647
0
    entries.clear();
648
0
    int nitems = t.nshared_total;
649
0
    load_vector_int(
650
0
        h, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
651
0
    load_vector_int(h, nitems, entries, 1, &HSharedObjectEntry::signature_present);
652
0
    for (size_t i = 0; i < toS(nitems); ++i) {
653
0
        if (entries.at(i).signature_present) {
654
            // Skip 128-bit MD5 hash.  These are not supported by acrobat, so they should probably
655
            // never be there.  We have no test case for this.
656
0
            for (int j = 0; j < 4; ++j) {
657
0
                (void)h.getBits(32);
658
0
            }
659
0
        }
660
0
    }
661
0
    load_vector_int(h, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
662
0
}
663
664
void
665
Lin::readHGeneric(BitStream h, HGeneric& t)
666
0
{
667
0
    t.first_object = h.getBitsInt(32);        // 1
668
0
    t.first_object_offset = h.getBitsInt(32); // 2
669
0
    t.nobjects = h.getBitsInt(32);            // 3
670
0
    t.group_length = h.getBitsInt(32);        // 4
671
0
}
672
673
void
674
Lin::checkLinearizationInternal()
675
0
{
676
    // All comments referring to the PDF spec refer to the spec for version 1.4.
677
678
    // Check all values in linearization parameter dictionary
679
680
0
    LinParameters& p = linp_;
681
682
    // L: file size in bytes -- checked by isLinearized
683
684
    // O: object number of first page
685
0
    auto const& all_pages = pages.all();
686
0
    if (p.first_page_object != all_pages.at(0).getObjectID()) {
687
0
        linearizationWarning("first page object (/O) mismatch");
688
0
    }
689
690
    // N: number of pages
691
0
    size_t npages = all_pages.size();
692
0
    if (std::cmp_not_equal(p.npages, npages)) {
693
        // Not tested in the test suite
694
0
        linearizationWarning("page count (/N) mismatch");
695
0
    }
696
697
0
    int i = 0;
698
0
    for (auto const& page: all_pages) {
699
0
        if (m->xref_table[page].getType() == 2) {
700
0
            linearizationWarning(
701
0
                "page dictionary for page " + std::to_string(i) + " is compressed");
702
0
        }
703
0
        ++i;
704
0
    }
705
706
    // T: offset of whitespace character preceding xref entry for object 0
707
0
    m->file->seek(p.xref_zero_offset, SEEK_SET);
708
0
    while (true) {
709
0
        char ch;
710
0
        m->file->read(&ch, 1);
711
0
        if (!(ch == ' ' || ch == '\r' || ch == '\n')) {
712
0
            m->file->seek(-1, SEEK_CUR);
713
0
            break;
714
0
        }
715
0
    }
716
0
    if (m->file->tell() != objects.first_xref_item_offset()) {
717
0
        linearizationWarning(
718
0
            "space before first xref item (/T) mismatch (computed = " +
719
0
            std::to_string(objects.first_xref_item_offset()) +
720
0
            "; file = " + std::to_string(m->file->tell()));
721
0
    }
722
723
    // P: first page number -- Implementation note 124 says Acrobat ignores this value, so we will
724
    // too.
725
726
    // Check numbering of compressed objects in each xref section. For linearized files, all
727
    // compressed objects are supposed to be at the end of the containing xref section if any object
728
    // streams are in use.
729
730
0
    if (objects.uncompressed_after_compressed()) {
731
0
        linearizationWarning(
732
0
            "linearized file contains an uncompressed object after a compressed "
733
0
            "one in a cross-reference stream");
734
0
    }
735
736
    // Further checking requires optimization and order calculation. Don't allow optimization to
737
    // make changes.  If it has to, then the file is not properly linearized.  We use the xref table
738
    // to figure out which objects are compressed and which are uncompressed.
739
0
    { // local scope
740
0
        std::map<int, int> object_stream_data;
741
0
        for (auto const& [og, entry]: m->xref_table) {
742
0
            if (entry.getType() == 2) {
743
0
                object_stream_data[og.getObj()] = entry.getObjStreamNumber();
744
0
            }
745
0
        }
746
0
        optimize_internal(object_stream_data, false, nullptr);
747
0
        calculateLinearizationData(object_stream_data);
748
0
    }
749
750
    // E: offset of end of first page -- Implementation note 123 says Acrobat includes on extra
751
    // object here by mistake.  pdlin fails to place thumbnail images in section 9, so when
752
    // thumbnails are present, it also gets the wrong value for /E.  It also doesn't count outlines
753
    // here when it should even though it places them in part 6.  This code fails to put thread
754
    // information dictionaries in part 9, so it actually gets the wrong value for E when threads
755
    // are present.  In that case, it would probably agree with pdlin.  As of this writing, the test
756
    // suite doesn't contain any files with threads.
757
758
0
    no_ci_stop_if(
759
0
        part6_.empty(), "linearization part 6 unexpectedly empty" //
760
0
    );
761
0
    qpdf_offset_t min_E = -1;
762
0
    qpdf_offset_t max_E = -1;
763
0
    for (auto const& oh: part6_) {
764
0
        QPDFObjGen og(oh.getObjGen());
765
        // All objects have to have been dereferenced to be classified.
766
0
        util::assertion(m->obj_cache.contains(og), "linearization part6 object not in cache");
767
0
        ObjCache const& oc = m->obj_cache[og];
768
0
        min_E = std::max(min_E, oc.end_before_space);
769
0
        max_E = std::max(max_E, oc.end_after_space);
770
0
    }
771
0
    if (p.first_page_end < min_E || p.first_page_end > max_E) {
772
0
        linearizationWarning(
773
0
            "end of first page section (/E) mismatch: /E = " + std::to_string(p.first_page_end) +
774
0
            "; computed = " + std::to_string(min_E) + ".." + std::to_string(max_E));
775
0
    }
776
777
    // Check hint tables
778
779
0
    std::map<int, int> shared_idx_to_obj;
780
0
    checkHSharedObject(all_pages, shared_idx_to_obj);
781
0
    checkHPageOffset(all_pages, shared_idx_to_obj);
782
0
    checkHOutlines();
783
0
}
784
785
qpdf_offset_t
786
Lin::maxEnd(ObjUser const& ou)
787
0
{
788
0
    no_ci_stop_if(
789
0
        !obj_user_to_objects_.contains(ou),
790
0
        "no entry in object user table for requested object user" //
791
0
    );
792
793
0
    qpdf_offset_t end = 0;
794
0
    for (auto const& og: obj_user_to_objects_[ou]) {
795
0
        no_ci_stop_if(
796
0
            !m->obj_cache.contains(og), "unknown object referenced in object user table" //
797
0
        );
798
0
        end = std::max(end, m->obj_cache[og].end_after_space);
799
0
    }
800
0
    return end;
801
0
}
802
803
qpdf_offset_t
804
Lin::getLinearizationOffset(QPDFObjGen og)
805
0
{
806
0
    QPDFXRefEntry const& entry = m->xref_table[og];
807
0
    auto typ = entry.getType();
808
0
    if (typ == 1) {
809
0
        return entry.getOffset();
810
0
    }
811
0
    no_ci_stop_if(
812
0
        typ != 2, "getLinearizationOffset called for xref entry not of type 1 or 2" //
813
0
    );
814
    // For compressed objects, return the offset of the object stream that contains them.
815
0
    return getLinearizationOffset({entry.getObjStreamNumber(), 0});
816
0
}
817
818
QPDFObjectHandle
819
Lin::getUncompressedObject(QPDFObjectHandle& obj, std::map<int, int> const& object_stream_data)
820
0
{
821
0
    if (obj.null() || !object_stream_data.contains(obj.getObjectID())) {
822
0
        return obj;
823
0
    }
824
0
    return qpdf.getObject((*(object_stream_data.find(obj.getObjectID()))).second, 0);
825
0
}
826
827
QPDFObjectHandle
828
Lin::getUncompressedObject(QPDFObjectHandle& oh, QPDFWriter::ObjTable const& obj)
829
0
{
830
0
    if (obj.contains(oh)) {
831
0
        if (auto id = obj[oh].object_stream; id > 0) {
832
0
            return oh.null() ? oh : qpdf.getObject(id, 0);
833
0
        }
834
0
    }
835
0
    return oh;
836
0
}
837
838
int
839
Lin::lengthNextN(int first_object, int n)
840
0
{
841
0
    int length = 0;
842
0
    for (int i = 0; i < n; ++i) {
843
0
        QPDFObjGen og(first_object + i, 0);
844
0
        if (m->xref_table.contains(og)) {
845
0
            no_ci_stop_if(
846
0
                !m->obj_cache.contains(og),
847
0
                "found unknown object while calculating length for linearization data" //
848
0
            );
849
850
0
            length += toI(m->obj_cache[og].end_after_space - getLinearizationOffset(og));
851
0
        } else {
852
0
            linearizationWarning(
853
0
                "no xref table entry for " + std::to_string(first_object + i) + " 0");
854
0
        }
855
0
    }
856
0
    return length;
857
0
}
858
859
void
860
Lin::checkHPageOffset(
861
    std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& shared_idx_to_obj)
862
0
{
863
    // Implementation note 126 says Acrobat always sets delta_content_offset and
864
    // delta_content_length in the page offset header dictionary to 0.  It also states that
865
    // min_content_offset in the per-page information is always 0, which is an incorrect value.
866
867
    // Implementation note 127 explains that Acrobat always sets item 8 (min_content_length) to
868
    // zero, item 9 (nbits_delta_content_length) to the value of item 5 (nbits_delta_page_length),
869
    // and item 7 of each per-page hint table (delta_content_length) to item 2 (delta_page_length)
870
    // of that entry.  Acrobat ignores these values when reading files.
871
872
    // Empirically, it also seems that Acrobat sometimes puts items under a page's /Resources
873
    // dictionary in with shared objects even when they are private.
874
875
0
    size_t npages = pages.size();
876
0
    qpdf_offset_t table_offset = adjusted_offset(page_offset_hints_.first_page_offset);
877
0
    QPDFObjGen first_page_og(pages.at(0).getObjGen());
878
0
    if (!m->xref_table.contains(first_page_og)) {
879
0
        stopOnError("supposed first page object is not known");
880
0
    }
881
0
    qpdf_offset_t offset = getLinearizationOffset(first_page_og);
882
0
    if (table_offset != offset) {
883
0
        linearizationWarning("first page object offset mismatch");
884
0
    }
885
886
0
    for (size_t pageno = 0; pageno < npages; ++pageno) {
887
0
        QPDFObjGen page_og(pages.at(pageno).getObjGen());
888
0
        int first_object = page_og.getObj();
889
0
        if (!m->xref_table.contains(page_og)) {
890
0
            stopOnError("unknown object in page offset hint table");
891
0
        }
892
0
        offset = getLinearizationOffset(page_og);
893
894
0
        HPageOffsetEntry& he = page_offset_hints_.entries.at(pageno);
895
0
        CHPageOffsetEntry& ce = c_page_offset_data_.entries.at(pageno);
896
0
        int h_nobjects = he.delta_nobjects + page_offset_hints_.min_nobjects;
897
0
        if (h_nobjects != ce.nobjects) {
898
            // This happens with pdlin when there are thumbnails.
899
0
            linearizationWarning(
900
0
                "object count mismatch for page " + std::to_string(pageno) + ": hint table = " +
901
0
                std::to_string(h_nobjects) + "; computed = " + std::to_string(ce.nobjects));
902
0
        }
903
904
        // Use value for number of objects in hint table rather than computed value if there is a
905
        // discrepancy.
906
0
        int length = lengthNextN(first_object, h_nobjects);
907
0
        int h_length = toI(he.delta_page_length + page_offset_hints_.min_page_length);
908
0
        if (length != h_length) {
909
            // This condition almost certainly indicates a bad hint table or a bug in this code.
910
0
            linearizationWarning(
911
0
                "page length mismatch for page " + std::to_string(pageno) + ": hint table = " +
912
0
                std::to_string(h_length) + "; computed length = " + std::to_string(length) +
913
0
                " (offset = " + std::to_string(offset) + ")");
914
0
        }
915
916
0
        offset += h_length;
917
918
        // Translate shared object indexes to object numbers.
919
0
        std::set<int> hint_shared;
920
0
        std::set<int> computed_shared;
921
922
0
        if (pageno == 0 && he.nshared_objects > 0) {
923
            // pdlin and Acrobat both do this even though the spec states clearly and unambiguously
924
            // that they should not.
925
0
            linearizationWarning("page 0 has shared identifier entries");
926
0
        }
927
928
0
        for (size_t i = 0; i < toS(he.nshared_objects); ++i) {
929
0
            int idx = he.shared_identifiers.at(i);
930
0
            no_ci_stop_if(
931
0
                !shared_idx_to_obj.contains(idx),
932
0
                "unable to get object for item in shared objects hint table");
933
934
0
            hint_shared.insert(shared_idx_to_obj[idx]);
935
0
        }
936
937
0
        for (size_t i = 0; i < toS(ce.nshared_objects); ++i) {
938
0
            int idx = ce.shared_identifiers.at(i);
939
0
            no_ci_stop_if(
940
0
                idx >= c_shared_object_data_.nshared_total,
941
0
                "index out of bounds for shared object hint table" //
942
0
            );
943
944
0
            int obj = c_shared_object_data_.entries.at(toS(idx)).object;
945
0
            computed_shared.insert(obj);
946
0
        }
947
948
0
        for (int iter: hint_shared) {
949
0
            if (!computed_shared.contains(iter)) {
950
                // pdlin puts thumbnails here even though it shouldn't
951
0
                linearizationWarning(
952
0
                    "page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
953
0
                    ": in hint table but not computed list");
954
0
            }
955
0
        }
956
957
0
        for (int iter: computed_shared) {
958
0
            if (!hint_shared.contains(iter)) {
959
                // Acrobat does not put some things including at least built-in fonts and procsets
960
                // here, at least in some cases.
961
0
                linearizationWarning(
962
0
                    ("page " + std::to_string(pageno) + ": shared object " + std::to_string(iter) +
963
0
                     ": in computed list but not hint table"));
964
0
            }
965
0
        }
966
0
    }
967
0
}
968
969
void
970
Lin::checkHSharedObject(std::vector<QPDFObjectHandle> const& pages, std::map<int, int>& idx_to_obj)
971
0
{
972
    // Implementation note 125 says shared object groups always contain only one object.
973
    // Implementation note 128 says that Acrobat always nbits_nobjects to zero.  Implementation note
974
    // 130 says that Acrobat does not support more than one shared object per group.  These are all
975
    // consistent.
976
977
    // Implementation note 129 states that MD5 signatures are not implemented in Acrobat, so
978
    // signature_present must always be zero.
979
980
    // Implementation note 131 states that first_shared_obj and first_shared_offset have meaningless
981
    // values for single-page files.
982
983
    // Empirically, Acrobat and pdlin generate incorrect values for these whenever there are no
984
    // shared objects not referenced by the first page (i.e., nshared_total == nshared_first_page).
985
986
0
    HSharedObject& so = shared_object_hints_;
987
0
    if (so.nshared_total < so.nshared_first_page) {
988
0
        linearizationWarning("shared object hint table: ntotal < nfirst_page");
989
0
    } else {
990
        // The first nshared_first_page objects are consecutive objects starting with the first page
991
        // object.  The rest are consecutive starting from the first_shared_obj object.
992
0
        int cur_object = pages.at(0).getObjectID();
993
0
        for (int i = 0; i < so.nshared_total; ++i) {
994
0
            if (i == so.nshared_first_page) {
995
0
                QTC::TC("qpdf", "QPDF lin check shared past first page");
996
0
                if (part8_.empty()) {
997
0
                    linearizationWarning("part 8 is empty but nshared_total > nshared_first_page");
998
0
                } else {
999
0
                    int obj = part8_.at(0).getObjectID();
1000
0
                    if (obj != so.first_shared_obj) {
1001
0
                        linearizationWarning(
1002
0
                            "first shared object number mismatch: hint table = " +
1003
0
                            std::to_string(so.first_shared_obj) +
1004
0
                            "; computed = " + std::to_string(obj));
1005
0
                    }
1006
0
                }
1007
1008
0
                cur_object = so.first_shared_obj;
1009
1010
0
                QPDFObjGen og(cur_object, 0);
1011
0
                if (!m->xref_table.contains(og)) {
1012
0
                    stopOnError("unknown object in shared object hint table");
1013
0
                }
1014
0
                qpdf_offset_t offset = getLinearizationOffset(og);
1015
0
                qpdf_offset_t h_offset = adjusted_offset(so.first_shared_offset);
1016
0
                if (offset != h_offset) {
1017
0
                    linearizationWarning(
1018
0
                        "first shared object offset mismatch: hint table = " +
1019
0
                        std::to_string(h_offset) + "; computed = " + std::to_string(offset));
1020
0
                }
1021
0
            }
1022
1023
0
            idx_to_obj[i] = cur_object;
1024
0
            HSharedObjectEntry& se = so.entries.at(toS(i));
1025
0
            int nobjects = se.nobjects_minus_one + 1;
1026
0
            int length = lengthNextN(cur_object, nobjects);
1027
0
            int h_length = so.min_group_length + se.delta_group_length;
1028
0
            if (length != h_length) {
1029
0
                linearizationWarning(
1030
0
                    "shared object " + std::to_string(i) + " length mismatch: hint table = " +
1031
0
                    std::to_string(h_length) + "; computed = " + std::to_string(length));
1032
0
            }
1033
0
            cur_object += nobjects;
1034
0
        }
1035
0
    }
1036
0
}
1037
1038
void
1039
Lin::checkHOutlines()
1040
0
{
1041
    // Empirically, Acrobat generates the correct value for the object number but incorrectly stores
1042
    // the next object number's offset as the offset, at least when outlines appear in part 6.  It
1043
    // also generates an incorrect value for length (specifically, the length that would cover the
1044
    // correct number of objects from the wrong starting place).  pdlin appears to generate correct
1045
    // values in those cases.
1046
1047
0
    if (c_outline_data_.nobjects == outline_hints_.nobjects) {
1048
0
        if (c_outline_data_.nobjects == 0) {
1049
0
            return;
1050
0
        }
1051
1052
0
        if (c_outline_data_.first_object == outline_hints_.first_object) {
1053
            // Check length and offset.  Acrobat gets these wrong.
1054
0
            QPDFObjectHandle outlines = qpdf.getRoot().getKey("/Outlines");
1055
0
            if (!outlines.isIndirect()) {
1056
                // This case is not exercised in test suite since not permitted by the spec, but if
1057
                // this does occur, the code below would fail.
1058
0
                linearizationWarning("/Outlines key of root dictionary is not indirect");
1059
0
                return;
1060
0
            }
1061
0
            QPDFObjGen og(outlines.getObjGen());
1062
0
            no_ci_stop_if(
1063
0
                !m->xref_table.contains(og), "unknown object in outlines hint table" //
1064
0
            );
1065
0
            qpdf_offset_t offset = getLinearizationOffset(og);
1066
0
            ObjUser ou(ObjUser::ou_root_key, "/Outlines");
1067
0
            int length = toI(maxEnd(ou) - offset);
1068
0
            qpdf_offset_t table_offset = adjusted_offset(outline_hints_.first_object_offset);
1069
0
            if (offset != table_offset) {
1070
0
                linearizationWarning(
1071
0
                    "incorrect offset in outlines table: hint table = " +
1072
0
                    std::to_string(table_offset) + "; computed = " + std::to_string(offset));
1073
0
            }
1074
0
            int table_length = outline_hints_.group_length;
1075
0
            if (length != table_length) {
1076
0
                linearizationWarning(
1077
0
                    "incorrect length in outlines table: hint table = " +
1078
0
                    std::to_string(table_length) + "; computed = " + std::to_string(length));
1079
0
            }
1080
0
        } else {
1081
0
            linearizationWarning("incorrect first object number in outline hints table.");
1082
0
        }
1083
0
    } else {
1084
0
        linearizationWarning("incorrect object count in outline hint table");
1085
0
    }
1086
0
}
1087
1088
void
1089
QPDF::showLinearizationData()
1090
0
{
1091
0
    m->lin.show_data();
1092
0
}
1093
1094
void
1095
Lin::show_data()
1096
0
{
1097
0
    try {
1098
0
        readLinearizationData();
1099
0
        checkLinearizationInternal();
1100
0
        dumpLinearizationDataInternal();
1101
0
    } catch (QPDFExc& e) {
1102
0
        linearizationWarning(e.what());
1103
0
    }
1104
0
}
1105
1106
void
1107
Lin::dumpLinearizationDataInternal()
1108
0
{
1109
0
    auto& info = *cf.log()->getInfo();
1110
1111
0
    info << m->file->getName() << ": linearization data:\n\n";
1112
1113
0
    info << "file_size: " << linp_.file_size << "\n"
1114
0
         << "first_page_object: " << linp_.first_page_object << "\n"
1115
0
         << "first_page_end: " << linp_.first_page_end << "\n"
1116
0
         << "npages: " << linp_.npages << "\n"
1117
0
         << "xref_zero_offset: " << linp_.xref_zero_offset << "\n"
1118
0
         << "first_page: " << linp_.first_page << "\n"
1119
0
         << "H_offset: " << linp_.H_offset << "\n"
1120
0
         << "H_length: " << linp_.H_length << "\n"
1121
0
         << "\n";
1122
1123
0
    info << "Page Offsets Hint Table\n\n";
1124
0
    dumpHPageOffset();
1125
0
    info << "\nShared Objects Hint Table\n\n";
1126
0
    dumpHSharedObject();
1127
1128
0
    if (outline_hints_.nobjects > 0) {
1129
0
        info << "\nOutlines Hint Table\n\n";
1130
0
        dumpHGeneric(outline_hints_);
1131
0
    }
1132
0
}
1133
1134
qpdf_offset_t
1135
Lin::adjusted_offset(qpdf_offset_t offset)
1136
0
{
1137
    // All offsets >= H_offset have to be increased by H_length since all hint table location values
1138
    // disregard the hint table itself.
1139
0
    if (offset >= linp_.H_offset) {
1140
0
        return offset + linp_.H_length;
1141
0
    }
1142
0
    return offset;
1143
0
}
1144
1145
void
1146
Lin::dumpHPageOffset()
1147
0
{
1148
0
    auto& info = *cf.log()->getInfo();
1149
0
    HPageOffset& t = page_offset_hints_;
1150
0
    info << "min_nobjects: " << t.min_nobjects << "\n"
1151
0
         << "first_page_offset: " << adjusted_offset(t.first_page_offset) << "\n"
1152
0
         << "nbits_delta_nobjects: " << t.nbits_delta_nobjects << "\n"
1153
0
         << "min_page_length: " << t.min_page_length << "\n"
1154
0
         << "nbits_delta_page_length: " << t.nbits_delta_page_length << "\n"
1155
0
         << "min_content_offset: " << t.min_content_offset << "\n"
1156
0
         << "nbits_delta_content_offset: " << t.nbits_delta_content_offset << "\n"
1157
0
         << "min_content_length: " << t.min_content_length << "\n"
1158
0
         << "nbits_delta_content_length: " << t.nbits_delta_content_length << "\n"
1159
0
         << "nbits_nshared_objects: " << t.nbits_nshared_objects << "\n"
1160
0
         << "nbits_shared_identifier: " << t.nbits_shared_identifier << "\n"
1161
0
         << "nbits_shared_numerator: " << t.nbits_shared_numerator << "\n"
1162
0
         << "shared_denominator: " << t.shared_denominator << "\n";
1163
1164
0
    for (size_t i1 = 0; i1 < linp_.npages; ++i1) {
1165
0
        HPageOffsetEntry& pe = t.entries.at(i1);
1166
0
        info << "Page " << i1 << ":\n"
1167
0
             << "  nobjects: " << pe.delta_nobjects + t.min_nobjects << "\n"
1168
0
             << "  length: " << pe.delta_page_length + t.min_page_length
1169
0
             << "\n"
1170
             // content offset is relative to page, not file
1171
0
             << "  content_offset: " << pe.delta_content_offset + t.min_content_offset << "\n"
1172
0
             << "  content_length: " << pe.delta_content_length + t.min_content_length << "\n"
1173
0
             << "  nshared_objects: " << pe.nshared_objects << "\n";
1174
0
        for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2) {
1175
0
            info << "    identifier " << i2 << ": " << pe.shared_identifiers.at(i2) << "\n";
1176
0
            info << "    numerator " << i2 << ": " << pe.shared_numerators.at(i2) << "\n";
1177
0
        }
1178
0
    }
1179
0
}
1180
1181
void
1182
Lin::dumpHSharedObject()
1183
0
{
1184
0
    auto& info = *cf.log()->getInfo();
1185
0
    HSharedObject& t = shared_object_hints_;
1186
0
    info << "first_shared_obj: " << t.first_shared_obj << "\n"
1187
0
         << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) << "\n"
1188
0
         << "nshared_first_page: " << t.nshared_first_page << "\n"
1189
0
         << "nshared_total: " << t.nshared_total << "\n"
1190
0
         << "nbits_nobjects: " << t.nbits_nobjects << "\n"
1191
0
         << "min_group_length: " << t.min_group_length << "\n"
1192
0
         << "nbits_delta_group_length: " << t.nbits_delta_group_length << "\n";
1193
1194
0
    for (size_t i = 0; i < toS(t.nshared_total); ++i) {
1195
0
        HSharedObjectEntry& se = t.entries.at(i);
1196
0
        info << "Shared Object " << i << ":\n"
1197
0
             << "  group length: " << se.delta_group_length + t.min_group_length << "\n";
1198
        // PDF spec says signature present nobjects_minus_one are always 0, so print them only if
1199
        // they have a non-zero value.
1200
0
        if (se.signature_present) {
1201
0
            info << "  signature present\n";
1202
0
        }
1203
0
        if (se.nobjects_minus_one != 0) {
1204
0
            info << "  nobjects: " << se.nobjects_minus_one + 1 << "\n";
1205
0
        }
1206
0
    }
1207
0
}
1208
1209
void
1210
Lin::dumpHGeneric(HGeneric& t)
1211
0
{
1212
0
    *cf.log()->getInfo() << "first_object: " << t.first_object << "\n"
1213
0
                         << "first_object_offset: " << adjusted_offset(t.first_object_offset)
1214
0
                         << "\n"
1215
0
                         << "nobjects: " << t.nobjects << "\n"
1216
0
                         << "group_length: " << t.group_length << "\n";
1217
0
}
1218
1219
template <typename T>
1220
void
1221
Lin::calculateLinearizationData(T const& object_stream_data)
1222
0
{
1223
    // This function calculates the ordering of objects, divides them into the appropriate parts,
1224
    // and computes some values for the linearization parameter dictionary and hint tables.  The
1225
    // file must be optimized (via calling optimize()) prior to calling this function.  Note that
1226
    // actual offsets and lengths are not computed here, but anything related to object ordering is.
1227
1228
0
    util::assertion(
1229
0
        !object_to_obj_users_.empty(),
1230
0
        "INTERNAL ERROR: QPDF::calculateLinearizationData called before optimize()" //
1231
0
    );
1232
    // Note that we can't call optimize here because we don't know whether it should be called
1233
    // with or without allow changes.
1234
1235
    // Separate objects into the categories sufficient for us to determine which part of the
1236
    // linearized file should contain the object.  This categorization is useful for other purposes
1237
    // as well.  Part numbers refer to version 1.4 of the PDF spec.
1238
1239
    // Parts 1, 3, 5, 10, and 11 don't contain any objects from the original file (except the
1240
    // trailer dictionary in part 11).
1241
1242
    // Part 4 is the document catalog (root) and the following root keys: /ViewerPreferences,
1243
    // /PageMode, /Threads, /OpenAction, /AcroForm, /Encrypt.  Note that Thread information
1244
    // dictionaries are supposed to appear in part 9, but we are disregarding that recommendation
1245
    // for now.
1246
1247
    // Part 6 is the first page section.  It includes all remaining objects referenced by the first
1248
    // page including shared objects but not including thumbnails.  Additionally, if /PageMode is
1249
    // /Outlines, then information from /Outlines also appears here.
1250
1251
    // Part 7 contains remaining objects private to pages other than the first page.
1252
1253
    // Part 8 contains all remaining shared objects except those that are shared only within
1254
    // thumbnails.
1255
1256
    // Part 9 contains all remaining objects.
1257
1258
    // We sort objects into the following categories:
1259
1260
    //   * open_document: part 4
1261
1262
    //   * first_page_private: part 6
1263
1264
    //   * first_page_shared: part 6
1265
1266
    //   * other_page_private: part 7
1267
1268
    //   * other_page_shared: part 8
1269
1270
    //   * thumbnail_private: part 9
1271
1272
    //   * thumbnail_shared: part 9
1273
1274
    //   * other: part 9
1275
1276
    //   * outlines: part 6 or 9
1277
1278
0
    part4_.clear();
1279
0
    part6_.clear();
1280
0
    part7_.clear();
1281
0
    part8_.clear();
1282
0
    part9_.clear();
1283
0
    c_linp_ = LinParameters();
1284
0
    c_page_offset_data_ = CHPageOffset();
1285
0
    c_shared_object_data_ = CHSharedObject();
1286
0
    c_outline_data_ = HGeneric();
1287
1288
0
    QPDFObjectHandle root = qpdf.getRoot();
1289
0
    bool outlines_in_first_page = false;
1290
0
    QPDFObjectHandle pagemode = root.getKey("/PageMode");
1291
0
    QTC::TC("qpdf", "QPDF categorize pagemode present", pagemode.isName() ? 1 : 0);
1292
0
    if (pagemode.isName()) {
1293
0
        if (pagemode.getName() == "/UseOutlines") {
1294
0
            if (root.hasKey("/Outlines")) {
1295
0
                outlines_in_first_page = true;
1296
0
            } else {
1297
0
                QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1298
0
            }
1299
0
        }
1300
0
        QTC::TC("qpdf", "QPDF categorize pagemode outlines", outlines_in_first_page ? 1 : 0);
1301
0
    }
1302
1303
0
    std::set<std::string> open_document_keys;
1304
0
    open_document_keys.insert("/ViewerPreferences");
1305
0
    open_document_keys.insert("/PageMode");
1306
0
    open_document_keys.insert("/Threads");
1307
0
    open_document_keys.insert("/OpenAction");
1308
0
    open_document_keys.insert("/AcroForm");
1309
1310
0
    std::set<QPDFObjGen> lc_open_document;
1311
0
    std::set<QPDFObjGen> lc_first_page_private;
1312
0
    std::set<QPDFObjGen> lc_first_page_shared;
1313
0
    std::set<QPDFObjGen> lc_other_page_private;
1314
0
    std::set<QPDFObjGen> lc_other_page_shared;
1315
0
    std::set<QPDFObjGen> lc_thumbnail_private;
1316
0
    std::set<QPDFObjGen> lc_thumbnail_shared;
1317
0
    std::set<QPDFObjGen> lc_other;
1318
0
    std::set<QPDFObjGen> lc_outlines;
1319
0
    std::set<QPDFObjGen> lc_root;
1320
1321
0
    for (auto& [og, ous]: object_to_obj_users_) {
1322
0
        bool in_open_document = false;
1323
0
        bool in_first_page = false;
1324
0
        int other_pages = 0;
1325
0
        int thumbs = 0;
1326
0
        int others = 0;
1327
0
        bool in_outlines = false;
1328
0
        bool is_root = false;
1329
1330
0
        for (auto const& ou: ous) {
1331
0
            switch (ou.ou_type) {
1332
0
            case ObjUser::ou_trailer_key:
1333
0
                if (ou.key == "/Encrypt") {
1334
0
                    in_open_document = true;
1335
0
                } else {
1336
0
                    ++others;
1337
0
                }
1338
0
                break;
1339
1340
0
            case ObjUser::ou_thumb:
1341
0
                ++thumbs;
1342
0
                break;
1343
1344
0
            case ObjUser::ou_root_key:
1345
0
                if (open_document_keys.contains(ou.key)) {
1346
0
                    in_open_document = true;
1347
0
                } else if (ou.key == "/Outlines") {
1348
0
                    in_outlines = true;
1349
0
                } else {
1350
0
                    ++others;
1351
0
                }
1352
0
                break;
1353
1354
0
            case ObjUser::ou_page:
1355
0
                if (ou.pageno == 0) {
1356
0
                    in_first_page = true;
1357
0
                } else {
1358
0
                    ++other_pages;
1359
0
                }
1360
0
                break;
1361
1362
0
            case ObjUser::ou_root:
1363
0
                is_root = true;
1364
0
                break;
1365
0
            }
1366
0
        }
1367
1368
0
        if (is_root) {
1369
0
            lc_root.insert(og);
1370
0
        } else if (in_outlines) {
1371
0
            lc_outlines.insert(og);
1372
0
        } else if (in_open_document) {
1373
0
            lc_open_document.insert(og);
1374
0
        } else if ((in_first_page) && (others == 0) && (other_pages == 0) && (thumbs == 0)) {
1375
0
            lc_first_page_private.insert(og);
1376
0
        } else if (in_first_page) {
1377
0
            lc_first_page_shared.insert(og);
1378
0
        } else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) {
1379
0
            lc_other_page_private.insert(og);
1380
0
        } else if (other_pages > 1) {
1381
0
            lc_other_page_shared.insert(og);
1382
0
        } else if ((thumbs == 1) && (others == 0)) {
1383
0
            lc_thumbnail_private.insert(og);
1384
0
        } else if (thumbs > 1) {
1385
0
            lc_thumbnail_shared.insert(og);
1386
0
        } else {
1387
0
            lc_other.insert(og);
1388
0
        }
1389
0
    }
1390
1391
    // Generate ordering for objects in the output file.  Sometimes we just dump right from a set
1392
    // into a vector.  Rather than optimizing this by going straight into the vector, we'll leave
1393
    // these phases separate for now.  That way, this section can be concerned only with ordering,
1394
    // and the above section can be considered only with categorization.  Note that sets of
1395
    // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file, objects appear in sequence with
1396
    // the possible exception of hints tables which we won't see here anyway.  That means that
1397
    // running calculateLinearizationData() on a linearized file should give results identical to
1398
    // the original file ordering.
1399
1400
    // We seem to traverse the page tree a lot in this code, but we can address this for a future
1401
    // code optimization if necessary. Premature optimization is the root of all evil.
1402
0
    std::vector<QPDFObjectHandle> uc_pages;
1403
0
    { // local scope
1404
        // Map all page objects to the containing object stream.  This should be a no-op in a
1405
        // properly linearized file.
1406
0
        for (auto oh: pages) {
1407
0
            uc_pages.emplace_back(getUncompressedObject(oh, object_stream_data));
1408
0
        }
1409
0
    }
1410
0
    size_t npages = pages.size();
1411
1412
    // We will be initializing some values of the computed hint tables.  Specifically, we can
1413
    // initialize any items that deal with object numbers or counts but not any items that deal with
1414
    // lengths or offsets.  The code that writes linearized files will have to fill in these values
1415
    // during the first pass.  The validation code can compute them relatively easily given the rest
1416
    // of the information.
1417
1418
    // npages is the size of the existing pages vector, which has been created by traversing the
1419
    // pages tree, and as such is a reasonable size.
1420
0
    c_linp_.npages = npages;
1421
0
    c_page_offset_data_.entries = std::vector<CHPageOffsetEntry>(npages);
1422
1423
    // Part 4: open document objects.  We don't care about the order.
1424
1425
0
    no_ci_stop_if(
1426
0
        lc_root.size() != 1, "found other than one root while calculating linearization data" //
1427
0
    );
1428
1429
0
    part4_.emplace_back(qpdf.getObject(*(lc_root.begin())));
1430
0
    for (auto const& og: lc_open_document) {
1431
0
        part4_.emplace_back(qpdf.getObject(og));
1432
0
    }
1433
1434
    // Part 6: first page objects.  Note: implementation note 124 states that Acrobat always treats
1435
    // page 0 as the first page for linearization regardless of /OpenAction.  pdlin doesn't provide
1436
    // any option to set this and also disregards /OpenAction.  We will do the same.
1437
1438
    // First, place the actual first page object itself.
1439
0
    no_ci_stop_if(
1440
0
        pages.empty(), "no pages found while calculating linearization data" //
1441
0
    );
1442
0
    QPDFObjGen first_page_og(uc_pages.at(0).getObjGen());
1443
0
    no_ci_stop_if(
1444
0
        !lc_first_page_private.erase(first_page_og), "unable to linearize first page" //
1445
0
    );
1446
0
    c_linp_.first_page_object = uc_pages.at(0).getObjectID();
1447
0
    part6_.emplace_back(uc_pages.at(0));
1448
1449
    // The PDF spec "recommends" an order for the rest of the objects, but we are going to disregard
1450
    // it except to the extent that it groups private and shared objects contiguously for the sake
1451
    // of hint tables.
1452
1453
0
    for (auto const& og: lc_first_page_private) {
1454
0
        part6_.emplace_back(qpdf.getObject(og));
1455
0
    }
1456
1457
0
    for (auto const& og: lc_first_page_shared) {
1458
0
        part6_.emplace_back(qpdf.getObject(og));
1459
0
    }
1460
1461
    // Place the outline dictionary if it goes in the first page section.
1462
0
    if (outlines_in_first_page) {
1463
0
        pushOutlinesToPart(part6_, lc_outlines, object_stream_data);
1464
0
    }
1465
1466
    // Fill in page offset hint table information for the first page. The PDF spec says that
1467
    // nshared_objects should be zero for the first page.  pdlin does not appear to obey this, but
1468
    // it fills in garbage values for all the shared object identifiers on the first page.
1469
1470
0
    c_page_offset_data_.entries.at(0).nobjects = toI(part6_.size());
1471
1472
    // Part 7: other pages' private objects
1473
1474
    // For each page in order:
1475
0
    for (size_t i = 1; i < npages; ++i) {
1476
        // Place this page's page object
1477
1478
0
        QPDFObjGen page_og(uc_pages.at(i).getObjGen());
1479
0
        no_ci_stop_if(
1480
0
            !lc_other_page_private.erase(page_og),
1481
0
            "unable to linearize page " + std::to_string(i) //
1482
0
        );
1483
1484
0
        part7_.emplace_back(uc_pages.at(i));
1485
1486
        // Place all non-shared objects referenced by this page, updating the page object count for
1487
        // the hint table.
1488
1489
0
        c_page_offset_data_.entries.at(i).nobjects = 1;
1490
1491
0
        ObjUser ou(ObjUser::ou_page, i);
1492
0
        no_ci_stop_if(
1493
0
            !obj_user_to_objects_.contains(ou),
1494
0
            "found unreferenced page while calculating linearization data" //
1495
0
        );
1496
1497
0
        for (auto const& og: obj_user_to_objects_[ou]) {
1498
0
            if (lc_other_page_private.erase(og)) {
1499
0
                part7_.emplace_back(qpdf.getObject(og));
1500
0
                ++c_page_offset_data_.entries.at(i).nobjects;
1501
0
            }
1502
0
        }
1503
0
    }
1504
    // That should have covered all part7 objects.
1505
0
    util::assertion(
1506
0
        lc_other_page_private.empty(),
1507
0
        "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_other_page_private is not empty "
1508
0
        "after generation of part7" //
1509
0
    );
1510
1511
    // Part 8: other pages' shared objects
1512
1513
    // Order is unimportant.
1514
0
    for (auto const& og: lc_other_page_shared) {
1515
0
        part8_.emplace_back(qpdf.getObject(og));
1516
0
    }
1517
1518
    // Part 9: other objects
1519
1520
    // The PDF specification makes recommendations on ordering here. We follow them only to a
1521
    // limited extent.  Specifically, we put the pages tree first, then private thumbnail objects in
1522
    // page order, then shared thumbnail objects, and then outlines (unless in part 6).  After that,
1523
    // we throw all remaining objects in arbitrary order.
1524
1525
    // Place the pages tree.
1526
0
    auto& pages_ogs = obj_user_to_objects_[{ObjUser::ou_root_key, "/Pages"}];
1527
0
    no_ci_stop_if(
1528
0
        pages_ogs.empty(), "found empty pages tree while calculating linearization data" //
1529
0
    );
1530
0
    for (auto const& og: pages_ogs) {
1531
0
        if (lc_other.erase(og)) {
1532
0
            part9_.emplace_back(qpdf.getObject(og));
1533
0
        }
1534
0
    }
1535
1536
    // Place private thumbnail images in page order.  Slightly more information would be required if
1537
    // we were going to bother with thumbnail hint tables.
1538
0
    for (size_t i = 0; i < npages; ++i) {
1539
0
        QPDFObjectHandle thumb = uc_pages.at(i).getKey("/Thumb");
1540
0
        thumb = getUncompressedObject(thumb, object_stream_data);
1541
0
        QPDFObjGen thumb_og(thumb.getObjGen());
1542
        // Output the thumbnail itself
1543
0
        if (lc_thumbnail_private.erase(thumb_og) && !thumb.null()) {
1544
0
            part9_.emplace_back(thumb);
1545
0
        } else {
1546
            // No internal error this time...there's nothing to stop this object from having
1547
            // been referred to somewhere else outside of a page's /Thumb, and if it had been,
1548
            // there's nothing to prevent it from having been in some set other than
1549
            // lc_thumbnail_private.
1550
0
        }
1551
0
        for (auto const& og: obj_user_to_objects_[{ObjUser::ou_thumb, i}]) {
1552
0
            if (lc_thumbnail_private.erase(og)) {
1553
0
                part9_.emplace_back(qpdf.getObject(og));
1554
0
            }
1555
0
        }
1556
0
    }
1557
0
    util::assertion(
1558
0
        lc_thumbnail_private.empty(),
1559
0
        "INTERNAL ERROR: QPDF::calculateLinearizationData: lc_thumbnail_private not "
1560
0
        "empty after placing thumbnails" //
1561
0
    );
1562
1563
    // Place shared thumbnail objects
1564
0
    for (auto const& og: lc_thumbnail_shared) {
1565
0
        part9_.emplace_back(qpdf.getObject(og));
1566
0
    }
1567
1568
    // Place outlines unless in first page
1569
0
    if (!outlines_in_first_page) {
1570
0
        pushOutlinesToPart(part9_, lc_outlines, object_stream_data);
1571
0
    }
1572
1573
    // Place all remaining objects
1574
0
    for (auto const& og: lc_other) {
1575
0
        part9_.emplace_back(qpdf.getObject(og));
1576
0
    }
1577
1578
    // Make sure we got everything exactly once.
1579
1580
0
    size_t num_placed =
1581
0
        part4_.size() + part6_.size() + part7_.size() + part8_.size() + part9_.size();
1582
0
    size_t num_wanted = object_to_obj_users_.size();
1583
0
    no_ci_stop_if(
1584
        // This can happen with damaged files, e.g. if the root is part of the the pages tree.
1585
0
        num_placed != num_wanted,
1586
0
        "QPDF::calculateLinearizationData: wrong number of objects placed (num_placed = " +
1587
0
            std::to_string(num_placed) + "; number of objects: " + std::to_string(num_wanted) +
1588
0
            "\nIf the file did not generate any other warnings please report this as a bug." //
1589
0
    );
1590
1591
    // Calculate shared object hint table information including references to shared objects from
1592
    // page offset hint data.
1593
1594
    // The shared object hint table consists of all part 6 (whether shared or not) in order followed
1595
    // by all part 8 objects in order.  Add the objects to shared object data keeping a map of
1596
    // object number to index.  Then populate the shared object information for the pages.
1597
1598
    // Note that two objects never have the same object number, so we can map from object number
1599
    // only without regards to generation.
1600
0
    std::map<int, int> obj_to_index;
1601
1602
0
    c_shared_object_data_.nshared_first_page = toI(part6_.size());
1603
0
    c_shared_object_data_.nshared_total =
1604
0
        c_shared_object_data_.nshared_first_page + toI(part8_.size());
1605
1606
0
    std::vector<CHSharedObjectEntry>& shared = c_shared_object_data_.entries;
1607
0
    for (auto& oh: part6_) {
1608
0
        int obj = oh.getObjectID();
1609
0
        obj_to_index[obj] = toI(shared.size());
1610
0
        shared.emplace_back(obj);
1611
0
    }
1612
0
    QTC::TC("qpdf", "QPDF lin part 8 empty", part8_.empty() ? 1 : 0);
1613
0
    if (!part8_.empty()) {
1614
0
        c_shared_object_data_.first_shared_obj = part8_.at(0).getObjectID();
1615
0
        for (auto& oh: part8_) {
1616
0
            int obj = oh.getObjectID();
1617
0
            obj_to_index[obj] = toI(shared.size());
1618
0
            shared.emplace_back(obj);
1619
0
        }
1620
0
    }
1621
0
    no_ci_stop_if(
1622
0
        std::cmp_not_equal(
1623
0
            c_shared_object_data_.nshared_total, c_shared_object_data_.entries.size()),
1624
0
        "shared object hint table has wrong number of entries" //
1625
0
    );
1626
1627
    // Now compute the list of shared objects for each page after the first page.
1628
1629
0
    for (size_t i = 1; i < npages; ++i) {
1630
0
        CHPageOffsetEntry& pe = c_page_offset_data_.entries.at(i);
1631
0
        ObjUser ou(ObjUser::ou_page, i);
1632
0
        no_ci_stop_if(
1633
0
            !obj_user_to_objects_.contains(ou),
1634
0
            "found unreferenced page while calculating linearization data" //
1635
0
        );
1636
1637
0
        for (auto const& og: obj_user_to_objects_[ou]) {
1638
0
            if (object_to_obj_users_[og].size() > 1 && obj_to_index.contains(og.getObj())) {
1639
0
                int idx = obj_to_index[og.getObj()];
1640
0
                ++pe.nshared_objects;
1641
0
                pe.shared_identifiers.push_back(idx);
1642
0
            }
1643
0
        }
1644
0
    }
1645
0
}
Unexecuted instantiation: void QPDF::Doc::Linearization::calculateLinearizationData<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
Unexecuted instantiation: void QPDF::Doc::Linearization::calculateLinearizationData<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&)
1646
1647
template <typename T>
1648
void
1649
Lin::pushOutlinesToPart(
1650
    std::vector<QPDFObjectHandle>& part,
1651
    std::set<QPDFObjGen>& lc_outlines,
1652
    T const& object_stream_data)
1653
0
{
1654
0
    QPDFObjectHandle root = qpdf.getRoot();
1655
0
    QPDFObjectHandle outlines = root.getKey("/Outlines");
1656
0
    if (outlines.null()) {
1657
0
        return;
1658
0
    }
1659
0
    outlines = getUncompressedObject(outlines, object_stream_data);
1660
0
    QPDFObjGen outlines_og(outlines.getObjGen());
1661
0
    QTC::TC(
1662
0
        "qpdf",
1663
0
        "QPDF lin outlines in part",
1664
0
        &part == &part6_         ? 0
1665
0
            : (&part == &part9_) ? 1
1666
0
                                 : 9999); // can't happen
1667
0
    if (lc_outlines.erase(outlines_og)) {
1668
        // Make sure outlines is in lc_outlines in case the file is damaged. in which case it may be
1669
        // included in an earlier part.
1670
0
        part.emplace_back(outlines);
1671
0
        c_outline_data_.first_object = outlines_og.getObj();
1672
0
        c_outline_data_.nobjects = 1;
1673
0
    }
1674
0
    for (auto const& og: lc_outlines) {
1675
0
        if (!c_outline_data_.first_object) {
1676
0
            c_outline_data_.first_object = og.getObj();
1677
0
        }
1678
0
        part.emplace_back(qpdf.getObject(og));
1679
0
        ++c_outline_data_.nobjects;
1680
0
    }
1681
0
}
Unexecuted instantiation: void QPDF::Doc::Linearization::pushOutlinesToPart<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&)
Unexecuted instantiation: void QPDF::Doc::Linearization::pushOutlinesToPart<QPDFWriter::ObjTable>(std::__1::vector<QPDFObjectHandle, std::__1::allocator<QPDFObjectHandle> >&, std::__1::set<QPDFObjGen, std::__1::less<QPDFObjGen>, std::__1::allocator<QPDFObjGen> >&, QPDFWriter::ObjTable const&)
1682
1683
void
1684
Lin::parts(
1685
    QPDFWriter::ObjTable const& obj,
1686
    std::vector<QPDFObjectHandle>& part4,
1687
    std::vector<QPDFObjectHandle>& part6,
1688
    std::vector<QPDFObjectHandle>& part7,
1689
    std::vector<QPDFObjectHandle>& part8,
1690
    std::vector<QPDFObjectHandle>& part9)
1691
0
{
1692
0
    calculateLinearizationData(obj);
1693
0
    part4 = part4_;
1694
0
    part6 = part6_;
1695
0
    part7 = part7_;
1696
0
    part8 = part8_;
1697
0
    part9 = part9_;
1698
0
}
1699
1700
static inline int
1701
nbits(int val)
1702
0
{
1703
0
    return (val == 0 ? 0 : (1 + nbits(val >> 1)));
1704
0
}
1705
1706
int
1707
Lin::outputLengthNextN(
1708
    int in_object, int n, QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1709
0
{
1710
    // Figure out the length of a series of n consecutive objects in the output file starting with
1711
    // whatever object in_object from the input file mapped to.
1712
1713
0
    int first = obj[in_object].renumber;
1714
0
    int last = first + n;
1715
0
    no_ci_stop_if(
1716
0
        first <= 0, "found object that is not renumbered while writing linearization data");
1717
0
    qpdf_offset_t length = 0;
1718
0
    for (int i = first; i < last; ++i) {
1719
0
        auto l = new_obj[i].length;
1720
0
        no_ci_stop_if(
1721
0
            l == 0, "found item with unknown length while writing linearization data" //
1722
0
        );
1723
0
        length += l;
1724
0
    }
1725
0
    return toI(length);
1726
0
}
1727
1728
void
1729
Lin::calculateHPageOffset(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1730
0
{
1731
    // Page Offset Hint Table
1732
1733
    // We are purposely leaving some values set to their initial zero values.
1734
1735
0
    auto const& all_pages = pages.all();
1736
0
    size_t npages = all_pages.size();
1737
0
    CHPageOffset& cph = c_page_offset_data_;
1738
0
    std::vector<CHPageOffsetEntry>& cphe = cph.entries;
1739
1740
    // Calculate minimum and maximum values for number of objects per page and page length.
1741
1742
0
    int min_nobjects = std::numeric_limits<int>::max();
1743
0
    int max_nobjects = 0;
1744
0
    int min_length = std::numeric_limits<int>::max();
1745
0
    int max_length = 0;
1746
0
    int max_shared = 0;
1747
1748
0
    HPageOffset& ph = page_offset_hints_;
1749
0
    std::vector<HPageOffsetEntry>& phe = ph.entries;
1750
    // npages is the size of the existing pages array.
1751
0
    phe = std::vector<HPageOffsetEntry>(npages);
1752
1753
0
    size_t i = 0;
1754
0
    for (auto& phe_i: phe) {
1755
        // Calculate values for each page, assigning full values to the delta items.  They will be
1756
        // adjusted later.
1757
1758
        // Repeat calculations for page 0 so we can assign to phe[i] without duplicating those
1759
        // assignments.
1760
1761
0
        int nobjects = cphe.at(i).nobjects;
1762
0
        int length = outputLengthNextN(all_pages.at(i).getObjectID(), nobjects, new_obj, obj);
1763
0
        int nshared = cphe.at(i).nshared_objects;
1764
1765
0
        min_nobjects = std::min(min_nobjects, nobjects);
1766
0
        max_nobjects = std::max(max_nobjects, nobjects);
1767
0
        min_length = std::min(min_length, length);
1768
0
        max_length = std::max(max_length, length);
1769
0
        max_shared = std::max(max_shared, nshared);
1770
1771
0
        phe_i.delta_nobjects = nobjects;
1772
0
        phe_i.delta_page_length = length;
1773
0
        phe_i.nshared_objects = nshared;
1774
0
        ++i;
1775
0
    }
1776
1777
0
    ph.min_nobjects = min_nobjects;
1778
0
    ph.first_page_offset = new_obj[obj[all_pages.at(0)].renumber].xref.getOffset();
1779
0
    ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects);
1780
0
    ph.min_page_length = min_length;
1781
0
    ph.nbits_delta_page_length = nbits(max_length - min_length);
1782
0
    ph.nbits_nshared_objects = nbits(max_shared);
1783
0
    ph.nbits_shared_identifier = nbits(c_shared_object_data_.nshared_total);
1784
0
    ph.shared_denominator = 4; // doesn't matter
1785
1786
    // It isn't clear how to compute content offset and content length.  Since we are not
1787
    // interleaving page objects with the content stream, we'll use the same values for content
1788
    // length as page length.  We will use 0 as content offset because this is what Adobe does
1789
    // (implementation note 127) and pdlin as well.
1790
0
    ph.nbits_delta_content_length = ph.nbits_delta_page_length;
1791
0
    ph.min_content_length = ph.min_page_length;
1792
1793
0
    i = 0;
1794
0
    for (auto& phe_i: phe) {
1795
        // Adjust delta entries
1796
0
        if (phe_i.delta_nobjects < min_nobjects || phe_i.delta_page_length < min_length) {
1797
0
            stopOnError(
1798
0
                "found too small delta nobjects or delta page length while writing "
1799
0
                "linearization data");
1800
0
        }
1801
0
        phe_i.delta_nobjects -= min_nobjects;
1802
0
        phe_i.delta_page_length -= min_length;
1803
0
        phe_i.delta_content_length = phe_i.delta_page_length;
1804
1805
0
        auto& si = cphe.at(i).shared_identifiers;
1806
0
        phe_i.shared_identifiers.insert(phe_i.shared_identifiers.end(), si.begin(), si.end());
1807
0
        phe_i.shared_numerators.insert(phe_i.shared_numerators.end(), si.size(), 0);
1808
0
        ++i;
1809
0
    }
1810
0
}
1811
1812
void
1813
Lin::calculateHSharedObject(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1814
0
{
1815
0
    CHSharedObject& cso = c_shared_object_data_;
1816
0
    std::vector<CHSharedObjectEntry>& csoe = cso.entries;
1817
0
    HSharedObject& so = shared_object_hints_;
1818
0
    std::vector<HSharedObjectEntry>& soe = so.entries;
1819
0
    soe.clear();
1820
1821
0
    int min_length = outputLengthNextN(csoe.at(0).object, 1, new_obj, obj);
1822
0
    int max_length = min_length;
1823
1824
0
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1825
        // Assign absolute numbers to deltas; adjust later
1826
0
        int length = outputLengthNextN(csoe.at(i).object, 1, new_obj, obj);
1827
0
        min_length = std::min(min_length, length);
1828
0
        max_length = std::max(max_length, length);
1829
0
        soe.emplace_back();
1830
0
        soe.at(i).delta_group_length = length;
1831
0
    }
1832
0
    no_ci_stop_if(
1833
0
        soe.size() != toS(cso.nshared_total), "soe has wrong size after initialization" //
1834
0
    );
1835
1836
0
    so.nshared_total = cso.nshared_total;
1837
0
    so.nshared_first_page = cso.nshared_first_page;
1838
0
    if (so.nshared_total > so.nshared_first_page) {
1839
0
        so.first_shared_obj = obj[cso.first_shared_obj].renumber;
1840
0
        so.min_group_length = min_length;
1841
0
        so.first_shared_offset = new_obj[so.first_shared_obj].xref.getOffset();
1842
0
    }
1843
0
    so.min_group_length = min_length;
1844
0
    so.nbits_delta_group_length = nbits(max_length - min_length);
1845
1846
0
    for (size_t i = 0; i < toS(cso.nshared_total); ++i) {
1847
        // Adjust deltas
1848
0
        no_ci_stop_if(
1849
0
            soe.at(i).delta_group_length < min_length,
1850
0
            "found too small group length while writing linearization data" //
1851
0
        );
1852
1853
0
        soe.at(i).delta_group_length -= min_length;
1854
0
    }
1855
0
}
1856
1857
void
1858
Lin::calculateHOutline(QPDFWriter::NewObjTable const& new_obj, QPDFWriter::ObjTable const& obj)
1859
0
{
1860
0
    HGeneric& cho = c_outline_data_;
1861
1862
0
    if (cho.nobjects == 0) {
1863
0
        return;
1864
0
    }
1865
1866
0
    HGeneric& ho = outline_hints_;
1867
1868
0
    ho.first_object = obj[cho.first_object].renumber;
1869
0
    ho.first_object_offset = new_obj[ho.first_object].xref.getOffset();
1870
0
    ho.nobjects = cho.nobjects;
1871
0
    ho.group_length = outputLengthNextN(cho.first_object, ho.nobjects, new_obj, obj);
1872
0
}
1873
1874
template <class T, class int_type>
1875
static void
1876
write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec, int bits, int_type T::* field)
1877
0
{
1878
    // nitems times, write bits bits from the given field of the ith vector to the given bit writer.
1879
1880
0
    for (size_t i = 0; i < QIntC::to_size(nitems); ++i) {
1881
0
        w.writeBits(QIntC::to_ulonglong(vec.at(i).*field), QIntC::to_size(bits));
1882
0
    }
1883
    // The PDF spec says that each hint table starts at a byte boundary.  Each "row" actually must
1884
    // start on a byte boundary.
1885
0
    w.flush();
1886
0
}
Unexecuted instantiation: QPDF_linearization.cc:void write_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, int>(BitWriter&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, int QPDF::Doc::Linearization::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void write_vector_int<QPDF::Doc::Linearization::HPageOffsetEntry, long long>(BitWriter&, int, std::__1::vector<QPDF::Doc::Linearization::HPageOffsetEntry, std::__1::allocator<QPDF::Doc::Linearization::HPageOffsetEntry> >&, int, long long QPDF::Doc::Linearization::HPageOffsetEntry::*)
Unexecuted instantiation: QPDF_linearization.cc:void write_vector_int<QPDF::Doc::Linearization::HSharedObjectEntry, int>(BitWriter&, int, std::__1::vector<QPDF::Doc::Linearization::HSharedObjectEntry, std::__1::allocator<QPDF::Doc::Linearization::HSharedObjectEntry> >&, int, int QPDF::Doc::Linearization::HSharedObjectEntry::*)
1887
1888
template <class T>
1889
static void
1890
write_vector_vector(
1891
    BitWriter& w,
1892
    int nitems1,
1893
    std::vector<T>& vec1,
1894
    int T::* nitems2,
1895
    int bits,
1896
    std::vector<int> T::* vec2)
1897
0
{
1898
    // nitems1 times, write nitems2 (from the ith element of vec1) items from the vec2 vector field
1899
    // of the ith item of vec1.
1900
0
    for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1) {
1901
0
        for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2) {
1902
0
            w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)), QIntC::to_size(bits));
1903
0
        }
1904
0
    }
1905
0
    w.flush();
1906
0
}
1907
1908
void
1909
Lin::writeHPageOffset(BitWriter& w)
1910
0
{
1911
0
    HPageOffset& t = page_offset_hints_;
1912
1913
0
    w.writeBitsInt(t.min_nobjects, 32);               // 1
1914
0
    w.writeBits(toULL(t.first_page_offset), 32);      // 2
1915
0
    w.writeBitsInt(t.nbits_delta_nobjects, 16);       // 3
1916
0
    w.writeBitsInt(t.min_page_length, 32);            // 4
1917
0
    w.writeBitsInt(t.nbits_delta_page_length, 16);    // 5
1918
0
    w.writeBits(toULL(t.min_content_offset), 32);     // 6
1919
0
    w.writeBitsInt(t.nbits_delta_content_offset, 16); // 7
1920
0
    w.writeBitsInt(t.min_content_length, 32);         // 8
1921
0
    w.writeBitsInt(t.nbits_delta_content_length, 16); // 9
1922
0
    w.writeBitsInt(t.nbits_nshared_objects, 16);      // 10
1923
0
    w.writeBitsInt(t.nbits_shared_identifier, 16);    // 11
1924
0
    w.writeBitsInt(t.nbits_shared_numerator, 16);     // 12
1925
0
    w.writeBitsInt(t.shared_denominator, 16);         // 13
1926
1927
0
    int nitems = toI(pages.size());
1928
0
    std::vector<HPageOffsetEntry>& entries = t.entries;
1929
1930
0
    write_vector_int(w, nitems, entries, t.nbits_delta_nobjects, &HPageOffsetEntry::delta_nobjects);
1931
0
    write_vector_int(
1932
0
        w, nitems, entries, t.nbits_delta_page_length, &HPageOffsetEntry::delta_page_length);
1933
0
    write_vector_int(
1934
0
        w, nitems, entries, t.nbits_nshared_objects, &HPageOffsetEntry::nshared_objects);
1935
0
    write_vector_vector(
1936
0
        w,
1937
0
        nitems,
1938
0
        entries,
1939
0
        &HPageOffsetEntry::nshared_objects,
1940
0
        t.nbits_shared_identifier,
1941
0
        &HPageOffsetEntry::shared_identifiers);
1942
0
    write_vector_vector(
1943
0
        w,
1944
0
        nitems,
1945
0
        entries,
1946
0
        &HPageOffsetEntry::nshared_objects,
1947
0
        t.nbits_shared_numerator,
1948
0
        &HPageOffsetEntry::shared_numerators);
1949
0
    write_vector_int(
1950
0
        w, nitems, entries, t.nbits_delta_content_offset, &HPageOffsetEntry::delta_content_offset);
1951
0
    write_vector_int(
1952
0
        w, nitems, entries, t.nbits_delta_content_length, &HPageOffsetEntry::delta_content_length);
1953
0
}
1954
1955
void
1956
Lin::writeHSharedObject(BitWriter& w)
1957
0
{
1958
0
    HSharedObject& t = shared_object_hints_;
1959
1960
0
    w.writeBitsInt(t.first_shared_obj, 32);         // 1
1961
0
    w.writeBits(toULL(t.first_shared_offset), 32);  // 2
1962
0
    w.writeBitsInt(t.nshared_first_page, 32);       // 3
1963
0
    w.writeBitsInt(t.nshared_total, 32);            // 4
1964
0
    w.writeBitsInt(t.nbits_nobjects, 16);           // 5
1965
0
    w.writeBitsInt(t.min_group_length, 32);         // 6
1966
0
    w.writeBitsInt(t.nbits_delta_group_length, 16); // 7
1967
1968
0
    QTC::TC(
1969
0
        "qpdf",
1970
0
        "QPDF lin write nshared_total > nshared_first_page",
1971
0
        (t.nshared_total > t.nshared_first_page) ? 1 : 0);
1972
1973
0
    int nitems = t.nshared_total;
1974
0
    std::vector<HSharedObjectEntry>& entries = t.entries;
1975
1976
0
    write_vector_int(
1977
0
        w, nitems, entries, t.nbits_delta_group_length, &HSharedObjectEntry::delta_group_length);
1978
0
    write_vector_int(w, nitems, entries, 1, &HSharedObjectEntry::signature_present);
1979
0
    for (size_t i = 0; i < toS(nitems); ++i) {
1980
        // If signature were present, we'd have to write a 128-bit hash.
1981
0
        if (entries.at(i).signature_present != 0) {
1982
0
            stopOnError("found unexpected signature present while writing linearization data");
1983
0
        }
1984
0
    }
1985
0
    write_vector_int(w, nitems, entries, t.nbits_nobjects, &HSharedObjectEntry::nobjects_minus_one);
1986
0
}
1987
1988
void
1989
Lin::writeHGeneric(BitWriter& w, HGeneric& t)
1990
0
{
1991
0
    w.writeBitsInt(t.first_object, 32);            // 1
1992
0
    w.writeBits(toULL(t.first_object_offset), 32); // 2
1993
0
    w.writeBitsInt(t.nobjects, 32);                // 3
1994
0
    w.writeBitsInt(t.group_length, 32);            // 4
1995
0
}
1996
1997
void
1998
Lin::generateHintStream(
1999
    QPDFWriter::NewObjTable const& new_obj,
2000
    QPDFWriter::ObjTable const& obj,
2001
    std::string& hint_buffer,
2002
    int& S,
2003
    int& O,
2004
    bool compressed)
2005
0
{
2006
    // Populate actual hint table values
2007
0
    calculateHPageOffset(new_obj, obj);
2008
0
    calculateHSharedObject(new_obj, obj);
2009
0
    calculateHOutline(new_obj, obj);
2010
2011
    // Write the hint stream itself into a compressed memory buffer. Write through a counter so we
2012
    // can get offsets.
2013
0
    pl::Count c(0, hint_buffer);
2014
0
    BitWriter w(&c);
2015
2016
0
    writeHPageOffset(w);
2017
0
    S = toI(c.getCount());
2018
0
    writeHSharedObject(w);
2019
0
    O = 0;
2020
0
    if (outline_hints_.nobjects > 0) {
2021
0
        O = toI(c.getCount());
2022
0
        writeHGeneric(w, outline_hints_);
2023
0
    }
2024
0
    if (compressed) {
2025
0
        hint_buffer = pl::pipe<Pl_Flate>(hint_buffer, Pl_Flate::a_deflate);
2026
0
    }
2027
0
}