Coverage Report

Created: 2025-10-12 07:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/QPDF_optimization.cc
Line
Count
Source
1
// See the "Optimization" section of the manual.
2
3
#include <qpdf/QPDF_private.hh>
4
5
#include <qpdf/QPDFExc.hh>
6
#include <qpdf/QPDFObjectHandle_private.hh>
7
#include <qpdf/QPDFWriter_private.hh>
8
#include <qpdf/QTC.hh>
9
10
using Lin = QPDF::Doc::Linearization;
11
using Pages = QPDF::Doc::Pages;
12
13
QPDF::ObjUser::ObjUser(user_e type) :
14
0
    ou_type(type)
15
0
{
16
0
    qpdf_assert_debug(type == ou_root);
17
0
}
18
19
QPDF::ObjUser::ObjUser(user_e type, size_t pageno) :
20
0
    ou_type(type),
21
0
    pageno(pageno)
22
0
{
23
0
    qpdf_assert_debug((type == ou_page) || (type == ou_thumb));
24
0
}
25
26
QPDF::ObjUser::ObjUser(user_e type, std::string const& key) :
27
0
    ou_type(type),
28
0
    key(key)
29
0
{
30
0
    qpdf_assert_debug((type == ou_trailer_key) || (type == ou_root_key));
31
0
}
32
33
bool
34
QPDF::ObjUser::operator<(ObjUser const& rhs) const
35
0
{
36
0
    if (ou_type < rhs.ou_type) {
37
0
        return true;
38
0
    }
39
0
    if (ou_type == rhs.ou_type) {
40
0
        if (pageno < rhs.pageno) {
41
0
            return true;
42
0
        }
43
0
        if (pageno == rhs.pageno) {
44
0
            return key < rhs.key;
45
0
        }
46
0
    }
47
0
    return false;
48
0
}
49
50
QPDF::UpdateObjectMapsFrame::UpdateObjectMapsFrame(
51
    QPDF::ObjUser const& ou, QPDFObjectHandle oh, bool top) :
52
0
    ou(ou),
53
0
    oh(oh),
54
0
    top(top)
55
0
{
56
0
}
57
58
void
59
QPDF::optimize(
60
    std::map<int, int> const& object_stream_data,
61
    bool allow_changes,
62
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
63
0
{
64
0
    m->lin.optimize_internal(object_stream_data, allow_changes, skip_stream_parameters);
65
0
}
66
67
void
68
Lin::optimize(
69
    QPDFWriter::ObjTable const& obj, std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
70
0
{
71
0
    optimize_internal(obj, true, skip_stream_parameters);
72
0
}
73
74
template <typename T>
75
void
76
Lin::optimize_internal(
77
    T const& object_stream_data,
78
    bool allow_changes,
79
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
80
0
{
81
0
    if (!m->obj_user_to_objects.empty()) {
82
        // already optimized
83
0
        return;
84
0
    }
85
86
    // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force
87
    // it to be so if it exists and is direct.  (This has been seen in the wild.)
88
0
    QPDFObjectHandle root = qpdf.getRoot();
89
0
    if (root.getKey("/Outlines").isDictionary()) {
90
0
        QPDFObjectHandle outlines = root.getKey("/Outlines");
91
0
        if (!outlines.isIndirect()) {
92
0
            root.replaceKey("/Outlines", qpdf.makeIndirectObject(outlines));
93
0
        }
94
0
    }
95
96
    // Traverse pages tree pushing all inherited resources down to the page level.  This also
97
    // initializes m->all_pages.
98
0
    m->pages.pushInheritedAttributesToPage(allow_changes, false);
99
100
    // Traverse pages
101
0
    size_t n = m->all_pages.size();
102
0
    for (size_t pageno = 0; pageno < n; ++pageno) {
103
0
        updateObjectMaps(
104
0
            ObjUser(ObjUser::ou_page, pageno), m->all_pages.at(pageno), skip_stream_parameters);
105
0
    }
106
107
    // Traverse document-level items
108
0
    for (auto const& [key, value]: m->trailer.as_dictionary()) {
109
0
        if (key == "/Root") {
110
            // handled separately
111
0
        } else {
112
0
            if (!value.null()) {
113
0
                updateObjectMaps(
114
0
                    ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters);
115
0
            }
116
0
        }
117
0
    }
118
119
0
    for (auto const& [key, value]: root.as_dictionary()) {
120
        // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but
121
        // we are going to disregard that specification for now.  There is loads of evidence that
122
        // pdlin and Acrobat both disregard things like this from time to time, so this is almost
123
        // certain not to cause any problems.
124
0
        if (!value.null()) {
125
0
            updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters);
126
0
        }
127
0
    }
128
129
0
    ObjUser root_ou = ObjUser(ObjUser::ou_root);
130
0
    auto root_og = QPDFObjGen(root.getObjGen());
131
0
    m->obj_user_to_objects[root_ou].insert(root_og);
132
0
    m->object_to_obj_users[root_og].insert(root_ou);
133
134
0
    filterCompressedObjects(object_stream_data);
135
0
}
Unexecuted instantiation: void QPDF::Doc::Linearization::optimize_internal<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
Unexecuted instantiation: void QPDF::Doc::Linearization::optimize_internal<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
136
137
void
138
QPDF::pushInheritedAttributesToPage()
139
0
{
140
    // Public API should not have access to allow_changes.
141
0
    m->pages.pushInheritedAttributesToPage(true, false);
142
0
}
143
144
void
145
Pages::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
146
2.67k
{
147
    // Traverse pages tree pushing all inherited resources down to the page level.
148
149
    // The record of whether we've done this is cleared by updateAllPagesCache().  If we're warning
150
    // for skipped keys, re-traverse unconditionally.
151
2.67k
    if (m->pushed_inherited_attributes_to_pages && (!warn_skipped_keys)) {
152
0
        return;
153
0
    }
154
155
    // Calling getAllPages() resolves any duplicated page objects, repairs broken nodes, and detects
156
    // loops, so we don't have to do those activities here.
157
2.67k
    qpdf.getAllPages();
158
159
    // key_ancestors is a mapping of page attribute keys to a stack of Pages nodes that contain
160
    // values for them.
161
2.67k
    std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
162
2.67k
    pushInheritedAttributesToPageInternal(
163
2.67k
        m->trailer.getKey("/Root").getKey("/Pages"),
164
2.67k
        key_ancestors,
165
2.67k
        allow_changes,
166
2.67k
        warn_skipped_keys);
167
2.67k
    if (!key_ancestors.empty()) {
168
0
        throw std::logic_error(
169
0
            "key_ancestors not empty after pushing inherited attributes to pages");
170
0
    }
171
2.67k
    m->pushed_inherited_attributes_to_pages = true;
172
2.67k
    m->ever_pushed_inherited_attributes_to_pages = true;
173
2.67k
}
174
175
void
176
Pages ::pushInheritedAttributesToPageInternal(
177
    QPDFObjectHandle cur_pages,
178
    std::map<std::string, std::vector<QPDFObjectHandle>>& key_ancestors,
179
    bool allow_changes,
180
    bool warn_skipped_keys)
181
4.44k
{
182
    // Make a list of inheritable keys. Only the keys /MediaBox, /CropBox, /Resources, and /Rotate
183
    // are inheritable attributes. Push this object onto the stack of pages nodes that have values
184
    // for this attribute.
185
186
4.44k
    std::set<std::string> inheritable_keys;
187
16.8k
    for (auto const& key: cur_pages.getKeys()) {
188
16.8k
        if (key == "/MediaBox" || key == "/CropBox" || key == "/Resources" || key == "/Rotate") {
189
870
            if (!allow_changes) {
190
0
                throw QPDFExc(
191
0
                    qpdf_e_internal,
192
0
                    m->file->getName(),
193
0
                    m->last_object_description,
194
0
                    m->file->getLastOffset(),
195
0
                    "optimize detected an inheritable attribute when called in no-change mode");
196
0
            }
197
198
            // This is an inheritable resource
199
870
            inheritable_keys.insert(key);
200
870
            QPDFObjectHandle oh = cur_pages.getKey(key);
201
870
            QTC::TC("qpdf", "QPDF opt direct pages resource", oh.indirect() ? 0 : 1);
202
870
            if (!oh.indirect()) {
203
853
                if (!oh.isScalar()) {
204
                    // Replace shared direct object non-scalar resources with indirect objects to
205
                    // avoid copying large structures around.
206
410
                    cur_pages.replaceKey(key, qpdf.makeIndirectObject(oh));
207
410
                    oh = cur_pages.getKey(key);
208
443
                } else {
209
                    // It's okay to copy scalars.
210
443
                }
211
853
            }
212
870
            key_ancestors[key].push_back(oh);
213
870
            if (key_ancestors[key].size() > 1) {
214
98
            }
215
            // Remove this resource from this node.  It will be reattached at the page level.
216
870
            cur_pages.removeKey(key);
217
16.0k
        } else if (!(key == "/Type" || key == "/Parent" || key == "/Kids" || key == "/Count")) {
218
            // Warn when flattening, but not if the key is at the top level (i.e. "/Parent" not
219
            // set), as we don't change these; but flattening removes intermediate /Pages nodes.
220
4.59k
            if (warn_skipped_keys && cur_pages.hasKey("/Parent")) {
221
1.46k
                qpdf.warn(
222
1.46k
                    qpdf_e_pages,
223
1.46k
                    "Pages object: object " + cur_pages.id_gen().unparse(' '),
224
1.46k
                    0,
225
1.46k
                    ("Unknown key " + key +
226
1.46k
                     " in /Pages object is being discarded as a result of flattening the /Pages "
227
1.46k
                     "tree"));
228
1.46k
            }
229
4.59k
        }
230
16.8k
    }
231
232
    // Process descendant nodes. This method does not perform loop detection because all code paths
233
    // that lead here follow a call to getAllPages, which already throws an exception in the event
234
    // of a loop in the pages tree.
235
31.6k
    for (auto& kid: cur_pages.getKey("/Kids").aitems()) {
236
31.6k
        if (kid.isDictionaryOfType("/Pages")) {
237
1.76k
            pushInheritedAttributesToPageInternal(
238
1.76k
                kid, key_ancestors, allow_changes, warn_skipped_keys);
239
29.8k
        } else {
240
            // Add all available inheritable attributes not present in this object to this object.
241
29.8k
            for (auto const& iter: key_ancestors) {
242
2.27k
                std::string const& key = iter.first;
243
2.27k
                if (!kid.hasKey(key)) {
244
1.90k
                    kid.replaceKey(key, iter.second.back());
245
1.90k
                } else {
246
369
                    QTC::TC("qpdf", "QPDF opt page resource hides ancestor");
247
369
                }
248
2.27k
            }
249
29.8k
        }
250
31.6k
    }
251
252
    // For each inheritable key, pop the stack.  If the stack becomes empty, remove it from the map.
253
    // That way, the invariant that the list of keys in key_ancestors is exactly those keys for
254
    // which inheritable attributes are available.
255
256
4.44k
    if (!inheritable_keys.empty()) {
257
605
        for (auto const& key: inheritable_keys) {
258
605
            key_ancestors[key].pop_back();
259
605
            if (key_ancestors[key].empty()) {
260
521
                key_ancestors.erase(key);
261
521
            }
262
605
        }
263
4.06k
    } else {
264
4.06k
        QTC::TC("qpdf", "QPDF opt no inheritable keys");
265
4.06k
    }
266
4.44k
}
267
268
void
269
Lin::updateObjectMaps(
270
    ObjUser const& first_ou,
271
    QPDFObjectHandle first_oh,
272
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
273
0
{
274
0
    QPDFObjGen::set visited;
275
0
    std::vector<UpdateObjectMapsFrame> pending;
276
0
    pending.emplace_back(first_ou, first_oh, true);
277
    // Traverse the object tree from this point taking care to avoid crossing page boundaries.
278
0
    std::unique_ptr<ObjUser> thumb_ou;
279
0
    while (!pending.empty()) {
280
0
        auto cur = pending.back();
281
0
        pending.pop_back();
282
283
0
        bool is_page_node = false;
284
285
0
        if (cur.oh.isDictionaryOfType("/Page")) {
286
0
            is_page_node = true;
287
0
            if (!cur.top) {
288
0
                continue;
289
0
            }
290
0
        }
291
292
0
        if (cur.oh.isIndirect()) {
293
0
            QPDFObjGen og(cur.oh.getObjGen());
294
0
            if (!visited.add(og)) {
295
0
                QTC::TC("qpdf", "QPDF opt loop detected");
296
0
                continue;
297
0
            }
298
0
            m->obj_user_to_objects[cur.ou].insert(og);
299
0
            m->object_to_obj_users[og].insert(cur.ou);
300
0
        }
301
302
0
        if (cur.oh.isArray()) {
303
0
            for (auto const& item: cur.oh.as_array()) {
304
0
                pending.emplace_back(cur.ou, item, false);
305
0
            }
306
0
        } else if (cur.oh.isDictionary() || cur.oh.isStream()) {
307
0
            QPDFObjectHandle dict = cur.oh;
308
0
            bool is_stream = cur.oh.isStream();
309
0
            int ssp = 0;
310
0
            if (is_stream) {
311
0
                dict = cur.oh.getDict();
312
0
                if (skip_stream_parameters) {
313
0
                    ssp = skip_stream_parameters(cur.oh);
314
0
                }
315
0
            }
316
317
0
            for (auto& [key, value]: dict.as_dictionary()) {
318
0
                if (value.null()) {
319
0
                    continue;
320
0
                }
321
322
0
                if (is_page_node && (key == "/Thumb")) {
323
                    // Traverse page thumbnail dictionaries as a special case. There can only ever
324
                    // be one /Thumb key on a page, and we see at most one page node per call.
325
0
                    thumb_ou = std::make_unique<ObjUser>(ObjUser::ou_thumb, cur.ou.pageno);
326
0
                    pending.emplace_back(*thumb_ou, dict.getKey(key), false);
327
0
                } else if (is_page_node && (key == "/Parent")) {
328
                    // Don't traverse back up the page tree
329
0
                } else if (
330
0
                    ((ssp >= 1) && (key == "/Length")) ||
331
0
                    ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) {
332
                    // Don't traverse into stream parameters that we are not going to write.
333
0
                } else {
334
0
                    pending.emplace_back(cur.ou, value, false);
335
0
                }
336
0
            }
337
0
        }
338
0
    }
339
0
}
340
341
void
342
Lin::filterCompressedObjects(std::map<int, int> const& object_stream_data)
343
0
{
344
0
    if (object_stream_data.empty()) {
345
0
        return;
346
0
    }
347
348
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
349
    // objects.  If something is a user of a compressed object, then it is really a user of the
350
    // object stream that contains it.
351
352
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
353
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
354
355
0
    for (auto const& i1: m->obj_user_to_objects) {
356
0
        ObjUser const& ou = i1.first;
357
        // Loop over objects.
358
0
        for (auto const& og: i1.second) {
359
0
            auto i2 = object_stream_data.find(og.getObj());
360
0
            if (i2 == object_stream_data.end()) {
361
0
                t_obj_user_to_objects[ou].insert(og);
362
0
            } else {
363
0
                t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0));
364
0
            }
365
0
        }
366
0
    }
367
368
0
    for (auto const& i1: m->object_to_obj_users) {
369
0
        QPDFObjGen const& og = i1.first;
370
        // Loop over obj_users.
371
0
        for (auto const& ou: i1.second) {
372
0
            auto i2 = object_stream_data.find(og.getObj());
373
0
            if (i2 == object_stream_data.end()) {
374
0
                t_object_to_obj_users[og].insert(ou);
375
0
            } else {
376
0
                t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou);
377
0
            }
378
0
        }
379
0
    }
380
381
0
    m->obj_user_to_objects = t_obj_user_to_objects;
382
0
    m->object_to_obj_users = t_object_to_obj_users;
383
0
}
384
385
void
386
Lin::filterCompressedObjects(QPDFWriter::ObjTable const& obj)
387
0
{
388
0
    if (obj.getStreamsEmpty()) {
389
0
        return;
390
0
    }
391
392
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
393
    // objects.  If something is a user of a compressed object, then it is really a user of the
394
    // object stream that contains it.
395
396
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
397
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
398
399
0
    for (auto const& i1: m->obj_user_to_objects) {
400
0
        ObjUser const& ou = i1.first;
401
        // Loop over objects.
402
0
        for (auto const& og: i1.second) {
403
0
            if (obj.contains(og)) {
404
0
                if (auto const& i2 = obj[og].object_stream; i2 <= 0) {
405
0
                    t_obj_user_to_objects[ou].insert(og);
406
0
                } else {
407
0
                    t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0));
408
0
                }
409
0
            }
410
0
        }
411
0
    }
412
413
0
    for (auto const& i1: m->object_to_obj_users) {
414
0
        QPDFObjGen const& og = i1.first;
415
0
        if (obj.contains(og)) {
416
            // Loop over obj_users.
417
0
            for (auto const& ou: i1.second) {
418
0
                if (auto i2 = obj[og].object_stream; i2 <= 0) {
419
0
                    t_object_to_obj_users[og].insert(ou);
420
0
                } else {
421
0
                    t_object_to_obj_users[QPDFObjGen(i2, 0)].insert(ou);
422
0
                }
423
0
            }
424
0
        }
425
0
    }
426
427
0
    m->obj_user_to_objects = t_obj_user_to_objects;
428
0
    m->object_to_obj_users = t_object_to_obj_users;
429
0
}