Coverage Report

Created: 2025-07-11 06:58

/src/qpdf/libqpdf/QPDF_optimization.cc
Line
Count
Source (jump to first uncovered line)
1
// See the "Optimization" section of the manual.
2
3
#include <qpdf/assert_debug.h>
4
5
#include <qpdf/QPDF_private.hh>
6
7
#include <qpdf/QPDFExc.hh>
8
#include <qpdf/QPDFObjectHandle_private.hh>
9
#include <qpdf/QPDFWriter_private.hh>
10
#include <qpdf/QTC.hh>
11
12
QPDF::ObjUser::ObjUser(user_e type) :
13
0
    ou_type(type)
14
0
{
15
0
    qpdf_assert_debug(type == ou_root);
16
0
}
17
18
QPDF::ObjUser::ObjUser(user_e type, int pageno) :
19
0
    ou_type(type),
20
0
    pageno(pageno)
21
0
{
22
0
    qpdf_assert_debug((type == ou_page) || (type == ou_thumb));
23
0
}
24
25
QPDF::ObjUser::ObjUser(user_e type, std::string const& key) :
26
0
    ou_type(type),
27
0
    key(key)
28
0
{
29
0
    qpdf_assert_debug((type == ou_trailer_key) || (type == ou_root_key));
30
0
}
31
32
bool
33
QPDF::ObjUser::operator<(ObjUser const& rhs) const
34
0
{
35
0
    if (ou_type < rhs.ou_type) {
36
0
        return true;
37
0
    }
38
0
    if (ou_type == rhs.ou_type) {
39
0
        if (pageno < rhs.pageno) {
40
0
            return true;
41
0
        }
42
0
        if (pageno == rhs.pageno) {
43
0
            return key < rhs.key;
44
0
        }
45
0
    }
46
0
    return false;
47
0
}
48
49
QPDF::UpdateObjectMapsFrame::UpdateObjectMapsFrame(
50
    QPDF::ObjUser const& ou, QPDFObjectHandle oh, bool top) :
51
0
    ou(ou),
52
0
    oh(oh),
53
0
    top(top)
54
0
{
55
0
}
56
57
void
58
QPDF::optimize(
59
    std::map<int, int> const& object_stream_data,
60
    bool allow_changes,
61
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
62
0
{
63
0
    optimize_internal(object_stream_data, allow_changes, skip_stream_parameters);
64
0
}
65
66
void
67
QPDF::optimize(
68
    QPDFWriter::ObjTable const& obj, std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
69
0
{
70
0
    optimize_internal(obj, true, skip_stream_parameters);
71
0
}
72
73
template <typename T>
74
void
75
QPDF::optimize_internal(
76
    T const& object_stream_data,
77
    bool allow_changes,
78
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
79
0
{
80
0
    if (!m->obj_user_to_objects.empty()) {
81
        // already optimized
82
0
        return;
83
0
    }
84
85
    // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force
86
    // it to be so if it exists and is direct.  (This has been seen in the wild.)
87
0
    QPDFObjectHandle root = getRoot();
88
0
    if (root.getKey("/Outlines").isDictionary()) {
89
0
        QPDFObjectHandle outlines = root.getKey("/Outlines");
90
0
        if (!outlines.isIndirect()) {
91
0
            QTC::TC("qpdf", "QPDF_optimization indirect outlines");
92
0
            root.replaceKey("/Outlines", makeIndirectObject(outlines));
93
0
        }
94
0
    }
95
96
    // Traverse pages tree pushing all inherited resources down to the page level.  This also
97
    // initializes m->all_pages.
98
0
    pushInheritedAttributesToPage(allow_changes, false);
99
100
    // Traverse pages
101
0
    int n = toI(m->all_pages.size());
102
0
    for (int pageno = 0; pageno < n; ++pageno) {
103
0
        updateObjectMaps(
104
0
            ObjUser(ObjUser::ou_page, pageno),
105
0
            m->all_pages.at(toS(pageno)),
106
0
            skip_stream_parameters);
107
0
    }
108
109
    // Traverse document-level items
110
0
    for (auto const& [key, value]: m->trailer.as_dictionary()) {
111
0
        if (key == "/Root") {
112
            // handled separately
113
0
        } else {
114
0
            if (!value.null()) {
115
0
                updateObjectMaps(
116
0
                    ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters);
117
0
            }
118
0
        }
119
0
    }
120
121
0
    for (auto const& [key, value]: root.as_dictionary()) {
122
        // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but
123
        // we are going to disregard that specification for now.  There is loads of evidence that
124
        // pdlin and Acrobat both disregard things like this from time to time, so this is almost
125
        // certain not to cause any problems.
126
0
        if (!value.null()) {
127
0
            updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters);
128
0
        }
129
0
    }
130
131
0
    ObjUser root_ou = ObjUser(ObjUser::ou_root);
132
0
    auto root_og = QPDFObjGen(root.getObjGen());
133
0
    m->obj_user_to_objects[root_ou].insert(root_og);
134
0
    m->object_to_obj_users[root_og].insert(root_ou);
135
136
0
    filterCompressedObjects(object_stream_data);
137
0
}
Unexecuted instantiation: void QPDF::optimize_internal<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
Unexecuted instantiation: void QPDF::optimize_internal<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
138
139
void
140
QPDF::pushInheritedAttributesToPage()
141
0
{
142
    // Public API should not have access to allow_changes.
143
0
    pushInheritedAttributesToPage(true, false);
144
0
}
145
146
void
147
QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
148
0
{
149
    // Traverse pages tree pushing all inherited resources down to the page level.
150
151
    // The record of whether we've done this is cleared by updateAllPagesCache().  If we're warning
152
    // for skipped keys, re-traverse unconditionally.
153
0
    if (m->pushed_inherited_attributes_to_pages && (!warn_skipped_keys)) {
154
0
        return;
155
0
    }
156
157
    // Calling getAllPages() resolves any duplicated page objects, repairs broken nodes, and detects
158
    // loops, so we don't have to do those activities here.
159
0
    getAllPages();
160
161
    // key_ancestors is a mapping of page attribute keys to a stack of Pages nodes that contain
162
    // values for them.
163
0
    std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
164
0
    pushInheritedAttributesToPageInternal(
165
0
        m->trailer.getKey("/Root").getKey("/Pages"),
166
0
        key_ancestors,
167
0
        allow_changes,
168
0
        warn_skipped_keys);
169
0
    if (!key_ancestors.empty()) {
170
0
        throw std::logic_error(
171
0
            "key_ancestors not empty after pushing inherited attributes to pages");
172
0
    }
173
0
    m->pushed_inherited_attributes_to_pages = true;
174
0
    m->ever_pushed_inherited_attributes_to_pages = true;
175
0
}
176
177
void
178
QPDF::pushInheritedAttributesToPageInternal(
179
    QPDFObjectHandle cur_pages,
180
    std::map<std::string, std::vector<QPDFObjectHandle>>& key_ancestors,
181
    bool allow_changes,
182
    bool warn_skipped_keys)
183
0
{
184
    // Make a list of inheritable keys. Only the keys /MediaBox, /CropBox, /Resources, and /Rotate
185
    // are inheritable attributes. Push this object onto the stack of pages nodes that have values
186
    // for this attribute.
187
188
0
    std::set<std::string> inheritable_keys;
189
0
    for (auto const& key: cur_pages.getKeys()) {
190
0
        if ((key == "/MediaBox") || (key == "/CropBox") || (key == "/Resources") ||
191
0
            (key == "/Rotate")) {
192
0
            if (!allow_changes) {
193
0
                throw QPDFExc(
194
0
                    qpdf_e_internal,
195
0
                    m->file->getName(),
196
0
                    m->last_object_description,
197
0
                    m->file->getLastOffset(),
198
0
                    "optimize detected an inheritable attribute when called in no-change mode");
199
0
            }
200
201
            // This is an inheritable resource
202
0
            inheritable_keys.insert(key);
203
0
            QPDFObjectHandle oh = cur_pages.getKey(key);
204
0
            QTC::TC("qpdf", "QPDF opt direct pages resource", oh.isIndirect() ? 0 : 1);
205
0
            if (!oh.isIndirect()) {
206
0
                if (!oh.isScalar()) {
207
                    // Replace shared direct object non-scalar resources with indirect objects to
208
                    // avoid copying large structures around.
209
0
                    cur_pages.replaceKey(key, makeIndirectObject(oh));
210
0
                    oh = cur_pages.getKey(key);
211
0
                } else {
212
                    // It's okay to copy scalars.
213
0
                    QTC::TC("qpdf", "QPDF opt inherited scalar");
214
0
                }
215
0
            }
216
0
            key_ancestors[key].push_back(oh);
217
0
            if (key_ancestors[key].size() > 1) {
218
0
                QTC::TC("qpdf", "QPDF opt key ancestors depth > 1");
219
0
            }
220
            // Remove this resource from this node.  It will be reattached at the page level.
221
0
            cur_pages.removeKey(key);
222
0
        } else if (!((key == "/Type") || (key == "/Parent") || (key == "/Kids") ||
223
0
                     (key == "/Count"))) {
224
            // Warn when flattening, but not if the key is at the top level (i.e. "/Parent" not
225
            // set), as we don't change these; but flattening removes intermediate /Pages nodes.
226
0
            if ((warn_skipped_keys) && (cur_pages.hasKey("/Parent"))) {
227
0
                QTC::TC("qpdf", "QPDF unknown key not inherited");
228
0
                setLastObjectDescription("Pages object", cur_pages.getObjGen());
229
0
                warn(
230
0
                    qpdf_e_pages,
231
0
                    m->last_object_description,
232
0
                    0,
233
0
                    ("Unknown key " + key +
234
0
                     " in /Pages object is being discarded as a result of flattening the /Pages "
235
0
                     "tree"));
236
0
            }
237
0
        }
238
0
    }
239
240
    // Process descendant nodes. This method does not perform loop detection because all code paths
241
    // that lead here follow a call to getAllPages, which already throws an exception in the event
242
    // of a loop in the pages tree.
243
0
    for (auto& kid: cur_pages.getKey("/Kids").aitems()) {
244
0
        if (kid.isDictionaryOfType("/Pages")) {
245
0
            pushInheritedAttributesToPageInternal(
246
0
                kid, key_ancestors, allow_changes, warn_skipped_keys);
247
0
        } else {
248
            // Add all available inheritable attributes not present in this object to this object.
249
0
            for (auto const& iter: key_ancestors) {
250
0
                std::string const& key = iter.first;
251
0
                if (!kid.hasKey(key)) {
252
0
                    QTC::TC("qpdf", "QPDF opt resource inherited");
253
0
                    kid.replaceKey(key, iter.second.back());
254
0
                } else {
255
0
                    QTC::TC("qpdf", "QPDF opt page resource hides ancestor");
256
0
                }
257
0
            }
258
0
        }
259
0
    }
260
261
    // For each inheritable key, pop the stack.  If the stack becomes empty, remove it from the map.
262
    // That way, the invariant that the list of keys in key_ancestors is exactly those keys for
263
    // which inheritable attributes are available.
264
265
0
    if (!inheritable_keys.empty()) {
266
0
        QTC::TC("qpdf", "QPDF opt inheritable keys");
267
0
        for (auto const& key: inheritable_keys) {
268
0
            key_ancestors[key].pop_back();
269
0
            if (key_ancestors[key].empty()) {
270
0
                QTC::TC("qpdf", "QPDF opt erase empty key ancestor");
271
0
                key_ancestors.erase(key);
272
0
            }
273
0
        }
274
0
    } else {
275
0
        QTC::TC("qpdf", "QPDF opt no inheritable keys");
276
0
    }
277
0
}
278
279
void
280
QPDF::updateObjectMaps(
281
    ObjUser const& first_ou,
282
    QPDFObjectHandle first_oh,
283
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
284
0
{
285
0
    QPDFObjGen::set visited;
286
0
    std::vector<UpdateObjectMapsFrame> pending;
287
0
    pending.emplace_back(first_ou, first_oh, true);
288
    // Traverse the object tree from this point taking care to avoid crossing page boundaries.
289
0
    std::unique_ptr<ObjUser> thumb_ou;
290
0
    while (!pending.empty()) {
291
0
        auto cur = pending.back();
292
0
        pending.pop_back();
293
294
0
        bool is_page_node = false;
295
296
0
        if (cur.oh.isDictionaryOfType("/Page")) {
297
0
            is_page_node = true;
298
0
            if (!cur.top) {
299
0
                continue;
300
0
            }
301
0
        }
302
303
0
        if (cur.oh.isIndirect()) {
304
0
            QPDFObjGen og(cur.oh.getObjGen());
305
0
            if (!visited.add(og)) {
306
0
                QTC::TC("qpdf", "QPDF opt loop detected");
307
0
                continue;
308
0
            }
309
0
            m->obj_user_to_objects[cur.ou].insert(og);
310
0
            m->object_to_obj_users[og].insert(cur.ou);
311
0
        }
312
313
0
        if (cur.oh.isArray()) {
314
0
            for (auto const& item: cur.oh.as_array()) {
315
0
                pending.emplace_back(cur.ou, item, false);
316
0
            }
317
0
        } else if (cur.oh.isDictionary() || cur.oh.isStream()) {
318
0
            QPDFObjectHandle dict = cur.oh;
319
0
            bool is_stream = cur.oh.isStream();
320
0
            int ssp = 0;
321
0
            if (is_stream) {
322
0
                dict = cur.oh.getDict();
323
0
                if (skip_stream_parameters) {
324
0
                    ssp = skip_stream_parameters(cur.oh);
325
0
                }
326
0
            }
327
328
0
            for (auto& [key, value]: dict.as_dictionary()) {
329
0
                if (value.null()) {
330
0
                    continue;
331
0
                }
332
333
0
                if (is_page_node && (key == "/Thumb")) {
334
                    // Traverse page thumbnail dictionaries as a special case. There can only ever
335
                    // be one /Thumb key on a page, and we see at most one page node per call.
336
0
                    thumb_ou = std::make_unique<ObjUser>(ObjUser::ou_thumb, cur.ou.pageno);
337
0
                    pending.emplace_back(*thumb_ou, dict.getKey(key), false);
338
0
                } else if (is_page_node && (key == "/Parent")) {
339
                    // Don't traverse back up the page tree
340
0
                } else if (
341
0
                    ((ssp >= 1) && (key == "/Length")) ||
342
0
                    ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) {
343
                    // Don't traverse into stream parameters that we are not going to write.
344
0
                } else {
345
0
                    pending.emplace_back(cur.ou, value, false);
346
0
                }
347
0
            }
348
0
        }
349
0
    }
350
0
}
351
352
void
353
QPDF::filterCompressedObjects(std::map<int, int> const& object_stream_data)
354
0
{
355
0
    if (object_stream_data.empty()) {
356
0
        return;
357
0
    }
358
359
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
360
    // objects.  If something is a user of a compressed object, then it is really a user of the
361
    // object stream that contains it.
362
363
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
364
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
365
366
0
    for (auto const& i1: m->obj_user_to_objects) {
367
0
        ObjUser const& ou = i1.first;
368
        // Loop over objects.
369
0
        for (auto const& og: i1.second) {
370
0
            auto i2 = object_stream_data.find(og.getObj());
371
0
            if (i2 == object_stream_data.end()) {
372
0
                t_obj_user_to_objects[ou].insert(og);
373
0
            } else {
374
0
                t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0));
375
0
            }
376
0
        }
377
0
    }
378
379
0
    for (auto const& i1: m->object_to_obj_users) {
380
0
        QPDFObjGen const& og = i1.first;
381
        // Loop over obj_users.
382
0
        for (auto const& ou: i1.second) {
383
0
            auto i2 = object_stream_data.find(og.getObj());
384
0
            if (i2 == object_stream_data.end()) {
385
0
                t_object_to_obj_users[og].insert(ou);
386
0
            } else {
387
0
                t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou);
388
0
            }
389
0
        }
390
0
    }
391
392
0
    m->obj_user_to_objects = t_obj_user_to_objects;
393
0
    m->object_to_obj_users = t_object_to_obj_users;
394
0
}
395
396
void
397
QPDF::filterCompressedObjects(QPDFWriter::ObjTable const& obj)
398
0
{
399
0
    if (obj.getStreamsEmpty()) {
400
0
        return;
401
0
    }
402
403
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
404
    // objects.  If something is a user of a compressed object, then it is really a user of the
405
    // object stream that contains it.
406
407
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
408
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
409
410
0
    for (auto const& i1: m->obj_user_to_objects) {
411
0
        ObjUser const& ou = i1.first;
412
        // Loop over objects.
413
0
        for (auto const& og: i1.second) {
414
0
            if (obj.contains(og)) {
415
0
                if (auto const& i2 = obj[og].object_stream; i2 <= 0) {
416
0
                    t_obj_user_to_objects[ou].insert(og);
417
0
                } else {
418
0
                    t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0));
419
0
                }
420
0
            }
421
0
        }
422
0
    }
423
424
0
    for (auto const& i1: m->object_to_obj_users) {
425
0
        QPDFObjGen const& og = i1.first;
426
0
        if (obj.contains(og)) {
427
            // Loop over obj_users.
428
0
            for (auto const& ou: i1.second) {
429
0
                if (auto i2 = obj[og].object_stream; i2 <= 0) {
430
0
                    t_object_to_obj_users[og].insert(ou);
431
0
                } else {
432
0
                    t_object_to_obj_users[QPDFObjGen(i2, 0)].insert(ou);
433
0
                }
434
0
            }
435
0
        }
436
0
    }
437
438
0
    m->obj_user_to_objects = t_obj_user_to_objects;
439
0
    m->object_to_obj_users = t_object_to_obj_users;
440
0
}