Coverage Report

Created: 2025-08-28 06:36

/src/qpdf/libqpdf/QPDF_optimization.cc
Line
Count
Source (jump to first uncovered line)
1
// See the "Optimization" section of the manual.
2
3
#include <qpdf/assert_debug.h>
4
5
#include <qpdf/QPDF_private.hh>
6
7
#include <qpdf/QPDFExc.hh>
8
#include <qpdf/QPDFObjectHandle_private.hh>
9
#include <qpdf/QPDFWriter_private.hh>
10
#include <qpdf/QTC.hh>
11
12
QPDF::ObjUser::ObjUser(user_e type) :
13
9.57k
    ou_type(type)
14
9.57k
{
15
9.57k
    qpdf_assert_debug(type == ou_root);
16
9.57k
}
17
18
QPDF::ObjUser::ObjUser(user_e type, size_t pageno) :
19
45.7k
    ou_type(type),
20
45.7k
    pageno(pageno)
21
45.7k
{
22
45.7k
    qpdf_assert_debug((type == ou_page) || (type == ou_thumb));
23
45.7k
}
24
25
QPDF::ObjUser::ObjUser(user_e type, std::string const& key) :
26
58.6k
    ou_type(type),
27
58.6k
    key(key)
28
58.6k
{
29
58.6k
    qpdf_assert_debug((type == ou_trailer_key) || (type == ou_root_key));
30
58.6k
}
31
32
bool
33
QPDF::ObjUser::operator<(ObjUser const& rhs) const
34
9.15M
{
35
9.15M
    if (ou_type < rhs.ou_type) {
36
455k
        return true;
37
455k
    }
38
8.70M
    if (ou_type == rhs.ou_type) {
39
8.22M
        if (pageno < rhs.pageno) {
40
1.68M
            return true;
41
1.68M
        }
42
6.53M
        if (pageno == rhs.pageno) {
43
4.73M
            return key < rhs.key;
44
4.73M
        }
45
6.53M
    }
46
2.28M
    return false;
47
8.70M
}
48
49
QPDF::UpdateObjectMapsFrame::UpdateObjectMapsFrame(
50
    QPDF::ObjUser const& ou, QPDFObjectHandle oh, bool top) :
51
2.96M
    ou(ou),
52
2.96M
    oh(oh),
53
2.96M
    top(top)
54
2.96M
{
55
2.96M
}
56
57
void
58
QPDF::optimize(
59
    std::map<int, int> const& object_stream_data,
60
    bool allow_changes,
61
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
62
0
{
63
0
    optimize_internal(object_stream_data, allow_changes, skip_stream_parameters);
64
0
}
65
66
void
67
QPDF::optimize(
68
    QPDFWriter::ObjTable const& obj, std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
69
9.60k
{
70
9.60k
    optimize_internal(obj, true, skip_stream_parameters);
71
9.60k
}
72
73
template <typename T>
74
void
75
QPDF::optimize_internal(
76
    T const& object_stream_data,
77
    bool allow_changes,
78
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
79
9.60k
{
80
9.60k
    if (!m->obj_user_to_objects.empty()) {
81
        // already optimized
82
0
        return;
83
0
    }
84
85
    // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force
86
    // it to be so if it exists and is direct.  (This has been seen in the wild.)
87
9.60k
    QPDFObjectHandle root = getRoot();
88
9.60k
    if (root.getKey("/Outlines").isDictionary()) {
89
199
        QPDFObjectHandle outlines = root.getKey("/Outlines");
90
199
        if (!outlines.isIndirect()) {
91
28
            QTC::TC("qpdf", "QPDF_optimization indirect outlines");
92
28
            root.replaceKey("/Outlines", makeIndirectObject(outlines));
93
28
        }
94
199
    }
95
96
    // Traverse pages tree pushing all inherited resources down to the page level.  This also
97
    // initializes m->all_pages.
98
9.60k
    pushInheritedAttributesToPage(allow_changes, false);
99
100
    // Traverse pages
101
9.60k
    size_t n = m->all_pages.size();
102
26.3k
    for (size_t pageno = 0; pageno < n; ++pageno) {
103
16.7k
        updateObjectMaps(
104
16.7k
            ObjUser(ObjUser::ou_page, pageno), m->all_pages.at(pageno), skip_stream_parameters);
105
16.7k
    }
106
107
    // Traverse document-level items
108
25.8k
    for (auto const& [key, value]: m->trailer.as_dictionary()) {
109
25.8k
        if (key == "/Root") {
110
            // handled separately
111
16.2k
        } else {
112
16.2k
            if (!value.null()) {
113
12.4k
                updateObjectMaps(
114
12.4k
                    ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters);
115
12.4k
            }
116
16.2k
        }
117
25.8k
    }
118
119
45.0k
    for (auto const& [key, value]: root.as_dictionary()) {
120
        // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but
121
        // we are going to disregard that specification for now.  There is loads of evidence that
122
        // pdlin and Acrobat both disregard things like this from time to time, so this is almost
123
        // certain not to cause any problems.
124
45.0k
        if (!value.null()) {
125
38.4k
            updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters);
126
38.4k
        }
127
45.0k
    }
128
129
9.60k
    ObjUser root_ou = ObjUser(ObjUser::ou_root);
130
9.60k
    auto root_og = QPDFObjGen(root.getObjGen());
131
9.60k
    m->obj_user_to_objects[root_ou].insert(root_og);
132
9.60k
    m->object_to_obj_users[root_og].insert(root_ou);
133
134
9.60k
    filterCompressedObjects(object_stream_data);
135
9.60k
}
Unexecuted instantiation: void QPDF::optimize_internal<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
void QPDF::optimize_internal<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
Line
Count
Source
79
9.60k
{
80
9.60k
    if (!m->obj_user_to_objects.empty()) {
81
        // already optimized
82
0
        return;
83
0
    }
84
85
    // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force
86
    // it to be so if it exists and is direct.  (This has been seen in the wild.)
87
9.60k
    QPDFObjectHandle root = getRoot();
88
9.60k
    if (root.getKey("/Outlines").isDictionary()) {
89
199
        QPDFObjectHandle outlines = root.getKey("/Outlines");
90
199
        if (!outlines.isIndirect()) {
91
28
            QTC::TC("qpdf", "QPDF_optimization indirect outlines");
92
28
            root.replaceKey("/Outlines", makeIndirectObject(outlines));
93
28
        }
94
199
    }
95
96
    // Traverse pages tree pushing all inherited resources down to the page level.  This also
97
    // initializes m->all_pages.
98
9.60k
    pushInheritedAttributesToPage(allow_changes, false);
99
100
    // Traverse pages
101
9.60k
    size_t n = m->all_pages.size();
102
26.3k
    for (size_t pageno = 0; pageno < n; ++pageno) {
103
16.7k
        updateObjectMaps(
104
16.7k
            ObjUser(ObjUser::ou_page, pageno), m->all_pages.at(pageno), skip_stream_parameters);
105
16.7k
    }
106
107
    // Traverse document-level items
108
25.8k
    for (auto const& [key, value]: m->trailer.as_dictionary()) {
109
25.8k
        if (key == "/Root") {
110
            // handled separately
111
16.2k
        } else {
112
16.2k
            if (!value.null()) {
113
12.4k
                updateObjectMaps(
114
12.4k
                    ObjUser(ObjUser::ou_trailer_key, key), value, skip_stream_parameters);
115
12.4k
            }
116
16.2k
        }
117
25.8k
    }
118
119
45.0k
    for (auto const& [key, value]: root.as_dictionary()) {
120
        // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but
121
        // we are going to disregard that specification for now.  There is loads of evidence that
122
        // pdlin and Acrobat both disregard things like this from time to time, so this is almost
123
        // certain not to cause any problems.
124
45.0k
        if (!value.null()) {
125
38.4k
            updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), value, skip_stream_parameters);
126
38.4k
        }
127
45.0k
    }
128
129
9.60k
    ObjUser root_ou = ObjUser(ObjUser::ou_root);
130
9.60k
    auto root_og = QPDFObjGen(root.getObjGen());
131
9.60k
    m->obj_user_to_objects[root_ou].insert(root_og);
132
9.60k
    m->object_to_obj_users[root_og].insert(root_ou);
133
134
9.60k
    filterCompressedObjects(object_stream_data);
135
9.60k
}
136
137
void
138
QPDF::pushInheritedAttributesToPage()
139
0
{
140
    // Public API should not have access to allow_changes.
141
0
    pushInheritedAttributesToPage(true, false);
142
0
}
143
144
void
145
QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
146
15.8k
{
147
    // Traverse pages tree pushing all inherited resources down to the page level.
148
149
    // The record of whether we've done this is cleared by updateAllPagesCache().  If we're warning
150
    // for skipped keys, re-traverse unconditionally.
151
15.8k
    if (m->pushed_inherited_attributes_to_pages && (!warn_skipped_keys)) {
152
5.33k
        return;
153
5.33k
    }
154
155
    // Calling getAllPages() resolves any duplicated page objects, repairs broken nodes, and detects
156
    // loops, so we don't have to do those activities here.
157
10.5k
    getAllPages();
158
159
    // key_ancestors is a mapping of page attribute keys to a stack of Pages nodes that contain
160
    // values for them.
161
10.5k
    std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
162
10.5k
    pushInheritedAttributesToPageInternal(
163
10.5k
        m->trailer.getKey("/Root").getKey("/Pages"),
164
10.5k
        key_ancestors,
165
10.5k
        allow_changes,
166
10.5k
        warn_skipped_keys);
167
10.5k
    if (!key_ancestors.empty()) {
168
0
        throw std::logic_error(
169
0
            "key_ancestors not empty after pushing inherited attributes to pages");
170
0
    }
171
10.5k
    m->pushed_inherited_attributes_to_pages = true;
172
10.5k
    m->ever_pushed_inherited_attributes_to_pages = true;
173
10.5k
}
174
175
void
176
QPDF::pushInheritedAttributesToPageInternal(
177
    QPDFObjectHandle cur_pages,
178
    std::map<std::string, std::vector<QPDFObjectHandle>>& key_ancestors,
179
    bool allow_changes,
180
    bool warn_skipped_keys)
181
11.1k
{
182
    // Make a list of inheritable keys. Only the keys /MediaBox, /CropBox, /Resources, and /Rotate
183
    // are inheritable attributes. Push this object onto the stack of pages nodes that have values
184
    // for this attribute.
185
186
11.1k
    std::set<std::string> inheritable_keys;
187
44.3k
    for (auto const& key: cur_pages.getKeys()) {
188
44.3k
        if ((key == "/MediaBox") || (key == "/CropBox") || (key == "/Resources") ||
189
44.3k
            (key == "/Rotate")) {
190
3.28k
            if (!allow_changes) {
191
0
                throw QPDFExc(
192
0
                    qpdf_e_internal,
193
0
                    m->file->getName(),
194
0
                    m->last_object_description,
195
0
                    m->file->getLastOffset(),
196
0
                    "optimize detected an inheritable attribute when called in no-change mode");
197
0
            }
198
199
            // This is an inheritable resource
200
3.28k
            inheritable_keys.insert(key);
201
3.28k
            QPDFObjectHandle oh = cur_pages.getKey(key);
202
3.28k
            QTC::TC("qpdf", "QPDF opt direct pages resource", oh.isIndirect() ? 0 : 1);
203
3.28k
            if (!oh.isIndirect()) {
204
2.94k
                if (!oh.isScalar()) {
205
                    // Replace shared direct object non-scalar resources with indirect objects to
206
                    // avoid copying large structures around.
207
2.83k
                    cur_pages.replaceKey(key, makeIndirectObject(oh));
208
2.83k
                    oh = cur_pages.getKey(key);
209
2.83k
                } else {
210
                    // It's okay to copy scalars.
211
118
                    QTC::TC("qpdf", "QPDF opt inherited scalar");
212
118
                }
213
2.94k
            }
214
3.28k
            key_ancestors[key].push_back(oh);
215
3.28k
            if (key_ancestors[key].size() > 1) {
216
77
                QTC::TC("qpdf", "QPDF opt key ancestors depth > 1");
217
77
            }
218
            // Remove this resource from this node.  It will be reattached at the page level.
219
3.28k
            cur_pages.removeKey(key);
220
41.0k
        } else if (!((key == "/Type") || (key == "/Parent") || (key == "/Kids") ||
221
41.0k
                     (key == "/Count"))) {
222
            // Warn when flattening, but not if the key is at the top level (i.e. "/Parent" not
223
            // set), as we don't change these; but flattening removes intermediate /Pages nodes.
224
9.97k
            if ((warn_skipped_keys) && (cur_pages.hasKey("/Parent"))) {
225
433
                QTC::TC("qpdf", "QPDF unknown key not inherited");
226
433
                setLastObjectDescription("Pages object", cur_pages.getObjGen());
227
433
                warn(
228
433
                    qpdf_e_pages,
229
433
                    m->last_object_description,
230
433
                    0,
231
433
                    ("Unknown key " + key +
232
433
                     " in /Pages object is being discarded as a result of flattening the /Pages "
233
433
                     "tree"));
234
433
            }
235
9.97k
        }
236
44.3k
    }
237
238
    // Process descendant nodes. This method does not perform loop detection because all code paths
239
    // that lead here follow a call to getAllPages, which already throws an exception in the event
240
    // of a loop in the pages tree.
241
99.5k
    for (auto& kid: cur_pages.getKey("/Kids").aitems()) {
242
99.5k
        if (kid.isDictionaryOfType("/Pages")) {
243
604
            pushInheritedAttributesToPageInternal(
244
604
                kid, key_ancestors, allow_changes, warn_skipped_keys);
245
98.9k
        } else {
246
            // Add all available inheritable attributes not present in this object to this object.
247
98.9k
            for (auto const& iter: key_ancestors) {
248
13.0k
                std::string const& key = iter.first;
249
13.0k
                if (!kid.hasKey(key)) {
250
5.37k
                    QTC::TC("qpdf", "QPDF opt resource inherited");
251
5.37k
                    kid.replaceKey(key, iter.second.back());
252
7.65k
                } else {
253
7.65k
                    QTC::TC("qpdf", "QPDF opt page resource hides ancestor");
254
7.65k
                }
255
13.0k
            }
256
98.9k
        }
257
99.5k
    }
258
259
    // For each inheritable key, pop the stack.  If the stack becomes empty, remove it from the map.
260
    // That way, the invariant that the list of keys in key_ancestors is exactly those keys for
261
    // which inheritable attributes are available.
262
263
11.1k
    if (!inheritable_keys.empty()) {
264
2.65k
        QTC::TC("qpdf", "QPDF opt inheritable keys");
265
3.10k
        for (auto const& key: inheritable_keys) {
266
3.10k
            key_ancestors[key].pop_back();
267
3.10k
            if (key_ancestors[key].empty()) {
268
3.02k
                QTC::TC("qpdf", "QPDF opt erase empty key ancestor");
269
3.02k
                key_ancestors.erase(key);
270
3.02k
            }
271
3.10k
        }
272
8.50k
    } else {
273
8.50k
        QTC::TC("qpdf", "QPDF opt no inheritable keys");
274
8.50k
    }
275
11.1k
}
276
277
void
278
QPDF::updateObjectMaps(
279
    ObjUser const& first_ou,
280
    QPDFObjectHandle first_oh,
281
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
282
67.6k
{
283
67.6k
    QPDFObjGen::set visited;
284
67.6k
    std::vector<UpdateObjectMapsFrame> pending;
285
67.6k
    pending.emplace_back(first_ou, first_oh, true);
286
    // Traverse the object tree from this point taking care to avoid crossing page boundaries.
287
67.6k
    std::unique_ptr<ObjUser> thumb_ou;
288
3.03M
    while (!pending.empty()) {
289
2.96M
        auto cur = pending.back();
290
2.96M
        pending.pop_back();
291
292
2.96M
        bool is_page_node = false;
293
294
2.96M
        if (cur.oh.isDictionaryOfType("/Page")) {
295
99.8k
            is_page_node = true;
296
99.8k
            if (!cur.top) {
297
81.1k
                continue;
298
81.1k
            }
299
99.8k
        }
300
301
2.88M
        if (cur.oh.isIndirect()) {
302
441k
            QPDFObjGen og(cur.oh.getObjGen());
303
441k
            if (!visited.add(og)) {
304
76.5k
                QTC::TC("qpdf", "QPDF opt loop detected");
305
76.5k
                continue;
306
76.5k
            }
307
365k
            m->obj_user_to_objects[cur.ou].insert(og);
308
365k
            m->object_to_obj_users[og].insert(cur.ou);
309
365k
        }
310
311
2.80M
        if (cur.oh.isArray()) {
312
2.10M
            for (auto const& item: cur.oh.as_array()) {
313
2.10M
                pending.emplace_back(cur.ou, item, false);
314
2.10M
            }
315
2.68M
        } else if (cur.oh.isDictionary() || cur.oh.isStream()) {
316
230k
            QPDFObjectHandle dict = cur.oh;
317
230k
            bool is_stream = cur.oh.isStream();
318
230k
            int ssp = 0;
319
230k
            if (is_stream) {
320
55.7k
                dict = cur.oh.getDict();
321
55.7k
                if (skip_stream_parameters) {
322
55.7k
                    ssp = skip_stream_parameters(cur.oh);
323
55.7k
                }
324
55.7k
            }
325
326
1.02M
            for (auto& [key, value]: dict.as_dictionary()) {
327
1.02M
                if (value.null()) {
328
152k
                    continue;
329
152k
                }
330
331
874k
                if (is_page_node && (key == "/Thumb")) {
332
                    // Traverse page thumbnail dictionaries as a special case. There can only ever
333
                    // be one /Thumb key on a page, and we see at most one page node per call.
334
1.14k
                    thumb_ou = std::make_unique<ObjUser>(ObjUser::ou_thumb, cur.ou.pageno);
335
1.14k
                    pending.emplace_back(*thumb_ou, dict.getKey(key), false);
336
873k
                } else if (is_page_node && (key == "/Parent")) {
337
                    // Don't traverse back up the page tree
338
860k
                } else if (
339
860k
                    ((ssp >= 1) && (key == "/Length")) ||
340
860k
                    ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) {
341
                    // Don't traverse into stream parameters that we are not going to write.
342
793k
                } else {
343
793k
                    pending.emplace_back(cur.ou, value, false);
344
793k
                }
345
874k
            }
346
230k
        }
347
2.80M
    }
348
67.6k
}
349
350
void
351
QPDF::filterCompressedObjects(std::map<int, int> const& object_stream_data)
352
0
{
353
0
    if (object_stream_data.empty()) {
354
0
        return;
355
0
    }
356
357
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
358
    // objects.  If something is a user of a compressed object, then it is really a user of the
359
    // object stream that contains it.
360
361
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
362
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
363
364
0
    for (auto const& i1: m->obj_user_to_objects) {
365
0
        ObjUser const& ou = i1.first;
366
        // Loop over objects.
367
0
        for (auto const& og: i1.second) {
368
0
            auto i2 = object_stream_data.find(og.getObj());
369
0
            if (i2 == object_stream_data.end()) {
370
0
                t_obj_user_to_objects[ou].insert(og);
371
0
            } else {
372
0
                t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0));
373
0
            }
374
0
        }
375
0
    }
376
377
0
    for (auto const& i1: m->object_to_obj_users) {
378
0
        QPDFObjGen const& og = i1.first;
379
        // Loop over obj_users.
380
0
        for (auto const& ou: i1.second) {
381
0
            auto i2 = object_stream_data.find(og.getObj());
382
0
            if (i2 == object_stream_data.end()) {
383
0
                t_object_to_obj_users[og].insert(ou);
384
0
            } else {
385
0
                t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou);
386
0
            }
387
0
        }
388
0
    }
389
390
0
    m->obj_user_to_objects = t_obj_user_to_objects;
391
0
    m->object_to_obj_users = t_object_to_obj_users;
392
0
}
393
394
void
395
QPDF::filterCompressedObjects(QPDFWriter::ObjTable const& obj)
396
9.57k
{
397
9.57k
    if (obj.getStreamsEmpty()) {
398
704
        return;
399
704
    }
400
401
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
402
    // objects.  If something is a user of a compressed object, then it is really a user of the
403
    // object stream that contains it.
404
405
8.87k
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
406
8.87k
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
407
408
47.5k
    for (auto const& i1: m->obj_user_to_objects) {
409
47.5k
        ObjUser const& ou = i1.first;
410
        // Loop over objects.
411
357k
        for (auto const& og: i1.second) {
412
357k
            if (obj.contains(og)) {
413
355k
                if (auto const& i2 = obj[og].object_stream; i2 <= 0) {
414
135k
                    t_obj_user_to_objects[ou].insert(og);
415
220k
                } else {
416
220k
                    t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0));
417
220k
                }
418
355k
            }
419
357k
        }
420
47.5k
    }
421
422
173k
    for (auto const& i1: m->object_to_obj_users) {
423
173k
        QPDFObjGen const& og = i1.first;
424
173k
        if (obj.contains(og)) {
425
            // Loop over obj_users.
426
355k
            for (auto const& ou: i1.second) {
427
355k
                if (auto i2 = obj[og].object_stream; i2 <= 0) {
428
135k
                    t_object_to_obj_users[og].insert(ou);
429
220k
                } else {
430
220k
                    t_object_to_obj_users[QPDFObjGen(i2, 0)].insert(ou);
431
220k
                }
432
355k
            }
433
172k
        }
434
173k
    }
435
436
8.87k
    m->obj_user_to_objects = t_obj_user_to_objects;
437
8.87k
    m->object_to_obj_users = t_object_to_obj_users;
438
8.87k
}