Coverage Report

Created: 2024-09-08 06:05

/src/qpdf/libqpdf/QPDF_optimization.cc
Line
Count
Source (jump to first uncovered line)
1
// See the "Optimization" section of the manual.
2
3
#include <qpdf/assert_debug.h>
4
5
#include <qpdf/QPDF.hh>
6
7
#include <qpdf/QPDFExc.hh>
8
#include <qpdf/QPDFWriter_private.hh>
9
#include <qpdf/QPDF_Array.hh>
10
#include <qpdf/QPDF_Dictionary.hh>
11
#include <qpdf/QTC.hh>
12
13
QPDF::ObjUser::ObjUser() :
14
    ou_type(ou_bad),
15
    pageno(0)
16
0
{
17
0
}
18
19
QPDF::ObjUser::ObjUser(user_e type) :
20
    ou_type(type),
21
    pageno(0)
22
0
{
23
0
    qpdf_assert_debug(type == ou_root);
24
0
}
25
26
QPDF::ObjUser::ObjUser(user_e type, int pageno) :
27
    ou_type(type),
28
    pageno(pageno)
29
0
{
30
0
    qpdf_assert_debug((type == ou_page) || (type == ou_thumb));
31
0
}
32
33
QPDF::ObjUser::ObjUser(user_e type, std::string const& key) :
34
    ou_type(type),
35
    pageno(0),
36
    key(key)
37
0
{
38
0
    qpdf_assert_debug((type == ou_trailer_key) || (type == ou_root_key));
39
0
}
40
41
bool
42
QPDF::ObjUser::operator<(ObjUser const& rhs) const
43
0
{
44
0
    if (this->ou_type < rhs.ou_type) {
45
0
        return true;
46
0
    } else if (this->ou_type == rhs.ou_type) {
47
0
        if (this->pageno < rhs.pageno) {
48
0
            return true;
49
0
        } else if (this->pageno == rhs.pageno) {
50
0
            return (this->key < rhs.key);
51
0
        }
52
0
    }
53
54
0
    return false;
55
0
}
56
57
QPDF::UpdateObjectMapsFrame::UpdateObjectMapsFrame(
58
    QPDF::ObjUser const& ou, QPDFObjectHandle oh, bool top) :
59
    ou(ou),
60
    oh(oh),
61
    top(top)
62
0
{
63
0
}
64
65
void
66
QPDF::optimize(
67
    std::map<int, int> const& object_stream_data,
68
    bool allow_changes,
69
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
70
0
{
71
0
    optimize_internal(object_stream_data, allow_changes, skip_stream_parameters);
72
0
}
73
74
void
75
QPDF::optimize(
76
    QPDFWriter::ObjTable const& obj, std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
77
0
{
78
0
    optimize_internal(obj, true, skip_stream_parameters);
79
0
}
80
81
template <typename T>
82
void
83
QPDF::optimize_internal(
84
    T const& object_stream_data,
85
    bool allow_changes,
86
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
87
0
{
88
0
    if (!m->obj_user_to_objects.empty()) {
89
        // already optimized
90
0
        return;
91
0
    }
92
93
    // The PDF specification indicates that /Outlines is supposed to be an indirect reference. Force
94
    // it to be so if it exists and is direct.  (This has been seen in the wild.)
95
0
    QPDFObjectHandle root = getRoot();
96
0
    if (root.getKey("/Outlines").isDictionary()) {
97
0
        QPDFObjectHandle outlines = root.getKey("/Outlines");
98
0
        if (!outlines.isIndirect()) {
99
0
            QTC::TC("qpdf", "QPDF_optimization indirect outlines");
100
0
            root.replaceKey("/Outlines", makeIndirectObject(outlines));
101
0
        }
102
0
    }
103
104
    // Traverse pages tree pushing all inherited resources down to the page level.  This also
105
    // initializes m->all_pages.
106
0
    pushInheritedAttributesToPage(allow_changes, false);
107
108
    // Traverse pages
109
0
    int n = toI(m->all_pages.size());
110
0
    for (int pageno = 0; pageno < n; ++pageno) {
111
0
        updateObjectMaps(
112
0
            ObjUser(ObjUser::ou_page, pageno),
113
0
            m->all_pages.at(toS(pageno)),
114
0
            skip_stream_parameters);
115
0
    }
116
117
    // Traverse document-level items
118
0
    for (auto const& key: m->trailer.getKeys()) {
119
0
        if (key == "/Root") {
120
            // handled separately
121
0
        } else {
122
0
            updateObjectMaps(
123
0
                ObjUser(ObjUser::ou_trailer_key, key),
124
0
                m->trailer.getKey(key),
125
0
                skip_stream_parameters);
126
0
        }
127
0
    }
128
129
0
    for (auto const& key: root.getKeys()) {
130
        // Technically, /I keys from /Thread dictionaries are supposed to be handled separately, but
131
        // we are going to disregard that specification for now.  There is loads of evidence that
132
        // pdlin and Acrobat both disregard things like this from time to time, so this is almost
133
        // certain not to cause any problems.
134
0
        updateObjectMaps(
135
0
            ObjUser(ObjUser::ou_root_key, key), root.getKey(key), skip_stream_parameters);
136
0
    }
137
138
0
    ObjUser root_ou = ObjUser(ObjUser::ou_root);
139
0
    auto root_og = QPDFObjGen(root.getObjGen());
140
0
    m->obj_user_to_objects[root_ou].insert(root_og);
141
0
    m->object_to_obj_users[root_og].insert(root_ou);
142
143
0
    filterCompressedObjects(object_stream_data);
144
0
}
Unexecuted instantiation: void QPDF::optimize_internal<std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > >(std::__1::map<int, int, std::__1::less<int>, std::__1::allocator<std::__1::pair<int const, int> > > const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
Unexecuted instantiation: void QPDF::optimize_internal<QPDFWriter::ObjTable>(QPDFWriter::ObjTable const&, bool, std::__1::function<int (QPDFObjectHandle&)>)
145
146
void
147
QPDF::pushInheritedAttributesToPage()
148
0
{
149
    // Public API should not have access to allow_changes.
150
0
    pushInheritedAttributesToPage(true, false);
151
0
}
152
153
void
154
QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
155
970
{
156
    // Traverse pages tree pushing all inherited resources down to the page level.
157
158
    // The record of whether we've done this is cleared by updateAllPagesCache().  If we're warning
159
    // for skipped keys, re-traverse unconditionally.
160
970
    if (m->pushed_inherited_attributes_to_pages && (!warn_skipped_keys)) {
161
0
        return;
162
0
    }
163
164
    // Calling getAllPages() resolves any duplicated page objects, repairs broken nodes, and detects
165
    // loops, so we don't have to do those activities here.
166
970
    getAllPages();
167
168
    // key_ancestors is a mapping of page attribute keys to a stack of Pages nodes that contain
169
    // values for them.
170
970
    std::map<std::string, std::vector<QPDFObjectHandle>> key_ancestors;
171
970
    pushInheritedAttributesToPageInternal(
172
970
        m->trailer.getKey("/Root").getKey("/Pages"),
173
970
        key_ancestors,
174
970
        allow_changes,
175
970
        warn_skipped_keys);
176
970
    if (!key_ancestors.empty()) {
177
0
        throw std::logic_error("key_ancestors not empty after"
178
0
                               " pushing inherited attributes to pages");
179
0
    }
180
970
    m->pushed_inherited_attributes_to_pages = true;
181
970
    m->ever_pushed_inherited_attributes_to_pages = true;
182
970
}
183
184
void
185
QPDF::pushInheritedAttributesToPageInternal(
186
    QPDFObjectHandle cur_pages,
187
    std::map<std::string, std::vector<QPDFObjectHandle>>& key_ancestors,
188
    bool allow_changes,
189
    bool warn_skipped_keys)
190
1.16k
{
191
    // Make a list of inheritable keys. Only the keys /MediaBox, /CropBox, /Resources, and /Rotate
192
    // are inheritable attributes. Push this object onto the stack of pages nodes that have values
193
    // for this attribute.
194
195
1.16k
    std::set<std::string> inheritable_keys;
196
7.35k
    for (auto const& key: cur_pages.getKeys()) {
197
7.35k
        if ((key == "/MediaBox") || (key == "/CropBox") || (key == "/Resources") ||
198
7.35k
            (key == "/Rotate")) {
199
568
            if (!allow_changes) {
200
0
                throw QPDFExc(
201
0
                    qpdf_e_internal,
202
0
                    m->file->getName(),
203
0
                    m->last_object_description,
204
0
                    m->file->getLastOffset(),
205
0
                    "optimize detected an inheritable attribute when called in no-change mode");
206
0
            }
207
208
            // This is an inheritable resource
209
568
            inheritable_keys.insert(key);
210
568
            QPDFObjectHandle oh = cur_pages.getKey(key);
211
568
            QTC::TC("qpdf", "QPDF opt direct pages resource", oh.isIndirect() ? 0 : 1);
212
568
            if (!oh.isIndirect()) {
213
533
                if (!oh.isScalar()) {
214
                    // Replace shared direct object non-scalar resources with indirect objects to
215
                    // avoid copying large structures around.
216
486
                    cur_pages.replaceKey(key, makeIndirectObject(oh));
217
486
                    oh = cur_pages.getKey(key);
218
486
                } else {
219
                    // It's okay to copy scalars.
220
47
                    QTC::TC("qpdf", "QPDF opt inherited scalar");
221
47
                }
222
533
            }
223
568
            key_ancestors[key].push_back(oh);
224
568
            if (key_ancestors[key].size() > 1) {
225
31
                QTC::TC("qpdf", "QPDF opt key ancestors depth > 1");
226
31
            }
227
            // Remove this resource from this node.  It will be reattached at the page level.
228
568
            cur_pages.removeKey(key);
229
6.78k
        } else if (!((key == "/Type") || (key == "/Parent") || (key == "/Kids") ||
230
6.78k
                     (key == "/Count"))) {
231
            // Warn when flattening, but not if the key is at the top level (i.e. "/Parent" not
232
            // set), as we don't change these; but flattening removes intermediate /Pages nodes.
233
3.40k
            if ((warn_skipped_keys) && (cur_pages.hasKey("/Parent"))) {
234
888
                QTC::TC("qpdf", "QPDF unknown key not inherited");
235
888
                setLastObjectDescription("Pages object", cur_pages.getObjGen());
236
888
                warn(
237
888
                    qpdf_e_pages,
238
888
                    m->last_object_description,
239
888
                    0,
240
888
                    ("Unknown key " + key +
241
888
                     " in /Pages object is being discarded as a result of flattening the /Pages "
242
888
                     "tree"));
243
888
            }
244
3.40k
        }
245
7.35k
    }
246
247
    // Process descendant nodes. This method does not perform loop detection because all code paths
248
    // that lead here follow a call to getAllPages, which already throws an exception in the event
249
    // of a loop in the pages tree.
250
38.1k
    for (auto& kid: cur_pages.getKey("/Kids").aitems()) {
251
38.1k
        if (kid.isDictionaryOfType("/Pages")) {
252
193
            pushInheritedAttributesToPageInternal(
253
193
                kid, key_ancestors, allow_changes, warn_skipped_keys);
254
37.9k
        } else {
255
            // Add all available inheritable attributes not present in this object to this object.
256
37.9k
            for (auto const& iter: key_ancestors) {
257
4.22k
                std::string const& key = iter.first;
258
4.22k
                if (!kid.hasKey(key)) {
259
2.92k
                    QTC::TC("qpdf", "QPDF opt resource inherited");
260
2.92k
                    kid.replaceKey(key, iter.second.back());
261
2.92k
                } else {
262
1.30k
                    QTC::TC("qpdf", "QPDF opt page resource hides ancestor");
263
1.30k
                }
264
4.22k
            }
265
37.9k
        }
266
38.1k
    }
267
268
    // For each inheritable key, pop the stack.  If the stack becomes empty, remove it from the map.
269
    // That way, the invariant that the list of keys in key_ancestors is exactly those keys for
270
    // which inheritable attributes are available.
271
272
1.16k
    if (!inheritable_keys.empty()) {
273
126
        QTC::TC("qpdf", "QPDF opt inheritable keys");
274
229
        for (auto const& key: inheritable_keys) {
275
229
            key_ancestors[key].pop_back();
276
229
            if (key_ancestors[key].empty()) {
277
203
                QTC::TC("qpdf", "QPDF opt erase empty key ancestor");
278
203
                key_ancestors.erase(key);
279
203
            }
280
229
        }
281
1.03k
    } else {
282
1.03k
        QTC::TC("qpdf", "QPDF opt no inheritable keys");
283
1.03k
    }
284
1.16k
}
285
286
void
287
QPDF::updateObjectMaps(
288
    ObjUser const& first_ou,
289
    QPDFObjectHandle first_oh,
290
    std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
291
0
{
292
0
    QPDFObjGen::set visited;
293
0
    std::vector<UpdateObjectMapsFrame> pending;
294
0
    pending.emplace_back(first_ou, first_oh, true);
295
    // Traverse the object tree from this point taking care to avoid crossing page boundaries.
296
0
    std::unique_ptr<ObjUser> thumb_ou;
297
0
    while (!pending.empty()) {
298
0
        auto cur = pending.back();
299
0
        pending.pop_back();
300
301
0
        bool is_page_node = false;
302
303
0
        if (cur.oh.isDictionaryOfType("/Page")) {
304
0
            is_page_node = true;
305
0
            if (!cur.top) {
306
0
                continue;
307
0
            }
308
0
        }
309
310
0
        if (cur.oh.isIndirect()) {
311
0
            QPDFObjGen og(cur.oh.getObjGen());
312
0
            if (!visited.add(og)) {
313
0
                QTC::TC("qpdf", "QPDF opt loop detected");
314
0
                continue;
315
0
            }
316
0
            m->obj_user_to_objects[cur.ou].insert(og);
317
0
            m->object_to_obj_users[og].insert(cur.ou);
318
0
        }
319
320
0
        if (cur.oh.isArray()) {
321
0
            int n = cur.oh.getArrayNItems();
322
0
            for (int i = 0; i < n; ++i) {
323
0
                pending.emplace_back(cur.ou, cur.oh.getArrayItem(i), false);
324
0
            }
325
0
        } else if (cur.oh.isDictionary() || cur.oh.isStream()) {
326
0
            QPDFObjectHandle dict = cur.oh;
327
0
            bool is_stream = cur.oh.isStream();
328
0
            int ssp = 0;
329
0
            if (is_stream) {
330
0
                dict = cur.oh.getDict();
331
0
                if (skip_stream_parameters) {
332
0
                    ssp = skip_stream_parameters(cur.oh);
333
0
                }
334
0
            }
335
336
0
            for (auto const& key: dict.getKeys()) {
337
0
                if (is_page_node && (key == "/Thumb")) {
338
                    // Traverse page thumbnail dictionaries as a special case. There can only ever
339
                    // be one /Thumb key on a page, and we see at most one page node per call.
340
0
                    thumb_ou = std::make_unique<ObjUser>(ObjUser::ou_thumb, cur.ou.pageno);
341
0
                    pending.emplace_back(*thumb_ou, dict.getKey(key), false);
342
0
                } else if (is_page_node && (key == "/Parent")) {
343
                    // Don't traverse back up the page tree
344
0
                } else if (
345
0
                    ((ssp >= 1) && (key == "/Length")) ||
346
0
                    ((ssp >= 2) && ((key == "/Filter") || (key == "/DecodeParms")))) {
347
                    // Don't traverse into stream parameters that we are not going to write.
348
0
                } else {
349
0
                    pending.emplace_back(cur.ou, dict.getKey(key), false);
350
0
                }
351
0
            }
352
0
        }
353
0
    }
354
0
}
355
356
void
357
QPDF::filterCompressedObjects(std::map<int, int> const& object_stream_data)
358
0
{
359
0
    if (object_stream_data.empty()) {
360
0
        return;
361
0
    }
362
363
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
364
    // objects.  If something is a user of a compressed object, then it is really a user of the
365
    // object stream that contains it.
366
367
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
368
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
369
370
0
    for (auto const& i1: m->obj_user_to_objects) {
371
0
        ObjUser const& ou = i1.first;
372
        // Loop over objects.
373
0
        for (auto const& og: i1.second) {
374
0
            auto i2 = object_stream_data.find(og.getObj());
375
0
            if (i2 == object_stream_data.end()) {
376
0
                t_obj_user_to_objects[ou].insert(og);
377
0
            } else {
378
0
                t_obj_user_to_objects[ou].insert(QPDFObjGen(i2->second, 0));
379
0
            }
380
0
        }
381
0
    }
382
383
0
    for (auto const& i1: m->object_to_obj_users) {
384
0
        QPDFObjGen const& og = i1.first;
385
        // Loop over obj_users.
386
0
        for (auto const& ou: i1.second) {
387
0
            auto i2 = object_stream_data.find(og.getObj());
388
0
            if (i2 == object_stream_data.end()) {
389
0
                t_object_to_obj_users[og].insert(ou);
390
0
            } else {
391
0
                t_object_to_obj_users[QPDFObjGen(i2->second, 0)].insert(ou);
392
0
            }
393
0
        }
394
0
    }
395
396
0
    m->obj_user_to_objects = t_obj_user_to_objects;
397
0
    m->object_to_obj_users = t_object_to_obj_users;
398
0
}
399
400
void
401
QPDF::filterCompressedObjects(QPDFWriter::ObjTable const& obj)
402
0
{
403
0
    if (obj.getStreamsEmpty()) {
404
0
        return;
405
0
    }
406
407
    // Transform object_to_obj_users and obj_user_to_objects so that they refer only to uncompressed
408
    // objects.  If something is a user of a compressed object, then it is really a user of the
409
    // object stream that contains it.
410
411
0
    std::map<ObjUser, std::set<QPDFObjGen>> t_obj_user_to_objects;
412
0
    std::map<QPDFObjGen, std::set<ObjUser>> t_object_to_obj_users;
413
414
0
    for (auto const& i1: m->obj_user_to_objects) {
415
0
        ObjUser const& ou = i1.first;
416
        // Loop over objects.
417
0
        for (auto const& og: i1.second) {
418
0
            if (obj.contains(og)) {
419
0
                if (auto const& i2 = obj[og].object_stream; i2 <= 0) {
420
0
                    t_obj_user_to_objects[ou].insert(og);
421
0
                } else {
422
0
                    t_obj_user_to_objects[ou].insert(QPDFObjGen(i2, 0));
423
0
                }
424
0
            }
425
0
        }
426
0
    }
427
428
0
    for (auto const& i1: m->object_to_obj_users) {
429
0
        QPDFObjGen const& og = i1.first;
430
0
        if (obj.contains(og)) {
431
            // Loop over obj_users.
432
0
            for (auto const& ou: i1.second) {
433
0
                if (auto i2 = obj[og].object_stream; i2 <= 0) {
434
0
                    t_object_to_obj_users[og].insert(ou);
435
0
                } else {
436
0
                    t_object_to_obj_users[QPDFObjGen(i2, 0)].insert(ou);
437
0
                }
438
0
            }
439
0
        }
440
0
    }
441
442
0
    m->obj_user_to_objects = t_obj_user_to_objects;
443
0
    m->object_to_obj_users = t_object_to_obj_users;
444
0
}