Coverage Report

Created: 2025-10-10 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/QPDF_pages.cc
Line
Count
Source
1
#include <qpdf/QPDF_private.hh>
2
3
#include <qpdf/QPDFExc.hh>
4
#include <qpdf/QPDFObjectHandle_private.hh>
5
#include <qpdf/QTC.hh>
6
#include <qpdf/QUtil.hh>
7
8
// In support of page manipulation APIs, these methods internally maintain state about pages in a
9
// pair of data structures: all_pages, which is a vector of page objects, and pageobj_to_pages_pos,
10
// which maps a page object to its position in the all_pages array. Unfortunately, the getAllPages()
11
// method returns a const reference to all_pages and has been in the public API long before the
12
// introduction of mutation APIs, so we're pretty much stuck with it. Anyway, there are lots of
13
// calls to it in the library, so the efficiency of having it cached is probably worth keeping it.
14
// At one point, I had partially implemented a helper class specifically for the pages tree, but
15
// once you work in all the logic that handles repairing the /Type keys of page tree nodes (both
16
// /Pages and /Page) and deal with duplicate pages, it's just as complex and less efficient than
17
// what's here. So, in spite of the fact that a const reference is returned, the current code is
18
// fine and does not need to be replaced. A partial implementation of QPDFPagesTree is in github in
19
// attic in case there is ever a reason to resurrect it. There are additional notes in
20
// README-maintainer, which also refers to this comment.
21
22
// The goal of this code is to ensure that the all_pages vector, which users may have a reference
23
// to, and the pageobj_to_pages_pos map, which users will not have access to, remain consistent
24
// outside of any call to the library.  As long as users only touch the /Pages structure through
25
// page-specific API calls, they never have to worry about anything, and this will also stay
26
// consistent.  If a user touches anything about the /Pages structure outside of these calls (such
27
// as by directly looking up and manipulating the underlying objects), they can call
28
// updatePagesCache() to bring things back in sync.
29
30
// If the user doesn't ever use the page manipulation APIs, then qpdf leaves the /Pages structure
31
// alone.  If the user does use the APIs, then we push all inheritable objects down and flatten the
32
// /Pages tree.  This makes it easier for us to keep /Pages, all_pages, and pageobj_to_pages_pos
33
// internally consistent at all times.
34
35
// Responsibility for keeping all_pages, pageobj_to_pages_pos, and the Pages structure consistent
36
// should remain in as few places as possible.  As of initial writing, only flattenPagesTree,
37
// insertPage, and removePage, along with methods they call, are concerned with it.  Everything else
38
// goes through one of those methods.
39
40
using Pages = QPDF::Doc::Pages;
41
42
std::vector<QPDFObjectHandle> const&
43
QPDF::getAllPages()
44
26.6k
{
45
    // Note that pushInheritedAttributesToPage may also be used to initialize m->all_pages.
46
26.6k
    if (m->all_pages.empty() && !m->invalid_page_found) {
47
15.7k
        m->ever_called_get_all_pages = true;
48
15.7k
        auto root = getRoot();
49
15.7k
        QPDFObjGen::set visited;
50
15.7k
        QPDFObjGen::set seen;
51
15.7k
        QPDFObjectHandle pages = root.getKey("/Pages");
52
15.7k
        bool warned = false;
53
15.7k
        bool changed_pages = false;
54
15.9k
        while (pages.isDictionary() && pages.hasKey("/Parent")) {
55
302
            if (!seen.add(pages)) {
56
                // loop -- will be detected again and reported later
57
66
                break;
58
66
            }
59
            // Files have been found in the wild where /Pages in the catalog points to the first
60
            // page. Try to work around this and similar cases with this heuristic.
61
236
            if (!warned) {
62
173
                root.warn(
63
173
                    "document page tree root (root -> /Pages) doesn't point"
64
173
                    " to the root of the page tree; attempting to correct");
65
173
                warned = true;
66
173
            }
67
236
            changed_pages = true;
68
236
            pages = pages.getKey("/Parent");
69
236
        }
70
15.7k
        if (changed_pages) {
71
172
            root.replaceKey("/Pages", pages);
72
172
        }
73
15.7k
        seen.clear();
74
15.7k
        if (!pages.hasKey("/Kids")) {
75
            // Ensure we actually found a /Pages object.
76
142
            throw QPDFExc(
77
142
                qpdf_e_pages, m->file->getName(), "", 0, "root of pages tree has no /Kids array");
78
142
        }
79
15.6k
        try {
80
15.6k
            m->pages.getAllPagesInternal(pages, visited, seen, false, false);
81
15.6k
        } catch (...) {
82
102
            m->all_pages.clear();
83
102
            m->invalid_page_found = false;
84
102
            throw;
85
102
        }
86
10.9k
        if (m->invalid_page_found) {
87
2.49k
            m->pages.flattenPagesTree();
88
2.49k
            m->invalid_page_found = false;
89
2.49k
        }
90
10.9k
    }
91
21.8k
    return m->all_pages;
92
26.6k
}
93
94
void
95
Pages::getAllPagesInternal(
96
    QPDFObjectHandle cur_node,
97
    QPDFObjGen::set& visited,
98
    QPDFObjGen::set& seen,
99
    bool media_box,
100
    bool resources)
101
12.4k
{
102
12.4k
    if (!visited.add(cur_node)) {
103
49
        throw QPDFExc(
104
49
            qpdf_e_pages,
105
49
            m->file->getName(),
106
49
            "object " + cur_node.getObjGen().unparse(' '),
107
49
            0,
108
49
            "Loop detected in /Pages structure (getAllPages)");
109
49
    }
110
12.3k
    if (!cur_node.isDictionaryOfType("/Pages")) {
111
        // During fuzzing files were encountered where the root object appeared in the pages tree.
112
        // Unconditionally setting the /Type to /Pages could cause problems, but trying to
113
        // accommodate the possibility may be excessive.
114
9.92k
        cur_node.warn("/Type key should be /Pages but is not; overriding");
115
9.92k
        cur_node.replaceKey("/Type", "/Pages"_qpdf);
116
9.92k
    }
117
12.3k
    if (!media_box) {
118
12.1k
        media_box = cur_node.getKey("/MediaBox").isRectangle();
119
12.1k
        QTC::TC("qpdf", "QPDF inherit mediabox", media_box ? 0 : 1);
120
12.1k
    }
121
12.3k
    if (!resources) {
122
12.1k
        resources = cur_node.getKey("/Resources").isDictionary();
123
12.1k
    }
124
12.3k
    auto kids = cur_node.getKey("/Kids");
125
12.3k
    if (!visited.add(kids)) {
126
8
        throw QPDFExc(
127
8
            qpdf_e_pages,
128
8
            m->file->getName(),
129
8
            "object " + cur_node.getObjGen().unparse(' '),
130
8
            0,
131
8
            "Loop detected in /Pages structure (getAllPages)");
132
8
    }
133
12.3k
    int i = -1;
134
49.0k
    for (auto& kid: kids.as_array()) {
135
49.0k
        ++i;
136
49.0k
        int errors = 0;
137
138
49.0k
        if (!kid.isDictionary()) {
139
32.1k
            kid.warn("Pages tree includes non-dictionary object; ignoring");
140
32.1k
            m->invalid_page_found = true;
141
32.1k
            continue;
142
32.1k
        }
143
16.8k
        if (!kid.isIndirect()) {
144
1.60k
            cur_node.warn(
145
1.60k
                "kid " + std::to_string(i) + " (from 0) is direct; converting to indirect");
146
1.60k
            kid = qpdf.makeIndirectObject(kid);
147
1.60k
            ++errors;
148
1.60k
        }
149
16.8k
        if (kid.hasKey("/Kids")) {
150
1.37k
            getAllPagesInternal(kid, visited, seen, media_box, resources);
151
15.4k
        } else {
152
15.4k
            if (!media_box && !kid.getKey("/MediaBox").isRectangle()) {
153
6.80k
                kid.warn(
154
6.80k
                    "kid " + std::to_string(i) +
155
6.80k
                    " (from 0) MediaBox is undefined; setting to letter / ANSI A");
156
6.80k
                kid.replaceKey(
157
6.80k
                    "/MediaBox",
158
6.80k
                    QPDFObjectHandle::newArray(QPDFObjectHandle::Rectangle(0, 0, 612, 792)));
159
6.80k
                ++errors;
160
6.80k
            }
161
15.4k
            if (!resources) {
162
13.3k
                auto res = kid.getKey("/Resources");
163
164
13.3k
                if (!res.isDictionary()) {
165
8.54k
                    ++errors;
166
8.54k
                    kid.warn(
167
8.54k
                        "kid " + std::to_string(i) +
168
8.54k
                        " (from 0) Resources is missing or invalid; repairing");
169
8.54k
                    kid.replaceKey("/Resources", QPDFObjectHandle::newDictionary());
170
8.54k
                }
171
13.3k
            }
172
15.4k
            auto annots = kid.getKey("/Annots");
173
15.4k
            if (!annots.null()) {
174
1.44k
                if (!annots.isArray()) {
175
44
                    kid.warn(
176
44
                        "kid " + std::to_string(i) + " (from 0) Annots is not an array; removing");
177
44
                    kid.removeKey("/Annots");
178
44
                    ++errors;
179
1.39k
                } else {
180
1.39k
                    QPDFObjGen::set seen_annots;
181
42.2k
                    for (auto& annot: annots.as_array()) {
182
42.2k
                        if (!seen_annots.add(annot)) {
183
853
                            kid.warn(
184
853
                                "kid " + std::to_string(i) +
185
853
                                " (from 0) Annots has duplicate entry for annotation " +
186
853
                                annot.id_gen().unparse(' '));
187
853
                            ++errors;
188
853
                        }
189
42.2k
                    }
190
1.39k
                }
191
1.44k
            }
192
193
15.4k
            if (!seen.add(kid)) {
194
                // Make a copy of the page. This does the same as shallowCopyPage in
195
                // QPDFPageObjectHelper.
196
2.10k
                if (!m->reconstructed_xref) {
197
37
                    cur_node.warn(
198
37
                        "kid " + std::to_string(i) +
199
37
                        " (from 0) appears more than once in the pages tree;"
200
37
                        " creating a new page object as a copy");
201
                    // This needs to be fixed. shallowCopy does not necessarily produce a valid
202
                    // page.
203
37
                    kid = qpdf.makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
204
37
                    seen.add(kid);
205
2.07k
                } else {
206
2.07k
                    cur_node.warn(
207
2.07k
                        "kid " + std::to_string(i) +
208
2.07k
                        " (from 0) appears more than once in the pages tree; ignoring duplicate");
209
2.07k
                    m->invalid_page_found = true;
210
2.07k
                    kid = QPDFObjectHandle::newNull();
211
2.07k
                    continue;
212
2.07k
                }
213
37
                if (!kid.getKey("/Parent").isSameObjectAs(cur_node)) {
214
                    // Consider fixing and adding an information message.
215
17
                    ++errors;
216
17
                }
217
37
            }
218
13.3k
            if (!kid.isDictionaryOfType("/Page")) {
219
6.41k
                kid.warn("/Type key should be /Page but is not; overriding");
220
6.41k
                kid.replaceKey("/Type", "/Page"_qpdf);
221
6.41k
                ++errors;
222
6.41k
            }
223
13.3k
            if (m->reconstructed_xref && errors > 2) {
224
777
                cur_node.warn(
225
777
                    "kid " + std::to_string(i) + " (from 0) has too many errors; ignoring page");
226
777
                m->invalid_page_found = true;
227
777
                kid = QPDFObjectHandle::newNull();
228
777
                continue;
229
777
            }
230
12.6k
            m->all_pages.emplace_back(kid);
231
12.6k
        }
232
16.8k
    }
233
12.3k
}
234
235
void
236
QPDF::updateAllPagesCache()
237
0
{
238
    // Force regeneration of the pages cache.  We force immediate recalculation of all_pages since
239
    // users may have references to it that they got from calls to getAllPages().  We can defer
240
    // recalculation of pageobj_to_pages_pos until needed.
241
0
    m->all_pages.clear();
242
0
    m->pageobj_to_pages_pos.clear();
243
0
    m->pushed_inherited_attributes_to_pages = false;
244
0
    getAllPages();
245
0
}
246
247
void
248
Pages::flattenPagesTree()
249
2.49k
{
250
    // If not already done, flatten the /Pages structure and initialize pageobj_to_pages_pos.
251
252
2.49k
    if (!m->pageobj_to_pages_pos.empty()) {
253
0
        return;
254
0
    }
255
256
    // Push inherited objects down to the /Page level.  As a side effect m->all_pages will also be
257
    // generated.
258
2.49k
    pushInheritedAttributesToPage(true, true);
259
260
2.49k
    QPDFObjectHandle pages = qpdf.getRoot().getKey("/Pages");
261
262
2.49k
    size_t const len = m->all_pages.size();
263
5.42k
    for (size_t pos = 0; pos < len; ++pos) {
264
        // Populate pageobj_to_pages_pos and fix parent pointer. There should be no duplicates at
265
        // this point because pushInheritedAttributesToPage calls getAllPages which resolves
266
        // duplicates.
267
2.93k
        insertPageobjToPage(m->all_pages.at(pos), toI(pos), true);
268
2.93k
        m->all_pages.at(pos).replaceKey("/Parent", pages);
269
2.93k
    }
270
271
2.49k
    pages.replaceKey("/Kids", QPDFObjectHandle::newArray(m->all_pages));
272
    // /Count has not changed
273
2.49k
    if (pages.getKey("/Count").getUIntValue() != len) {
274
1.17k
        if (m->invalid_page_found && pages.getKey("/Count").getUIntValue() > len) {
275
1.12k
            pages.replaceKey("/Count", QPDFObjectHandle::newInteger(toI(len)));
276
1.12k
        } else {
277
49
            throw std::runtime_error("/Count is wrong after flattening pages tree");
278
49
        }
279
1.17k
    }
280
2.49k
}
281
282
void
283
Pages::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, bool check_duplicate)
284
2.93k
{
285
2.93k
    QPDFObjGen og(obj.getObjGen());
286
2.93k
    if (check_duplicate) {
287
2.93k
        if (!m->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second) {
288
            // The library never calls insertPageobjToPage in a way that causes this to happen.
289
0
            throw QPDFExc(
290
0
                qpdf_e_pages,
291
0
                m->file->getName(),
292
0
                "page " + std::to_string(pos) + " (numbered from zero): object " + og.unparse(' '),
293
0
                0,
294
0
                "duplicate page reference found; this would cause loss of data");
295
0
        }
296
2.93k
    } else {
297
0
        m->pageobj_to_pages_pos[og] = pos;
298
0
    }
299
2.93k
}
300
301
void
302
Pages::insertPage(QPDFObjectHandle newpage, int pos)
303
0
{
304
    // pos is numbered from 0, so pos = 0 inserts at the beginning and pos = npages adds to the end.
305
306
0
    flattenPagesTree();
307
308
0
    if (!newpage.isIndirect()) {
309
0
        newpage = qpdf.makeIndirectObject(newpage);
310
0
    } else if (newpage.getOwningQPDF() != &qpdf) {
311
0
        newpage.getQPDF().pushInheritedAttributesToPage();
312
0
        newpage = qpdf.copyForeignObject(newpage);
313
0
    } else {
314
0
        QTC::TC("qpdf", "QPDF insert indirect page");
315
0
    }
316
317
0
    if (pos < 0 || toS(pos) > m->all_pages.size()) {
318
0
        throw std::runtime_error("QPDF::insertPage called with pos out of range");
319
0
    }
320
321
0
    QTC::TC(
322
0
        "qpdf",
323
0
        "QPDF insert page",
324
0
        (pos == 0) ? 0 :                            // insert at beginning
325
0
            (pos == toI(m->all_pages.size())) ? 1   // at end
326
0
                                              : 2); // insert in middle
327
328
0
    auto og = newpage.getObjGen();
329
0
    if (m->pageobj_to_pages_pos.contains(og)) {
330
0
        newpage = qpdf.makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy());
331
0
    }
332
333
0
    QPDFObjectHandle pages = qpdf.getRoot().getKey("/Pages");
334
0
    QPDFObjectHandle kids = pages.getKey("/Kids");
335
336
0
    newpage.replaceKey("/Parent", pages);
337
0
    kids.insertItem(pos, newpage);
338
0
    int npages = static_cast<int>(kids.size());
339
0
    pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
340
0
    m->all_pages.insert(m->all_pages.begin() + pos, newpage);
341
0
    for (int i = pos + 1; i < npages; ++i) {
342
0
        insertPageobjToPage(m->all_pages.at(toS(i)), i, false);
343
0
    }
344
0
    insertPageobjToPage(newpage, pos, true);
345
0
}
346
347
void
348
QPDF::removePage(QPDFObjectHandle page)
349
0
{
350
0
    int pos = findPage(page); // also ensures flat /Pages
351
0
    QTC::TC(
352
0
        "qpdf",
353
0
        "QPDF remove page",
354
0
        (pos == 0) ? 0 :                                // remove at beginning
355
0
            (pos == toI(m->all_pages.size() - 1)) ? 1   // end
356
0
                                                  : 2); // remove in middle
357
358
0
    QPDFObjectHandle pages = getRoot().getKey("/Pages");
359
0
    QPDFObjectHandle kids = pages.getKey("/Kids");
360
361
0
    kids.eraseItem(pos);
362
0
    int npages = static_cast<int>(kids.size());
363
0
    pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
364
0
    m->all_pages.erase(m->all_pages.begin() + pos);
365
0
    m->pageobj_to_pages_pos.erase(page.getObjGen());
366
0
    for (int i = pos; i < npages; ++i) {
367
0
        m->pages.insertPageobjToPage(m->all_pages.at(toS(i)), i, false);
368
0
    }
369
0
}
370
371
void
372
QPDF::addPageAt(QPDFObjectHandle newpage, bool before, QPDFObjectHandle refpage)
373
0
{
374
0
    int refpos = findPage(refpage);
375
0
    if (!before) {
376
0
        ++refpos;
377
0
    }
378
0
    m->pages.insertPage(newpage, refpos);
379
0
}
380
381
void
382
QPDF::addPage(QPDFObjectHandle newpage, bool first)
383
0
{
384
0
    if (first) {
385
0
        m->pages.insertPage(newpage, 0);
386
0
    } else {
387
0
        m->pages.insertPage(
388
0
            newpage, getRoot().getKey("/Pages").getKey("/Count").getIntValueAsInt());
389
0
    }
390
0
}
391
392
int
393
QPDF::findPage(QPDFObjectHandle& page)
394
0
{
395
0
    return findPage(page.getObjGen());
396
0
}
397
398
int
399
QPDF::findPage(QPDFObjGen og)
400
0
{
401
0
    m->pages.flattenPagesTree();
402
0
    auto it = m->pageobj_to_pages_pos.find(og);
403
0
    if (it == m->pageobj_to_pages_pos.end()) {
404
0
        throw QPDFExc(
405
0
            qpdf_e_pages,
406
0
            m->file->getName(),
407
0
            "page object: object " + og.unparse(' '),
408
0
            0,
409
0
            "page object not referenced in /Pages tree");
410
0
    }
411
0
    return (*it).second;
412
0
}