Coverage Report

Created: 2025-06-22 06:28

/src/qpdf/libqpdf/QPDF_pages.cc
Line
Count
Source (jump to first uncovered line)
1
#include <qpdf/QPDF_private.hh>
2
3
#include <qpdf/QPDFExc.hh>
4
#include <qpdf/QPDFObjectHandle_private.hh>
5
#include <qpdf/QTC.hh>
6
#include <qpdf/QUtil.hh>
7
8
// In support of page manipulation APIs, these methods internally maintain state about pages in a
9
// pair of data structures: all_pages, which is a vector of page objects, and pageobj_to_pages_pos,
10
// which maps a page object to its position in the all_pages array. Unfortunately, the getAllPages()
11
// method returns a const reference to all_pages and has been in the public API long before the
12
// introduction of mutation APIs, so we're pretty much stuck with it. Anyway, there are lots of
13
// calls to it in the library, so the efficiency of having it cached is probably worth keeping it.
14
// At one point, I had partially implemented a helper class specifically for the pages tree, but
15
// once you work in all the logic that handles repairing the /Type keys of page tree nodes (both
16
// /Pages and /Page) and deal with duplicate pages, it's just as complex and less efficient than
17
// what's here. So, in spite of the fact that a const reference is returned, the current code is
18
// fine and does not need to be replaced. A partial implementation of QPDFPagesTree is in github in
19
// attic in case there is ever a reason to resurrect it. There are additional notes in
20
// README-maintainer, which also refers to this comment.
21
22
// The goal of this code is to ensure that the all_pages vector, which users may have a reference
23
// to, and the pageobj_to_pages_pos map, which users will not have access to, remain consistent
24
// outside of any call to the library.  As long as users only touch the /Pages structure through
25
// page-specific API calls, they never have to worry about anything, and this will also stay
26
// consistent.  If a user touches anything about the /Pages structure outside of these calls (such
27
// as by directly looking up and manipulating the underlying objects), they can call
28
// updatePagesCache() to bring things back in sync.
29
30
// If the user doesn't ever use the page manipulation APIs, then qpdf leaves the /Pages structure
31
// alone.  If the user does use the APIs, then we push all inheritable objects down and flatten the
32
// /Pages tree.  This makes it easier for us to keep /Pages, all_pages, and pageobj_to_pages_pos
33
// internally consistent at all times.
34
35
// Responsibility for keeping all_pages, pageobj_to_pages_pos, and the Pages structure consistent
36
// should remain in as few places as possible.  As of initial writing, only flattenPagesTree,
37
// insertPage, and removePage, along with methods they call, are concerned with it.  Everything else
38
// goes through one of those methods.
39
40
std::vector<QPDFObjectHandle> const&
41
QPDF::getAllPages()
42
43.0k
{
43
    // Note that pushInheritedAttributesToPage may also be used to initialize m->all_pages.
44
43.0k
    if (m->all_pages.empty() && !m->invalid_page_found) {
45
14.2k
        m->ever_called_get_all_pages = true;
46
14.2k
        auto root = getRoot();
47
14.2k
        QPDFObjGen::set visited;
48
14.2k
        QPDFObjGen::set seen;
49
14.2k
        QPDFObjectHandle pages = root.getKey("/Pages");
50
14.2k
        bool warned = false;
51
14.2k
        bool changed_pages = false;
52
15.3k
        while (pages.isDictionary() && pages.hasKey("/Parent")) {
53
1.62k
            if (!seen.add(pages)) {
54
                // loop -- will be detected again and reported later
55
493
                break;
56
493
            }
57
            // Files have been found in the wild where /Pages in the catalog points to the first
58
            // page. Try to work around this and similar cases with this heuristic.
59
1.13k
            if (!warned) {
60
722
                root.warnIfPossible(
61
722
                    "document page tree root (root -> /Pages) doesn't point"
62
722
                    " to the root of the page tree; attempting to correct");
63
722
                warned = true;
64
722
            }
65
1.13k
            changed_pages = true;
66
1.13k
            pages = pages.getKey("/Parent");
67
1.13k
        }
68
14.2k
        if (changed_pages) {
69
720
            root.replaceKey("/Pages", pages);
70
720
        }
71
14.2k
        seen.clear();
72
14.2k
        if (!pages.hasKey("/Kids")) {
73
            // Ensure we actually found a /Pages object.
74
212
            throw QPDFExc(
75
212
                qpdf_e_pages, m->file->getName(), "", 0, "root of pages tree has no /Kids array");
76
212
        }
77
14.0k
        try {
78
14.0k
            getAllPagesInternal(pages, visited, seen, false, false);
79
14.0k
        } catch (...) {
80
82
            m->all_pages.clear();
81
82
            m->invalid_page_found = false;
82
82
            throw;
83
82
        }
84
9.75k
        if (m->invalid_page_found) {
85
4.67k
            flattenPagesTree();
86
4.67k
            m->invalid_page_found = false;
87
4.67k
        }
88
9.75k
    }
89
38.5k
    return m->all_pages;
90
43.0k
}
91
92
void
93
QPDF::getAllPagesInternal(
94
    QPDFObjectHandle cur_node,
95
    QPDFObjGen::set& visited,
96
    QPDFObjGen::set& seen,
97
    bool media_box,
98
    bool resources)
99
10.9k
{
100
10.9k
    if (!visited.add(cur_node)) {
101
27
        throw QPDFExc(
102
27
            qpdf_e_pages,
103
27
            m->file->getName(),
104
27
            "object " + cur_node.getObjGen().unparse(' '),
105
27
            0,
106
27
            "Loop detected in /Pages structure (getAllPages)");
107
27
    }
108
10.9k
    if (!cur_node.isDictionaryOfType("/Pages")) {
109
        // During fuzzing files were encountered where the root object appeared in the pages tree.
110
        // Unconditionally setting the /Type to /Pages could cause problems, but trying to
111
        // accommodate the possibility may be excessive.
112
6.01k
        cur_node.warnIfPossible("/Type key should be /Pages but is not; overriding");
113
6.01k
        cur_node.replaceKey("/Type", "/Pages"_qpdf);
114
6.01k
    }
115
10.9k
    if (!media_box) {
116
10.8k
        media_box = cur_node.getKey("/MediaBox").isRectangle();
117
10.8k
        QTC::TC("qpdf", "QPDF inherit mediabox", media_box ? 0 : 1);
118
10.8k
    }
119
10.9k
    if (!resources) {
120
10.8k
        resources = cur_node.getKey("/Resources").isDictionary();
121
10.8k
    }
122
10.9k
    auto kids = cur_node.getKey("/Kids");
123
10.9k
    if (!visited.add(kids)) {
124
12
        throw QPDFExc(
125
12
            qpdf_e_pages,
126
12
            m->file->getName(),
127
12
            "object " + cur_node.getObjGen().unparse(' '),
128
12
            0,
129
12
            "Loop detected in /Pages structure (getAllPages)");
130
12
    }
131
10.9k
    int i = -1;
132
136k
    for (auto& kid: kids.as_array()) {
133
136k
        ++i;
134
136k
        int errors = 0;
135
136
136k
        if (!kid.isDictionary()) {
137
101k
            kid.warnIfPossible("Pages tree includes non-dictionary object; ignoring");
138
101k
            m->invalid_page_found = true;
139
101k
            continue;
140
101k
        }
141
35.3k
        if (!kid.isIndirect()) {
142
599
            QTC::TC("qpdf", "QPDF handle direct page object");
143
599
            cur_node.warnIfPossible(
144
599
                "kid " + std::to_string(i) + " (from 0) is direct; converting to indirect");
145
599
            kid = makeIndirectObject(kid);
146
599
            ++errors;
147
599
        }
148
35.3k
        if (kid.hasKey("/Kids")) {
149
1.14k
            getAllPagesInternal(kid, visited, seen, media_box, resources);
150
34.2k
        } else {
151
34.2k
            if (!media_box && !kid.getKey("/MediaBox").isRectangle()) {
152
8.64k
                QTC::TC("qpdf", "QPDF missing mediabox");
153
8.64k
                kid.warnIfPossible(
154
8.64k
                    "kid " + std::to_string(i) +
155
8.64k
                    " (from 0) MediaBox is undefined; setting to letter / ANSI A");
156
8.64k
                kid.replaceKey(
157
8.64k
                    "/MediaBox",
158
8.64k
                    QPDFObjectHandle::newArray(QPDFObjectHandle::Rectangle(0, 0, 612, 792)));
159
8.64k
                ++errors;
160
8.64k
            }
161
34.2k
            if (!resources && !kid.getKey("/Resources").isDictionary()) {
162
                // Consider adding an information message
163
22.1k
                ++errors;
164
22.1k
            }
165
34.2k
            if (!seen.add(kid)) {
166
                // Make a copy of the page. This does the same as shallowCopyPage in
167
                // QPDFPageObjectHelper.
168
14.4k
                QTC::TC("qpdf", "QPDF resolve duplicated page object");
169
14.4k
                if (!m->in_xref_reconstruction) {
170
12.7k
                    cur_node.warnIfPossible(
171
12.7k
                        "kid " + std::to_string(i) +
172
12.7k
                        " (from 0) appears more than once in the pages tree;"
173
12.7k
                        " creating a new page object as a copy");
174
                    // This needs to be fixed. shallowCopy does not necessarily produce a valid
175
                    // page.
176
12.7k
                    kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
177
12.7k
                    seen.add(kid);
178
12.7k
                } else {
179
1.78k
                    cur_node.warnIfPossible(
180
1.78k
                        "kid " + std::to_string(i) +
181
1.78k
                        " (from 0) appears more than once in the pages tree; ignoring duplicate");
182
1.78k
                    m->invalid_page_found = true;
183
1.78k
                    kid = QPDFObjectHandle::newNull();
184
1.78k
                    continue;
185
1.78k
                }
186
12.7k
                if (!kid.getKey("/Parent").isSameObjectAs(cur_node)) {
187
                    // Consider fixing and adding an information message.
188
12.3k
                    ++errors;
189
12.3k
                }
190
12.7k
            }
191
32.4k
            if (!kid.isDictionaryOfType("/Page")) {
192
7.35k
                kid.warnIfPossible("/Type key should be /Page but is not; overriding");
193
7.35k
                kid.replaceKey("/Type", "/Page"_qpdf);
194
7.35k
                ++errors;
195
7.35k
            }
196
32.4k
            if (m->in_xref_reconstruction && errors > 2) {
197
1.48k
                cur_node.warnIfPossible(
198
1.48k
                    "kid " + std::to_string(i) + " (from 0) has too many errors; ignoring page");
199
1.48k
                m->invalid_page_found = true;
200
1.48k
                kid = QPDFObjectHandle::newNull();
201
1.48k
                continue;
202
1.48k
            }
203
30.9k
            m->all_pages.emplace_back(kid);
204
30.9k
        }
205
35.3k
    }
206
10.9k
}
207
208
void
209
QPDF::updateAllPagesCache()
210
0
{
211
    // Force regeneration of the pages cache.  We force immediate recalculation of all_pages since
212
    // users may have references to it that they got from calls to getAllPages().  We can defer
213
    // recalculation of pageobj_to_pages_pos until needed.
214
0
    QTC::TC("qpdf", "QPDF updateAllPagesCache");
215
0
    m->all_pages.clear();
216
0
    m->pageobj_to_pages_pos.clear();
217
0
    m->pushed_inherited_attributes_to_pages = false;
218
0
    getAllPages();
219
0
}
220
221
void
222
QPDF::flattenPagesTree()
223
4.67k
{
224
    // If not already done, flatten the /Pages structure and initialize pageobj_to_pages_pos.
225
226
4.67k
    if (!m->pageobj_to_pages_pos.empty()) {
227
0
        return;
228
0
    }
229
230
    // Push inherited objects down to the /Page level.  As a side effect m->all_pages will also be
231
    // generated.
232
4.67k
    pushInheritedAttributesToPage(true, true);
233
234
4.67k
    QPDFObjectHandle pages = getRoot().getKey("/Pages");
235
236
4.67k
    size_t const len = m->all_pages.size();
237
27.8k
    for (size_t pos = 0; pos < len; ++pos) {
238
        // Populate pageobj_to_pages_pos and fix parent pointer. There should be no duplicates at
239
        // this point because pushInheritedAttributesToPage calls getAllPages which resolves
240
        // duplicates.
241
23.2k
        insertPageobjToPage(m->all_pages.at(pos), toI(pos), true);
242
23.2k
        m->all_pages.at(pos).replaceKey("/Parent", pages);
243
23.2k
    }
244
245
4.67k
    pages.replaceKey("/Kids", QPDFObjectHandle::newArray(m->all_pages));
246
    // /Count has not changed
247
4.67k
    if (pages.getKey("/Count").getUIntValue() != len) {
248
3.96k
        if (m->invalid_page_found && pages.getKey("/Count").getUIntValue() > len) {
249
3.94k
            pages.replaceKey("/Count", QPDFObjectHandle::newInteger(toI(len)));
250
3.94k
        } else {
251
26
            throw std::runtime_error("/Count is wrong after flattening pages tree");
252
26
        }
253
3.96k
    }
254
4.67k
}
255
256
void
257
QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, bool check_duplicate)
258
23.2k
{
259
23.2k
    QPDFObjGen og(obj.getObjGen());
260
23.2k
    if (check_duplicate) {
261
23.2k
        if (!m->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second) {
262
            // The library never calls insertPageobjToPage in a way that causes this to happen.
263
0
            setLastObjectDescription("page " + std::to_string(pos) + " (numbered from zero)", og);
264
0
            throw QPDFExc(
265
0
                qpdf_e_pages,
266
0
                m->file->getName(),
267
0
                m->last_object_description,
268
0
                0,
269
0
                "duplicate page reference found; this would cause loss of data");
270
0
        }
271
23.2k
    } else {
272
0
        m->pageobj_to_pages_pos[og] = pos;
273
0
    }
274
23.2k
}
275
276
void
277
QPDF::insertPage(QPDFObjectHandle newpage, int pos)
278
0
{
279
    // pos is numbered from 0, so pos = 0 inserts at the beginning and pos = npages adds to the end.
280
281
0
    flattenPagesTree();
282
283
0
    if (!newpage.isIndirect()) {
284
0
        QTC::TC("qpdf", "QPDF insert non-indirect page");
285
0
        newpage = makeIndirectObject(newpage);
286
0
    } else if (newpage.getOwningQPDF() != this) {
287
0
        QTC::TC("qpdf", "QPDF insert foreign page");
288
0
        newpage.getQPDF().pushInheritedAttributesToPage();
289
0
        newpage = copyForeignObject(newpage);
290
0
    } else {
291
0
        QTC::TC("qpdf", "QPDF insert indirect page");
292
0
    }
293
294
0
    if ((pos < 0) || (toS(pos) > m->all_pages.size())) {
295
0
        throw std::runtime_error("QPDF::insertPage called with pos out of range");
296
0
    }
297
298
0
    QTC::TC(
299
0
        "qpdf",
300
0
        "QPDF insert page",
301
0
        (pos == 0) ? 0 :                            // insert at beginning
302
0
            (pos == toI(m->all_pages.size())) ? 1   // at end
303
0
                                              : 2); // insert in middle
304
305
0
    auto og = newpage.getObjGen();
306
0
    if (m->pageobj_to_pages_pos.contains(og)) {
307
0
        QTC::TC("qpdf", "QPDF resolve duplicated page in insert");
308
0
        newpage = makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy());
309
0
    }
310
311
0
    QPDFObjectHandle pages = getRoot().getKey("/Pages");
312
0
    QPDFObjectHandle kids = pages.getKey("/Kids");
313
314
0
    newpage.replaceKey("/Parent", pages);
315
0
    kids.insertItem(pos, newpage);
316
0
    int npages = kids.getArrayNItems();
317
0
    pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
318
0
    m->all_pages.insert(m->all_pages.begin() + pos, newpage);
319
0
    for (int i = pos + 1; i < npages; ++i) {
320
0
        insertPageobjToPage(m->all_pages.at(toS(i)), i, false);
321
0
    }
322
0
    insertPageobjToPage(newpage, pos, true);
323
0
}
324
325
void
326
QPDF::removePage(QPDFObjectHandle page)
327
0
{
328
0
    int pos = findPage(page); // also ensures flat /Pages
329
0
    QTC::TC(
330
0
        "qpdf",
331
0
        "QPDF remove page",
332
0
        (pos == 0) ? 0 :                                // remove at beginning
333
0
            (pos == toI(m->all_pages.size() - 1)) ? 1   // end
334
0
                                                  : 2); // remove in middle
335
336
0
    QPDFObjectHandle pages = getRoot().getKey("/Pages");
337
0
    QPDFObjectHandle kids = pages.getKey("/Kids");
338
339
0
    kids.eraseItem(pos);
340
0
    int npages = kids.getArrayNItems();
341
0
    pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
342
0
    m->all_pages.erase(m->all_pages.begin() + pos);
343
0
    m->pageobj_to_pages_pos.erase(page.getObjGen());
344
0
    for (int i = pos; i < npages; ++i) {
345
0
        insertPageobjToPage(m->all_pages.at(toS(i)), i, false);
346
0
    }
347
0
}
348
349
void
350
QPDF::addPageAt(QPDFObjectHandle newpage, bool before, QPDFObjectHandle refpage)
351
0
{
352
0
    int refpos = findPage(refpage);
353
0
    if (!before) {
354
0
        ++refpos;
355
0
    }
356
0
    insertPage(newpage, refpos);
357
0
}
358
359
void
360
QPDF::addPage(QPDFObjectHandle newpage, bool first)
361
0
{
362
0
    if (first) {
363
0
        insertPage(newpage, 0);
364
0
    } else {
365
0
        insertPage(newpage, getRoot().getKey("/Pages").getKey("/Count").getIntValueAsInt());
366
0
    }
367
0
}
368
369
int
370
QPDF::findPage(QPDFObjectHandle& page)
371
0
{
372
0
    return findPage(page.getObjGen());
373
0
}
374
375
int
376
QPDF::findPage(QPDFObjGen og)
377
0
{
378
0
    flattenPagesTree();
379
0
    auto it = m->pageobj_to_pages_pos.find(og);
380
0
    if (it == m->pageobj_to_pages_pos.end()) {
381
0
        QTC::TC("qpdf", "QPDF_pages findPage not found");
382
0
        setLastObjectDescription("page object", og);
383
0
        throw QPDFExc(
384
0
            qpdf_e_pages,
385
0
            m->file->getName(),
386
0
            m->last_object_description,
387
0
            0,
388
0
            "page object not referenced in /Pages tree");
389
0
    }
390
0
    return (*it).second;
391
0
}