/src/qpdf/libqpdf/QPDF_pages.cc
Line | Count | Source |
1 | | #include <qpdf/QPDF_private.hh> |
2 | | |
3 | | #include <qpdf/QPDFExc.hh> |
4 | | #include <qpdf/QPDFObjectHandle_private.hh> |
5 | | #include <qpdf/QTC.hh> |
6 | | #include <qpdf/QUtil.hh> |
7 | | |
8 | | // In support of page manipulation APIs, these methods internally maintain state about pages in a |
9 | | // pair of data structures: all_pages, which is a vector of page objects, and pageobj_to_pages_pos, |
10 | | // which maps a page object to its position in the all_pages array. Unfortunately, the getAllPages() |
11 | | // method returns a const reference to all_pages and has been in the public API long before the |
12 | | // introduction of mutation APIs, so we're pretty much stuck with it. Anyway, there are lots of |
13 | | // calls to it in the library, so the efficiency of having it cached is probably worth keeping it. |
14 | | // At one point, I had partially implemented a helper class specifically for the pages tree, but |
15 | | // once you work in all the logic that handles repairing the /Type keys of page tree nodes (both |
16 | | // /Pages and /Page) and deal with duplicate pages, it's just as complex and less efficient than |
17 | | // what's here. So, in spite of the fact that a const reference is returned, the current code is |
18 | | // fine and does not need to be replaced. A partial implementation of QPDFPagesTree is in github in |
19 | | // attic in case there is ever a reason to resurrect it. There are additional notes in |
20 | | // README-maintainer, which also refers to this comment. |
21 | | |
22 | | // The goal of this code is to ensure that the all_pages vector, which users may have a reference |
23 | | // to, and the pageobj_to_pages_pos map, which users will not have access to, remain consistent |
24 | | // outside of any call to the library. As long as users only touch the /Pages structure through |
25 | | // page-specific API calls, they never have to worry about anything, and this will also stay |
26 | | // consistent. If a user touches anything about the /Pages structure outside of these calls (such |
27 | | // as by directly looking up and manipulating the underlying objects), they can call |
28 | | // updatePagesCache() to bring things back in sync. |
29 | | |
30 | | // If the user doesn't ever use the page manipulation APIs, then qpdf leaves the /Pages structure |
31 | | // alone. If the user does use the APIs, then we push all inheritable objects down and flatten the |
32 | | // /Pages tree. This makes it easier for us to keep /Pages, all_pages, and pageobj_to_pages_pos |
33 | | // internally consistent at all times. |
34 | | |
35 | | // Responsibility for keeping all_pages, pageobj_to_pages_pos, and the Pages structure consistent |
36 | | // should remain in as few places as possible. As of initial writing, only flattenPagesTree, |
37 | | // insertPage, and removePage, along with methods they call, are concerned with it. Everything else |
38 | | // goes through one of those methods. |
39 | | |
40 | | using Pages = QPDF::Doc::Pages; |
41 | | |
42 | | std::vector<QPDFObjectHandle> const& |
43 | | QPDF::getAllPages() |
44 | 26.6k | { |
45 | | // Note that pushInheritedAttributesToPage may also be used to initialize m->all_pages. |
46 | 26.6k | if (m->all_pages.empty() && !m->invalid_page_found) { |
47 | 15.7k | m->ever_called_get_all_pages = true; |
48 | 15.7k | auto root = getRoot(); |
49 | 15.7k | QPDFObjGen::set visited; |
50 | 15.7k | QPDFObjGen::set seen; |
51 | 15.7k | QPDFObjectHandle pages = root.getKey("/Pages"); |
52 | 15.7k | bool warned = false; |
53 | 15.7k | bool changed_pages = false; |
54 | 15.9k | while (pages.isDictionary() && pages.hasKey("/Parent")) { |
55 | 302 | if (!seen.add(pages)) { |
56 | | // loop -- will be detected again and reported later |
57 | 66 | break; |
58 | 66 | } |
59 | | // Files have been found in the wild where /Pages in the catalog points to the first |
60 | | // page. Try to work around this and similar cases with this heuristic. |
61 | 236 | if (!warned) { |
62 | 173 | root.warn( |
63 | 173 | "document page tree root (root -> /Pages) doesn't point" |
64 | 173 | " to the root of the page tree; attempting to correct"); |
65 | 173 | warned = true; |
66 | 173 | } |
67 | 236 | changed_pages = true; |
68 | 236 | pages = pages.getKey("/Parent"); |
69 | 236 | } |
70 | 15.7k | if (changed_pages) { |
71 | 172 | root.replaceKey("/Pages", pages); |
72 | 172 | } |
73 | 15.7k | seen.clear(); |
74 | 15.7k | if (!pages.hasKey("/Kids")) { |
75 | | // Ensure we actually found a /Pages object. |
76 | 142 | throw QPDFExc( |
77 | 142 | qpdf_e_pages, m->file->getName(), "", 0, "root of pages tree has no /Kids array"); |
78 | 142 | } |
79 | 15.6k | try { |
80 | 15.6k | m->pages.getAllPagesInternal(pages, visited, seen, false, false); |
81 | 15.6k | } catch (...) { |
82 | 102 | m->all_pages.clear(); |
83 | 102 | m->invalid_page_found = false; |
84 | 102 | throw; |
85 | 102 | } |
86 | 10.9k | if (m->invalid_page_found) { |
87 | 2.49k | m->pages.flattenPagesTree(); |
88 | 2.49k | m->invalid_page_found = false; |
89 | 2.49k | } |
90 | 10.9k | } |
91 | 21.8k | return m->all_pages; |
92 | 26.6k | } |
93 | | |
94 | | void |
95 | | Pages::getAllPagesInternal( |
96 | | QPDFObjectHandle cur_node, |
97 | | QPDFObjGen::set& visited, |
98 | | QPDFObjGen::set& seen, |
99 | | bool media_box, |
100 | | bool resources) |
101 | 12.4k | { |
102 | 12.4k | if (!visited.add(cur_node)) { |
103 | 49 | throw QPDFExc( |
104 | 49 | qpdf_e_pages, |
105 | 49 | m->file->getName(), |
106 | 49 | "object " + cur_node.getObjGen().unparse(' '), |
107 | 49 | 0, |
108 | 49 | "Loop detected in /Pages structure (getAllPages)"); |
109 | 49 | } |
110 | 12.3k | if (!cur_node.isDictionaryOfType("/Pages")) { |
111 | | // During fuzzing files were encountered where the root object appeared in the pages tree. |
112 | | // Unconditionally setting the /Type to /Pages could cause problems, but trying to |
113 | | // accommodate the possibility may be excessive. |
114 | 9.92k | cur_node.warn("/Type key should be /Pages but is not; overriding"); |
115 | 9.92k | cur_node.replaceKey("/Type", "/Pages"_qpdf); |
116 | 9.92k | } |
117 | 12.3k | if (!media_box) { |
118 | 12.1k | media_box = cur_node.getKey("/MediaBox").isRectangle(); |
119 | 12.1k | QTC::TC("qpdf", "QPDF inherit mediabox", media_box ? 0 : 1); |
120 | 12.1k | } |
121 | 12.3k | if (!resources) { |
122 | 12.1k | resources = cur_node.getKey("/Resources").isDictionary(); |
123 | 12.1k | } |
124 | 12.3k | auto kids = cur_node.getKey("/Kids"); |
125 | 12.3k | if (!visited.add(kids)) { |
126 | 8 | throw QPDFExc( |
127 | 8 | qpdf_e_pages, |
128 | 8 | m->file->getName(), |
129 | 8 | "object " + cur_node.getObjGen().unparse(' '), |
130 | 8 | 0, |
131 | 8 | "Loop detected in /Pages structure (getAllPages)"); |
132 | 8 | } |
133 | 12.3k | int i = -1; |
134 | 49.0k | for (auto& kid: kids.as_array()) { |
135 | 49.0k | ++i; |
136 | 49.0k | int errors = 0; |
137 | | |
138 | 49.0k | if (!kid.isDictionary()) { |
139 | 32.1k | kid.warn("Pages tree includes non-dictionary object; ignoring"); |
140 | 32.1k | m->invalid_page_found = true; |
141 | 32.1k | continue; |
142 | 32.1k | } |
143 | 16.8k | if (!kid.isIndirect()) { |
144 | 1.60k | cur_node.warn( |
145 | 1.60k | "kid " + std::to_string(i) + " (from 0) is direct; converting to indirect"); |
146 | 1.60k | kid = qpdf.makeIndirectObject(kid); |
147 | 1.60k | ++errors; |
148 | 1.60k | } |
149 | 16.8k | if (kid.hasKey("/Kids")) { |
150 | 1.37k | getAllPagesInternal(kid, visited, seen, media_box, resources); |
151 | 15.4k | } else { |
152 | 15.4k | if (!media_box && !kid.getKey("/MediaBox").isRectangle()) { |
153 | 6.80k | kid.warn( |
154 | 6.80k | "kid " + std::to_string(i) + |
155 | 6.80k | " (from 0) MediaBox is undefined; setting to letter / ANSI A"); |
156 | 6.80k | kid.replaceKey( |
157 | 6.80k | "/MediaBox", |
158 | 6.80k | QPDFObjectHandle::newArray(QPDFObjectHandle::Rectangle(0, 0, 612, 792))); |
159 | 6.80k | ++errors; |
160 | 6.80k | } |
161 | 15.4k | if (!resources) { |
162 | 13.3k | auto res = kid.getKey("/Resources"); |
163 | | |
164 | 13.3k | if (!res.isDictionary()) { |
165 | 8.54k | ++errors; |
166 | 8.54k | kid.warn( |
167 | 8.54k | "kid " + std::to_string(i) + |
168 | 8.54k | " (from 0) Resources is missing or invalid; repairing"); |
169 | 8.54k | kid.replaceKey("/Resources", QPDFObjectHandle::newDictionary()); |
170 | 8.54k | } |
171 | 13.3k | } |
172 | 15.4k | auto annots = kid.getKey("/Annots"); |
173 | 15.4k | if (!annots.null()) { |
174 | 1.44k | if (!annots.isArray()) { |
175 | 44 | kid.warn( |
176 | 44 | "kid " + std::to_string(i) + " (from 0) Annots is not an array; removing"); |
177 | 44 | kid.removeKey("/Annots"); |
178 | 44 | ++errors; |
179 | 1.39k | } else { |
180 | 1.39k | QPDFObjGen::set seen_annots; |
181 | 42.2k | for (auto& annot: annots.as_array()) { |
182 | 42.2k | if (!seen_annots.add(annot)) { |
183 | 853 | kid.warn( |
184 | 853 | "kid " + std::to_string(i) + |
185 | 853 | " (from 0) Annots has duplicate entry for annotation " + |
186 | 853 | annot.id_gen().unparse(' ')); |
187 | 853 | ++errors; |
188 | 853 | } |
189 | 42.2k | } |
190 | 1.39k | } |
191 | 1.44k | } |
192 | | |
193 | 15.4k | if (!seen.add(kid)) { |
194 | | // Make a copy of the page. This does the same as shallowCopyPage in |
195 | | // QPDFPageObjectHelper. |
196 | 2.10k | if (!m->reconstructed_xref) { |
197 | 37 | cur_node.warn( |
198 | 37 | "kid " + std::to_string(i) + |
199 | 37 | " (from 0) appears more than once in the pages tree;" |
200 | 37 | " creating a new page object as a copy"); |
201 | | // This needs to be fixed. shallowCopy does not necessarily produce a valid |
202 | | // page. |
203 | 37 | kid = qpdf.makeIndirectObject(QPDFObjectHandle(kid).shallowCopy()); |
204 | 37 | seen.add(kid); |
205 | 2.07k | } else { |
206 | 2.07k | cur_node.warn( |
207 | 2.07k | "kid " + std::to_string(i) + |
208 | 2.07k | " (from 0) appears more than once in the pages tree; ignoring duplicate"); |
209 | 2.07k | m->invalid_page_found = true; |
210 | 2.07k | kid = QPDFObjectHandle::newNull(); |
211 | 2.07k | continue; |
212 | 2.07k | } |
213 | 37 | if (!kid.getKey("/Parent").isSameObjectAs(cur_node)) { |
214 | | // Consider fixing and adding an information message. |
215 | 17 | ++errors; |
216 | 17 | } |
217 | 37 | } |
218 | 13.3k | if (!kid.isDictionaryOfType("/Page")) { |
219 | 6.41k | kid.warn("/Type key should be /Page but is not; overriding"); |
220 | 6.41k | kid.replaceKey("/Type", "/Page"_qpdf); |
221 | 6.41k | ++errors; |
222 | 6.41k | } |
223 | 13.3k | if (m->reconstructed_xref && errors > 2) { |
224 | 777 | cur_node.warn( |
225 | 777 | "kid " + std::to_string(i) + " (from 0) has too many errors; ignoring page"); |
226 | 777 | m->invalid_page_found = true; |
227 | 777 | kid = QPDFObjectHandle::newNull(); |
228 | 777 | continue; |
229 | 777 | } |
230 | 12.6k | m->all_pages.emplace_back(kid); |
231 | 12.6k | } |
232 | 16.8k | } |
233 | 12.3k | } |
234 | | |
235 | | void |
236 | | QPDF::updateAllPagesCache() |
237 | 0 | { |
238 | | // Force regeneration of the pages cache. We force immediate recalculation of all_pages since |
239 | | // users may have references to it that they got from calls to getAllPages(). We can defer |
240 | | // recalculation of pageobj_to_pages_pos until needed. |
241 | 0 | m->all_pages.clear(); |
242 | 0 | m->pageobj_to_pages_pos.clear(); |
243 | 0 | m->pushed_inherited_attributes_to_pages = false; |
244 | 0 | getAllPages(); |
245 | 0 | } |
246 | | |
247 | | void |
248 | | Pages::flattenPagesTree() |
249 | 2.49k | { |
250 | | // If not already done, flatten the /Pages structure and initialize pageobj_to_pages_pos. |
251 | | |
252 | 2.49k | if (!m->pageobj_to_pages_pos.empty()) { |
253 | 0 | return; |
254 | 0 | } |
255 | | |
256 | | // Push inherited objects down to the /Page level. As a side effect m->all_pages will also be |
257 | | // generated. |
258 | 2.49k | pushInheritedAttributesToPage(true, true); |
259 | | |
260 | 2.49k | QPDFObjectHandle pages = qpdf.getRoot().getKey("/Pages"); |
261 | | |
262 | 2.49k | size_t const len = m->all_pages.size(); |
263 | 5.42k | for (size_t pos = 0; pos < len; ++pos) { |
264 | | // Populate pageobj_to_pages_pos and fix parent pointer. There should be no duplicates at |
265 | | // this point because pushInheritedAttributesToPage calls getAllPages which resolves |
266 | | // duplicates. |
267 | 2.93k | insertPageobjToPage(m->all_pages.at(pos), toI(pos), true); |
268 | 2.93k | m->all_pages.at(pos).replaceKey("/Parent", pages); |
269 | 2.93k | } |
270 | | |
271 | 2.49k | pages.replaceKey("/Kids", QPDFObjectHandle::newArray(m->all_pages)); |
272 | | // /Count has not changed |
273 | 2.49k | if (pages.getKey("/Count").getUIntValue() != len) { |
274 | 1.17k | if (m->invalid_page_found && pages.getKey("/Count").getUIntValue() > len) { |
275 | 1.12k | pages.replaceKey("/Count", QPDFObjectHandle::newInteger(toI(len))); |
276 | 1.12k | } else { |
277 | 49 | throw std::runtime_error("/Count is wrong after flattening pages tree"); |
278 | 49 | } |
279 | 1.17k | } |
280 | 2.49k | } |
281 | | |
282 | | void |
283 | | Pages::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, bool check_duplicate) |
284 | 2.93k | { |
285 | 2.93k | QPDFObjGen og(obj.getObjGen()); |
286 | 2.93k | if (check_duplicate) { |
287 | 2.93k | if (!m->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second) { |
288 | | // The library never calls insertPageobjToPage in a way that causes this to happen. |
289 | 0 | throw QPDFExc( |
290 | 0 | qpdf_e_pages, |
291 | 0 | m->file->getName(), |
292 | 0 | "page " + std::to_string(pos) + " (numbered from zero): object " + og.unparse(' '), |
293 | 0 | 0, |
294 | 0 | "duplicate page reference found; this would cause loss of data"); |
295 | 0 | } |
296 | 2.93k | } else { |
297 | 0 | m->pageobj_to_pages_pos[og] = pos; |
298 | 0 | } |
299 | 2.93k | } |
300 | | |
301 | | void |
302 | | Pages::insertPage(QPDFObjectHandle newpage, int pos) |
303 | 0 | { |
304 | | // pos is numbered from 0, so pos = 0 inserts at the beginning and pos = npages adds to the end. |
305 | |
|
306 | 0 | flattenPagesTree(); |
307 | |
|
308 | 0 | if (!newpage.isIndirect()) { |
309 | 0 | newpage = qpdf.makeIndirectObject(newpage); |
310 | 0 | } else if (newpage.getOwningQPDF() != &qpdf) { |
311 | 0 | newpage.getQPDF().pushInheritedAttributesToPage(); |
312 | 0 | newpage = qpdf.copyForeignObject(newpage); |
313 | 0 | } else { |
314 | 0 | QTC::TC("qpdf", "QPDF insert indirect page"); |
315 | 0 | } |
316 | |
|
317 | 0 | if (pos < 0 || toS(pos) > m->all_pages.size()) { |
318 | 0 | throw std::runtime_error("QPDF::insertPage called with pos out of range"); |
319 | 0 | } |
320 | | |
321 | 0 | QTC::TC( |
322 | 0 | "qpdf", |
323 | 0 | "QPDF insert page", |
324 | 0 | (pos == 0) ? 0 : // insert at beginning |
325 | 0 | (pos == toI(m->all_pages.size())) ? 1 // at end |
326 | 0 | : 2); // insert in middle |
327 | |
|
328 | 0 | auto og = newpage.getObjGen(); |
329 | 0 | if (m->pageobj_to_pages_pos.contains(og)) { |
330 | 0 | newpage = qpdf.makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy()); |
331 | 0 | } |
332 | |
|
333 | 0 | QPDFObjectHandle pages = qpdf.getRoot().getKey("/Pages"); |
334 | 0 | QPDFObjectHandle kids = pages.getKey("/Kids"); |
335 | |
|
336 | 0 | newpage.replaceKey("/Parent", pages); |
337 | 0 | kids.insertItem(pos, newpage); |
338 | 0 | int npages = static_cast<int>(kids.size()); |
339 | 0 | pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages)); |
340 | 0 | m->all_pages.insert(m->all_pages.begin() + pos, newpage); |
341 | 0 | for (int i = pos + 1; i < npages; ++i) { |
342 | 0 | insertPageobjToPage(m->all_pages.at(toS(i)), i, false); |
343 | 0 | } |
344 | 0 | insertPageobjToPage(newpage, pos, true); |
345 | 0 | } |
346 | | |
347 | | void |
348 | | QPDF::removePage(QPDFObjectHandle page) |
349 | 0 | { |
350 | 0 | int pos = findPage(page); // also ensures flat /Pages |
351 | 0 | QTC::TC( |
352 | 0 | "qpdf", |
353 | 0 | "QPDF remove page", |
354 | 0 | (pos == 0) ? 0 : // remove at beginning |
355 | 0 | (pos == toI(m->all_pages.size() - 1)) ? 1 // end |
356 | 0 | : 2); // remove in middle |
357 | |
|
358 | 0 | QPDFObjectHandle pages = getRoot().getKey("/Pages"); |
359 | 0 | QPDFObjectHandle kids = pages.getKey("/Kids"); |
360 | |
|
361 | 0 | kids.eraseItem(pos); |
362 | 0 | int npages = static_cast<int>(kids.size()); |
363 | 0 | pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages)); |
364 | 0 | m->all_pages.erase(m->all_pages.begin() + pos); |
365 | 0 | m->pageobj_to_pages_pos.erase(page.getObjGen()); |
366 | 0 | for (int i = pos; i < npages; ++i) { |
367 | 0 | m->pages.insertPageobjToPage(m->all_pages.at(toS(i)), i, false); |
368 | 0 | } |
369 | 0 | } |
370 | | |
371 | | void |
372 | | QPDF::addPageAt(QPDFObjectHandle newpage, bool before, QPDFObjectHandle refpage) |
373 | 0 | { |
374 | 0 | int refpos = findPage(refpage); |
375 | 0 | if (!before) { |
376 | 0 | ++refpos; |
377 | 0 | } |
378 | 0 | m->pages.insertPage(newpage, refpos); |
379 | 0 | } |
380 | | |
381 | | void |
382 | | QPDF::addPage(QPDFObjectHandle newpage, bool first) |
383 | 0 | { |
384 | 0 | if (first) { |
385 | 0 | m->pages.insertPage(newpage, 0); |
386 | 0 | } else { |
387 | 0 | m->pages.insertPage( |
388 | 0 | newpage, getRoot().getKey("/Pages").getKey("/Count").getIntValueAsInt()); |
389 | 0 | } |
390 | 0 | } |
391 | | |
392 | | int |
393 | | QPDF::findPage(QPDFObjectHandle& page) |
394 | 0 | { |
395 | 0 | return findPage(page.getObjGen()); |
396 | 0 | } |
397 | | |
398 | | int |
399 | | QPDF::findPage(QPDFObjGen og) |
400 | 0 | { |
401 | 0 | m->pages.flattenPagesTree(); |
402 | 0 | auto it = m->pageobj_to_pages_pos.find(og); |
403 | 0 | if (it == m->pageobj_to_pages_pos.end()) { |
404 | 0 | throw QPDFExc( |
405 | 0 | qpdf_e_pages, |
406 | 0 | m->file->getName(), |
407 | 0 | "page object: object " + og.unparse(' '), |
408 | 0 | 0, |
409 | 0 | "page object not referenced in /Pages tree"); |
410 | 0 | } |
411 | 0 | return (*it).second; |
412 | 0 | } |