/src/qpdf/libqpdf/QPDF_pages.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include <qpdf/QPDF_private.hh> |
2 | | |
3 | | #include <qpdf/QPDFExc.hh> |
4 | | #include <qpdf/QPDFObjectHandle_private.hh> |
5 | | #include <qpdf/QTC.hh> |
6 | | #include <qpdf/QUtil.hh> |
7 | | |
8 | | // In support of page manipulation APIs, these methods internally maintain state about pages in a |
9 | | // pair of data structures: all_pages, which is a vector of page objects, and pageobj_to_pages_pos, |
10 | | // which maps a page object to its position in the all_pages array. Unfortunately, the getAllPages() |
11 | | // method returns a const reference to all_pages and has been in the public API long before the |
12 | | // introduction of mutation APIs, so we're pretty much stuck with it. Anyway, there are lots of |
13 | | // calls to it in the library, so the efficiency of having it cached is probably worth keeping it. |
14 | | // At one point, I had partially implemented a helper class specifically for the pages tree, but |
15 | | // once you work in all the logic that handles repairing the /Type keys of page tree nodes (both |
16 | | // /Pages and /Page) and deal with duplicate pages, it's just as complex and less efficient than |
17 | | // what's here. So, in spite of the fact that a const reference is returned, the current code is |
18 | | // fine and does not need to be replaced. A partial implementation of QPDFPagesTree is in github in |
19 | | // attic in case there is ever a reason to resurrect it. There are additional notes in |
20 | | // README-maintainer, which also refers to this comment. |
21 | | |
22 | | // The goal of this code is to ensure that the all_pages vector, which users may have a reference |
23 | | // to, and the pageobj_to_pages_pos map, which users will not have access to, remain consistent |
24 | | // outside of any call to the library. As long as users only touch the /Pages structure through |
25 | | // page-specific API calls, they never have to worry about anything, and this will also stay |
26 | | // consistent. If a user touches anything about the /Pages structure outside of these calls (such |
27 | | // as by directly looking up and manipulating the underlying objects), they can call |
28 | | // updatePagesCache() to bring things back in sync. |
29 | | |
30 | | // If the user doesn't ever use the page manipulation APIs, then qpdf leaves the /Pages structure |
31 | | // alone. If the user does use the APIs, then we push all inheritable objects down and flatten the |
32 | | // /Pages tree. This makes it easier for us to keep /Pages, all_pages, and pageobj_to_pages_pos |
33 | | // internally consistent at all times. |
34 | | |
35 | | // Responsibility for keeping all_pages, pageobj_to_pages_pos, and the Pages structure consistent |
36 | | // should remain in as few places as possible. As of initial writing, only flattenPagesTree, |
37 | | // insertPage, and removePage, along with methods they call, are concerned with it. Everything else |
38 | | // goes through one of those methods. |
39 | | |
40 | | std::vector<QPDFObjectHandle> const& |
41 | | QPDF::getAllPages() |
42 | 178k | { |
43 | | // Note that pushInheritedAttributesToPage may also be used to initialize m->all_pages. |
44 | 178k | if (m->all_pages.empty() && !m->invalid_page_found) { |
45 | 53.8k | m->ever_called_get_all_pages = true; |
46 | 53.8k | auto root = getRoot(); |
47 | 53.8k | QPDFObjGen::set visited; |
48 | 53.8k | QPDFObjGen::set seen; |
49 | 53.8k | QPDFObjectHandle pages = root.getKey("/Pages"); |
50 | 53.8k | bool warned = false; |
51 | 53.8k | bool changed_pages = false; |
52 | 55.5k | while (pages.isDictionary() && pages.hasKey("/Parent")) { |
53 | 2.14k | if (!seen.add(pages)) { |
54 | | // loop -- will be detected again and reported later |
55 | 448 | break; |
56 | 448 | } |
57 | | // Files have been found in the wild where /Pages in the catalog points to the first |
58 | | // page. Try to work around this and similar cases with this heuristic. |
59 | 1.69k | if (!warned) { |
60 | 1.47k | root.warnIfPossible( |
61 | 1.47k | "document page tree root (root -> /Pages) doesn't point" |
62 | 1.47k | " to the root of the page tree; attempting to correct"); |
63 | 1.47k | warned = true; |
64 | 1.47k | } |
65 | 1.69k | changed_pages = true; |
66 | 1.69k | pages = pages.getKey("/Parent"); |
67 | 1.69k | } |
68 | 53.8k | if (changed_pages) { |
69 | 1.47k | root.replaceKey("/Pages", pages); |
70 | 1.47k | } |
71 | 53.8k | seen.clear(); |
72 | 53.8k | if (!pages.hasKey("/Kids")) { |
73 | | // Ensure we actually found a /Pages object. |
74 | 793 | throw QPDFExc( |
75 | 793 | qpdf_e_pages, m->file->getName(), "", 0, "root of pages tree has no /Kids array"); |
76 | 793 | } |
77 | 53.0k | try { |
78 | 53.0k | getAllPagesInternal(pages, visited, seen, false, false); |
79 | 53.0k | } catch (...) { |
80 | 285 | m->all_pages.clear(); |
81 | 285 | m->invalid_page_found = false; |
82 | 285 | throw; |
83 | 285 | } |
84 | 35.9k | if (m->invalid_page_found) { |
85 | 14.6k | flattenPagesTree(); |
86 | 14.6k | m->invalid_page_found = false; |
87 | 14.6k | } |
88 | 35.9k | } |
89 | 160k | return m->all_pages; |
90 | 178k | } |
91 | | |
92 | | void |
93 | | QPDF::getAllPagesInternal( |
94 | | QPDFObjectHandle cur_node, |
95 | | QPDFObjGen::set& visited, |
96 | | QPDFObjGen::set& seen, |
97 | | bool media_box, |
98 | | bool resources) |
99 | 38.9k | { |
100 | 38.9k | if (!visited.add(cur_node)) { |
101 | 142 | throw QPDFExc( |
102 | 142 | qpdf_e_pages, |
103 | 142 | m->file->getName(), |
104 | 142 | "object " + cur_node.getObjGen().unparse(' '), |
105 | 142 | 0, |
106 | 142 | "Loop detected in /Pages structure (getAllPages)"); |
107 | 142 | } |
108 | 38.8k | if (!cur_node.isDictionaryOfType("/Pages")) { |
109 | | // During fuzzing files were encountered where the root object appeared in the pages tree. |
110 | | // Unconditionally setting the /Type to /Pages could cause problems, but trying to |
111 | | // accommodate the possibility may be excessive. |
112 | 22.8k | cur_node.warnIfPossible("/Type key should be /Pages but is not; overriding"); |
113 | 22.8k | cur_node.replaceKey("/Type", "/Pages"_qpdf); |
114 | 22.8k | } |
115 | 38.8k | if (!media_box) { |
116 | 38.5k | media_box = cur_node.getKey("/MediaBox").isRectangle(); |
117 | 38.5k | QTC::TC("qpdf", "QPDF inherit mediabox", media_box ? 0 : 1); |
118 | 38.5k | } |
119 | 38.8k | if (!resources) { |
120 | 38.6k | resources = cur_node.getKey("/Resources").isDictionary(); |
121 | 38.6k | } |
122 | 38.8k | auto kids = cur_node.getKey("/Kids"); |
123 | 38.8k | if (!visited.add(kids)) { |
124 | 23 | throw QPDFExc( |
125 | 23 | qpdf_e_pages, |
126 | 23 | m->file->getName(), |
127 | 23 | "object " + cur_node.getObjGen().unparse(' '), |
128 | 23 | 0, |
129 | 23 | "Loop detected in /Pages structure (getAllPages)"); |
130 | 23 | } |
131 | 38.8k | int i = -1; |
132 | 268k | for (auto& kid: kids.as_array()) { |
133 | 268k | ++i; |
134 | 268k | int errors = 0; |
135 | | |
136 | 268k | if (!kid.isDictionary()) { |
137 | 193k | kid.warnIfPossible("Pages tree includes non-dictionary object; ignoring"); |
138 | 193k | m->invalid_page_found = true; |
139 | 193k | continue; |
140 | 193k | } |
141 | 75.6k | if (!kid.isIndirect()) { |
142 | 2.87k | QTC::TC("qpdf", "QPDF handle direct page object"); |
143 | 2.87k | cur_node.warnIfPossible( |
144 | 2.87k | "kid " + std::to_string(i) + " (from 0) is direct; converting to indirect"); |
145 | 2.87k | kid = makeIndirectObject(kid); |
146 | 2.87k | ++errors; |
147 | 2.87k | } |
148 | 75.6k | if (kid.hasKey("/Kids")) { |
149 | 2.76k | getAllPagesInternal(kid, visited, seen, media_box, resources); |
150 | 72.9k | } else { |
151 | 72.9k | if (!media_box && !kid.getKey("/MediaBox").isRectangle()) { |
152 | 25.3k | QTC::TC("qpdf", "QPDF missing mediabox"); |
153 | 25.3k | kid.warnIfPossible( |
154 | 25.3k | "kid " + std::to_string(i) + |
155 | 25.3k | " (from 0) MediaBox is undefined; setting to letter / ANSI A"); |
156 | 25.3k | kid.replaceKey( |
157 | 25.3k | "/MediaBox", |
158 | 25.3k | QPDFObjectHandle::newArray(QPDFObjectHandle::Rectangle(0, 0, 612, 792))); |
159 | 25.3k | ++errors; |
160 | 25.3k | } |
161 | 72.9k | if (!resources && !kid.getKey("/Resources").isDictionary()) { |
162 | | // Consider adding an information message |
163 | 34.1k | ++errors; |
164 | 34.1k | } |
165 | 72.9k | if (!seen.add(kid)) { |
166 | | // Make a copy of the page. This does the same as shallowCopyPage in |
167 | | // QPDFPageObjectHelper. |
168 | 12.0k | QTC::TC("qpdf", "QPDF resolve duplicated page object"); |
169 | 12.0k | if (!m->reconstructed_xref) { |
170 | 2 | cur_node.warnIfPossible( |
171 | 2 | "kid " + std::to_string(i) + |
172 | 2 | " (from 0) appears more than once in the pages tree;" |
173 | 2 | " creating a new page object as a copy"); |
174 | | // This needs to be fixed. shallowCopy does not necessarily produce a valid |
175 | | // page. |
176 | 2 | kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy()); |
177 | 2 | seen.add(kid); |
178 | 12.0k | } else { |
179 | 12.0k | cur_node.warnIfPossible( |
180 | 12.0k | "kid " + std::to_string(i) + |
181 | 12.0k | " (from 0) appears more than once in the pages tree; ignoring duplicate"); |
182 | 12.0k | m->invalid_page_found = true; |
183 | 12.0k | kid = QPDFObjectHandle::newNull(); |
184 | 12.0k | continue; |
185 | 12.0k | } |
186 | 2 | if (!kid.getKey("/Parent").isSameObjectAs(cur_node)) { |
187 | | // Consider fixing and adding an information message. |
188 | 0 | ++errors; |
189 | 0 | } |
190 | 2 | } |
191 | 60.8k | if (!kid.isDictionaryOfType("/Page")) { |
192 | 20.5k | kid.warnIfPossible("/Type key should be /Page but is not; overriding"); |
193 | 20.5k | kid.replaceKey("/Type", "/Page"_qpdf); |
194 | 20.5k | ++errors; |
195 | 20.5k | } |
196 | 60.8k | if (m->reconstructed_xref && errors > 2) { |
197 | 6.52k | cur_node.warnIfPossible( |
198 | 6.52k | "kid " + std::to_string(i) + " (from 0) has too many errors; ignoring page"); |
199 | 6.52k | m->invalid_page_found = true; |
200 | 6.52k | kid = QPDFObjectHandle::newNull(); |
201 | 6.52k | continue; |
202 | 6.52k | } |
203 | 54.3k | m->all_pages.emplace_back(kid); |
204 | 54.3k | } |
205 | 75.6k | } |
206 | 38.8k | } |
207 | | |
208 | | void |
209 | | QPDF::updateAllPagesCache() |
210 | 0 | { |
211 | | // Force regeneration of the pages cache. We force immediate recalculation of all_pages since |
212 | | // users may have references to it that they got from calls to getAllPages(). We can defer |
213 | | // recalculation of pageobj_to_pages_pos until needed. |
214 | 0 | QTC::TC("qpdf", "QPDF updateAllPagesCache"); |
215 | 0 | m->all_pages.clear(); |
216 | 0 | m->pageobj_to_pages_pos.clear(); |
217 | 0 | m->pushed_inherited_attributes_to_pages = false; |
218 | 0 | getAllPages(); |
219 | 0 | } |
220 | | |
221 | | void |
222 | | QPDF::flattenPagesTree() |
223 | 14.6k | { |
224 | | // If not already done, flatten the /Pages structure and initialize pageobj_to_pages_pos. |
225 | | |
226 | 14.6k | if (!m->pageobj_to_pages_pos.empty()) { |
227 | 0 | return; |
228 | 0 | } |
229 | | |
230 | | // Push inherited objects down to the /Page level. As a side effect m->all_pages will also be |
231 | | // generated. |
232 | 14.6k | pushInheritedAttributesToPage(true, true); |
233 | | |
234 | 14.6k | QPDFObjectHandle pages = getRoot().getKey("/Pages"); |
235 | | |
236 | 14.6k | size_t const len = m->all_pages.size(); |
237 | 42.5k | for (size_t pos = 0; pos < len; ++pos) { |
238 | | // Populate pageobj_to_pages_pos and fix parent pointer. There should be no duplicates at |
239 | | // this point because pushInheritedAttributesToPage calls getAllPages which resolves |
240 | | // duplicates. |
241 | 27.9k | insertPageobjToPage(m->all_pages.at(pos), toI(pos), true); |
242 | 27.9k | m->all_pages.at(pos).replaceKey("/Parent", pages); |
243 | 27.9k | } |
244 | | |
245 | 14.6k | pages.replaceKey("/Kids", QPDFObjectHandle::newArray(m->all_pages)); |
246 | | // /Count has not changed |
247 | 14.6k | if (pages.getKey("/Count").getUIntValue() != len) { |
248 | 12.0k | if (m->invalid_page_found && pages.getKey("/Count").getUIntValue() > len) { |
249 | 11.9k | pages.replaceKey("/Count", QPDFObjectHandle::newInteger(toI(len))); |
250 | 11.9k | } else { |
251 | 63 | throw std::runtime_error("/Count is wrong after flattening pages tree"); |
252 | 63 | } |
253 | 12.0k | } |
254 | 14.6k | } |
255 | | |
256 | | void |
257 | | QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, bool check_duplicate) |
258 | 27.9k | { |
259 | 27.9k | QPDFObjGen og(obj.getObjGen()); |
260 | 27.9k | if (check_duplicate) { |
261 | 27.9k | if (!m->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second) { |
262 | | // The library never calls insertPageobjToPage in a way that causes this to happen. |
263 | 0 | setLastObjectDescription("page " + std::to_string(pos) + " (numbered from zero)", og); |
264 | 0 | throw QPDFExc( |
265 | 0 | qpdf_e_pages, |
266 | 0 | m->file->getName(), |
267 | 0 | m->last_object_description, |
268 | 0 | 0, |
269 | 0 | "duplicate page reference found; this would cause loss of data"); |
270 | 0 | } |
271 | 27.9k | } else { |
272 | 0 | m->pageobj_to_pages_pos[og] = pos; |
273 | 0 | } |
274 | 27.9k | } |
275 | | |
276 | | void |
277 | | QPDF::insertPage(QPDFObjectHandle newpage, int pos) |
278 | 0 | { |
279 | | // pos is numbered from 0, so pos = 0 inserts at the beginning and pos = npages adds to the end. |
280 | |
|
281 | 0 | flattenPagesTree(); |
282 | |
|
283 | 0 | if (!newpage.isIndirect()) { |
284 | 0 | QTC::TC("qpdf", "QPDF insert non-indirect page"); |
285 | 0 | newpage = makeIndirectObject(newpage); |
286 | 0 | } else if (newpage.getOwningQPDF() != this) { |
287 | 0 | QTC::TC("qpdf", "QPDF insert foreign page"); |
288 | 0 | newpage.getQPDF().pushInheritedAttributesToPage(); |
289 | 0 | newpage = copyForeignObject(newpage); |
290 | 0 | } else { |
291 | 0 | QTC::TC("qpdf", "QPDF insert indirect page"); |
292 | 0 | } |
293 | |
|
294 | 0 | if ((pos < 0) || (toS(pos) > m->all_pages.size())) { |
295 | 0 | throw std::runtime_error("QPDF::insertPage called with pos out of range"); |
296 | 0 | } |
297 | | |
298 | 0 | QTC::TC( |
299 | 0 | "qpdf", |
300 | 0 | "QPDF insert page", |
301 | 0 | (pos == 0) ? 0 : // insert at beginning |
302 | 0 | (pos == toI(m->all_pages.size())) ? 1 // at end |
303 | 0 | : 2); // insert in middle |
304 | |
|
305 | 0 | auto og = newpage.getObjGen(); |
306 | 0 | if (m->pageobj_to_pages_pos.contains(og)) { |
307 | 0 | QTC::TC("qpdf", "QPDF resolve duplicated page in insert"); |
308 | 0 | newpage = makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy()); |
309 | 0 | } |
310 | |
|
311 | 0 | QPDFObjectHandle pages = getRoot().getKey("/Pages"); |
312 | 0 | QPDFObjectHandle kids = pages.getKey("/Kids"); |
313 | |
|
314 | 0 | newpage.replaceKey("/Parent", pages); |
315 | 0 | kids.insertItem(pos, newpage); |
316 | 0 | int npages = kids.getArrayNItems(); |
317 | 0 | pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages)); |
318 | 0 | m->all_pages.insert(m->all_pages.begin() + pos, newpage); |
319 | 0 | for (int i = pos + 1; i < npages; ++i) { |
320 | 0 | insertPageobjToPage(m->all_pages.at(toS(i)), i, false); |
321 | 0 | } |
322 | 0 | insertPageobjToPage(newpage, pos, true); |
323 | 0 | } |
324 | | |
325 | | void |
326 | | QPDF::removePage(QPDFObjectHandle page) |
327 | 0 | { |
328 | 0 | int pos = findPage(page); // also ensures flat /Pages |
329 | 0 | QTC::TC( |
330 | 0 | "qpdf", |
331 | 0 | "QPDF remove page", |
332 | 0 | (pos == 0) ? 0 : // remove at beginning |
333 | 0 | (pos == toI(m->all_pages.size() - 1)) ? 1 // end |
334 | 0 | : 2); // remove in middle |
335 | |
|
336 | 0 | QPDFObjectHandle pages = getRoot().getKey("/Pages"); |
337 | 0 | QPDFObjectHandle kids = pages.getKey("/Kids"); |
338 | |
|
339 | 0 | kids.eraseItem(pos); |
340 | 0 | int npages = kids.getArrayNItems(); |
341 | 0 | pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages)); |
342 | 0 | m->all_pages.erase(m->all_pages.begin() + pos); |
343 | 0 | m->pageobj_to_pages_pos.erase(page.getObjGen()); |
344 | 0 | for (int i = pos; i < npages; ++i) { |
345 | 0 | insertPageobjToPage(m->all_pages.at(toS(i)), i, false); |
346 | 0 | } |
347 | 0 | } |
348 | | |
349 | | void |
350 | | QPDF::addPageAt(QPDFObjectHandle newpage, bool before, QPDFObjectHandle refpage) |
351 | 0 | { |
352 | 0 | int refpos = findPage(refpage); |
353 | 0 | if (!before) { |
354 | 0 | ++refpos; |
355 | 0 | } |
356 | 0 | insertPage(newpage, refpos); |
357 | 0 | } |
358 | | |
359 | | void |
360 | | QPDF::addPage(QPDFObjectHandle newpage, bool first) |
361 | 0 | { |
362 | 0 | if (first) { |
363 | 0 | insertPage(newpage, 0); |
364 | 0 | } else { |
365 | 0 | insertPage(newpage, getRoot().getKey("/Pages").getKey("/Count").getIntValueAsInt()); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | int |
370 | | QPDF::findPage(QPDFObjectHandle& page) |
371 | 0 | { |
372 | 0 | return findPage(page.getObjGen()); |
373 | 0 | } |
374 | | |
375 | | int |
376 | | QPDF::findPage(QPDFObjGen og) |
377 | 0 | { |
378 | 0 | flattenPagesTree(); |
379 | 0 | auto it = m->pageobj_to_pages_pos.find(og); |
380 | 0 | if (it == m->pageobj_to_pages_pos.end()) { |
381 | 0 | QTC::TC("qpdf", "QPDF_pages findPage not found"); |
382 | 0 | setLastObjectDescription("page object", og); |
383 | 0 | throw QPDFExc( |
384 | 0 | qpdf_e_pages, |
385 | 0 | m->file->getName(), |
386 | 0 | m->last_object_description, |
387 | 0 | 0, |
388 | 0 | "page object not referenced in /Pages tree"); |
389 | 0 | } |
390 | 0 | return (*it).second; |
391 | 0 | } |