/src/qpdf/include/qpdf/QPDFPageObjectHelper.hh
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2005-2021 Jay Berkenbilt |
2 | | // Copyright (c) 2022-2025 Jay Berkenbilt and Manfred Holger |
3 | | // |
4 | | // This file is part of qpdf. |
5 | | // |
6 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
7 | | // in compliance with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
12 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
13 | | // or implied. See the License for the specific language governing permissions and limitations under |
14 | | // the License. |
15 | | // |
16 | | // Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic |
17 | | // License. At your option, you may continue to consider qpdf to be licensed under those terms. |
18 | | // Please see the manual for additional information. |
19 | | |
20 | | #ifndef QPDFPAGEOBJECTHELPER_HH |
21 | | #define QPDFPAGEOBJECTHELPER_HH |
22 | | |
23 | | #include <qpdf/QPDFAnnotationObjectHelper.hh> |
24 | | #include <qpdf/QPDFMatrix.hh> |
25 | | #include <qpdf/QPDFObjectHelper.hh> |
26 | | |
27 | | #include <qpdf/DLL.h> |
28 | | |
29 | | #include <qpdf/QPDFObjectHandle.hh> |
30 | | #include <functional> |
31 | | |
32 | | class QPDFAcroFormDocumentHelper; |
33 | | |
34 | | // This is a helper class for page objects, but as of qpdf 10.1, many of the methods also work |
35 | | // for form XObjects. When this is the case, it is noted in the comment. |
36 | | class QPDFPageObjectHelper: public QPDFObjectHelper |
37 | | { |
38 | | public: |
39 | | QPDF_DLL |
40 | | QPDFPageObjectHelper(QPDFObjectHandle); |
41 | | |
42 | 0 | ~QPDFPageObjectHelper() override = default; |
43 | | |
44 | | // PAGE ATTRIBUTES |
45 | | |
46 | | // The getAttribute method works with pages and form XObjects. It returns the value of the |
47 | | // requested attribute from the page/form XObject's dictionary, taking inheritance from the |
48 | | // pages tree into consideration. For pages, the attributes /MediaBox, /CropBox, /Resources, and |
49 | | // /Rotate are inheritable, meaning that if they are not present directly on the page node, they |
50 | | // may be inherited from ancestor nodes in the pages tree. |
51 | | // |
52 | | // There are two ways that an attribute can be "shared": |
53 | | // |
54 | | // * For inheritable attributes on pages, it may appear in a higher level node of the pages tree |
55 | | // |
56 | | // * For any attribute, the attribute may be an indirect object which may be referenced by more |
57 | | // than one page/form XObject. |
58 | | // |
59 | | // If copy_if_shared is true, then this method will replace the attribute with a shallow copy if |
60 | | // it is indirect or inherited and return the copy. You should do this if you are going to |
61 | | // modify the returned object and want the modifications to apply to the current page/form |
62 | | // XObject only. |
63 | | QPDF_DLL |
64 | | QPDFObjectHandle getAttribute(std::string const& name, bool copy_if_shared); |
65 | | |
66 | | // PAGE BOXES |
67 | | // |
68 | | // Pages have various types of boundary boxes. These are described in detail in the PDF |
69 | | // specification (section 14.11.2 Page boundaries). They are, by key in the page dictionary: |
70 | | // |
71 | | // * /MediaBox -- boundaries of physical page |
72 | | // * /CropBox -- clipping region of what is displayed |
73 | | // * /BleedBox -- clipping region for production environments |
74 | | // * /TrimBox -- dimensions of final printed page after trimming |
75 | | // * /ArtBox -- extent of meaningful content including margins |
76 | | // |
77 | | // Of these, only /MediaBox is required. If any are absent, the |
78 | | // fallback value for /CropBox is /MediaBox, and the fallback |
79 | | // values for the other boxes are /CropBox. |
80 | | // |
81 | | // As noted above (PAGE ATTRIBUTES), /MediaBox and /CropBox can be inherited from parent nodes |
82 | | // in the pages tree. The other boxes can't be inherited. |
83 | | // |
84 | | // When the comments below refer to the "effective value" of a box, this takes into |
85 | | // consideration both inheritance through the pages tree (in the case of /MediaBox and /CropBox) |
86 | | // and fallback values for missing attributes (for all except /MediaBox). |
87 | | // |
88 | | // For the methods below, copy_if_shared is passed to getAttribute and therefore refers only to |
89 | | // indirect objects and values that are inherited through the pages tree. |
90 | | // |
91 | | // If copy_if_fallback is true, a copy is made if the object's value was obtained by falling |
92 | | // back to a different box. |
93 | | // |
94 | | // The copy_if_shared and copy_if_fallback parameters carry across multiple layers. This is |
95 | | // explained below. |
96 | | // |
97 | | // You should set copy_if_shared to true if you want to modify a bounding box for the current |
98 | | // page without affecting other pages but you don't want to change the fallback behavior. For |
99 | | // example, if you want to modify the /TrimBox for the current page only but have it continue to |
100 | | // fall back to the value of /CropBox or /MediaBox if they are not defined, you could set |
101 | | // copy_if_shared to true. |
102 | | // |
103 | | // You should set copy_if_fallback to true if you want to modify a specific box as distinct from |
104 | | // any other box. For example, if you want to make /TrimBox differ from /CropBox, then you |
105 | | // should set copy_if_fallback to true. |
106 | | // |
107 | | // The copy_if_fallback flags were added in qpdf 11. |
108 | | // |
109 | | // For example, suppose that neither /CropBox nor /TrimBox is present on a page but /CropBox is |
110 | | // present in the page's parent node in the page tree. |
111 | | // |
112 | | // * getTrimBox(false, false) would return the /CropBox from the parent node. |
113 | | // |
114 | | // * getTrimBox(true, false) would make a shallow copy of the /CropBox from the parent node into |
115 | | // the current node and return it. |
116 | | // |
117 | | // * getTrimBox(false, true) would make a shallow copy of the /CropBox from the parent node into |
118 | | // /TrimBox of the current node and return it. |
119 | | // |
120 | | // * getTrimBox(true, true) would make a shallow copy of the /CropBox from the parent node into |
121 | | // the current node, then make a shallow copy of the resulting copy to /TrimBox of the current |
122 | | // node, and then return that. |
123 | | // |
124 | | // To illustrate how these parameters carry across multiple layers, suppose that neither |
125 | | // /MediaBox, /CropBox, nor /TrimBox is present on a page but /MediaBox is present on the |
126 | | // parent. In this case: |
127 | | // |
128 | | // * getTrimBox(false, false) would return the value of /MediaBox from the parent node. |
129 | | // |
130 | | // * getTrimBox(true, false) would copy /MediaBox to the current node and return it. |
131 | | // |
132 | | // * getTrimBox(false, true) would first copy /MediaBox from the parent to /CropBox, then copy |
133 | | // /CropBox to /TrimBox, and then return the result. |
134 | | // |
135 | | // * getTrimBox(true, true) would first copy /MediaBox from the parent to the current page, then |
136 | | // copy it to /CropBox, then copy /CropBox to /TrimBox, and then return the result. |
137 | | // |
138 | | // If you need different behavior, call getAttribute directly and take care of your own copying. |
139 | | |
140 | | // Return the effective MediaBox |
141 | | QPDF_DLL |
142 | | QPDFObjectHandle getMediaBox(bool copy_if_shared = false); |
143 | | |
144 | | // Return the effective CropBox. If not defined, fall back to MediaBox |
145 | | QPDF_DLL |
146 | | QPDFObjectHandle getCropBox(bool copy_if_shared = false, bool copy_if_fallback = false); |
147 | | |
148 | | // Return the effective BleedBox. If not defined, fall back to CropBox. |
149 | | QPDF_DLL |
150 | | QPDFObjectHandle getBleedBox(bool copy_if_shared = false, bool copy_if_fallback = false); |
151 | | |
152 | | // Return the effective TrimBox. If not defined, fall back to CropBox. |
153 | | QPDF_DLL |
154 | | QPDFObjectHandle getTrimBox(bool copy_if_shared = false, bool copy_if_fallback = false); |
155 | | |
156 | | // Return the effective ArtBox. If not defined, fall back to CropBox. |
157 | | QPDF_DLL |
158 | | QPDFObjectHandle getArtBox(bool copy_if_shared = false, bool copy_if_fallback = false); |
159 | | |
160 | | // Iterate through XObjects, possibly recursing into form XObjects. This works with pages or |
161 | | // form XObjects. Call action on each XObject for which selector, if specified, returns true. |
162 | | // With no selector, calls action for every object. In addition to the object being passed to |
163 | | // action, the containing XObject dictionary and key are passed in. Remember that the XObject |
164 | | // dictionary may be shared, and the object may appear in multiple XObject dictionaries. |
165 | | QPDF_DLL |
166 | | void forEachXObject( |
167 | | bool recursive, |
168 | | std::function<void( |
169 | | QPDFObjectHandle& obj, QPDFObjectHandle& xobj_dict, std::string const& key)> action, |
170 | | std::function<bool(QPDFObjectHandle)> selector = nullptr); |
171 | | // Only call action for images |
172 | | QPDF_DLL |
173 | | void forEachImage( |
174 | | bool recursive, |
175 | | std::function<void( |
176 | | QPDFObjectHandle& obj, QPDFObjectHandle& xobj_dict, std::string const& key)> action); |
177 | | // Only call action for form XObjects |
178 | | QPDF_DLL |
179 | | void forEachFormXObject( |
180 | | bool recursive, |
181 | | std::function<void( |
182 | | QPDFObjectHandle& obj, QPDFObjectHandle& xobj_dict, std::string const& key)> action); |
183 | | |
184 | | // Returns an empty map if there are no images or no resources. Prior to qpdf 8.4.0, this |
185 | | // function did not support inherited resources, but it does now. Return value is a map from |
186 | | // XObject name to the image object, which is always a stream. Works with form XObjects as well |
187 | | // as pages. This method does not recurse into nested form XObjects. For that, use forEachImage. |
188 | | QPDF_DLL |
189 | | std::map<std::string, QPDFObjectHandle> getImages(); |
190 | | |
191 | | // Old name -- calls getImages() |
192 | | QPDF_DLL |
193 | | std::map<std::string, QPDFObjectHandle> getPageImages(); |
194 | | |
195 | | // Returns an empty map if there are no form XObjects or no resources. Otherwise, returns a map |
196 | | // of keys to form XObjects directly referenced from this page or form XObjects. This does not |
197 | | // recurse into nested form XObjects. For that, use forEachFormXObject. |
198 | | QPDF_DLL |
199 | | std::map<std::string, QPDFObjectHandle> getFormXObjects(); |
200 | | |
201 | | // Converts each inline image to an external (normal) image if the size is at least the |
202 | | // specified number of bytes. This method works with pages or form XObjects. By default, it |
203 | | // recursively processes nested form XObjects. Pass true as shallow to avoid this behavior. |
204 | | // Prior to qpdf 10.1, form XObjects were ignored, but this was considered a bug. |
205 | | QPDF_DLL |
206 | | void externalizeInlineImages(size_t min_size = 0, bool shallow = false); |
207 | | |
208 | | // Return the annotations in the page's "/Annots" list, if any. If only_subtype is non-empty, |
209 | | // only include annotations of the given subtype. |
210 | | QPDF_DLL |
211 | | std::vector<QPDFAnnotationObjectHelper> getAnnotations(std::string const& only_subtype = ""); |
212 | | |
213 | | // Returns a vector of stream objects representing the content streams for the given page. This |
214 | | // routine allows the caller to not care whether there are one or more than one content streams |
215 | | // for a page. |
216 | | QPDF_DLL |
217 | | std::vector<QPDFObjectHandle> getPageContents(); |
218 | | |
219 | | // Add the given object as a new content stream for this page. If parameter 'first' is true, add |
220 | | // to the beginning. Otherwise, add to the end. This routine automatically converts the page |
221 | | // contents to an array if it is a scalar, allowing the caller not to care what the initial |
222 | | // structure is. You can call coalesceContentStreams() afterwards if you want to force it to be |
223 | | // a single stream. |
224 | | QPDF_DLL |
225 | | void addPageContents(QPDFObjectHandle contents, bool first); |
226 | | |
227 | | // Rotate a page. If relative is false, set the rotation of the page to angle. Otherwise, add |
228 | | // angle to the rotation of the page. Angle must be a multiple of 90. Adding 90 to the rotation |
229 | | // rotates clockwise by 90 degrees. |
230 | | QPDF_DLL |
231 | | void rotatePage(int angle, bool relative); |
232 | | |
233 | | // Coalesce a page's content streams. A page's content may be a stream or an array of streams. |
234 | | // If this page's content is an array, concatenate the streams into a single stream. This can be |
235 | | // useful when working with files that split content streams in arbitrary spots, such as in the |
236 | | // middle of a token, as that can confuse some software. You could also call this after calling |
237 | | // addPageContents. |
238 | | QPDF_DLL |
239 | | void coalesceContentStreams(); |
240 | | |
241 | | // |
242 | | // Content stream handling |
243 | | // |
244 | | |
245 | | // Parse a page's contents through ParserCallbacks, described above. This method works whether |
246 | | // the contents are a single stream or an array of streams. Call on a page object. Also works |
247 | | // for form XObjects. |
248 | | QPDF_DLL |
249 | | void parseContents(QPDFObjectHandle::ParserCallbacks* callbacks); |
250 | | // Old name |
251 | | QPDF_DLL |
252 | | void parsePageContents(QPDFObjectHandle::ParserCallbacks* callbacks); |
253 | | |
254 | | // Pass a page's or form XObject's contents through the given TokenFilter. If a pipeline is also |
255 | | // provided, it will be the target of the write methods from the token filter. If a pipeline is |
256 | | // not specified, any output generated by the token filter will be discarded. Use this interface |
257 | | // if you need to pass a page's contents through filter for work purposes without having that |
258 | | // filter automatically applied to the page's contents, as happens with addContentTokenFilter. |
259 | | // See examples/pdf-count-strings.cc for an example. |
260 | | QPDF_DLL |
261 | | void filterContents(QPDFObjectHandle::TokenFilter* filter, Pipeline* next = nullptr); |
262 | | |
263 | | // Old name -- calls filterContents() |
264 | | QPDF_DLL |
265 | | void filterPageContents(QPDFObjectHandle::TokenFilter* filter, Pipeline* next = nullptr); |
266 | | |
267 | | // Pipe a page's contents through the given pipeline. This method works whether the contents are |
268 | | // a single stream or an array of streams. Also works on form XObjects. |
269 | | QPDF_DLL |
270 | | void pipeContents(Pipeline* p); |
271 | | // Old name |
272 | | QPDF_DLL |
273 | | void pipePageContents(Pipeline* p); |
274 | | |
275 | | // Attach a token filter to a page's contents. If the page's contents is an array of streams, it |
276 | | // is automatically coalesced. The token filter is applied to the page's contents as a single |
277 | | // stream. Also works on form XObjects. |
278 | | QPDF_DLL |
279 | | void addContentTokenFilter(std::shared_ptr<QPDFObjectHandle::TokenFilter> token_filter); |
280 | | |
281 | | // A page's resources dictionary maps names to objects elsewhere in the file. This method walks |
282 | | // through a page's contents and keeps tracks of which resources are referenced somewhere in the |
283 | | // contents. Then it removes from the resources dictionary any object that is not referenced in |
284 | | // the contents. This operation is most useful after calling |
285 | | // QPDFPageDocumentHelper::pushInheritedAttributesToPage(). This method is used by page |
286 | | // splitting code to avoid copying unused objects in files that used shared resource |
287 | | // dictionaries across multiple pages. This method recurses into form XObjects and can be called |
288 | | // with a form XObject as well as a page. |
289 | | QPDF_DLL |
290 | | void removeUnreferencedResources(); |
291 | | |
292 | | // Return a new QPDFPageObjectHelper that is a duplicate of the page. The returned object is an |
293 | | // indirect object that is ready to be inserted into the same or a different QPDF object using |
294 | | // any of the addPage methods in QPDFPageDocumentHelper or QPDF. Without calling one of those |
295 | | // methods, the page will not be added anywhere. The new page object shares all content streams |
296 | | // and indirect object resources with the original page, so if you are going to modify the |
297 | | // contents or other aspects of the page, you will need to handling copying of the component |
298 | | // parts separately. |
299 | | QPDF_DLL |
300 | | QPDFPageObjectHelper shallowCopyPage(); |
301 | | |
302 | | // Return a transformation matrix whose effect is the same as the page's /Rotate and /UserUnit |
303 | | // parameters. If invert is true, return a matrix whose effect is the opposite. The regular |
304 | | // matrix is suitable for taking something from this page to put elsewhere, and the second one |
305 | | // is suitable for putting something else onto this page. The page's TrimBox is used as the |
306 | | // bounding box for purposes of computing the matrix. |
307 | | QPDF_DLL |
308 | | QPDFObjectHandle::Matrix getMatrixForTransformations(bool invert = false); |
309 | | |
310 | | // Return a form XObject that draws this page. This is useful for n-up operations, underlay, |
311 | | // overlay, thumbnail generation, or any other case in which it is useful to replicate the |
312 | | // contents of a page in some other context. The dictionaries are shallow copies of the original |
313 | | // page dictionary, and the contents are coalesced from the page's contents. The resulting |
314 | | // object handle is not referenced anywhere. If handle_transformations is true, the resulting |
315 | | // form XObject's /Matrix will be set to replicate rotation (/Rotate) and scaling (/UserUnit) in |
316 | | // the page's dictionary. In this way, the page's transformations will be preserved when placing |
317 | | // this object on another page. |
318 | | QPDF_DLL |
319 | | QPDFObjectHandle getFormXObjectForPage(bool handle_transformations = true); |
320 | | |
321 | | // Return content stream text that will place the given form XObject (fo) using the resource |
322 | | // name "name" on this page centered within the given rectangle. If invert_transformations is |
323 | | // true, the effect of any rotation (/Rotate) and scaling (/UserUnit) applied to the current |
324 | | // page will be inverted in the form XObject placement. This will cause the form XObject's |
325 | | // absolute orientation to be preserved. You could overlay one page on another by calling |
326 | | // getFormXObjectForPage on the original page, QPDFObjectHandle::getUniqueResourceName on the |
327 | | // destination page's Resources dictionary to generate a name for the resulting object, and |
328 | | // calling placeFormXObject on the destination page. Then insert the new fo (or, if it comes |
329 | | // from a different file, the result of calling copyForeignObject on it) into the resources |
330 | | // dictionary using name, and append or prepend the content to the page's content streams. See |
331 | | // the overlay/underlay code in qpdf.cc or examples/pdf-overlay-page.cc for an example. From |
332 | | // qpdf 10.0.0, the allow_shrink and allow_expand parameters control whether the form XObject is |
333 | | // allowed to be shrunk or expanded to stay within or maximally fill the destination rectangle. |
334 | | // The default values are for backward compatibility with the pre-10.0.0 behavior. |
335 | | QPDF_DLL |
336 | | std::string placeFormXObject( |
337 | | QPDFObjectHandle fo, |
338 | | std::string const& name, |
339 | | QPDFObjectHandle::Rectangle rect, |
340 | | bool invert_transformations = true, |
341 | | bool allow_shrink = true, |
342 | | bool allow_expand = false); |
343 | | |
344 | | // Alternative version that also fills in the transformation matrix that was used. |
345 | | QPDF_DLL |
346 | | std::string placeFormXObject( |
347 | | QPDFObjectHandle fo, |
348 | | std::string const& name, |
349 | | QPDFObjectHandle::Rectangle rect, |
350 | | QPDFMatrix& cm, |
351 | | bool invert_transformations = true, |
352 | | bool allow_shrink = true, |
353 | | bool allow_expand = false); |
354 | | |
355 | | // Return the transformation matrix that translates from the given form XObject's coordinate |
356 | | // system into the given rectangular region on the page. The parameters have the same meaning as |
357 | | // for placeFormXObject. |
358 | | QPDF_DLL |
359 | | QPDFMatrix getMatrixForFormXObjectPlacement( |
360 | | QPDFObjectHandle fo, |
361 | | QPDFObjectHandle::Rectangle rect, |
362 | | bool invert_transformations = true, |
363 | | bool allow_shrink = true, |
364 | | bool allow_expand = false); |
365 | | |
366 | | // If a page is rotated using /Rotate in the page's dictionary, instead rotate the page by the |
367 | | // same amount by altering the contents and removing the /Rotate key. This method adjusts the |
368 | | // various page bounding boxes (/MediaBox, etc.) so that the page will have the same semantics. |
369 | | // This can be useful to work around problems with PDF applications that can't properly handle |
370 | | // rotated pages. If a QPDFAcroFormDocumentHelper is provided, it will be used for resolving any |
371 | | // form fields that have to be rotated. If not, one will be created inside the function, which |
372 | | // is less efficient. |
373 | | QPDF_DLL |
374 | | void flattenRotation(QPDFAcroFormDocumentHelper* afdh = nullptr); |
375 | | |
376 | | // Copy annotations from another page into this page. The other page may be from the same QPDF |
377 | | // or from a different QPDF. Each annotation's rectangle is transformed by the given matrix. If |
378 | | // the annotation is a widget annotation that is associated with a form field, the form field is |
379 | | // copied into this document's AcroForm dictionary as well. You can use this to copy annotations |
380 | | // from a page that was converted to a form XObject and added to another page. For example of |
381 | | // this, see examples/pdf-overlay-page.cc. This method calls |
382 | | // QPDFAcroFormDocumentHelper::transformAnnotations, which will copy annotations and form fields |
383 | | // so that you can copy annotations from a source page to any number of other pages, even with |
384 | | // different matrices, and maintain independence from the original annotations. See also |
385 | | // QPDFAcroFormDocumentHelper::fixCopiedAnnotations, which can be used if you copy a page and |
386 | | // want to repair the annotations on the destination page to make them independent from the |
387 | | // original page's annotations. |
388 | | // |
389 | | // If you pass in a QPDFAcroFormDocumentHelper*, the method will use that instead of creating |
390 | | // one in the function. Creating QPDFAcroFormDocumentHelper objects is expensive, so if you're |
391 | | // doing a lot of copying, it can be more efficient to create these outside and pass them in. |
392 | | QPDF_DLL |
393 | | void copyAnnotations( |
394 | | QPDFPageObjectHelper from_page, |
395 | | QPDFMatrix const& cm = QPDFMatrix(), |
396 | | QPDFAcroFormDocumentHelper* afdh = nullptr, |
397 | | QPDFAcroFormDocumentHelper* from_afdh = nullptr); |
398 | | |
399 | | private: |
400 | | QPDFObjectHandle getAttribute( |
401 | | std::string const& name, |
402 | | bool copy_if_shared, |
403 | | std::function<QPDFObjectHandle()> get_fallback, |
404 | | bool copy_if_fallback); |
405 | | static bool |
406 | | removeUnreferencedResourcesHelper(QPDFPageObjectHelper ph, std::set<std::string>& unresolved); |
407 | | |
408 | | class Members |
409 | | { |
410 | | friend class QPDFPageObjectHelper; |
411 | | |
412 | | public: |
413 | | ~Members() = default; |
414 | | |
415 | | private: |
416 | | Members() = default; |
417 | | Members(Members const&) = delete; |
418 | | }; |
419 | | |
420 | | std::shared_ptr<Members> m; |
421 | | }; |
422 | | |
423 | | #endif // QPDFPAGEOBJECTHELPER_HH |