Coverage Report

Created: 2025-07-18 07:00

/src/qpdf/include/qpdf/QPDFPageObjectHelper.hh
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2005-2021 Jay Berkenbilt
2
// Copyright (c) 2022-2025 Jay Berkenbilt and Manfred Holger
3
//
4
// This file is part of qpdf.
5
//
6
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
7
// in compliance with the License. You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software distributed under the License
12
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13
// or implied. See the License for the specific language governing permissions and limitations under
14
// the License.
15
//
16
// Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic
17
// License. At your option, you may continue to consider qpdf to be licensed under those terms.
18
// Please see the manual for additional information.
19
20
#ifndef QPDFPAGEOBJECTHELPER_HH
21
#define QPDFPAGEOBJECTHELPER_HH
22
23
#include <qpdf/QPDFAnnotationObjectHelper.hh>
24
#include <qpdf/QPDFMatrix.hh>
25
#include <qpdf/QPDFObjectHelper.hh>
26
27
#include <qpdf/DLL.h>
28
29
#include <qpdf/QPDFObjectHandle.hh>
30
#include <functional>
31
32
class QPDFAcroFormDocumentHelper;
33
34
// This is a helper class for page objects, but as of qpdf 10.1, many of the methods also work
35
// for form XObjects. When this is the case, it is noted in the comment.
36
class QPDFPageObjectHelper: public QPDFObjectHelper
37
{
38
  public:
39
    QPDF_DLL
40
    QPDFPageObjectHelper(QPDFObjectHandle);
41
42
0
    ~QPDFPageObjectHelper() override = default;
43
44
    // PAGE ATTRIBUTES
45
46
    // The getAttribute method works with pages and form XObjects. It returns the value of the
47
    // requested attribute from the page/form XObject's dictionary, taking inheritance from the
48
    // pages tree into consideration. For pages, the attributes /MediaBox, /CropBox, /Resources, and
49
    // /Rotate are inheritable, meaning that if they are not present directly on the page node, they
50
    // may be inherited from ancestor nodes in the pages tree.
51
    //
52
    // There are two ways that an attribute can be "shared":
53
    //
54
    // * For inheritable attributes on pages, it may appear in a higher level node of the pages tree
55
    //
56
    // * For any attribute, the attribute may be an indirect object which may be referenced by more
57
    // than one page/form XObject.
58
    //
59
    // If copy_if_shared is true, then this method will replace the attribute with a shallow copy if
60
    // it is indirect or inherited and return the copy. You should do this if you are going to
61
    // modify the returned object and want the modifications to apply to the current page/form
62
    // XObject only.
63
    QPDF_DLL
64
    QPDFObjectHandle getAttribute(std::string const& name, bool copy_if_shared);
65
66
    // PAGE BOXES
67
    //
68
    // Pages have various types of boundary boxes. These are described in detail in the PDF
69
    // specification (section 14.11.2 Page boundaries). They are, by key in the page dictionary:
70
    //
71
    // * /MediaBox -- boundaries of physical page
72
    // * /CropBox -- clipping region of what is displayed
73
    // * /BleedBox -- clipping region for production environments
74
    // * /TrimBox -- dimensions of final printed page after trimming
75
    // * /ArtBox -- extent of meaningful content including margins
76
    //
77
    // Of these, only /MediaBox is required. If any are absent, the
78
    // fallback value for /CropBox is /MediaBox, and the fallback
79
    // values for the other boxes are /CropBox.
80
    //
81
    // As noted above (PAGE ATTRIBUTES), /MediaBox and /CropBox can be inherited from parent nodes
82
    // in the pages tree. The other boxes can't be inherited.
83
    //
84
    // When the comments below refer to the "effective value" of a box, this takes into
85
    // consideration both inheritance through the pages tree (in the case of /MediaBox and /CropBox)
86
    // and fallback values for missing attributes (for all except /MediaBox).
87
    //
88
    // For the methods below, copy_if_shared is passed to getAttribute and therefore refers only to
89
    // indirect objects and values that are inherited through the pages tree.
90
    //
91
    // If copy_if_fallback is true, a copy is made if the object's value was obtained by falling
92
    // back to a different box.
93
    //
94
    // The copy_if_shared and copy_if_fallback parameters carry across multiple layers. This is
95
    // explained below.
96
    //
97
    // You should set copy_if_shared to true if you want to modify a bounding box for the current
98
    // page without affecting other pages but you don't want to change the fallback behavior. For
99
    // example, if you want to modify the /TrimBox for the current page only but have it continue to
100
    // fall back to the value of /CropBox or /MediaBox if they are not defined, you could set
101
    // copy_if_shared to true.
102
    //
103
    // You should set copy_if_fallback to true if you want to modify a specific box as distinct from
104
    // any other box. For example, if you want to make /TrimBox differ from /CropBox, then you
105
    // should set copy_if_fallback to true.
106
    //
107
    // The copy_if_fallback flags were added in qpdf 11.
108
    //
109
    // For example, suppose that neither /CropBox nor /TrimBox is present on a page but /CropBox is
110
    // present in the page's parent node in the page tree.
111
    //
112
    // * getTrimBox(false, false) would return the /CropBox from the parent node.
113
    //
114
    // * getTrimBox(true, false) would make a shallow copy of the /CropBox from the parent node into
115
    //   the current node and return it.
116
    //
117
    // * getTrimBox(false, true) would make a shallow copy of the /CropBox from the parent node into
118
    //   /TrimBox of the current node and return it.
119
    //
120
    // * getTrimBox(true, true) would make a shallow copy of the /CropBox from the parent node into
121
    //   the current node, then make a shallow copy of the resulting copy to /TrimBox of the current
122
    //   node, and then return that.
123
    //
124
    // To illustrate how these parameters carry across multiple layers, suppose that neither
125
    // /MediaBox, /CropBox, nor /TrimBox is present on a page but /MediaBox is present on the
126
    // parent. In this case:
127
    //
128
    // * getTrimBox(false, false) would return the value of /MediaBox from the parent node.
129
    //
130
    // * getTrimBox(true, false) would copy /MediaBox to the current node and return it.
131
    //
132
    // * getTrimBox(false, true) would first copy /MediaBox from the parent to /CropBox, then copy
133
    //   /CropBox to /TrimBox, and then return the result.
134
    //
135
    // * getTrimBox(true, true) would first copy /MediaBox from the parent to the current page, then
136
    //   copy it to /CropBox, then copy /CropBox to /TrimBox, and then return the result.
137
    //
138
    // If you need different behavior, call getAttribute directly and take care of your own copying.
139
140
    // Return the effective MediaBox
141
    QPDF_DLL
142
    QPDFObjectHandle getMediaBox(bool copy_if_shared = false);
143
144
    // Return the effective CropBox. If not defined, fall back to MediaBox
145
    QPDF_DLL
146
    QPDFObjectHandle getCropBox(bool copy_if_shared = false, bool copy_if_fallback = false);
147
148
    // Return the effective BleedBox. If not defined, fall back to CropBox.
149
    QPDF_DLL
150
    QPDFObjectHandle getBleedBox(bool copy_if_shared = false, bool copy_if_fallback = false);
151
152
    // Return the effective TrimBox. If not defined, fall back to CropBox.
153
    QPDF_DLL
154
    QPDFObjectHandle getTrimBox(bool copy_if_shared = false, bool copy_if_fallback = false);
155
156
    // Return the effective ArtBox. If not defined, fall back to CropBox.
157
    QPDF_DLL
158
    QPDFObjectHandle getArtBox(bool copy_if_shared = false, bool copy_if_fallback = false);
159
160
    // Iterate through XObjects, possibly recursing into form XObjects. This works with pages or
161
    // form XObjects. Call action on each XObject for which selector, if specified, returns true.
162
    // With no selector, calls action for every object. In addition to the object being passed to
163
    // action, the containing XObject dictionary and key are passed in. Remember that the XObject
164
    // dictionary may be shared, and the object may appear in multiple XObject dictionaries.
165
    QPDF_DLL
166
    void forEachXObject(
167
        bool recursive,
168
        std::function<void(
169
            QPDFObjectHandle& obj, QPDFObjectHandle& xobj_dict, std::string const& key)> action,
170
        std::function<bool(QPDFObjectHandle)> selector = nullptr);
171
    // Only call action for images
172
    QPDF_DLL
173
    void forEachImage(
174
        bool recursive,
175
        std::function<void(
176
            QPDFObjectHandle& obj, QPDFObjectHandle& xobj_dict, std::string const& key)> action);
177
    // Only call action for form XObjects
178
    QPDF_DLL
179
    void forEachFormXObject(
180
        bool recursive,
181
        std::function<void(
182
            QPDFObjectHandle& obj, QPDFObjectHandle& xobj_dict, std::string const& key)> action);
183
184
    // Returns an empty map if there are no images or no resources. Prior to qpdf 8.4.0, this
185
    // function did not support inherited resources, but it does now. Return value is a map from
186
    // XObject name to the image object, which is always a stream. Works with form XObjects as well
187
    // as pages. This method does not recurse into nested form XObjects. For that, use forEachImage.
188
    QPDF_DLL
189
    std::map<std::string, QPDFObjectHandle> getImages();
190
191
    // Old name -- calls getImages()
192
    QPDF_DLL
193
    std::map<std::string, QPDFObjectHandle> getPageImages();
194
195
    // Returns an empty map if there are no form XObjects or no resources. Otherwise, returns a map
196
    // of keys to form XObjects directly referenced from this page or form XObjects. This does not
197
    // recurse into nested form XObjects. For that, use forEachFormXObject.
198
    QPDF_DLL
199
    std::map<std::string, QPDFObjectHandle> getFormXObjects();
200
201
    // Converts each inline image to an external (normal) image if the size is at least the
202
    // specified number of bytes. This method works with pages or form XObjects. By default, it
203
    // recursively processes nested form XObjects. Pass true as shallow to avoid this behavior.
204
    // Prior to qpdf 10.1, form XObjects were ignored, but this was considered a bug.
205
    QPDF_DLL
206
    void externalizeInlineImages(size_t min_size = 0, bool shallow = false);
207
208
    // Return the annotations in the page's "/Annots" list, if any. If only_subtype is non-empty,
209
    // only include annotations of the given subtype.
210
    QPDF_DLL
211
    std::vector<QPDFAnnotationObjectHelper> getAnnotations(std::string const& only_subtype = "");
212
213
    // Returns a vector of stream objects representing the content streams for the given page.  This
214
    // routine allows the caller to not care whether there are one or more than one content streams
215
    // for a page.
216
    QPDF_DLL
217
    std::vector<QPDFObjectHandle> getPageContents();
218
219
    // Add the given object as a new content stream for this page. If parameter 'first' is true, add
220
    // to the beginning. Otherwise, add to the end. This routine automatically converts the page
221
    // contents to an array if it is a scalar, allowing the caller not to care what the initial
222
    // structure is. You can call coalesceContentStreams() afterwards if you want to force it to be
223
    // a single stream.
224
    QPDF_DLL
225
    void addPageContents(QPDFObjectHandle contents, bool first);
226
227
    // Rotate a page. If relative is false, set the rotation of the page to angle. Otherwise, add
228
    // angle to the rotation of the page. Angle must be a multiple of 90. Adding 90 to the rotation
229
    // rotates clockwise by 90 degrees.
230
    QPDF_DLL
231
    void rotatePage(int angle, bool relative);
232
233
    // Coalesce a page's content streams. A page's content may be a stream or an array of streams.
234
    // If this page's content is an array, concatenate the streams into a single stream. This can be
235
    // useful when working with files that split content streams in arbitrary spots, such as in the
236
    // middle of a token, as that can confuse some software. You could also call this after calling
237
    // addPageContents.
238
    QPDF_DLL
239
    void coalesceContentStreams();
240
241
    //
242
    // Content stream handling
243
    //
244
245
    // Parse a page's contents through ParserCallbacks, described above. This method works whether
246
    // the contents are a single stream or an array of streams. Call on a page object. Also works
247
    // for form XObjects.
248
    QPDF_DLL
249
    void parseContents(QPDFObjectHandle::ParserCallbacks* callbacks);
250
    // Old name
251
    QPDF_DLL
252
    void parsePageContents(QPDFObjectHandle::ParserCallbacks* callbacks);
253
254
    // Pass a page's or form XObject's contents through the given TokenFilter. If a pipeline is also
255
    // provided, it will be the target of the write methods from the token filter. If a pipeline is
256
    // not specified, any output generated by the token filter will be discarded. Use this interface
257
    // if you need to pass a page's contents through filter for work purposes without having that
258
    // filter automatically applied to the page's contents, as happens with addContentTokenFilter.
259
    // See examples/pdf-count-strings.cc for an example.
260
    QPDF_DLL
261
    void filterContents(QPDFObjectHandle::TokenFilter* filter, Pipeline* next = nullptr);
262
263
    // Old name -- calls filterContents()
264
    QPDF_DLL
265
    void filterPageContents(QPDFObjectHandle::TokenFilter* filter, Pipeline* next = nullptr);
266
267
    // Pipe a page's contents through the given pipeline. This method works whether the contents are
268
    // a single stream or an array of streams. Also works on form XObjects.
269
    QPDF_DLL
270
    void pipeContents(Pipeline* p);
271
    // Old name
272
    QPDF_DLL
273
    void pipePageContents(Pipeline* p);
274
275
    // Attach a token filter to a page's contents. If the page's contents is an array of streams, it
276
    // is automatically coalesced. The token filter is applied to the page's contents as a single
277
    // stream. Also works on form XObjects.
278
    QPDF_DLL
279
    void addContentTokenFilter(std::shared_ptr<QPDFObjectHandle::TokenFilter> token_filter);
280
281
    // A page's resources dictionary maps names to objects elsewhere in the file. This method walks
282
    // through a page's contents and keeps tracks of which resources are referenced somewhere in the
283
    // contents. Then it removes from the resources dictionary any object that is not referenced in
284
    // the contents. This operation is most useful after calling
285
    // QPDFPageDocumentHelper::pushInheritedAttributesToPage(). This method is used by page
286
    // splitting code to avoid copying unused objects in files that used shared resource
287
    // dictionaries across multiple pages. This method recurses into form XObjects and can be called
288
    // with a form XObject as well as a page.
289
    QPDF_DLL
290
    void removeUnreferencedResources();
291
292
    // Return a new QPDFPageObjectHelper that is a duplicate of the page. The returned object is an
293
    // indirect object that is ready to be inserted into the same or a different QPDF object using
294
    // any of the addPage methods in QPDFPageDocumentHelper or QPDF. Without calling one of those
295
    // methods, the page will not be added anywhere. The new page object shares all content streams
296
    // and indirect object resources with the original page, so if you are going to modify the
297
    // contents or other aspects of the page, you will need to handling copying of the component
298
    // parts separately.
299
    QPDF_DLL
300
    QPDFPageObjectHelper shallowCopyPage();
301
302
    // Return a transformation matrix whose effect is the same as the page's /Rotate and /UserUnit
303
    // parameters. If invert is true, return a matrix whose effect is the opposite. The regular
304
    // matrix is suitable for taking something from this page to put elsewhere, and the second one
305
    // is suitable for putting something else onto this page. The page's TrimBox is used as the
306
    // bounding box for purposes of computing the matrix.
307
    QPDF_DLL
308
    QPDFObjectHandle::Matrix getMatrixForTransformations(bool invert = false);
309
310
    // Return a form XObject that draws this page. This is useful for n-up operations, underlay,
311
    // overlay, thumbnail generation, or any other case in which it is useful to replicate the
312
    // contents of a page in some other context. The dictionaries are shallow copies of the original
313
    // page dictionary, and the contents are coalesced from the page's contents. The resulting
314
    // object handle is not referenced anywhere. If handle_transformations is true, the resulting
315
    // form XObject's /Matrix will be set to replicate rotation (/Rotate) and scaling (/UserUnit) in
316
    // the page's dictionary. In this way, the page's transformations will be preserved when placing
317
    // this object on another page.
318
    QPDF_DLL
319
    QPDFObjectHandle getFormXObjectForPage(bool handle_transformations = true);
320
321
    // Return content stream text that will place the given form XObject (fo) using the resource
322
    // name "name" on this page centered within the given rectangle. If invert_transformations is
323
    // true, the effect of any rotation (/Rotate) and scaling (/UserUnit) applied to the current
324
    // page will be inverted in the form XObject placement. This will cause the form XObject's
325
    // absolute orientation to be preserved. You could overlay one page on another by calling
326
    // getFormXObjectForPage on the original page, QPDFObjectHandle::getUniqueResourceName on the
327
    // destination page's Resources dictionary to generate a name for the resulting object, and
328
    // calling placeFormXObject on the destination page. Then insert the new fo (or, if it comes
329
    // from a different file, the result of calling copyForeignObject on it) into the resources
330
    // dictionary using name, and append or prepend the content to the page's content streams. See
331
    // the overlay/underlay code in qpdf.cc or examples/pdf-overlay-page.cc for an example. From
332
    // qpdf 10.0.0, the allow_shrink and allow_expand parameters control whether the form XObject is
333
    // allowed to be shrunk or expanded to stay within or maximally fill the destination rectangle.
334
    // The default values are for backward compatibility with the pre-10.0.0 behavior.
335
    QPDF_DLL
336
    std::string placeFormXObject(
337
        QPDFObjectHandle fo,
338
        std::string const& name,
339
        QPDFObjectHandle::Rectangle rect,
340
        bool invert_transformations = true,
341
        bool allow_shrink = true,
342
        bool allow_expand = false);
343
344
    // Alternative version that also fills in the transformation matrix that was used.
345
    QPDF_DLL
346
    std::string placeFormXObject(
347
        QPDFObjectHandle fo,
348
        std::string const& name,
349
        QPDFObjectHandle::Rectangle rect,
350
        QPDFMatrix& cm,
351
        bool invert_transformations = true,
352
        bool allow_shrink = true,
353
        bool allow_expand = false);
354
355
    // Return the transformation matrix that translates from the given form XObject's coordinate
356
    // system into the given rectangular region on the page. The parameters have the same meaning as
357
    // for placeFormXObject.
358
    QPDF_DLL
359
    QPDFMatrix getMatrixForFormXObjectPlacement(
360
        QPDFObjectHandle fo,
361
        QPDFObjectHandle::Rectangle rect,
362
        bool invert_transformations = true,
363
        bool allow_shrink = true,
364
        bool allow_expand = false);
365
366
    // If a page is rotated using /Rotate in the page's dictionary, instead rotate the page by the
367
    // same amount by altering the contents and removing the /Rotate key. This method adjusts the
368
    // various page bounding boxes (/MediaBox, etc.) so that the page will have the same semantics.
369
    // This can be useful to work around problems with PDF applications that can't properly handle
370
    // rotated pages. If a QPDFAcroFormDocumentHelper is provided, it will be used for resolving any
371
    // form fields that have to be rotated. If not, one will be created inside the function, which
372
    // is less efficient.
373
    QPDF_DLL
374
    void flattenRotation(QPDFAcroFormDocumentHelper* afdh = nullptr);
375
376
    // Copy annotations from another page into this page. The other page may be from the same QPDF
377
    // or from a different QPDF. Each annotation's rectangle is transformed by the given matrix. If
378
    // the annotation is a widget annotation that is associated with a form field, the form field is
379
    // copied into this document's AcroForm dictionary as well. You can use this to copy annotations
380
    // from a page that was converted to a form XObject and added to another page. For example of
381
    // this, see examples/pdf-overlay-page.cc. This method calls
382
    // QPDFAcroFormDocumentHelper::transformAnnotations, which will copy annotations and form fields
383
    // so that you can copy annotations from a source page to any number of other pages, even with
384
    // different matrices, and maintain independence from the original annotations. See also
385
    // QPDFAcroFormDocumentHelper::fixCopiedAnnotations, which can be used if you copy a page and
386
    // want to repair the annotations on the destination page to make them independent from the
387
    // original page's annotations.
388
    //
389
    // If you pass in a QPDFAcroFormDocumentHelper*, the method will use that instead of creating
390
    // one in the function. Creating QPDFAcroFormDocumentHelper objects is expensive, so if you're
391
    // doing a lot of copying, it can be more efficient to create these outside and pass them in.
392
    QPDF_DLL
393
    void copyAnnotations(
394
        QPDFPageObjectHelper from_page,
395
        QPDFMatrix const& cm = QPDFMatrix(),
396
        QPDFAcroFormDocumentHelper* afdh = nullptr,
397
        QPDFAcroFormDocumentHelper* from_afdh = nullptr);
398
399
  private:
400
    QPDFObjectHandle getAttribute(
401
        std::string const& name,
402
        bool copy_if_shared,
403
        std::function<QPDFObjectHandle()> get_fallback,
404
        bool copy_if_fallback);
405
    static bool
406
    removeUnreferencedResourcesHelper(QPDFPageObjectHelper ph, std::set<std::string>& unresolved);
407
408
    class Members
409
    {
410
        friend class QPDFPageObjectHelper;
411
412
      public:
413
        ~Members() = default;
414
415
      private:
416
        Members() = default;
417
        Members(Members const&) = delete;
418
    };
419
420
    std::shared_ptr<Members> m;
421
};
422
423
#endif // QPDFPAGEOBJECTHELPER_HH