Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 21%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import decimal
31import enum
32import hashlib
33import re
34import struct
35import sys
36import uuid
37from collections.abc import Iterable, Mapping
38from io import BytesIO, FileIO, IOBase
39from itertools import compress
40from pathlib import Path
41from re import Pattern
42from types import TracebackType
43from typing import (
44 IO,
45 Any,
46 Callable,
47 Optional,
48 Union,
49 cast,
50)
52if sys.version_info >= (3, 11):
53 from typing import Self
54else:
55 from typing_extensions import Self
57from ._doc_common import DocumentInformation, PdfDocCommon
58from ._encryption import EncryptAlgorithm, Encryption
59from ._page import PageObject, Transformation
60from ._page_labels import nums_clear_range, nums_insert, nums_next
61from ._reader import PdfReader
62from ._utils import (
63 StrByteType,
64 StreamType,
65 _get_max_pdf_version_header,
66 deprecate_with_replacement,
67 deprecation_no_replacement,
68 logger_warning,
69)
70from .constants import AnnotationDictionaryAttributes as AA
71from .constants import CatalogAttributes as CA
72from .constants import (
73 CatalogDictionary,
74 GoToActionArguments,
75 ImageType,
76 InteractiveFormDictEntries,
77 OutlineFontFlag,
78 PageLabelStyle,
79 PagesAttributes,
80 TypFitArguments,
81 UserAccessPermissions,
82)
83from .constants import Core as CO
84from .constants import FieldDictionaryAttributes as FA
85from .constants import PageAttributes as PG
86from .constants import TrailerKeys as TK
87from .errors import LimitReachedError, PdfReadError, PyPdfError
88from .generic import (
89 PAGE_FIT,
90 ArrayObject,
91 BooleanObject,
92 ByteStringObject,
93 ContentStream,
94 Destination,
95 DictionaryObject,
96 EmbeddedFile,
97 Fit,
98 FloatObject,
99 IndirectObject,
100 NameObject,
101 NullObject,
102 NumberObject,
103 PdfObject,
104 RectangleObject,
105 ReferenceLink,
106 StreamObject,
107 TextStringObject,
108 TreeObject,
109 ViewerPreferences,
110 create_string_object,
111 extract_links,
112 hex_to_rgb,
113 is_null_or_none,
114)
115from .generic._appearance_stream import TextStreamAppearance
116from .pagerange import PageRange, PageRangeSpec
117from .types import (
118 AnnotationSubtype,
119 BorderArrayType,
120 LayoutType,
121 OutlineItemType,
122 OutlineType,
123 PagemodeType,
124)
125from .xmp import XmpInformation
127ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()
130class ObjectDeletionFlag(enum.IntFlag):
131 NONE = 0
132 TEXT = enum.auto()
133 LINKS = enum.auto()
134 ATTACHMENTS = enum.auto()
135 OBJECTS_3D = enum.auto()
136 ALL_ANNOTATIONS = enum.auto()
137 XOBJECT_IMAGES = enum.auto()
138 INLINE_IMAGES = enum.auto()
139 DRAWING_IMAGES = enum.auto()
140 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
143def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
144 hash = hashlib.md5(usedforsecurity=False)
145 for block in iter(lambda: stream.read(blocksize), b""):
146 hash.update(block)
147 return hash.hexdigest()
150class PdfWriter(PdfDocCommon):
151 """
152 Write a PDF file out, given pages produced by another class or through
153 cloning a PDF file during initialization.
155 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.
157 Args:
158 clone_from: identical to fileobj (for compatibility)
160 incremental: If true, loads the document and set the PdfWriter in incremental mode.
162 When writing incrementally, the original document is written first and new/modified
163 content is appended. To be used for signed document/forms to keep signature valid.
165 full: If true, loads all the objects (always full if incremental = True).
166 This parameter may allow loading large PDFs.
168 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.
169 If false, pypdf will try to be forgiving and do something reasonable, but it will log
170 a warning message. It is a best-effort approach.
172 """
174 def __init__(
175 self,
176 fileobj: Union[None, PdfReader, StrByteType, Path] = "",
177 clone_from: Union[None, PdfReader, StrByteType, Path] = None,
178 incremental: bool = False,
179 full: bool = False,
180 strict: bool = False,
181 *,
182 incremental_clone_object_count_limit: Optional[int] = 500_000,
183 incremental_clone_object_id_limit: Optional[int] = 1_000_000,
184 ) -> None:
185 self.strict = strict
186 """
187 If true, pypdf will raise an exception if a PDF does not follow the specification.
188 If false, pypdf will try to be forgiving and do something reasonable, but it will log
189 a warning message. It is a best-effort approach.
190 """
192 self.incremental = incremental or full
193 """
194 Returns if the PdfWriter object has been started in incremental mode.
195 """
197 self._objects: list[Optional[PdfObject]] = []
198 """
199 The indirect objects in the PDF.
200 For the incremental case, it will be filled with None
201 in clone_reader_document_root.
202 """
204 self._original_hash: list[int] = []
205 """
206 List of hashes after import; used to identify changes.
207 """
209 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}
210 """
211 Maps hash values of indirect objects to the list of IndirectObjects.
212 This is used for compression.
213 """
215 self._id_translated: dict[int, dict[int, int]] = {}
216 """List of already translated IDs.
217 dict[id(pdf)][(idnum, generation)]
218 """
220 self._info_obj: Optional[PdfObject]
221 """The PDF files's document information dictionary,
222 defined by Info in the PDF file's trailer dictionary."""
224 self._ID: Union[ArrayObject, None] = None
225 """The PDF file identifier,
226 defined by the ID in the PDF file's trailer dictionary."""
228 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []
229 "Tracks links in pages added to the writer for resolving later."
230 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}
231 "Tracks pages added to the writer and what page they turned into."
233 # Security parameters.
234 self._incremental_clone_object_count_limit = (
235 incremental_clone_object_count_limit
236 if isinstance(incremental_clone_object_count_limit, int)
237 else sys.maxsize
238 )
239 self._incremental_clone_object_id_limit = (
240 incremental_clone_object_id_limit if isinstance(incremental_clone_object_id_limit, int) else sys.maxsize
241 )
243 if self.incremental:
244 if isinstance(fileobj, (str, Path)):
245 with open(fileobj, "rb") as f:
246 fileobj = BytesIO(f.read(-1))
247 if isinstance(fileobj, BytesIO):
248 fileobj = PdfReader(fileobj)
249 if not isinstance(fileobj, PdfReader):
250 raise PyPdfError("Invalid type for incremental mode")
251 self._reader = fileobj # prev content is in _reader.stream
252 self._header = fileobj.pdf_header.encode()
253 self._readonly = True # TODO: to be analysed
254 else:
255 self._header = b"%PDF-1.3"
256 self._info_obj = self._add_object(
257 DictionaryObject(
258 {NameObject("/Producer"): create_string_object("pypdf")}
259 )
260 )
262 def _get_clone_from(
263 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
264 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
265 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:
266 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (
267 fileobj == "" or clone_from is not None
268 ):
269 return clone_from
270 cloning = True
271 if isinstance(fileobj, (str, Path)):
272 fileobj_path = Path(fileobj)
273 if not fileobj_path.exists() or fileobj_path.stat().st_size == 0:
274 cloning = False
275 elif isinstance(fileobj, (IOBase, BytesIO)):
276 t = fileobj.tell()
277 if fileobj.seek(0, 2) == 0:
278 cloning = False
279 fileobj.seek(t, 0)
280 if cloning:
281 clone_from = fileobj
282 return clone_from
284 clone_from = _get_clone_from(fileobj, clone_from)
285 # To prevent overwriting
286 self.temp_fileobj = fileobj
287 self.fileobj = ""
288 self._with_as_usage = False
289 self._cloned = False
290 # The root of our page tree node
291 pages = DictionaryObject(
292 {
293 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),
294 NameObject(PagesAttributes.COUNT): NumberObject(0),
295 NameObject(PagesAttributes.KIDS): ArrayObject(),
296 }
297 )
298 self.flattened_pages = []
299 self._encryption: Optional[Encryption] = None
300 self._encrypt_entry: Optional[DictionaryObject] = None
302 if clone_from is not None:
303 if not isinstance(clone_from, PdfReader):
304 clone_from = PdfReader(clone_from)
305 self.clone_document_from_reader(clone_from)
306 self._cloned = True
307 else:
308 self._pages = self._add_object(pages)
309 self._root_object = DictionaryObject(
310 {
311 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),
312 NameObject(CO.PAGES): self._pages,
313 }
314 )
315 self._add_object(self._root_object)
316 if full and not incremental:
317 self.incremental = False
318 if isinstance(self._ID, list):
319 if isinstance(self._ID[0], TextStringObject):
320 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
321 if isinstance(self._ID[1], TextStringObject):
322 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())
324 # for commonality
325 @property
326 def is_encrypted(self) -> bool:
327 """
328 Read-only boolean property showing whether this PDF file is encrypted.
330 Note that this property, if true, will remain true even after the
331 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
332 """
333 return False
335 @property
336 def root_object(self) -> DictionaryObject:
337 """
338 Provide direct access to PDF Structure.
340 Note:
341 Recommended only for read access.
343 """
344 return self._root_object
346 @property
347 def _info(self) -> Optional[DictionaryObject]:
348 """
349 Provide access to "/Info". Standardized with PdfReader.
351 Returns:
352 /Info Dictionary; None if the entry does not exist
354 """
355 return (
356 None
357 if self._info_obj is None
358 else cast(DictionaryObject, self._info_obj.get_object())
359 )
361 @_info.setter
362 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
363 if value is None:
364 try:
365 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore[union-attr]
366 except (KeyError, AttributeError):
367 pass
368 self._info_obj = None
369 else:
370 if self._info_obj is None:
371 self._info_obj = self._add_object(DictionaryObject())
372 obj = cast(DictionaryObject, self._info_obj.get_object())
373 obj.clear()
374 obj.update(cast(DictionaryObject, value.get_object()))
376 @property
377 def xmp_metadata(self) -> Optional[XmpInformation]:
378 """XMP (Extensible Metadata Platform) data."""
379 return cast(XmpInformation, self.root_object.xmp_metadata)
381 @xmp_metadata.setter
382 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:
383 """XMP (Extensible Metadata Platform) data."""
384 if value is None:
385 if "/Metadata" in self.root_object:
386 del self.root_object["/Metadata"]
387 return
389 metadata = self.root_object.get("/Metadata", None)
390 if not isinstance(metadata, IndirectObject):
391 if metadata is not None:
392 del self.root_object["/Metadata"]
393 metadata_stream = StreamObject()
394 stream_reference = self._add_object(metadata_stream)
395 self.root_object[NameObject("/Metadata")] = stream_reference
396 else:
397 metadata_stream = cast(StreamObject, metadata.get_object())
399 if isinstance(value, XmpInformation):
400 bytes_data = value.stream.get_data()
401 else:
402 bytes_data = value
403 metadata_stream.set_data(bytes_data)
405 @property
406 def with_as_usage(self) -> bool:
407 deprecation_no_replacement("with_as_usage", "5.0")
409 @with_as_usage.setter
410 def with_as_usage(self, value: bool) -> None:
411 deprecation_no_replacement("with_as_usage", "5.0")
413 def __enter__(self) -> Self:
414 """Store how writer is initialized by 'with'."""
415 c: bool = self._cloned
416 t = self.temp_fileobj
417 self.__init__() # type: ignore[misc]
418 self._cloned = c
419 self._with_as_usage = True
420 self.fileobj = t # type: ignore[assignment]
421 return self
423 def __exit__(
424 self,
425 exc_type: Optional[type[BaseException]],
426 exc: Optional[BaseException],
427 traceback: Optional[TracebackType],
428 ) -> None:
429 """Write data to the fileobj."""
430 if self.fileobj and not self._cloned:
431 self.write(self.fileobj)
433 @property
434 def pdf_header(self) -> str:
435 """
436 Read/Write property of the PDF header that is written.
438 This should be something like ``'%PDF-1.5'``. It is recommended to set
439 the lowest version that supports all features which are used within the
440 PDF file.
442 Note: `pdf_header` returns a string but accepts bytes or str for writing
443 """
444 return self._header.decode()
446 @pdf_header.setter
447 def pdf_header(self, new_header: Union[str, bytes]) -> None:
448 if isinstance(new_header, str):
449 new_header = new_header.encode()
450 self._header = new_header
452 def _add_object(self, obj: PdfObject) -> IndirectObject:
453 if (
454 getattr(obj, "indirect_reference", None) is not None
455 and obj.indirect_reference.pdf == self # type: ignore[union-attr]
456 ):
457 return obj.indirect_reference # type: ignore[return-value]
458 # check for /Contents in Pages (/Contents in annotations are strings)
459 if isinstance(obj, DictionaryObject) and isinstance(
460 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
461 ):
462 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
463 self._objects.append(obj)
464 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
465 return obj.indirect_reference
467 def get_object(
468 self,
469 indirect_reference: Union[int, IndirectObject],
470 ) -> PdfObject:
471 if isinstance(indirect_reference, int):
472 obj = self._objects[indirect_reference - 1]
473 elif indirect_reference.pdf != self:
474 raise ValueError("PDF must be self")
475 else:
476 obj = self._objects[indirect_reference.idnum - 1]
477 if obj is None:
478 raise PdfReadError(f"Object {indirect_reference!r} not found!")
479 return obj
481 def _replace_object(
482 self,
483 indirect_reference: Union[int, IndirectObject],
484 obj: PdfObject,
485 ) -> PdfObject:
486 if isinstance(indirect_reference, IndirectObject):
487 if indirect_reference.pdf != self:
488 raise ValueError("PDF must be self")
489 indirect_reference = indirect_reference.idnum
490 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore[union-attr]
491 if (
492 getattr(obj, "indirect_reference", None) is not None
493 and obj.indirect_reference.pdf != self # type: ignore[union-attr]
494 ):
495 obj = obj.clone(self)
496 self._objects[indirect_reference - 1] = obj
497 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
499 assert isinstance(obj, PdfObject), "mypy"
500 return obj
502 def _add_page(
503 self,
504 page: PageObject,
505 index: int,
506 excluded_keys: Iterable[str] = (),
507 ) -> PageObject:
508 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:
509 raise ValueError("Invalid page object")
510 assert self.flattened_pages is not None, "for mypy"
511 page_org = page
512 excluded_keys = list(excluded_keys)
513 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]
514 # Acrobat does not accept two indirect references pointing on the same
515 # page; therefore in order to add multiple copies of the same
516 # page, we need to create a new dictionary for the page, however the
517 # objects below (including content) are not duplicated:
518 try: # delete an already existing page
519 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore[union-attr]
520 page_org.indirect_reference.idnum # type: ignore[union-attr]
521 ]
522 except Exception:
523 pass
525 page = cast(
526 "PageObject", page_org.clone(self, False, excluded_keys).get_object()
527 )
528 if page_org.pdf is not None:
529 other = page_org.pdf.pdf_header
530 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
532 node, idx = self._get_page_in_node(index)
533 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference
535 if idx >= 0:
536 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)
537 self.flattened_pages.insert(index, page)
538 else:
539 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)
540 self.flattened_pages.append(page)
541 current: Optional[PdfObject] = node
542 recurse = 0
543 while not is_null_or_none(current):
544 assert current is not None # for mypy; guarded by is_null_or_none
545 node_dict = cast(DictionaryObject, current.get_object())
546 node_dict[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node_dict[PagesAttributes.COUNT]) + 1)
547 current = node_dict.get(PagesAttributes.PARENT, None)
548 recurse += 1
549 if recurse > 1000:
550 raise PyPdfError("Too many recursive calls!")
552 if page_org.pdf is not None:
553 # the page may contain links to other pages, and those other
554 # pages may or may not already be added. we store the
555 # information we need, so that we can resolve the references
556 # later.
557 self._unresolved_links.extend(extract_links(page, page_org))
558 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
560 return page
562 def set_need_appearances_writer(self, state: bool = True) -> None:
563 """
564 Sets the "NeedAppearances" flag in the PDF writer.
566 The "NeedAppearances" flag indicates whether the appearance dictionary
567 for form fields should be automatically generated by the PDF viewer or
568 if the embedded appearance should be used.
570 Args:
571 state: The actual value of the NeedAppearances flag.
573 Returns:
574 None
576 """
577 # See §12.7.2 and §7.7.2 for more information:
578 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
579 try:
580 # get the AcroForm tree
581 if CatalogDictionary.ACRO_FORM not in self._root_object:
582 self._root_object[
583 NameObject(CatalogDictionary.ACRO_FORM)
584 ] = self._add_object(DictionaryObject())
586 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)
587 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[
588 need_appearances
589 ] = BooleanObject(state)
590 except Exception as exc: # pragma: no cover
591 logger_warning(
592 "set_need_appearances_writer(%(state)s) catch : %(exc)s",
593 source=__name__,
594 state=state,
595 exc=exc,
596 )
598 def create_viewer_preferences(self) -> ViewerPreferences:
599 o = ViewerPreferences()
600 self._root_object[
601 NameObject(CatalogDictionary.VIEWER_PREFERENCES)
602 ] = self._add_object(o)
603 return o
605 def add_page(
606 self,
607 page: PageObject,
608 excluded_keys: Iterable[str] = (),
609 ) -> PageObject:
610 """
611 Add a page to this PDF file.
613 Recommended for advanced usage including the adequate excluded_keys.
615 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`
616 instance.
618 Args:
619 page: The page to add to the document. Should be
620 an instance of :class:`PageObject<pypdf._page.PageObject>`
621 excluded_keys:
623 Returns:
624 The added PageObject.
626 """
627 assert self.flattened_pages is not None, "mypy"
628 return self._add_page(page, len(self.flattened_pages), excluded_keys)
630 def insert_page(
631 self,
632 page: PageObject,
633 index: int = 0,
634 excluded_keys: Iterable[str] = (),
635 ) -> PageObject:
636 """
637 Insert a page in this PDF file. The page is usually acquired from a
638 :class:`PdfReader<pypdf.PdfReader>` instance.
640 Args:
641 page: The page to add to the document.
642 index: Position at which the page will be inserted.
643 excluded_keys:
645 Returns:
646 The added PageObject.
648 """
649 assert self.flattened_pages is not None, "mypy"
650 if index < 0:
651 index += len(self.flattened_pages)
652 if index < 0:
653 raise ValueError("Invalid index value")
654 if index >= len(self.flattened_pages):
655 return self.add_page(page, excluded_keys)
656 return self._add_page(page, index, excluded_keys)
658 def _get_page_number_by_indirect(
659 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
660 ) -> Optional[int]:
661 """
662 Generate _page_id2num.
664 Args:
665 indirect_reference:
667 Returns:
668 The page number or None
670 """
671 # To provide same function as in PdfReader
672 if is_null_or_none(indirect_reference):
673 return None
674 assert indirect_reference is not None, "mypy"
675 if isinstance(indirect_reference, int):
676 indirect_reference = IndirectObject(indirect_reference, 0, self)
677 obj = indirect_reference.get_object()
678 if isinstance(obj, PageObject):
679 return obj.page_number
680 return None
682 def add_blank_page(
683 self, width: Optional[float] = None, height: Optional[float] = None
684 ) -> PageObject:
685 """
686 Append a blank page to this PDF file and return it.
688 If no page size is specified, use the size of the last page.
690 Args:
691 width: The width of the new page expressed in default user
692 space units.
693 height: The height of the new page expressed in default
694 user space units.
696 Returns:
697 The newly appended page.
699 Raises:
700 PageSizeNotDefinedError: if width and height are not defined
701 and previous page does not exist.
703 """
704 page = PageObject.create_blank_page(self, width, height)
705 return self.add_page(page)
707 def insert_blank_page(
708 self,
709 width: Optional[Union[float, decimal.Decimal]] = None,
710 height: Optional[Union[float, decimal.Decimal]] = None,
711 index: int = 0,
712 ) -> PageObject:
713 """
714 Insert a blank page to this PDF file and return it.
716 If no page size is specified for a dimension, use the size of the last page.
718 Args:
719 width: The width of the new page in default user space units.
720 height: The height of the new page in default user space units.
721 index: Position to add the page.
723 Returns:
724 The newly inserted page.
726 Raises:
727 PageSizeNotDefinedError: if width and height are not defined
728 and previous page does not exist.
729 IndexError: Index is outside of [-self.get_num_pages(), self.get_num_pages()]
730 """
731 num_pages = self.get_num_pages()
732 if abs(index) <= num_pages:
733 # Use the chosen index, but do not exceed the available pages
734 fixed_index = min(index, num_pages - 1)
735 mediabox = self.pages[fixed_index].mediabox
736 if width is None or width <= 0:
737 width = mediabox.width
738 if height is None or height <= 0:
739 height = mediabox.height
740 else:
741 raise IndexError(f"Index should be in range [-{num_pages}, {num_pages}]")
743 page = PageObject.create_blank_page(self, width, height)
744 self.insert_page(page, index)
745 return page
747 @property
748 def open_destination(
749 self,
750 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
751 return super().open_destination
753 @open_destination.setter
754 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
755 if dest is None:
756 try:
757 del self._root_object["/OpenAction"]
758 except KeyError:
759 pass
760 elif isinstance(dest, str):
761 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)
762 elif isinstance(dest, Destination):
763 self._root_object[NameObject("/OpenAction")] = dest.dest_array
764 elif isinstance(dest, PageObject):
765 self._root_object[NameObject("/OpenAction")] = Destination(
766 "Opening",
767 dest.indirect_reference
768 if dest.indirect_reference is not None
769 else NullObject(),
770 PAGE_FIT,
771 ).dest_array
773 def add_js(self, javascript: str) -> None:
774 """
775 Add JavaScript which will launch upon opening this PDF.
777 Args:
778 javascript: Your JavaScript.
780 Example:
781 This will launch the print window when the PDF is opened.
783 >>> from pypdf import PdfWriter
784 >>> output = PdfWriter()
785 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
787 """
788 # Names / JavaScript preferred to be able to add multiple scripts
789 if "/Names" not in self._root_object:
790 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()
791 names = cast(DictionaryObject, self._root_object[CA.NAMES])
792 if "/JavaScript" not in names:
793 names[NameObject("/JavaScript")] = DictionaryObject(
794 {NameObject("/Names"): ArrayObject()}
795 )
796 js_list = cast(
797 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]
798 )
799 # We need a name for parameterized JavaScript in the PDF file,
800 # but it can be anything.
801 js_list.append(create_string_object(str(uuid.uuid4())))
803 js = DictionaryObject(
804 {
805 NameObject(PagesAttributes.TYPE): NameObject("/Action"),
806 NameObject("/S"): NameObject("/JavaScript"),
807 NameObject("/JS"): TextStringObject(f"{javascript}"),
808 }
809 )
810 js_list.append(self._add_object(js))
812 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":
813 """
814 Embed a file inside the PDF.
816 Reference:
817 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
818 Section 7.11.3
820 Args:
821 filename: The filename to display.
822 data: The data in the file.
824 Returns:
825 EmbeddedFile instance for the newly created embedded file.
827 """
828 return EmbeddedFile._create_new(self, filename, data)
830 def append_pages_from_reader(
831 self,
832 reader: PdfReader,
833 after_page_append: Optional[Callable[[PageObject], None]] = None,
834 ) -> None:
835 """
836 Copy pages from reader to writer. Includes an optional callback
837 parameter which is invoked after pages are appended to the writer.
839 ``append`` should be preferred.
841 Args:
842 reader: a PdfReader object from which to copy page
843 annotations to this writer object. The writer's annots
844 will then be updated.
845 after_page_append:
846 Callback function that is invoked after each page is appended to
847 the writer. Signature includes a reference to the appended page
848 (delegates to append_pages_from_reader). The single parameter of
849 the callback is a reference to the page just appended to the
850 document.
852 """
853 reader_num_pages = len(reader.pages)
854 # Copy pages from reader to writer
855 for reader_page_number in range(reader_num_pages):
856 reader_page = reader.pages[reader_page_number]
857 writer_page = self.add_page(reader_page)
858 # Trigger callback, pass writer page as parameter
859 if callable(after_page_append):
860 after_page_append(writer_page)
862 def _merge_content_stream_to_page(
863 self,
864 page: PageObject,
865 new_content_data: bytes,
866 ) -> None:
867 """
868 Combines existing content stream(s) with new content (as bytes).
870 Args:
871 page: The page to which the new content data will be added.
872 new_content_data: A binary-encoded new content stream, for
873 instance the commands to draw an XObject.
874 """
875 # First resolve the existing page content. This always is an IndirectObject:
876 # PDF Explained by John Whitington
877 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
878 if NameObject("/Contents") in page:
879 existing_content_ref = page[NameObject("/Contents")]
880 existing_content = existing_content_ref.get_object()
882 if isinstance(existing_content, ArrayObject):
883 # Create a new StreamObject for the new_content_data
884 new_stream_obj = StreamObject()
885 new_stream_obj.set_data(new_content_data)
886 existing_content.append(self._add_object(new_stream_obj))
887 page[NameObject("/Contents")] = self._add_object(existing_content)
888 if isinstance(existing_content, StreamObject):
889 # Merge new content to existing StreamObject
890 merged_data = existing_content.get_data() + b"\n" + new_content_data
891 new_stream = StreamObject()
892 new_stream.set_data(merged_data)
893 page[NameObject("/Contents")] = self._add_object(new_stream)
894 else:
895 # If no existing content, then we have an empty page.
896 # Create a new StreamObject in a new /Contents entry.
897 new_stream = StreamObject()
898 new_stream.set_data(new_content_data)
899 page[NameObject("/Contents")] = self._add_object(new_stream)
901 def _add_apstream_object(
902 self,
903 page: PageObject,
904 appearance_stream_obj: StreamObject,
905 object_name: str,
906 x_offset: float,
907 y_offset: float,
908 ) -> None:
909 """
910 Adds an appearance stream to the page content in the form of
911 an XObject.
913 Args:
914 page: The page to which to add the appearance stream.
915 appearance_stream_obj: The appearance stream.
916 object_name: The name of the appearance stream.
917 x_offset: The horizontal offset for the appearance stream.
918 y_offset: The vertical offset for the appearance stream.
919 """
920 pg_res = cast(DictionaryObject, page[PG.RESOURCES])
921 # Always add the resolved stream object to the writer to get a new IndirectObject.
922 # This ensures we have a valid IndirectObject managed by *this* writer.
923 xobject_ref = self._add_object(appearance_stream_obj)
924 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()
925 if "/XObject" not in pg_res:
926 pg_res[NameObject("/XObject")] = DictionaryObject()
927 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])
928 if xobject_name not in pg_xo_res:
929 pg_xo_res[xobject_name] = xobject_ref
930 else:
931 logger_warning(
932 "XObject %(xobject_name)r already added to page resources. This might be an issue.",
933 source=__name__,
934 xobject_name=xobject_name,
935 )
936 xobject_cm = Transformation().translate(x_offset, y_offset)
937 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
938 self._merge_content_stream_to_page(page, xobject_drawing_commands)
940 FFBITS_NUL = FA.FfBits(0)
942 def update_page_form_field_values(
943 self,
944 page: Union[PageObject, list[PageObject], None],
945 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],
946 flags: FA.FfBits = FFBITS_NUL,
947 auto_regenerate: Optional[bool] = True,
948 flatten: bool = False,
949 ) -> None:
950 """
951 Update the form field values for a given page from a fields dictionary.
953 Copy field texts and values from fields to page.
954 If the field links to a parent object, add the information to the parent.
956 Args:
957 page: `PageObject` - references **PDF writer's page** where the
958 annotations and field data will be updated.
959 `List[Pageobject]` - provides list of pages to be processed.
960 `None` - all pages.
961 fields: a Python dictionary of:
963 * field names (/T) as keys and text values (/V) as value
964 * field names (/T) as keys and list of text values (/V) for multiple choice list
965 * field names (/T) as keys and tuple of:
966 * text values (/V)
967 * font id (e.g. /F1, the font id must exist)
968 * font size (0 for autosize)
970 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.
972 auto_regenerate: Set/unset the need_appearances flag;
973 the flag is unchanged if auto_regenerate is None.
975 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's
976 appearance stream to the page contents. Note that this option does not remove the
977 annotation itself.
979 """
980 if CatalogDictionary.ACRO_FORM not in self._root_object:
981 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")
982 acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
983 if InteractiveFormDictEntries.Fields not in acro_form:
984 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")
985 if isinstance(auto_regenerate, bool):
986 self.set_need_appearances_writer(auto_regenerate)
987 # Iterate through pages, update field values
988 if page is None:
989 page = list(self.pages)
990 if isinstance(page, list):
991 for p in page:
992 if PG.ANNOTS in p: # just to prevent warnings
993 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)
994 return
995 if PG.ANNOTS not in page:
996 logger_warning("No fields to update on this page", source=__name__)
997 return
998 appearance_stream_obj: Optional[StreamObject] = None
1000 for annotation in page[PG.ANNOTS]: # type: ignore[attr-defined]
1001 annotation = cast(DictionaryObject, annotation.get_object())
1002 if annotation.get("/Subtype", "") != "/Widget":
1003 continue
1004 if "/FT" in annotation and "/T" in annotation:
1005 parent_annotation = annotation
1006 else:
1007 parent_annotation = annotation.get(
1008 PG.PARENT, DictionaryObject()
1009 ).get_object()
1011 for field, value in fields.items():
1012 rectangle = cast(RectangleObject, annotation[AA.Rect])
1013 if not (
1014 self._get_qualified_field_name(parent_annotation) == field
1015 or parent_annotation.get("/T", None) == field
1016 ):
1017 continue
1018 if (
1019 parent_annotation.get("/FT", None) == "/Ch"
1020 and "/I" in parent_annotation
1021 ):
1022 del parent_annotation["/I"]
1023 if flags:
1024 annotation[NameObject(FA.Ff)] = NumberObject(flags)
1025 # Set the field value
1026 if not (value is None and flatten): # Only change values if given by user and not flattening.
1027 if isinstance(value, list):
1028 lst = ArrayObject(TextStringObject(v) for v in value)
1029 parent_annotation[NameObject(FA.V)] = lst
1030 elif isinstance(value, tuple):
1031 annotation[NameObject(FA.V)] = TextStringObject(
1032 value[0],
1033 )
1034 else:
1035 parent_annotation[NameObject(FA.V)] = TextStringObject(value)
1036 # Get or create the field's appearance stream object
1037 if parent_annotation.get(FA.FT) == "/Btn":
1038 # Checkbox button (no /FT found in Radio widgets);
1039 # We can find the associated appearance stream object
1040 # within the annotation.
1041 v = NameObject(value)
1042 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])
1043 normal_ap = cast(DictionaryObject, ap["/N"])
1044 if v not in normal_ap:
1045 v = NameObject("/Off")
1046 appearance_stream_obj = normal_ap.get(v)
1047 # Other cases will be updated through the for loop
1048 annotation[NameObject(AA.AS)] = v
1049 annotation[NameObject(FA.V)] = v
1050 elif (
1051 parent_annotation.get(FA.FT) == "/Tx"
1052 or parent_annotation.get(FA.FT) == "/Ch"
1053 ):
1054 # Textbox; we need to generate the appearance stream object
1055 if isinstance(value, tuple):
1056 appearance_stream_obj = TextStreamAppearance.from_text_annotation(
1057 self, page, flatten, acro_form, parent_annotation, annotation, value[1], value[2]
1058 )
1059 else:
1060 appearance_stream_obj = TextStreamAppearance.from_text_annotation(
1061 self, page, flatten, acro_form, parent_annotation, annotation
1062 )
1063 # Add the appearance stream object
1064 if AA.AP not in annotation:
1065 annotation[NameObject(AA.AP)] = DictionaryObject(
1066 {NameObject("/N"): self._add_object(appearance_stream_obj)}
1067 )
1068 elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])):
1069 cast(DictionaryObject, annotation[NameObject(AA.AP)])[
1070 NameObject("/N")
1071 ] = self._add_object(appearance_stream_obj)
1072 else: # [/AP][/N] exists
1073 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore[index]
1074 self._objects[n - 1] = appearance_stream_obj
1075 appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self)
1076 elif (
1077 annotation.get(FA.FT) == "/Sig"
1078 ): # deprecated # not implemented yet
1079 logger_warning("Signature forms not implemented yet", source=__name__)
1081 if appearance_stream_obj and flatten:
1082 self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])
1084 def reattach_fields(
1085 self, page: Optional[PageObject] = None
1086 ) -> list[DictionaryObject]:
1087 """
1088 Parse annotations within the page looking for orphan fields and
1089 reattach then into the Fields Structure.
1091 Args:
1092 page: page to analyze.
1093 If none is provided, all pages will be analyzed.
1095 Returns:
1096 list of reattached fields.
1098 """
1099 lst = []
1100 if page is None:
1101 for p in self.pages:
1102 lst += self.reattach_fields(p)
1103 return lst
1105 try:
1106 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1107 except KeyError:
1108 af = DictionaryObject()
1109 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af
1110 try:
1111 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])
1112 except KeyError:
1113 fields = ArrayObject()
1114 af[NameObject(InteractiveFormDictEntries.Fields)] = fields
1116 if "/Annots" not in page:
1117 return lst
1118 annotations = cast(ArrayObject, page["/Annots"])
1119 for idx, annotation in enumerate(annotations):
1120 is_indirect = isinstance(annotation, IndirectObject)
1121 annotation = cast(DictionaryObject, annotation.get_object())
1122 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:
1123 if (
1124 "indirect_reference" in annotation.__dict__
1125 and annotation.indirect_reference in fields
1126 ):
1127 continue
1128 if not is_indirect:
1129 annotations[idx] = self._add_object(annotation)
1130 fields.append(annotation.indirect_reference)
1131 lst.append(annotation)
1132 return lst
1134 def _collect_incremental_clone_object_ids(self, reader: PdfReader) -> list[int]:
1135 object_ids: set[int] = set()
1136 for xref_entry in reader.xref.values():
1137 object_ids.update(filter(None, xref_entry))
1138 object_ids.update(filter(None, reader.xref_objStm))
1140 object_count = len(object_ids)
1141 if object_count > self._incremental_clone_object_count_limit:
1142 raise LimitReachedError(
1143 f"Incremental clone object count {object_count} exceeds "
1144 f"maximum allowed count {self._incremental_clone_object_count_limit}."
1145 )
1147 max_object_id = max(object_ids, default=0)
1148 if max_object_id > self._incremental_clone_object_id_limit:
1149 raise LimitReachedError(
1150 f"Incremental clone object ID {max_object_id} exceeds "
1151 f"maximum allowed ID {self._incremental_clone_object_id_limit}."
1152 )
1154 return sorted(object_ids)
1156 def clone_reader_document_root(self, reader: PdfReader) -> None:
1157 """
1158 Copy the reader document root to the writer and all sub-elements,
1159 including pages, threads, outlines,... For partial insertion, ``append``
1160 should be considered.
1162 Args:
1163 reader: PdfReader from which the document root should be copied.
1165 """
1166 self._info_obj = None
1167 if self.incremental:
1168 object_ids = self._collect_incremental_clone_object_ids(reader)
1169 self._objects = [None] * (object_ids[-1] if object_ids else 0)
1170 for object_id in object_ids:
1171 reader_object = reader.get_object(object_id)
1172 if reader_object is not None:
1173 self._objects[object_id - 1] = reader_object.replicate(self)
1174 else:
1175 self._objects.clear()
1176 self._root_object = reader.root_object.clone(self)
1177 self._pages = self._root_object.raw_get("/Pages")
1179 trailer_size = cast(int, reader.trailer["/Size"])
1180 if len(self._objects) > trailer_size:
1181 if self.strict:
1182 raise PdfReadError(
1183 f"Object count {len(self._objects)} exceeds defined trailer size {trailer_size}"
1184 )
1185 logger_warning(
1186 "Object count %(object_count)d exceeds defined trailer size %(trailer_size)d",
1187 source=__name__,
1188 object_count=len(self._objects),
1189 trailer_size=trailer_size,
1190 )
1192 # must be done here before rewriting
1193 if self.incremental:
1194 self._original_hash = [
1195 (obj.hash_bin() if obj is not None else 0) for obj in self._objects
1196 ]
1198 try:
1199 self._flatten()
1200 except IndexError:
1201 raise PdfReadError("Got index error while flattening.")
1203 assert self.flattened_pages is not None
1204 for p in self.flattened_pages:
1205 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)
1206 if not self.incremental:
1207 p[NameObject("/Parent")] = self._pages
1208 if not self.incremental:
1209 cast(DictionaryObject, self._pages.get_object())[
1210 NameObject("/Kids")
1211 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
1213 def clone_document_from_reader(
1214 self,
1215 reader: PdfReader,
1216 after_page_append: Optional[Callable[[PageObject], None]] = None,
1217 ) -> None:
1218 """
1219 Create a copy (clone) of a document from a PDF file reader cloning
1220 section '/Root' and '/Info' and '/ID' of the pdf.
1222 Args:
1223 reader: PDF file reader instance from which the clone
1224 should be created.
1225 after_page_append:
1226 Callback function that is invoked after each page is appended to
1227 the writer. Signature includes a reference to the appended page
1228 (delegates to append_pages_from_reader). The single parameter of
1229 the callback is a reference to the page just appended to the
1230 document.
1232 """
1233 self.clone_reader_document_root(reader)
1234 inf = reader._info
1235 if self.incremental:
1236 if inf is not None:
1237 self._info_obj = cast(
1238 IndirectObject, inf.clone(self).indirect_reference
1239 )
1240 assert isinstance(self._info, DictionaryObject), "for mypy"
1241 self._original_hash[
1242 self._info_obj.indirect_reference.idnum - 1
1243 ] = self._info.hash_bin()
1244 elif inf is not None:
1245 self._info_obj = self._add_object(
1246 DictionaryObject(cast(DictionaryObject, inf.get_object()))
1247 )
1248 # else: _info_obj = None done in clone_reader_document_root()
1250 try:
1251 self._ID = cast(ArrayObject, reader._ID).clone(self)
1252 except AttributeError:
1253 pass
1255 if callable(after_page_append):
1256 for page in cast(
1257 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]
1258 ):
1259 after_page_append(page.get_object())
1261 def _compute_document_identifier(self) -> ByteStringObject:
1262 stream = BytesIO()
1263 self._write_pdf_structure(stream)
1264 stream.seek(0)
1265 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
1267 def generate_file_identifiers(self) -> None:
1268 """
1269 Generate an identifier for the PDF that will be written.
1271 The only point of this is ensuring uniqueness. Reproducibility is not
1272 required.
1273 When a file is first written, both identifiers shall be set to the same value.
1274 If both identifiers match when a file reference is resolved, it is very
1275 likely that the correct and unchanged file has been found. If only the first
1276 identifier matches, a different version of the correct file has been found.
1277 see §14.4 "File Identifiers".
1278 """
1279 if self._ID:
1280 id1 = self._ID[0]
1281 id2 = self._compute_document_identifier()
1282 else:
1283 id1 = self._compute_document_identifier()
1284 id2 = id1
1285 self._ID = ArrayObject((id1, id2))
1287 def encrypt(
1288 self,
1289 user_password: str,
1290 owner_password: Optional[str] = None,
1291 use_128bit: bool = True,
1292 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,
1293 *,
1294 algorithm: Optional[str] = None,
1295 ) -> None:
1296 """
1297 Encrypt this PDF file with the PDF Standard encryption handler.
1299 Args:
1300 user_password: The password which allows for opening
1301 and reading the PDF file with the restrictions provided.
1302 owner_password: The password which allows for
1303 opening the PDF files without any restrictions. By default,
1304 the owner password is the same as the user password.
1305 use_128bit: flag as to whether to use 128bit
1306 encryption. When false, 40bit encryption will be used.
1307 By default, this flag is on.
1308 permissions_flag: permissions as described in
1309 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means
1310 the permission is granted.
1311 Hence an integer value of -1 will set all flags.
1312 Bit position 3 is for printing, 4 is for modifying content,
1313 5 and 6 control annotations, 9 for form fields,
1314 10 for extraction of text and graphics.
1315 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",
1316 "AES-128", "AES-256-R5", "AES-256". If it is valid,
1317 `use_128bit` will be ignored.
1319 """
1320 if self.incremental:
1321 raise NotImplementedError("Encrypting incremental PDF files is currently not supported.")
1323 if owner_password is None:
1324 owner_password = user_password
1326 if algorithm is not None:
1327 try:
1328 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))
1329 except AttributeError:
1330 raise ValueError(f"Algorithm '{algorithm}' NOT supported")
1331 else:
1332 alg = EncryptAlgorithm.RC4_128
1333 if not use_128bit:
1334 alg = EncryptAlgorithm.RC4_40
1335 self.generate_file_identifiers()
1336 assert self._ID
1337 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
1338 # in case call `encrypt` again
1339 entry = self._encryption.write_entry(user_password, owner_password, strict=self.strict)
1340 if self._encrypt_entry:
1341 # replace old encrypt_entry
1342 assert self._encrypt_entry.indirect_reference is not None
1343 entry.indirect_reference = self._encrypt_entry.indirect_reference
1344 self._objects[entry.indirect_reference.idnum - 1] = entry
1345 else:
1346 self._add_object(entry)
1347 self._encrypt_entry = entry
1349 def _resolve_links(self) -> None:
1350 """Patch up links that were added to the document earlier, to
1351 make sure they still point to the same pages.
1352 """
1353 for (new_link, old_link) in self._unresolved_links:
1354 old_page = old_link.find_referenced_page()
1355 if not old_page:
1356 continue
1357 new_page = self._merged_in_pages.get(old_page)
1358 if new_page is None:
1359 continue
1360 new_link.patch_reference(self, new_page)
1362 def write_stream(self, stream: StreamType) -> None:
1363 if hasattr(stream, "mode") and "b" not in stream.mode:
1364 logger_warning(
1365 "File <%(stream_name)s> to write to is not in binary mode. "
1366 "It may not be written to correctly.",
1367 source=__name__,
1368 stream_name=stream.name,
1369 )
1370 self._resolve_links()
1372 if self.incremental:
1373 self._reader.stream.seek(0)
1374 stream.write(self._reader.stream.read(-1))
1375 if len(self.list_objects_in_increment()) > 0:
1376 self._write_increment(stream) # writes objs, xref stream and startxref
1377 else:
1378 object_positions, free_objects = self._write_pdf_structure(stream)
1379 xref_location = self._write_xref_table(
1380 stream, object_positions, free_objects
1381 )
1382 self._write_trailer(stream, xref_location)
1384 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:
1385 """
1386 Write the collection of pages added to this object out as a PDF file.
1388 Args:
1389 stream: An object to write the file to. The object can support
1390 the write method and the tell method, similar to a file object, or
1391 be a file path, just like the fileobj, just named it stream to keep
1392 existing workflow.
1394 Returns:
1395 A tuple (bool, IO).
1397 """
1398 my_file = False
1400 if stream == "":
1401 raise ValueError(f"Output({stream=}) is empty.")
1403 if isinstance(stream, (str, Path)):
1404 stream = FileIO(stream, "wb")
1405 my_file = True
1407 self.write_stream(stream)
1409 if my_file:
1410 stream.close()
1411 else:
1412 stream.flush()
1414 return my_file, stream
1416 def list_objects_in_increment(self) -> list[IndirectObject]:
1417 """
1418 For analysis or debugging.
1419 Provides the list of new or modified objects that will be written
1420 in the increment.
1421 Deleted objects will not be freed but will become orphans.
1423 Returns:
1424 List of new or modified IndirectObjects
1426 """
1427 original_hash_count = len(self._original_hash)
1428 return [
1429 cast(IndirectObject, obj).indirect_reference
1430 for i, obj in enumerate(self._objects)
1431 if (
1432 obj is not None
1433 and (
1434 i >= original_hash_count
1435 or obj.hash_bin() != self._original_hash[i]
1436 )
1437 )
1438 ]
1440 def _write_increment(self, stream: StreamType) -> None:
1441 object_positions = {}
1442 object_blocks = []
1443 current_start = -1
1444 current_stop = -2
1445 original_hash_count = len(self._original_hash)
1446 for i, obj in enumerate(self._objects):
1447 if obj is not None and (
1448 i >= original_hash_count
1449 or obj.hash_bin() != self._original_hash[i]
1450 ):
1451 idnum = i + 1
1452 assert isinstance(obj, PdfObject), "mypy"
1453 # first write new/modified object
1454 object_positions[idnum] = stream.tell()
1455 stream.write(f"{idnum} 0 obj\n".encode())
1456 """ encryption is not operational
1457 if self._encryption and obj != self._encrypt_entry:
1458 obj = self._encryption.encrypt_object(obj, idnum, 0)
1459 """
1460 obj.write_to_stream(stream)
1461 stream.write(b"\nendobj\n")
1463 # prepare xref
1464 if idnum != current_stop:
1465 if current_start > 0:
1466 object_blocks.append(
1467 [current_start, current_stop - current_start]
1468 )
1469 current_start = idnum
1470 current_stop = idnum + 1
1471 assert current_start > 0, "for pytest only"
1472 object_blocks.append([current_start, current_stop - current_start])
1473 # write incremented xref
1474 xref_location = stream.tell()
1475 xr_id = len(self._objects) + 1
1476 stream.write(f"{xr_id} 0 obj".encode())
1477 init_data = {
1478 NameObject("/Type"): NameObject("/XRef"),
1479 NameObject("/Size"): NumberObject(xr_id + 1),
1480 NameObject("/Root"): self.root_object.indirect_reference,
1481 NameObject("/Filter"): NameObject("/FlateDecode"),
1482 NameObject("/Index"): ArrayObject(
1483 [NumberObject(_it) for _su in object_blocks for _it in _su]
1484 ),
1485 NameObject("/W"): ArrayObject(
1486 [NumberObject(1), NumberObject(4), NumberObject(1)]
1487 ),
1488 "__streamdata__": b"",
1489 }
1490 if self._info is not None and (
1491 self._info.indirect_reference.idnum - 1 # type: ignore[union-attr]
1492 >= len(self._original_hash)
1493 or cast(IndirectObject, self._info).hash_bin() # kept for future
1494 != self._original_hash[
1495 self._info.indirect_reference.idnum - 1 # type: ignore[union-attr]
1496 ]
1497 ):
1498 init_data[NameObject(TK.INFO)] = self._info.indirect_reference
1499 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
1500 if self._ID:
1501 init_data[NameObject(TK.ID)] = self._ID
1502 xr = StreamObject.initialize_from_dictionary(init_data)
1503 xr.set_data(
1504 b"".join(
1505 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]
1506 )
1507 )
1508 xr.write_to_stream(stream)
1509 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1511 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:
1512 object_positions = []
1513 free_objects = []
1514 stream.write(self.pdf_header.encode() + b"\n")
1515 stream.write(b"%\xE2\xE3\xCF\xD3\n")
1517 for idnum, obj in enumerate(self._objects, start=1):
1518 if obj is not None:
1519 object_positions.append(stream.tell())
1520 stream.write(f"{idnum} 0 obj\n".encode())
1521 if self._encryption and obj != self._encrypt_entry:
1522 obj = self._encryption.encrypt_object(obj, idnum, 0)
1523 obj.write_to_stream(stream)
1524 stream.write(b"\nendobj\n")
1525 else:
1526 object_positions.append(-1)
1527 free_objects.append(idnum)
1528 free_objects.append(0) # add 0 to loop in accordance with specification
1529 return object_positions, free_objects
1531 def _write_xref_table(
1532 self, stream: StreamType, object_positions: list[int], free_objects: list[int]
1533 ) -> int:
1534 xref_location = stream.tell()
1535 stream.write(b"xref\n")
1536 stream.write(f"0 {len(self._objects) + 1}\n".encode())
1537 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
1538 free_idx = 1
1539 for offset in object_positions:
1540 if offset > 0:
1541 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
1542 else:
1543 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
1544 free_idx += 1
1545 return xref_location
1547 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
1548 """
1549 Write the PDF trailer to the stream.
1551 To quote the PDF specification:
1552 [The] trailer [gives] the location of the cross-reference table and
1553 of certain special objects within the body of the file.
1554 """
1555 stream.write(b"trailer\n")
1556 trailer = DictionaryObject(
1557 {
1558 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
1559 NameObject(TK.ROOT): self.root_object.indirect_reference,
1560 }
1561 )
1562 if self._info is not None:
1563 trailer[NameObject(TK.INFO)] = self._info.indirect_reference
1564 if self._ID is not None:
1565 trailer[NameObject(TK.ID)] = self._ID
1566 if self._encrypt_entry:
1567 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference
1568 trailer.write_to_stream(stream)
1569 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1571 @property
1572 def metadata(self) -> Optional[DocumentInformation]:
1573 """
1574 Retrieve/set the PDF file's document information dictionary, if it exists.
1576 Args:
1577 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
1579 Note that some PDF files use (XMP) metadata streams instead of document
1580 information dictionaries, and these metadata streams will not be
1581 accessed by this function, but by :meth:`~xmp_metadata`.
1583 """
1584 return super().metadata
1586 @metadata.setter
1587 def metadata(
1588 self,
1589 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],
1590 ) -> None:
1591 if value is None:
1592 self._info = None
1593 else:
1594 if self._info is not None:
1595 self._info.clear()
1597 self.add_metadata(value)
1599 def add_metadata(self, infos: dict[str, Any]) -> None:
1600 """
1601 Add custom metadata to the output.
1603 Args:
1604 infos: a Python dictionary where each key is a field
1605 and each value is your new metadata.
1607 """
1608 args = {}
1609 if isinstance(infos, PdfObject):
1610 infos = cast(DictionaryObject, infos.get_object())
1611 for key, value in list(infos.items()):
1612 if isinstance(value, PdfObject):
1613 value = value.get_object()
1614 args[NameObject(key)] = create_string_object(str(value))
1615 if self._info is None:
1616 self._info = DictionaryObject()
1617 self._info.update(args)
1619 _UNSET = object()
1621 def compress_identical_objects(
1622 self,
1623 remove_identicals: Any = _UNSET,
1624 remove_orphans: Any = _UNSET,
1625 *,
1626 remove_duplicates: bool = True,
1627 remove_unreferenced: bool = True,
1628 ) -> None:
1629 """
1630 Parse the PDF file and merge objects that have the same hash.
1631 This will make objects common to multiple pages.
1632 Recommended to be used just before writing output.
1634 Args:
1635 remove_identicals: Deprecated.
1636 remove_orphans: Deprecated.
1637 remove_duplicates: Remove duplicate objects.
1638 remove_unreferenced: Remove unreferenced objects.
1640 """
1641 if remove_identicals != self._UNSET:
1642 deprecate_with_replacement("remove_identicals", "remove_duplicates", "7.0.0")
1643 assert isinstance(remove_identicals, bool)
1644 remove_duplicates = remove_identicals
1645 if remove_orphans != self._UNSET:
1646 deprecate_with_replacement("remove_orphans", "remove_unreferenced", "7.0.0")
1647 assert isinstance(remove_orphans, bool)
1648 remove_unreferenced = remove_orphans
1650 def replace_in_obj(
1651 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]
1652 ) -> None:
1653 if isinstance(obj, DictionaryObject):
1654 key_val = obj.items()
1655 elif isinstance(obj, ArrayObject):
1656 key_val = enumerate(obj) # type: ignore[assignment]
1657 else:
1658 return
1659 assert isinstance(obj, (DictionaryObject, ArrayObject))
1660 for k, v in key_val:
1661 if isinstance(v, IndirectObject):
1662 unreferenced[v.idnum - 1] = False
1663 if v in crossref:
1664 obj[k] = crossref[v]
1665 else:
1666 """The filtering on DictionaryObject and ArrayObject only
1667 will be performed within replace_in_obj"""
1668 replace_in_obj(v, crossref)
1670 # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...])
1671 self._idnum_hash = {}
1672 unreferenced = [True] * len(self._objects)
1673 # look for similar objects
1674 for idx, obj in enumerate(self._objects):
1675 if is_null_or_none(obj):
1676 continue
1677 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.
1678 assert isinstance(obj.indirect_reference, IndirectObject)
1679 h = obj.hash_value()
1680 if remove_duplicates and h in self._idnum_hash:
1681 self._idnum_hash[h][1].append(obj.indirect_reference)
1682 self._objects[idx] = None
1683 else:
1684 self._idnum_hash[h] = (obj.indirect_reference, [])
1686 # generate the dict converting others to 1st
1687 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
1688 cnv_rev: dict[IndirectObject, IndirectObject] = {}
1689 for k, v in cnv.items():
1690 cnv_rev.update(zip(v, (k,) * len(v)))
1692 # replace reference to merged objects
1693 for obj in self._objects:
1694 if isinstance(obj, (DictionaryObject, ArrayObject)):
1695 replace_in_obj(obj, cnv_rev)
1697 if remove_unreferenced:
1698 unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore[union-attr]
1700 if not is_null_or_none(self._info):
1701 unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore[union-attr]
1703 try:
1704 unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore[union-attr]
1705 except AttributeError:
1706 pass
1708 for i in compress(range(len(self._objects)), unreferenced):
1709 self._objects[i] = None
1711 def get_reference(self, obj: PdfObject) -> IndirectObject:
1712 idnum = self._objects.index(obj) + 1
1713 ref = IndirectObject(idnum, 0, self)
1714 assert ref.get_object() == obj
1715 return ref
1717 def get_outline_root(self) -> TreeObject:
1718 if CO.OUTLINES in self._root_object:
1719 # Entries in the catalog dictionary
1720 outline = cast(TreeObject, self._root_object[CO.OUTLINES])
1721 if not isinstance(outline, TreeObject):
1722 t = TreeObject(outline)
1723 self._replace_object(outline.indirect_reference.idnum, t)
1724 outline = t
1725 idnum = self._objects.index(outline) + 1
1726 outline_ref = IndirectObject(idnum, 0, self)
1727 assert outline_ref.get_object() == outline
1728 else:
1729 outline = TreeObject()
1730 outline.update({})
1731 outline_ref = self._add_object(outline)
1732 self._root_object[NameObject(CO.OUTLINES)] = outline_ref
1734 return outline
1736 def get_threads_root(self) -> ArrayObject:
1737 """
1738 The list of threads.
1740 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1742 Returns:
1743 An array (possibly empty) of Dictionaries with an ``/F`` key,
1744 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
1746 """
1747 if CO.THREADS in self._root_object:
1748 # Entries in the catalog dictionary
1749 threads = cast(ArrayObject, self._root_object[CO.THREADS])
1750 else:
1751 threads = ArrayObject()
1752 self._root_object[NameObject(CO.THREADS)] = threads
1753 return threads
1755 @property
1756 def threads(self) -> ArrayObject:
1757 """
1758 Read-only property for the list of threads.
1760 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1762 Each element is a dictionary with an ``/F`` key, and optionally
1763 information about the thread in ``/I`` or ``/Metadata`` keys.
1764 """
1765 return self.get_threads_root()
1767 def add_outline_item_destination(
1768 self,
1769 page_destination: Union[IndirectObject, PageObject, TreeObject],
1770 parent: Union[None, TreeObject, IndirectObject] = None,
1771 before: Union[None, TreeObject, IndirectObject] = None,
1772 is_open: bool = True,
1773 ) -> IndirectObject:
1774 page_destination = cast(PageObject, page_destination.get_object())
1775 if isinstance(page_destination, PageObject):
1776 return self.add_outline_item_destination(
1777 Destination(
1778 f"page #{page_destination.page_number}",
1779 cast(IndirectObject, page_destination.indirect_reference),
1780 Fit.fit(),
1781 )
1782 )
1784 if parent is None:
1785 parent = self.get_outline_root()
1787 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)
1788 parent = cast(TreeObject, parent.get_object())
1789 page_destination_ref = self._add_object(page_destination)
1790 if before is not None:
1791 before = before.indirect_reference
1792 parent.insert_child(
1793 page_destination_ref,
1794 before,
1795 self,
1796 page_destination.inc_parent_counter_outline
1797 if is_open
1798 else (lambda x, y: 0), # noqa: ARG005
1799 )
1800 if "/Count" not in page_destination:
1801 page_destination[NameObject("/Count")] = NumberObject(0)
1803 return page_destination_ref
1805 def add_outline_item_dict(
1806 self,
1807 outline_item: OutlineItemType,
1808 parent: Union[None, TreeObject, IndirectObject] = None,
1809 before: Union[None, TreeObject, IndirectObject] = None,
1810 is_open: bool = True,
1811 ) -> IndirectObject:
1812 outline_item_object = TreeObject()
1813 outline_item_object.update(outline_item)
1815 """code currently unreachable
1816 if "/A" in outline_item:
1817 action = DictionaryObject()
1818 a_dict = cast(DictionaryObject, outline_item["/A"])
1819 for k, v in list(a_dict.items()):
1820 action[NameObject(str(k))] = v
1821 action_ref = self._add_object(action)
1822 outline_item_object[NameObject("/A")] = action_ref
1823 """
1824 return self.add_outline_item_destination(
1825 outline_item_object, parent, before, is_open
1826 )
1828 def add_outline_item(
1829 self,
1830 title: str,
1831 page_number: Union[None, PageObject, IndirectObject, int],
1832 parent: Union[None, TreeObject, IndirectObject] = None,
1833 before: Union[None, TreeObject, IndirectObject] = None,
1834 color: Optional[Union[tuple[float, float, float], str]] = None,
1835 bold: bool = False,
1836 italic: bool = False,
1837 fit: Fit = PAGE_FIT,
1838 is_open: bool = True,
1839 ) -> IndirectObject:
1840 """
1841 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.
1843 Args:
1844 title: Title to use for this outline item.
1845 page_number: Page number this outline item will point to.
1846 parent: A reference to a parent outline item to create nested
1847 outline items.
1848 before:
1849 color: Color of the outline item's font as a red, green, blue tuple
1850 from 0.0 to 1.0 or as a Hex String (#RRGGBB)
1851 bold: Outline item font is bold
1852 italic: Outline item font is italic
1853 fit: The fit of the destination page.
1855 Returns:
1856 The added outline item as an indirect object.
1858 """
1859 page_ref: Union[None, NullObject, IndirectObject, NumberObject]
1860 if isinstance(italic, Fit): # it means that we are on the old params
1861 if fit is not None and page_number is None:
1862 page_number = fit
1863 return self.add_outline_item(
1864 title, page_number, parent, None, before, color, bold, italic, is_open=is_open
1865 )
1866 if page_number is None:
1867 action_ref = None
1868 else:
1869 if isinstance(page_number, IndirectObject):
1870 page_ref = page_number
1871 elif isinstance(page_number, PageObject):
1872 page_ref = page_number.indirect_reference
1873 elif isinstance(page_number, int):
1874 try:
1875 page_ref = self.pages[page_number].indirect_reference
1876 except IndexError:
1877 page_ref = NumberObject(page_number)
1878 if page_ref is None:
1879 logger_warning(
1880 "can not find reference of page %(page_number)s",
1881 source=__name__,
1882 page_number=page_number,
1883 )
1884 page_ref = NullObject()
1885 dest = Destination(
1886 NameObject("/" + title + " outline item"),
1887 page_ref,
1888 fit,
1889 )
1891 action_ref = self._add_object(
1892 DictionaryObject(
1893 {
1894 NameObject(GoToActionArguments.D): dest.dest_array,
1895 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
1896 }
1897 )
1898 )
1899 outline_item = self._add_object(
1900 _create_outline_item(action_ref, title, color, italic, bold)
1901 )
1903 if parent is None:
1904 parent = self.get_outline_root()
1905 return self.add_outline_item_destination(outline_item, parent, before, is_open)
1907 def add_outline(self) -> None:
1908 raise NotImplementedError(
1909 "This method is not yet implemented. Use :meth:`add_outline_item` instead."
1910 )
1912 def add_named_destination_array(
1913 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]
1914 ) -> None:
1915 named_dest = self.get_named_dest_root()
1916 i = 0
1917 while i < len(named_dest):
1918 if title < named_dest[i]:
1919 named_dest.insert(i, destination)
1920 named_dest.insert(i, TextStringObject(title))
1921 return
1922 i += 2
1923 named_dest.extend([TextStringObject(title), destination])
1924 return
1926 def add_named_destination_object(
1927 self,
1928 page_destination: PdfObject,
1929 ) -> IndirectObject:
1930 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore[attr-defined]
1931 self.add_named_destination_array(
1932 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore[index]
1933 )
1935 return page_destination_ref
1937 def add_named_destination(
1938 self,
1939 title: str,
1940 page_number: int,
1941 ) -> IndirectObject:
1942 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore[index]
1943 dest = DictionaryObject()
1944 dest.update(
1945 {
1946 NameObject(GoToActionArguments.D): ArrayObject(
1947 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]
1948 ),
1949 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
1950 }
1951 )
1953 dest_ref = self._add_object(dest)
1954 if not isinstance(title, TextStringObject):
1955 title = TextStringObject(str(title))
1957 self.add_named_destination_array(title, dest_ref)
1958 return dest_ref
1960 def remove_links(self) -> None:
1961 """Remove links and annotations from this output."""
1962 for page in self.pages:
1963 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
1965 def remove_annotations(
1966 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]
1967 ) -> None:
1968 """
1969 Remove annotations by annotation subtype.
1971 Args:
1972 subtypes: subtype or list of subtypes to be removed.
1973 Examples are: "/Link", "/FileAttachment", "/Sound",
1974 "/Movie", "/Screen", ...
1975 If you want to remove all annotations, use subtypes=None.
1977 """
1978 for page in self.pages:
1979 self._remove_annots_from_page(page, subtypes)
1981 def _remove_annots_from_page(
1982 self,
1983 page: Union[IndirectObject, PageObject, DictionaryObject],
1984 subtypes: Optional[Iterable[str]],
1985 ) -> None:
1986 page = cast(DictionaryObject, page.get_object())
1987 if PG.ANNOTS in page:
1988 i = 0
1989 while i < len(cast(ArrayObject, page[PG.ANNOTS])):
1990 an = cast(ArrayObject, page[PG.ANNOTS])[i]
1991 obj = cast(DictionaryObject, an.get_object())
1992 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
1993 if isinstance(an, IndirectObject):
1994 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
1995 del page[PG.ANNOTS][i] # type:ignore
1996 else:
1997 i += 1
1999 def remove_objects_from_page(
2000 self,
2001 page: Union[PageObject, DictionaryObject],
2002 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
2003 text_filters: Optional[dict[str, Any]] = None
2004 ) -> None:
2005 """
2006 Remove objects specified by ``to_delete`` from the given page.
2008 Args:
2009 page: Page object to clean up.
2010 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
2011 or a list of ObjectDeletionFlag
2012 text_filters: Properties of text to be deleted, if applicable. Optional.
2013 This is a Python dictionary with the following properties:
2015 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.
2017 """
2018 if isinstance(to_delete, (list, tuple)):
2019 for to_d in to_delete:
2020 self.remove_objects_from_page(page, to_d)
2021 return None
2022 assert isinstance(to_delete, ObjectDeletionFlag)
2024 if to_delete & ObjectDeletionFlag.LINKS:
2025 return self._remove_annots_from_page(page, ("/Link",))
2026 if to_delete & ObjectDeletionFlag.ATTACHMENTS:
2027 return self._remove_annots_from_page(
2028 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
2029 )
2030 if to_delete & ObjectDeletionFlag.OBJECTS_3D:
2031 return self._remove_annots_from_page(page, ("/3D",))
2032 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
2033 return self._remove_annots_from_page(page, None)
2035 jump_operators = []
2036 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
2037 jump_operators = [
2038 b"w", b"J", b"j", b"M", b"d", b"i",
2039 b"W", b"W*",
2040 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",
2041 b"m", b"l", b"c", b"v", b"y", b"h", b"re",
2042 b"sh"
2043 ]
2044 if to_delete & ObjectDeletionFlag.TEXT:
2045 jump_operators = [b"Tj", b"TJ", b"'", b'"']
2047 if not isinstance(page, PageObject):
2048 page = PageObject(self, page.indirect_reference) # pragma: no cover
2049 if "/Contents" in page:
2050 content = cast(ContentStream, page.get_contents())
2052 images, forms = self._remove_objects_from_page__clean_forms(
2053 elt=page, stack=[], jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters,
2054 )
2056 self._remove_objects_from_page__clean(
2057 content=content, images=images, forms=forms,
2058 jump_operators=jump_operators, to_delete=to_delete,
2059 text_filters=text_filters
2060 )
2061 page.replace_contents(content)
2062 return [], [] # type: ignore[return-value]
2064 def _remove_objects_from_page__clean(
2065 self,
2066 content: ContentStream,
2067 images: list[str],
2068 forms: list[str],
2069 jump_operators: list[bytes],
2070 to_delete: ObjectDeletionFlag,
2071 text_filters: Optional[dict[str, Any]] = None,
2072 ) -> None:
2073 font_id = None
2074 font_ids_to_delete = []
2075 if text_filters and to_delete & ObjectDeletionFlag.TEXT:
2076 font_ids_to_delete = text_filters.get("font_ids", [])
2078 i = 0
2079 while i < len(content.operations):
2080 operands, operator = content.operations[i]
2081 if operator == b"Tf":
2082 font_id = operands[0]
2083 if (
2084 (
2085 operator == b"INLINE IMAGE"
2086 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)
2087 )
2088 or (operator in jump_operators)
2089 or (
2090 operator == b"Do"
2091 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)
2092 and (operands[0] in images)
2093 )
2094 ):
2095 if (
2096 not to_delete & ObjectDeletionFlag.TEXT
2097 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
2098 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
2099 ):
2100 del content.operations[i]
2101 else:
2102 i += 1
2103 else:
2104 i += 1
2105 content.get_data() # this ensures ._data is rebuilt from the .operations
2107 def _remove_objects_from_page__clean_forms(
2108 self,
2109 elt: DictionaryObject,
2110 stack: list[DictionaryObject],
2111 jump_operators: list[bytes],
2112 to_delete: ObjectDeletionFlag,
2113 text_filters: Optional[dict[str, Any]] = None,
2114 ) -> tuple[list[str], list[str]]:
2115 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference
2116 if (elt in stack) or (
2117 hasattr(elt, "indirect_reference") and any(
2118 elt.indirect_reference == getattr(x, "indirect_reference", -1)
2119 for x in stack
2120 )
2121 ):
2122 # to prevent infinite looping
2123 return [], [] # pragma: no cover
2124 try:
2125 d = cast(
2126 dict[Any, Any],
2127 cast(DictionaryObject, elt["/Resources"])["/XObject"],
2128 )
2129 except KeyError:
2130 d = {}
2131 images = []
2132 forms = []
2133 for k, v in d.items():
2134 o = v.get_object()
2135 try:
2136 content: Any = None
2137 if (
2138 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES
2139 and o["/Subtype"] == "/Image"
2140 ):
2141 content = NullObject() # to delete the image keeping the entry
2142 images.append(k)
2143 if o["/Subtype"] == "/Form":
2144 forms.append(k)
2145 if isinstance(o, ContentStream):
2146 content = o
2147 else:
2148 content = ContentStream(o, self)
2149 content.update(
2150 {
2151 k1: v1
2152 for k1, v1 in o.items()
2153 if k1 not in ["/Length", "/Filter", "/DecodeParms"]
2154 }
2155 )
2156 try:
2157 content.indirect_reference = o.indirect_reference
2158 except AttributeError: # pragma: no cover
2159 pass
2160 stack.append(elt)
2162 # clean subforms
2163 self._remove_objects_from_page__clean_forms(
2164 elt=content, stack=stack, jump_operators=jump_operators, to_delete=to_delete,
2165 text_filters=text_filters,
2166 )
2167 if content is not None:
2168 if isinstance(v, IndirectObject):
2169 self._objects[v.idnum - 1] = content
2170 else:
2171 # should only occur in a PDF not respecting PDF spec
2172 # where streams must be indirected.
2173 d[k] = self._add_object(content) # pragma: no cover
2174 except (TypeError, KeyError):
2175 pass
2176 for im in images:
2177 del d[im] # for clean-up
2178 if isinstance(elt, StreamObject): # for /Form
2179 if not isinstance(elt, ContentStream): # pragma: no cover
2180 e = ContentStream(elt, self)
2181 e.update(elt.items())
2182 elt = e
2183 # clean the content
2184 self._remove_objects_from_page__clean(
2185 content=elt, images=images, forms=forms, jump_operators=jump_operators,
2186 to_delete=to_delete, text_filters=text_filters
2187 )
2188 return images, forms
2190 def remove_images(
2191 self,
2192 to_delete: ImageType = ImageType.ALL,
2193 ) -> None:
2194 """
2195 Remove images from this output.
2197 Args:
2198 to_delete: The type of images to be deleted
2199 (default = all images types)
2201 """
2202 if isinstance(to_delete, bool):
2203 to_delete = ImageType.ALL
2205 i = ObjectDeletionFlag.NONE
2207 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):
2208 if to_delete & ImageType[image]:
2209 i |= ObjectDeletionFlag[image]
2211 for page in self.pages:
2212 self.remove_objects_from_page(page, i)
2214 def remove_text(self, font_names: Optional[list[str]] = None) -> None:
2215 """
2216 Remove text from the PDF.
2218 Args:
2219 font_names: List of font names to remove, such as "Helvetica-Bold".
2220 Optional. If not specified, all text will be removed.
2221 """
2222 if not font_names:
2223 font_names = []
2225 for page in self.pages:
2226 resource_ids_to_remove = []
2228 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
2229 # Font names need to be converted to resource names/IDs for easier removal
2230 if font_names:
2231 # Recursively loop through page objects to gather font info
2232 def get_font_info(
2233 obj: Any,
2234 font_info: Optional[dict[str, Any]] = None,
2235 key: Optional[str] = None
2236 ) -> dict[str, Any]:
2237 if font_info is None:
2238 font_info = {}
2239 if isinstance(obj, IndirectObject):
2240 obj = obj.get_object()
2241 if isinstance(obj, dict):
2242 if obj.get("/Type") == "/Font":
2243 font_name = obj.get("/BaseFont", "")
2244 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
2245 normalized_font_name = font_name.lstrip("/").split("+")[-1]
2246 if normalized_font_name not in font_info:
2247 font_info[normalized_font_name] = {
2248 "normalized_font_name": normalized_font_name,
2249 "resource_ids": [],
2250 }
2251 if key not in font_info[normalized_font_name]["resource_ids"]:
2252 font_info[normalized_font_name]["resource_ids"].append(key)
2253 for k in obj:
2254 font_info = get_font_info(obj[k], font_info, k)
2255 elif isinstance(obj, (list, ArrayObject)):
2256 for child_obj in obj:
2257 font_info = get_font_info(child_obj, font_info)
2258 return font_info
2260 # Add relevant resource names for removal
2261 font_info = get_font_info(page.get("/Resources"))
2262 for font_name in font_names:
2263 if font_name in font_info:
2264 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])
2266 text_filters = {}
2267 if font_names:
2268 text_filters["font_ids"] = resource_ids_to_remove
2269 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
2271 def add_uri(
2272 self,
2273 page_number: int,
2274 uri: str,
2275 rect: RectangleObject,
2276 border: Optional[ArrayObject] = None,
2277 ) -> None:
2278 """
2279 Add an URI from a rectangular area to the specified page.
2281 Args:
2282 page_number: index of the page on which to place the URI action.
2283 uri: URI of resource to link to.
2284 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or
2285 array of four integers specifying the clickable rectangular area
2286 ``[xLL, yLL, xUR, yUR]``, or string in the form
2287 ``"[ xLL yLL xUR yUR ]"``.
2288 border: if provided, an array describing border-drawing
2289 properties. See the PDF spec for details. No border will be
2290 drawn if this argument is omitted.
2292 """
2293 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore[index]
2294 page_ref = cast(dict[str, Any], self.get_object(page_link))
2296 border_arr: BorderArrayType
2297 if border is not None:
2298 border_arr = [NumberObject(n) for n in border[:3]]
2299 if len(border) == 4:
2300 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
2301 border_arr.append(dash_pattern)
2302 else:
2303 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]
2305 if isinstance(rect, str):
2306 rect = NumberObject(rect)
2307 elif isinstance(rect, RectangleObject):
2308 pass
2309 else:
2310 rect = RectangleObject(rect)
2312 lnk2 = DictionaryObject()
2313 lnk2.update(
2314 {
2315 NameObject("/S"): NameObject("/URI"),
2316 NameObject("/URI"): TextStringObject(uri),
2317 }
2318 )
2319 lnk = DictionaryObject()
2320 lnk.update(
2321 {
2322 NameObject(AA.Type): NameObject("/Annot"),
2323 NameObject(AA.Subtype): NameObject("/Link"),
2324 NameObject(AA.P): page_link,
2325 NameObject(AA.Rect): rect,
2326 NameObject("/H"): NameObject("/I"),
2327 NameObject(AA.Border): ArrayObject(border_arr),
2328 NameObject("/A"): lnk2,
2329 }
2330 )
2331 lnk_ref = self._add_object(lnk)
2333 if PG.ANNOTS in page_ref:
2334 page_ref[PG.ANNOTS].append(lnk_ref)
2335 else:
2336 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])
2338 _valid_layouts = (
2339 "/NoLayout",
2340 "/SinglePage",
2341 "/OneColumn",
2342 "/TwoColumnLeft",
2343 "/TwoColumnRight",
2344 "/TwoPageLeft",
2345 "/TwoPageRight",
2346 )
2348 def _get_page_layout(self) -> Optional[LayoutType]:
2349 try:
2350 return cast(LayoutType, self._root_object["/PageLayout"])
2351 except KeyError:
2352 return None
2354 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:
2355 """
2356 Set the page layout.
2358 Args:
2359 layout: The page layout to be used.
2361 .. list-table:: Valid ``layout`` arguments
2362 :widths: 50 200
2364 * - /NoLayout
2365 - Layout explicitly not specified
2366 * - /SinglePage
2367 - Show one page at a time
2368 * - /OneColumn
2369 - Show one column at a time
2370 * - /TwoColumnLeft
2371 - Show pages in two columns, odd-numbered pages on the left
2372 * - /TwoColumnRight
2373 - Show pages in two columns, odd-numbered pages on the right
2374 * - /TwoPageLeft
2375 - Show two pages at a time, odd-numbered pages on the left
2376 * - /TwoPageRight
2377 - Show two pages at a time, odd-numbered pages on the right
2379 """
2380 if not isinstance(layout, NameObject):
2381 if layout not in self._valid_layouts:
2382 logger_warning(
2383 "Layout should be one of: %(layouts)s",
2384 source=__name__,
2385 layouts={"", "".join(self._valid_layouts)},
2386 )
2387 layout = NameObject(layout)
2388 self._root_object.update({NameObject("/PageLayout"): layout})
2390 def set_page_layout(self, layout: LayoutType) -> None:
2391 """
2392 Set the page layout.
2394 Args:
2395 layout: The page layout to be used
2397 .. list-table:: Valid ``layout`` arguments
2398 :widths: 50 200
2400 * - /NoLayout
2401 - Layout explicitly not specified
2402 * - /SinglePage
2403 - Show one page at a time
2404 * - /OneColumn
2405 - Show one column at a time
2406 * - /TwoColumnLeft
2407 - Show pages in two columns, odd-numbered pages on the left
2408 * - /TwoColumnRight
2409 - Show pages in two columns, odd-numbered pages on the right
2410 * - /TwoPageLeft
2411 - Show two pages at a time, odd-numbered pages on the left
2412 * - /TwoPageRight
2413 - Show two pages at a time, odd-numbered pages on the right
2415 """
2416 self._set_page_layout(layout)
2418 @property
2419 def page_layout(self) -> Optional[LayoutType]:
2420 """
2421 Page layout property.
2423 .. list-table:: Valid ``layout`` values
2424 :widths: 50 200
2426 * - /NoLayout
2427 - Layout explicitly not specified
2428 * - /SinglePage
2429 - Show one page at a time
2430 * - /OneColumn
2431 - Show one column at a time
2432 * - /TwoColumnLeft
2433 - Show pages in two columns, odd-numbered pages on the left
2434 * - /TwoColumnRight
2435 - Show pages in two columns, odd-numbered pages on the right
2436 * - /TwoPageLeft
2437 - Show two pages at a time, odd-numbered pages on the left
2438 * - /TwoPageRight
2439 - Show two pages at a time, odd-numbered pages on the right
2440 """
2441 return self._get_page_layout()
2443 @page_layout.setter
2444 def page_layout(self, layout: LayoutType) -> None:
2445 self._set_page_layout(layout)
2447 _valid_modes = (
2448 "/UseNone",
2449 "/UseOutlines",
2450 "/UseThumbs",
2451 "/FullScreen",
2452 "/UseOC",
2453 "/UseAttachments",
2454 )
2456 def _get_page_mode(self) -> Optional[PagemodeType]:
2457 try:
2458 return cast(PagemodeType, self._root_object["/PageMode"])
2459 except KeyError:
2460 return None
2462 @property
2463 def page_mode(self) -> Optional[PagemodeType]:
2464 """
2465 Page mode property.
2467 .. list-table:: Valid ``mode`` values
2468 :widths: 50 200
2470 * - /UseNone
2471 - Do not show outline or thumbnails panels
2472 * - /UseOutlines
2473 - Show outline (aka bookmarks) panel
2474 * - /UseThumbs
2475 - Show page thumbnails panel
2476 * - /FullScreen
2477 - Fullscreen view
2478 * - /UseOC
2479 - Show Optional Content Group (OCG) panel
2480 * - /UseAttachments
2481 - Show attachments panel
2482 """
2483 return self._get_page_mode()
2485 @page_mode.setter
2486 def page_mode(self, mode: PagemodeType) -> None:
2487 if isinstance(mode, NameObject):
2488 mode_name: NameObject = mode
2489 else:
2490 if mode not in self._valid_modes:
2491 logger_warning(
2492 "Mode should be one of: %(modes)s",
2493 source=__name__,
2494 modes=", ".join(self._valid_modes),
2495 )
2496 mode_name = NameObject(mode)
2497 self._root_object.update({NameObject("/PageMode"): mode_name})
2499 def add_annotation(
2500 self,
2501 page_number: Union[int, PageObject],
2502 annotation: dict[str, Any],
2503 ) -> DictionaryObject:
2504 """
2505 Add a single annotation to the page.
2506 The added annotation must be a new annotation.
2507 It cannot be recycled.
2509 Args:
2510 page_number: PageObject or page index.
2511 annotation: Annotation to be added (created with annotation).
2513 Returns:
2514 The inserted object.
2515 This can be used for popup creation, for example.
2517 """
2518 page = page_number
2519 if isinstance(page, int):
2520 page = self.pages[page]
2521 elif not isinstance(page, PageObject):
2522 raise TypeError("page: invalid type")
2524 to_add = cast(DictionaryObject, _pdf_objectify(annotation))
2525 to_add[NameObject("/P")] = page.indirect_reference
2527 if page.annotations is None:
2528 page[NameObject("/Annots")] = ArrayObject()
2529 assert page.annotations is not None
2531 # Internal link annotations need the correct object type for the
2532 # destination
2533 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:
2534 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])
2535 dest = Destination(
2536 NameObject("/LinkName"),
2537 tmp["target_page_index"],
2538 Fit(
2539 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]
2540 ), # I have no clue why this dict-hack is necessary
2541 )
2542 to_add[NameObject("/Dest")] = dest.dest_array
2544 page.annotations.append(self._add_object(to_add))
2546 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:
2547 cast(DictionaryObject, to_add["/Parent"].get_object())[
2548 NameObject("/Popup")
2549 ] = to_add.indirect_reference
2551 return to_add
2553 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:
2554 """
2555 Perform some clean up in the page.
2556 Currently: convert NameObject named destination to TextStringObject
2557 (required for names/dests list)
2559 Args:
2560 page:
2562 Returns:
2563 The cleaned PageObject
2565 """
2566 page = cast("PageObject", page.get_object())
2567 for a in page.get("/Annots", []):
2568 a_obj = a.get_object()
2569 d = a_obj.get("/Dest", None)
2570 act = a_obj.get("/A", None)
2571 if isinstance(d, NameObject):
2572 a_obj[NameObject("/Dest")] = TextStringObject(d)
2573 elif act is not None:
2574 act = act.get_object()
2575 d = act.get("/D", None)
2576 if isinstance(d, NameObject):
2577 act[NameObject("/D")] = TextStringObject(d)
2578 return page
2580 def _create_stream(
2581 self, fileobj: Union[Path, StrByteType, PdfReader]
2582 ) -> tuple[IOBase, Optional[Encryption]]:
2583 # If the fileobj parameter is a string, assume it is a path
2584 # and create a file object at that location. If it is a file,
2585 # copy the file's contents into a BytesIO stream object; if
2586 # it is a PdfReader, copy that reader's stream into a
2587 # BytesIO stream.
2588 # If fileobj is none of the above types, it is not modified
2589 encryption_obj = None
2590 stream: IOBase
2591 if isinstance(fileobj, (str, Path)):
2592 with FileIO(fileobj, "rb") as f:
2593 stream = BytesIO(f.read())
2594 elif isinstance(fileobj, PdfReader):
2595 if fileobj._encryption:
2596 encryption_obj = fileobj._encryption
2597 orig_tell = fileobj.stream.tell()
2598 fileobj.stream.seek(0)
2599 stream = BytesIO(fileobj.stream.read())
2601 # reset the stream to its original location
2602 fileobj.stream.seek(orig_tell)
2603 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):
2604 fileobj.seek(0)
2605 filecontent = fileobj.read()
2606 stream = BytesIO(filecontent)
2607 else:
2608 raise NotImplementedError(
2609 "Merging requires an object that PdfReader can parse. "
2610 "Typically, that is a Path or a string representing a Path, "
2611 "a file object, or an object implementing .seek and .read. "
2612 "Passing a PdfReader directly works as well."
2613 )
2614 return stream, encryption_obj
2616 def append(
2617 self,
2618 fileobj: Union[StrByteType, PdfReader, Path],
2619 outline_item: Union[
2620 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]
2621 ] = None,
2622 pages: Union[
2623 None,
2624 PageRange,
2625 tuple[int, int],
2626 tuple[int, int, int],
2627 list[int],
2628 list[PageObject],
2629 ] = None,
2630 import_outline: bool = True,
2631 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,
2632 ) -> None:
2633 """
2634 Identical to the :meth:`merge()<merge>` method, but assumes you want to
2635 concatenate all pages onto the end of the file instead of specifying a
2636 position.
2638 Args:
2639 fileobj: A File Object or an object that supports the standard
2640 read and seek methods similar to a File Object. Could also be a
2641 string representing a path to a PDF file.
2642 outline_item: Optionally, you may specify a string to build an
2643 outline (aka 'bookmark') to identify the beginning of the
2644 included file.
2645 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2646 or a ``(start, stop[, step])`` tuple
2647 or a list of pages to be processed
2648 to merge only the specified range of pages from the source
2649 document into the output document.
2650 import_outline: You may prevent the source document's
2651 outline (collection of outline items, previously referred to as
2652 'bookmarks') from being imported by specifying this as ``False``.
2653 excluded_fields: Provide the list of fields/keys to be ignored
2654 if ``/Annots`` is part of the list, the annotation will be ignored
2655 if ``/B`` is part of the list, the articles will be ignored
2657 """
2658 if excluded_fields is None:
2659 excluded_fields = ()
2660 if isinstance(outline_item, (tuple, list, PageRange)):
2661 if isinstance(pages, bool):
2662 if not isinstance(import_outline, bool):
2663 excluded_fields = import_outline
2664 import_outline = pages
2665 pages = outline_item
2666 self.merge(
2667 None,
2668 fileobj,
2669 None,
2670 pages,
2671 import_outline,
2672 excluded_fields,
2673 )
2674 else: # if isinstance(outline_item, str):
2675 self.merge(
2676 None,
2677 fileobj,
2678 outline_item,
2679 pages,
2680 import_outline,
2681 excluded_fields,
2682 )
2684 def merge(
2685 self,
2686 position: Optional[int],
2687 fileobj: Union[Path, StrByteType, PdfReader],
2688 outline_item: Optional[str] = None,
2689 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,
2690 import_outline: bool = True,
2691 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),
2692 ) -> None:
2693 """
2694 Merge the pages from the given file into the output file at the
2695 specified page number.
2697 Args:
2698 position: The *page number* to insert this file. File will
2699 be inserted after the given number.
2700 fileobj: A File Object or an object that supports the standard
2701 read and seek methods similar to a File Object. Could also be a
2702 string representing a path to a PDF file.
2703 outline_item: Optionally, you may specify a string to build an outline
2704 (aka 'bookmark') to identify the
2705 beginning of the included file.
2706 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2707 or a ``(start, stop[, step])`` tuple
2708 or a list of pages to be processed
2709 to merge only the specified range of pages from the source
2710 document into the output document.
2711 import_outline: You may prevent the source document's
2712 outline (collection of outline items, previously referred to as
2713 'bookmarks') from being imported by specifying this as ``False``.
2714 excluded_fields: provide the list of fields/keys to be ignored
2715 if ``/Annots`` is part of the list, the annotation will be ignored
2716 if ``/B`` is part of the list, the articles will be ignored
2718 Raises:
2719 TypeError: The pages attribute is not configured properly
2721 """
2722 if isinstance(fileobj, PdfDocCommon):
2723 reader = fileobj
2724 else:
2725 stream, _encryption_obj = self._create_stream(fileobj)
2726 # Create a new PdfReader instance using the stream
2727 # (either file or BytesIO or StringIO) created above
2728 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]
2730 if excluded_fields is None:
2731 excluded_fields = ()
2732 # Find the range of pages to merge.
2733 if pages is None:
2734 pages = list(range(len(reader.pages)))
2735 elif isinstance(pages, PageRange):
2736 pages = list(range(*pages.indices(len(reader.pages))))
2737 elif isinstance(pages, list):
2738 pass # keep unchanged
2739 elif isinstance(pages, tuple) and len(pages) <= 3:
2740 pages = list(range(*pages))
2741 elif not isinstance(pages, tuple):
2742 raise TypeError(
2743 '"pages" must be a tuple of (start, stop[, step]) or a list'
2744 )
2746 srcpages = {}
2747 for page in pages:
2748 if isinstance(page, PageObject):
2749 pg = page
2750 else:
2751 pg = reader.pages[page]
2752 assert pg.indirect_reference is not None
2753 if position is None:
2754 # numbers in the exclude list identifies that the exclusion is
2755 # only applicable to 1st level of cloning
2756 srcpages[pg.indirect_reference.idnum] = self.add_page(
2757 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore[list-item]
2758 )
2759 else:
2760 srcpages[pg.indirect_reference.idnum] = self.insert_page(
2761 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore[list-item]
2762 )
2763 position += 1
2764 srcpages[pg.indirect_reference.idnum].original_page = pg
2766 reader._named_destinations = (
2767 reader.named_destinations
2768 ) # need for the outline processing below
2770 arr: Any
2772 for dest in reader._named_destinations.values():
2773 self._merge__process_named_dests(dest=dest, reader=reader, srcpages=srcpages)
2775 outline_item_typ: TreeObject
2776 if outline_item is not None:
2777 outline_item_typ = cast(
2778 "TreeObject",
2779 self.add_outline_item(
2780 TextStringObject(outline_item),
2781 next(iter(srcpages.values())).indirect_reference,
2782 fit=PAGE_FIT,
2783 ).get_object(),
2784 )
2785 else:
2786 outline_item_typ = self.get_outline_root()
2788 _ro = reader.root_object
2789 if import_outline and CO.OUTLINES in _ro:
2790 outline = self._get_filtered_outline(
2791 _ro.get(CO.OUTLINES, None), srcpages, reader
2792 )
2793 self._insert_filtered_outline(
2794 outline, outline_item_typ, None
2795 ) # TODO: use before parameter
2797 if "/Annots" not in excluded_fields:
2798 for pag in srcpages.values():
2799 lst = self._insert_filtered_annotations(
2800 pag.original_page.get("/Annots", []), pag, srcpages, reader
2801 )
2802 if len(lst) > 0:
2803 pag[NameObject("/Annots")] = lst
2804 self.clean_page(pag)
2806 if "/AcroForm" in _ro and not is_null_or_none(_ro["/AcroForm"]):
2807 if "/AcroForm" not in self._root_object:
2808 self._root_object[NameObject("/AcroForm")] = self._add_object(
2809 cast(
2810 DictionaryObject,
2811 reader.root_object["/AcroForm"],
2812 ).clone(self, False, ("/Fields",))
2813 )
2814 arr = ArrayObject()
2815 else:
2816 arr = cast(
2817 ArrayObject,
2818 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
2819 )
2820 trslat = self._id_translated[id(reader)]
2821 try:
2822 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore[index]
2823 try:
2824 ind = IndirectObject(trslat[f.idnum], 0, self)
2825 if ind not in arr:
2826 arr.append(ind)
2827 except KeyError:
2828 # for trslat[] which mean the field has not be copied
2829 # through the page
2830 pass
2831 except KeyError: # for /Acroform or /Fields are not existing
2832 arr = self._add_object(ArrayObject())
2833 cast(DictionaryObject, self._root_object["/AcroForm"])[
2834 NameObject("/Fields")
2835 ] = arr
2837 if "/B" not in excluded_fields:
2838 self.add_filtered_articles("", srcpages, reader)
2840 def _merge__process_named_dests(self, dest: Any, reader: PdfDocCommon, srcpages: dict[int, PageObject]) -> None:
2841 arr: Any = dest.dest_array
2842 if "/Names" in self._root_object and dest["/Title"] in cast(
2843 list[Any],
2844 cast(
2845 DictionaryObject,
2846 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),
2847 ).get("/Names", DictionaryObject()),
2848 ):
2849 # already exists: should not duplicate it
2850 pass
2851 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):
2852 pass
2853 elif isinstance(dest["/Page"], int):
2854 # the page reference is a page number normally not a PDF Reference
2855 # page numbers as int are normally accepted only in external goto
2856 try:
2857 p = reader.pages[dest["/Page"]]
2858 except IndexError:
2859 return
2860 assert p.indirect_reference is not None
2861 try:
2862 arr[NumberObject(0)] = NumberObject(
2863 srcpages[p.indirect_reference.idnum].page_number
2864 )
2865 self.add_named_destination_array(dest["/Title"], arr)
2866 except KeyError:
2867 pass
2868 elif dest["/Page"].indirect_reference.idnum in srcpages:
2869 arr[NumberObject(0)] = srcpages[
2870 dest["/Page"].indirect_reference.idnum
2871 ].indirect_reference
2872 self.add_named_destination_array(dest["/Title"], arr)
2874 def _add_articles_thread(
2875 self,
2876 thread: DictionaryObject, # thread entry from the reader's array of threads
2877 pages: dict[int, PageObject],
2878 reader: PdfReader,
2879 ) -> IndirectObject:
2880 """
2881 Clone the thread with only the applicable articles.
2883 Args:
2884 thread:
2885 pages:
2886 reader:
2888 Returns:
2889 The added thread as an indirect reference
2891 """
2892 nthread = thread.clone(
2893 self, force_duplicate=True, ignore_fields=("/F",)
2894 ) # use of clone to keep link between reader and writer
2895 self.threads.append(nthread.indirect_reference)
2896 first_article = cast("DictionaryObject", thread["/F"])
2897 current_article: Optional[DictionaryObject] = first_article
2898 new_article: Optional[DictionaryObject] = None
2899 while current_article is not None:
2900 pag = self._get_cloned_page(
2901 cast("PageObject", current_article["/P"]), pages, reader
2902 )
2903 if pag is not None:
2904 if new_article is None:
2905 new_article = cast(
2906 "DictionaryObject",
2907 self._add_object(DictionaryObject()).get_object(),
2908 )
2909 new_first = new_article
2910 nthread[NameObject("/F")] = new_article.indirect_reference
2911 else:
2912 new_article2 = cast(
2913 "DictionaryObject",
2914 self._add_object(
2915 DictionaryObject(
2916 {NameObject("/V"): new_article.indirect_reference}
2917 )
2918 ).get_object(),
2919 )
2920 new_article[NameObject("/N")] = new_article2.indirect_reference
2921 new_article = new_article2
2922 new_article[NameObject("/P")] = pag
2923 new_article[NameObject("/T")] = nthread.indirect_reference
2924 new_article[NameObject("/R")] = current_article["/R"]
2925 pag_obj = cast("PageObject", pag.get_object())
2926 if "/B" not in pag_obj:
2927 pag_obj[NameObject("/B")] = ArrayObject()
2928 cast("ArrayObject", pag_obj["/B"]).append(
2929 new_article.indirect_reference
2930 )
2931 current_article = cast("DictionaryObject", current_article["/N"])
2932 if current_article == first_article:
2933 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore[index]
2934 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore[union-attr]
2935 current_article = None
2936 assert nthread.indirect_reference is not None
2937 return nthread.indirect_reference
2939 def add_filtered_articles(
2940 self,
2941 fltr: Union[
2942 Pattern[Any], str
2943 ], # thread entry from the reader's array of threads
2944 pages: dict[int, PageObject],
2945 reader: PdfReader,
2946 ) -> None:
2947 """
2948 Add articles matching the defined criteria.
2950 Args:
2951 fltr:
2952 pages:
2953 reader:
2955 """
2956 if isinstance(fltr, str):
2957 fltr = re.compile(fltr)
2958 elif not isinstance(fltr, Pattern):
2959 fltr = re.compile("")
2960 for p in pages.values():
2961 pp = p.original_page
2962 for a in pp.get("/B", ()):
2963 a_obj = a.get_object()
2964 if is_null_or_none(a_obj):
2965 continue
2966 thr = a_obj.get("/T")
2967 if thr is None:
2968 continue
2969 thr = thr.get_object()
2970 if thr.indirect_reference.idnum not in self._id_translated[
2971 id(reader)
2972 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):
2973 self._add_articles_thread(thr, pages, reader)
2975 def _get_cloned_page(
2976 self,
2977 page: Union[None, IndirectObject, PageObject, NullObject],
2978 pages: dict[int, PageObject],
2979 reader: PdfReader,
2980 ) -> Optional[IndirectObject]:
2981 if isinstance(page, NullObject):
2982 return None
2983 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":
2984 _i = page.indirect_reference
2985 elif isinstance(page, IndirectObject):
2986 _i = page
2987 try:
2988 return pages[_i.idnum].indirect_reference # type: ignore[union-attr]
2989 except Exception:
2990 return None
2992 def _insert_filtered_annotations(
2993 self,
2994 annots: Union[IndirectObject, list[DictionaryObject], None],
2995 page: PageObject,
2996 pages: dict[int, PageObject],
2997 reader: PdfReader,
2998 ) -> list[Destination]:
2999 outlist = ArrayObject()
3000 if isinstance(annots, IndirectObject):
3001 annots = cast("list[Any]", annots.get_object())
3002 if annots is None:
3003 return outlist
3004 if not isinstance(annots, list):
3005 logger_warning(
3006 "Expected list of annotations, got %(annots)s of type %(annots_type)s.",
3007 source=__name__,
3008 annots=annots,
3009 annots_type=annots.__class__.__name__,
3010 )
3011 return outlist
3012 for an in annots:
3013 ano = cast("DictionaryObject", an.get_object())
3014 if (
3015 ano["/Subtype"] != "/Link" # type: ignore[comparison-overlap]
3016 or "/A" not in ano
3017 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" # type: ignore[comparison-overlap]
3018 or "/Dest" in ano
3019 ):
3020 if "/Dest" not in ano:
3021 outlist.append(self._add_object(ano.clone(self)))
3022 else:
3023 d = ano["/Dest"]
3024 if isinstance(d, str):
3025 # it is a named dest
3026 if str(d) in self.get_named_dest_root():
3027 outlist.append(ano.clone(self).indirect_reference)
3028 else:
3029 d = cast("ArrayObject", d)
3030 p = self._get_cloned_page(d[0], pages, reader)
3031 if p is not None:
3032 anc = ano.clone(self, ignore_fields=("/Dest",))
3033 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])
3034 outlist.append(self._add_object(anc))
3035 else:
3036 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())
3037 if is_null_or_none(d):
3038 continue
3039 if isinstance(d, str):
3040 # it is a named dest
3041 if str(d) in self.get_named_dest_root():
3042 outlist.append(ano.clone(self).indirect_reference)
3043 else:
3044 d = cast("ArrayObject", d)
3045 p = self._get_cloned_page(d[0], pages, reader)
3046 if p is not None:
3047 anc = ano.clone(self, ignore_fields=("/D",))
3048 cast("DictionaryObject", anc["/A"])[
3049 NameObject("/D")
3050 ] = ArrayObject([p, *d[1:]])
3051 outlist.append(self._add_object(anc))
3052 return outlist
3054 def _get_filtered_outline(
3055 self,
3056 node: Any,
3057 pages: dict[int, PageObject],
3058 reader: PdfReader,
3059 ) -> list[Destination]:
3060 """
3061 Extract outline item entries that are part of the specified page set.
3063 Args:
3064 node:
3065 pages:
3066 reader:
3068 Returns:
3069 A list of destination objects.
3071 """
3072 new_outline = []
3073 if node is None:
3074 node = NullObject()
3075 node = node.get_object()
3076 if is_null_or_none(node):
3077 node = DictionaryObject()
3078 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:
3079 node = node.get("/First", None)
3080 if node is not None:
3081 node = node.get_object()
3082 new_outline += self._get_filtered_outline(node, pages, reader)
3083 else:
3084 v: Union[None, IndirectObject, NullObject]
3085 while node is not None:
3086 node = node.get_object()
3087 o = cast("Destination", reader._build_outline_item(node))
3088 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)
3089 if v is None:
3090 v = NullObject()
3091 o[NameObject("/Page")] = v
3092 if "/First" in node:
3093 o._filtered_children = self._get_filtered_outline(
3094 node["/First"], pages, reader
3095 )
3096 else:
3097 o._filtered_children = []
3098 if (
3099 not isinstance(o["/Page"], NullObject)
3100 or len(o._filtered_children) > 0
3101 ):
3102 new_outline.append(o)
3103 node = node.get("/Next", None)
3104 return new_outline
3106 def _clone_outline(self, dest: Destination) -> TreeObject:
3107 n_ol = TreeObject()
3108 self._add_object(n_ol)
3109 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])
3110 if not isinstance(dest["/Page"], NullObject):
3111 if dest.node is not None and "/A" in dest.node:
3112 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)
3113 else:
3114 n_ol[NameObject("/Dest")] = dest.dest_array
3115 # TODO: /SE
3116 if dest.node is not None:
3117 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))
3118 n_ol[NameObject("/C")] = ArrayObject(
3119 dest.node.get(
3120 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]
3121 )
3122 )
3123 return n_ol
3125 def _insert_filtered_outline(
3126 self,
3127 outlines: list[Destination],
3128 parent: Union[TreeObject, IndirectObject],
3129 before: Union[None, TreeObject, IndirectObject] = None,
3130 ) -> None:
3131 for dest in outlines:
3132 # TODO: can be improved to keep A and SE entries (ignored for the moment)
3133 # with np=self.add_outline_item_destination(dest,parent,before)
3134 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:
3135 np = parent
3136 else:
3137 np = self._clone_outline(dest)
3138 cast(TreeObject, parent.get_object()).insert_child(np, before, self)
3139 self._insert_filtered_outline(dest._filtered_children, np, None)
3141 def close(self) -> None:
3142 """Implemented for API harmonization."""
3143 return
3145 def find_outline_item(
3146 self,
3147 outline_item: dict[str, Any],
3148 root: Optional[OutlineType] = None,
3149 ) -> Optional[list[int]]:
3150 if root is None:
3151 o = self.get_outline_root()
3152 else:
3153 o = cast("TreeObject", root)
3155 i = 0
3156 while o is not None:
3157 if (
3158 o.indirect_reference == outline_item
3159 or o.get("/Title", None) == outline_item
3160 ):
3161 return [i]
3162 if "/First" in o:
3163 res = self.find_outline_item(
3164 outline_item, cast(OutlineType, o["/First"])
3165 )
3166 if res:
3167 return ([i] if "/Title" in o else []) + res
3168 if "/Next" in o:
3169 i += 1
3170 o = cast(TreeObject, o["/Next"])
3171 else:
3172 return None
3173 raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover
3175 def reset_translation(
3176 self, reader: Union[None, PdfReader, IndirectObject] = None
3177 ) -> None:
3178 """
3179 Reset the translation table between reader and the writer object.
3181 Late cloning will create new independent objects.
3183 Args:
3184 reader: PdfReader or IndirectObject referencing a PdfReader object.
3185 if set to None or omitted, all tables will be reset.
3187 """
3188 if reader is None:
3189 self._id_translated = {}
3190 elif isinstance(reader, PdfReader):
3191 try:
3192 del self._id_translated[id(reader)]
3193 except Exception:
3194 pass
3195 elif isinstance(reader, IndirectObject):
3196 try:
3197 del self._id_translated[id(reader.pdf)]
3198 except Exception:
3199 pass
3200 else:
3201 raise Exception("invalid parameter {reader}")
3203 def set_page_label(
3204 self,
3205 page_index_from: int,
3206 page_index_to: int,
3207 style: Optional[PageLabelStyle] = None,
3208 prefix: Optional[str] = None,
3209 start: Optional[int] = 0,
3210 ) -> None:
3211 """
3212 Set a page label to a range of pages.
3214 Page indexes must be given starting from 0.
3215 Labels must have a style, a prefix or both.
3216 If a range is not assigned any page label, a decimal label starting from 1 is applied.
3218 Args:
3219 page_index_from: page index of the beginning of the range starting from 0
3220 page_index_to: page index of the beginning of the range starting from 0
3221 style: The numbering style to be used for the numeric portion of each page label:
3223 * ``/D`` Decimal Arabic numerals
3224 * ``/R`` Uppercase Roman numerals
3225 * ``/r`` Lowercase Roman numerals
3226 * ``/A`` Uppercase letters (A to Z for the first 26 pages,
3227 AA to ZZ for the next 26, and so on)
3228 * ``/a`` Lowercase letters (a to z for the first 26 pages,
3229 aa to zz for the next 26, and so on)
3231 prefix: The label prefix for page labels in this range.
3232 start: The value of the numeric portion for the first page label
3233 in the range.
3234 Subsequent pages are numbered sequentially from this value,
3235 which must be greater than or equal to 1.
3236 Default value: 1.
3238 """
3239 if style is None and prefix is None:
3240 raise ValueError("At least one of style and prefix must be given")
3241 if page_index_from < 0:
3242 raise ValueError("page_index_from must be greater or equal than 0")
3243 if page_index_to < page_index_from:
3244 raise ValueError(
3245 "page_index_to must be greater or equal than page_index_from"
3246 )
3247 if page_index_to >= len(self.pages):
3248 raise ValueError("page_index_to exceeds number of pages")
3249 if start is not None and start != 0 and start < 1:
3250 raise ValueError("If given, start must be greater or equal than one")
3252 self._set_page_label(page_index_from, page_index_to, style, prefix, start)
3254 def _set_page_label(
3255 self,
3256 page_index_from: int,
3257 page_index_to: int,
3258 style: Optional[PageLabelStyle] = None,
3259 prefix: Optional[str] = None,
3260 start: Optional[int] = 0,
3261 ) -> None:
3262 """
3263 Set a page label to a range of pages.
3265 Page indexes must be given starting from 0.
3266 Labels must have a style, a prefix or both.
3267 If a range is not assigned any page label a decimal label starting from 1 is applied.
3269 Args:
3270 page_index_from: page index of the beginning of the range starting from 0
3271 page_index_to: page index of the beginning of the range starting from 0
3272 style: The numbering style to be used for the numeric portion of each page label:
3273 /D Decimal Arabic numerals
3274 /R Uppercase Roman numerals
3275 /r Lowercase Roman numerals
3276 /A Uppercase letters (A to Z for the first 26 pages,
3277 AA to ZZ for the next 26, and so on)
3278 /a Lowercase letters (a to z for the first 26 pages,
3279 aa to zz for the next 26, and so on)
3280 prefix: The label prefix for page labels in this range.
3281 start: The value of the numeric portion for the first page label
3282 in the range.
3283 Subsequent pages are numbered sequentially from this value,
3284 which must be greater than or equal to 1. Default value: 1.
3286 """
3287 default_page_label = DictionaryObject()
3288 default_page_label[NameObject("/S")] = NameObject("/D")
3290 new_page_label = DictionaryObject()
3291 if style is not None:
3292 new_page_label[NameObject("/S")] = NameObject(style)
3293 if prefix is not None:
3294 new_page_label[NameObject("/P")] = TextStringObject(prefix)
3295 if start != 0:
3296 new_page_label[NameObject("/St")] = NumberObject(start)
3298 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:
3299 nums = ArrayObject()
3300 nums_insert(NumberObject(0), default_page_label, nums)
3301 page_labels = TreeObject()
3302 page_labels[NameObject("/Nums")] = nums
3303 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3305 page_labels = cast(
3306 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]
3307 )
3308 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])
3310 nums_insert(NumberObject(page_index_from), new_page_label, nums)
3311 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)
3312 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)
3313 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):
3314 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)
3316 page_labels[NameObject("/Nums")] = nums
3317 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3319 def _repr_mimebundle_(
3320 self,
3321 include: Union[None, Iterable[str]] = None,
3322 exclude: Union[None, Iterable[str]] = None,
3323 ) -> dict[str, Any]:
3324 """
3325 Integration into Jupyter Notebooks.
3327 This method returns a dictionary that maps a mime-type to its
3328 representation.
3330 .. seealso::
3332 https://ipython.readthedocs.io/en/stable/config/integrating.html
3333 """
3334 pdf_data = BytesIO()
3335 self.write(pdf_data)
3336 data = {
3337 "application/pdf": pdf_data,
3338 }
3340 if include is not None:
3341 # Filter representations based on include list
3342 data = {k: v for k, v in data.items() if k in include}
3344 if exclude is not None:
3345 # Remove representations based on exclude list
3346 data = {k: v for k, v in data.items() if k not in exclude}
3348 return data
3351def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:
3352 if isinstance(obj, PdfObject):
3353 return obj
3354 if isinstance(obj, dict):
3355 to_add = DictionaryObject()
3356 for key, value in obj.items():
3357 to_add[NameObject(key)] = _pdf_objectify(value)
3358 return to_add
3359 if isinstance(obj, str):
3360 if obj.startswith("/"):
3361 return NameObject(obj)
3362 return TextStringObject(obj)
3363 if isinstance(obj, (float, int)):
3364 return FloatObject(obj)
3365 if isinstance(obj, list):
3366 return ArrayObject(_pdf_objectify(i) for i in obj)
3367 raise NotImplementedError(
3368 f"{type(obj)=} could not be cast to a PdfObject"
3369 )
3372def _create_outline_item(
3373 action_ref: Union[None, IndirectObject],
3374 title: str,
3375 color: Union[tuple[float, float, float], str, None],
3376 italic: bool,
3377 bold: bool,
3378) -> TreeObject:
3379 outline_item = TreeObject()
3380 if action_ref is not None:
3381 outline_item[NameObject("/A")] = action_ref
3382 outline_item.update(
3383 {
3384 NameObject("/Title"): create_string_object(title),
3385 }
3386 )
3387 if color:
3388 if isinstance(color, str):
3389 color = hex_to_rgb(color)
3390 outline_item.update(
3391 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}
3392 )
3393 if italic or bold:
3394 format_flag = 0
3395 if italic:
3396 format_flag += OutlineFontFlag.italic
3397 if bold:
3398 format_flag += OutlineFontFlag.bold
3399 outline_item.update({NameObject("/F"): NumberObject(format_flag)})
3400 return outline_item