Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 21%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import decimal
31import enum
32import hashlib
33import re
34import struct
35import sys
36import uuid
37from collections.abc import Iterable, Mapping
38from io import BytesIO, FileIO, IOBase
39from itertools import compress
40from pathlib import Path
41from re import Pattern
42from types import TracebackType
43from typing import (
44 IO,
45 Any,
46 Callable,
47 Optional,
48 Union,
49 cast,
50)
52if sys.version_info >= (3, 11):
53 from typing import Self
54else:
55 from typing_extensions import Self
57from ._doc_common import DocumentInformation, PdfDocCommon
58from ._encryption import EncryptAlgorithm, Encryption
59from ._page import PageObject, Transformation
60from ._page_labels import nums_clear_range, nums_insert, nums_next
61from ._reader import PdfReader
62from ._utils import (
63 StrByteType,
64 StreamType,
65 _get_max_pdf_version_header,
66 deprecate_with_replacement,
67 deprecation_no_replacement,
68 logger_warning,
69)
70from .constants import AnnotationDictionaryAttributes as AA
71from .constants import CatalogAttributes as CA
72from .constants import (
73 CatalogDictionary,
74 GoToActionArguments,
75 ImageType,
76 InteractiveFormDictEntries,
77 OutlineFontFlag,
78 PageLabelStyle,
79 PagesAttributes,
80 TypFitArguments,
81 UserAccessPermissions,
82)
83from .constants import Core as CO
84from .constants import FieldDictionaryAttributes as FA
85from .constants import PageAttributes as PG
86from .constants import TrailerKeys as TK
87from .errors import LimitReachedError, PdfReadError, PyPdfError
88from .generic import (
89 PAGE_FIT,
90 ArrayObject,
91 BooleanObject,
92 ByteStringObject,
93 ContentStream,
94 Destination,
95 DictionaryObject,
96 EmbeddedFile,
97 Fit,
98 FloatObject,
99 IndirectObject,
100 NameObject,
101 NullObject,
102 NumberObject,
103 PdfObject,
104 RectangleObject,
105 ReferenceLink,
106 StreamObject,
107 TextStringObject,
108 TreeObject,
109 ViewerPreferences,
110 create_string_object,
111 extract_links,
112 hex_to_rgb,
113 is_null_or_none,
114)
115from .generic._appearance_stream import TextStreamAppearance
116from .pagerange import PageRange, PageRangeSpec
117from .types import (
118 AnnotationSubtype,
119 BorderArrayType,
120 LayoutType,
121 OutlineItemType,
122 OutlineType,
123 PagemodeType,
124)
125from .xmp import XmpInformation
127ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()
130class ObjectDeletionFlag(enum.IntFlag):
131 NONE = 0
132 TEXT = enum.auto()
133 LINKS = enum.auto()
134 ATTACHMENTS = enum.auto()
135 OBJECTS_3D = enum.auto()
136 ALL_ANNOTATIONS = enum.auto()
137 XOBJECT_IMAGES = enum.auto()
138 INLINE_IMAGES = enum.auto()
139 DRAWING_IMAGES = enum.auto()
140 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
143def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
144 hash = hashlib.md5(usedforsecurity=False)
145 for block in iter(lambda: stream.read(blocksize), b""):
146 hash.update(block)
147 return hash.hexdigest()
150class PdfWriter(PdfDocCommon):
151 """
152 Write a PDF file out, given pages produced by another class or through
153 cloning a PDF file during initialization.
155 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.
157 Args:
158 clone_from: identical to fileobj (for compatibility)
160 incremental: If true, loads the document and set the PdfWriter in incremental mode.
162 When writing incrementally, the original document is written first and new/modified
163 content is appended. To be used for signed document/forms to keep signature valid.
165 full: If true, loads all the objects (always full if incremental = True).
166 This parameter may allow loading large PDFs.
168 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.
169 If false, pypdf will try to be forgiving and do something reasonable, but it will log
170 a warning message. It is a best-effort approach.
172 """
174 def __init__(
175 self,
176 fileobj: Union[None, PdfReader, StrByteType, Path] = "",
177 clone_from: Union[None, PdfReader, StrByteType, Path] = None,
178 incremental: bool = False,
179 full: bool = False,
180 strict: bool = False,
181 *,
182 incremental_clone_object_count_limit: Optional[int] = 500_000,
183 incremental_clone_object_id_limit: Optional[int] = 1_000_000,
184 ) -> None:
185 self.strict = strict
186 """
187 If true, pypdf will raise an exception if a PDF does not follow the specification.
188 If false, pypdf will try to be forgiving and do something reasonable, but it will log
189 a warning message. It is a best-effort approach.
190 """
192 self.incremental = incremental or full
193 """
194 Returns if the PdfWriter object has been started in incremental mode.
195 """
197 self._objects: list[Optional[PdfObject]] = []
198 """
199 The indirect objects in the PDF.
200 For the incremental case, it will be filled with None
201 in clone_reader_document_root.
202 """
204 self._original_hash: list[int] = []
205 """
206 List of hashes after import; used to identify changes.
207 """
209 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}
210 """
211 Maps hash values of indirect objects to the list of IndirectObjects.
212 This is used for compression.
213 """
215 self._id_translated: dict[int, dict[int, int]] = {}
216 """List of already translated IDs.
217 dict[id(pdf)][(idnum, generation)]
218 """
220 self._info_obj: Optional[PdfObject]
221 """The PDF files's document information dictionary,
222 defined by Info in the PDF file's trailer dictionary."""
224 self._ID: Union[ArrayObject, None] = None
225 """The PDF file identifier,
226 defined by the ID in the PDF file's trailer dictionary."""
228 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []
229 "Tracks links in pages added to the writer for resolving later."
230 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}
231 "Tracks pages added to the writer and what page they turned into."
233 # Security parameters.
234 self._incremental_clone_object_count_limit = (
235 incremental_clone_object_count_limit
236 if isinstance(incremental_clone_object_count_limit, int)
237 else sys.maxsize
238 )
239 self._incremental_clone_object_id_limit = (
240 incremental_clone_object_id_limit if isinstance(incremental_clone_object_id_limit, int) else sys.maxsize
241 )
243 if self.incremental:
244 if isinstance(fileobj, (str, Path)):
245 with open(fileobj, "rb") as f:
246 fileobj = BytesIO(f.read(-1))
247 if isinstance(fileobj, BytesIO):
248 fileobj = PdfReader(fileobj)
249 if not isinstance(fileobj, PdfReader):
250 raise PyPdfError("Invalid type for incremental mode")
251 self._reader = fileobj # prev content is in _reader.stream
252 self._header = fileobj.pdf_header.encode()
253 self._readonly = True # TODO: to be analysed
254 else:
255 self._header = b"%PDF-1.3"
256 self._info_obj = self._add_object(
257 DictionaryObject(
258 {NameObject("/Producer"): create_string_object("pypdf")}
259 )
260 )
262 def _get_clone_from(
263 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
264 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
265 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:
266 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (
267 fileobj == "" or clone_from is not None
268 ):
269 return clone_from
270 cloning = True
271 if isinstance(fileobj, (str, Path)) and (
272 not Path(str(fileobj)).exists()
273 or Path(str(fileobj)).stat().st_size == 0
274 ):
275 cloning = False
276 if isinstance(fileobj, (IOBase, BytesIO)):
277 t = fileobj.tell()
278 if fileobj.seek(0, 2) == 0:
279 cloning = False
280 fileobj.seek(t, 0)
281 if cloning:
282 clone_from = fileobj
283 return clone_from
285 clone_from = _get_clone_from(fileobj, clone_from)
286 # To prevent overwriting
287 self.temp_fileobj = fileobj
288 self.fileobj = ""
289 self._with_as_usage = False
290 self._cloned = False
291 # The root of our page tree node
292 pages = DictionaryObject(
293 {
294 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),
295 NameObject(PagesAttributes.COUNT): NumberObject(0),
296 NameObject(PagesAttributes.KIDS): ArrayObject(),
297 }
298 )
299 self.flattened_pages = []
300 self._encryption: Optional[Encryption] = None
301 self._encrypt_entry: Optional[DictionaryObject] = None
303 if clone_from is not None:
304 if not isinstance(clone_from, PdfReader):
305 clone_from = PdfReader(clone_from)
306 self.clone_document_from_reader(clone_from)
307 self._cloned = True
308 else:
309 self._pages = self._add_object(pages)
310 self._root_object = DictionaryObject(
311 {
312 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),
313 NameObject(CO.PAGES): self._pages,
314 }
315 )
316 self._add_object(self._root_object)
317 if full and not incremental:
318 self.incremental = False
319 if isinstance(self._ID, list):
320 if isinstance(self._ID[0], TextStringObject):
321 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
322 if isinstance(self._ID[1], TextStringObject):
323 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())
325 # for commonality
326 @property
327 def is_encrypted(self) -> bool:
328 """
329 Read-only boolean property showing whether this PDF file is encrypted.
331 Note that this property, if true, will remain true even after the
332 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
333 """
334 return False
336 @property
337 def root_object(self) -> DictionaryObject:
338 """
339 Provide direct access to PDF Structure.
341 Note:
342 Recommended only for read access.
344 """
345 return self._root_object
347 @property
348 def _info(self) -> Optional[DictionaryObject]:
349 """
350 Provide access to "/Info". Standardized with PdfReader.
352 Returns:
353 /Info Dictionary; None if the entry does not exist
355 """
356 return (
357 None
358 if self._info_obj is None
359 else cast(DictionaryObject, self._info_obj.get_object())
360 )
362 @_info.setter
363 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
364 if value is None:
365 try:
366 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore
367 except (KeyError, AttributeError):
368 pass
369 self._info_obj = None
370 else:
371 if self._info_obj is None:
372 self._info_obj = self._add_object(DictionaryObject())
373 obj = cast(DictionaryObject, self._info_obj.get_object())
374 obj.clear()
375 obj.update(cast(DictionaryObject, value.get_object()))
377 @property
378 def xmp_metadata(self) -> Optional[XmpInformation]:
379 """XMP (Extensible Metadata Platform) data."""
380 return cast(XmpInformation, self.root_object.xmp_metadata)
382 @xmp_metadata.setter
383 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:
384 """XMP (Extensible Metadata Platform) data."""
385 if value is None:
386 if "/Metadata" in self.root_object:
387 del self.root_object["/Metadata"]
388 return
390 metadata = self.root_object.get("/Metadata", None)
391 if not isinstance(metadata, IndirectObject):
392 if metadata is not None:
393 del self.root_object["/Metadata"]
394 metadata_stream = StreamObject()
395 stream_reference = self._add_object(metadata_stream)
396 self.root_object[NameObject("/Metadata")] = stream_reference
397 else:
398 metadata_stream = cast(StreamObject, metadata.get_object())
400 if isinstance(value, XmpInformation):
401 bytes_data = value.stream.get_data()
402 else:
403 bytes_data = value
404 metadata_stream.set_data(bytes_data)
406 @property
407 def with_as_usage(self) -> bool:
408 deprecation_no_replacement("with_as_usage", "5.0")
409 return self._with_as_usage
411 @with_as_usage.setter
412 def with_as_usage(self, value: bool) -> None:
413 deprecation_no_replacement("with_as_usage", "5.0")
414 self._with_as_usage = value
416 def __enter__(self) -> Self:
417 """Store how writer is initialized by 'with'."""
418 c: bool = self._cloned
419 t = self.temp_fileobj
420 self.__init__() # type: ignore
421 self._cloned = c
422 self._with_as_usage = True
423 self.fileobj = t # type: ignore
424 return self
426 def __exit__(
427 self,
428 exc_type: Optional[type[BaseException]],
429 exc: Optional[BaseException],
430 traceback: Optional[TracebackType],
431 ) -> None:
432 """Write data to the fileobj."""
433 if self.fileobj and not self._cloned:
434 self.write(self.fileobj)
436 @property
437 def pdf_header(self) -> str:
438 """
439 Read/Write property of the PDF header that is written.
441 This should be something like ``'%PDF-1.5'``. It is recommended to set
442 the lowest version that supports all features which are used within the
443 PDF file.
445 Note: `pdf_header` returns a string but accepts bytes or str for writing
446 """
447 return self._header.decode()
449 @pdf_header.setter
450 def pdf_header(self, new_header: Union[str, bytes]) -> None:
451 if isinstance(new_header, str):
452 new_header = new_header.encode()
453 self._header = new_header
455 def _add_object(self, obj: PdfObject) -> IndirectObject:
456 if (
457 getattr(obj, "indirect_reference", None) is not None
458 and obj.indirect_reference.pdf == self # type: ignore
459 ):
460 return obj.indirect_reference # type: ignore
461 # check for /Contents in Pages (/Contents in annotations are strings)
462 if isinstance(obj, DictionaryObject) and isinstance(
463 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
464 ):
465 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
466 self._objects.append(obj)
467 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
468 return obj.indirect_reference
470 def get_object(
471 self,
472 indirect_reference: Union[int, IndirectObject],
473 ) -> PdfObject:
474 if isinstance(indirect_reference, int):
475 obj = self._objects[indirect_reference - 1]
476 elif indirect_reference.pdf != self:
477 raise ValueError("PDF must be self")
478 else:
479 obj = self._objects[indirect_reference.idnum - 1]
480 if obj is None:
481 raise PdfReadError(f"Object {indirect_reference!r} not found!")
482 return obj
484 def _replace_object(
485 self,
486 indirect_reference: Union[int, IndirectObject],
487 obj: PdfObject,
488 ) -> PdfObject:
489 if isinstance(indirect_reference, IndirectObject):
490 if indirect_reference.pdf != self:
491 raise ValueError("PDF must be self")
492 indirect_reference = indirect_reference.idnum
493 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore
494 if (
495 getattr(obj, "indirect_reference", None) is not None
496 and obj.indirect_reference.pdf != self # type: ignore
497 ):
498 obj = obj.clone(self)
499 self._objects[indirect_reference - 1] = obj
500 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
502 assert isinstance(obj, PdfObject), "mypy"
503 return obj
505 def _add_page(
506 self,
507 page: PageObject,
508 index: int,
509 excluded_keys: Iterable[str] = (),
510 ) -> PageObject:
511 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:
512 raise ValueError("Invalid page object")
513 assert self.flattened_pages is not None, "for mypy"
514 page_org = page
515 excluded_keys = list(excluded_keys)
516 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]
517 # Acrobat does not accept two indirect references pointing on the same
518 # page; therefore in order to add multiple copies of the same
519 # page, we need to create a new dictionary for the page, however the
520 # objects below (including content) are not duplicated:
521 try: # delete an already existing page
522 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore
523 page_org.indirect_reference.idnum # type: ignore
524 ]
525 except Exception:
526 pass
528 page = cast(
529 "PageObject", page_org.clone(self, False, excluded_keys).get_object()
530 )
531 if page_org.pdf is not None:
532 other = page_org.pdf.pdf_header
533 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
535 node, idx = self._get_page_in_node(index)
536 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference
538 if idx >= 0:
539 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)
540 self.flattened_pages.insert(index, page)
541 else:
542 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)
543 self.flattened_pages.append(page)
544 recurse = 0
545 while not is_null_or_none(node):
546 node = cast(DictionaryObject, node.get_object())
547 node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1)
548 node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix.
549 recurse += 1
550 if recurse > 1000:
551 raise PyPdfError("Too many recursive calls!")
553 if page_org.pdf is not None:
554 # the page may contain links to other pages, and those other
555 # pages may or may not already be added. we store the
556 # information we need, so that we can resolve the references
557 # later.
558 self._unresolved_links.extend(extract_links(page, page_org))
559 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
561 return page
563 def set_need_appearances_writer(self, state: bool = True) -> None:
564 """
565 Sets the "NeedAppearances" flag in the PDF writer.
567 The "NeedAppearances" flag indicates whether the appearance dictionary
568 for form fields should be automatically generated by the PDF viewer or
569 if the embedded appearance should be used.
571 Args:
572 state: The actual value of the NeedAppearances flag.
574 Returns:
575 None
577 """
578 # See §12.7.2 and §7.7.2 for more information:
579 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
580 try:
581 # get the AcroForm tree
582 if CatalogDictionary.ACRO_FORM not in self._root_object:
583 self._root_object[
584 NameObject(CatalogDictionary.ACRO_FORM)
585 ] = self._add_object(DictionaryObject())
587 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)
588 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[
589 need_appearances
590 ] = BooleanObject(state)
591 except Exception as exc: # pragma: no cover
592 logger_warning(
593 f"set_need_appearances_writer({state}) catch : {exc}", __name__
594 )
596 def create_viewer_preferences(self) -> ViewerPreferences:
597 o = ViewerPreferences()
598 self._root_object[
599 NameObject(CatalogDictionary.VIEWER_PREFERENCES)
600 ] = self._add_object(o)
601 return o
603 def add_page(
604 self,
605 page: PageObject,
606 excluded_keys: Iterable[str] = (),
607 ) -> PageObject:
608 """
609 Add a page to this PDF file.
611 Recommended for advanced usage including the adequate excluded_keys.
613 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`
614 instance.
616 Args:
617 page: The page to add to the document. Should be
618 an instance of :class:`PageObject<pypdf._page.PageObject>`
619 excluded_keys:
621 Returns:
622 The added PageObject.
624 """
625 assert self.flattened_pages is not None, "mypy"
626 return self._add_page(page, len(self.flattened_pages), excluded_keys)
628 def insert_page(
629 self,
630 page: PageObject,
631 index: int = 0,
632 excluded_keys: Iterable[str] = (),
633 ) -> PageObject:
634 """
635 Insert a page in this PDF file. The page is usually acquired from a
636 :class:`PdfReader<pypdf.PdfReader>` instance.
638 Args:
639 page: The page to add to the document.
640 index: Position at which the page will be inserted.
641 excluded_keys:
643 Returns:
644 The added PageObject.
646 """
647 assert self.flattened_pages is not None, "mypy"
648 if index < 0:
649 index += len(self.flattened_pages)
650 if index < 0:
651 raise ValueError("Invalid index value")
652 if index >= len(self.flattened_pages):
653 return self.add_page(page, excluded_keys)
654 return self._add_page(page, index, excluded_keys)
656 def _get_page_number_by_indirect(
657 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
658 ) -> Optional[int]:
659 """
660 Generate _page_id2num.
662 Args:
663 indirect_reference:
665 Returns:
666 The page number or None
668 """
669 # To provide same function as in PdfReader
670 if is_null_or_none(indirect_reference):
671 return None
672 assert indirect_reference is not None, "mypy"
673 if isinstance(indirect_reference, int):
674 indirect_reference = IndirectObject(indirect_reference, 0, self)
675 obj = indirect_reference.get_object()
676 if isinstance(obj, PageObject):
677 return obj.page_number
678 return None
680 def add_blank_page(
681 self, width: Optional[float] = None, height: Optional[float] = None
682 ) -> PageObject:
683 """
684 Append a blank page to this PDF file and return it.
686 If no page size is specified, use the size of the last page.
688 Args:
689 width: The width of the new page expressed in default user
690 space units.
691 height: The height of the new page expressed in default
692 user space units.
694 Returns:
695 The newly appended page.
697 Raises:
698 PageSizeNotDefinedError: if width and height are not defined
699 and previous page does not exist.
701 """
702 page = PageObject.create_blank_page(self, width, height)
703 return self.add_page(page)
705 def insert_blank_page(
706 self,
707 width: Optional[Union[float, decimal.Decimal]] = None,
708 height: Optional[Union[float, decimal.Decimal]] = None,
709 index: int = 0,
710 ) -> PageObject:
711 """
712 Insert a blank page to this PDF file and return it.
714 If no page size is specified for a dimension, use the size of the last page.
716 Args:
717 width: The width of the new page in default user space units.
718 height: The height of the new page in default user space units.
719 index: Position to add the page.
721 Returns:
722 The newly inserted page.
724 Raises:
725 PageSizeNotDefinedError: if width and height are not defined
726 and previous page does not exist.
727 IndexError: Index is outside of [-self.get_num_pages(), self.get_num_pages()]
728 """
729 num_pages = self.get_num_pages()
730 if abs(index) <= num_pages:
731 # Use the chosen index, but do not exceed the available pages
732 fixed_index = min(index, num_pages - 1)
733 mediabox = self.pages[fixed_index].mediabox
734 if width is None or width <= 0:
735 width = mediabox.width
736 if height is None or height <= 0:
737 height = mediabox.height
738 else:
739 raise IndexError(f"Index should be in range [-{num_pages}, {num_pages}]")
741 page = PageObject.create_blank_page(self, width, height)
742 self.insert_page(page, index)
743 return page
745 @property
746 def open_destination(
747 self,
748 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
749 return super().open_destination
751 @open_destination.setter
752 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
753 if dest is None:
754 try:
755 del self._root_object["/OpenAction"]
756 except KeyError:
757 pass
758 elif isinstance(dest, str):
759 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)
760 elif isinstance(dest, Destination):
761 self._root_object[NameObject("/OpenAction")] = dest.dest_array
762 elif isinstance(dest, PageObject):
763 self._root_object[NameObject("/OpenAction")] = Destination(
764 "Opening",
765 dest.indirect_reference
766 if dest.indirect_reference is not None
767 else NullObject(),
768 PAGE_FIT,
769 ).dest_array
771 def add_js(self, javascript: str) -> None:
772 """
773 Add JavaScript which will launch upon opening this PDF.
775 Args:
776 javascript: Your JavaScript.
778 Example:
779 This will launch the print window when the PDF is opened.
781 >>> from pypdf import PdfWriter
782 >>> output = PdfWriter()
783 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
785 """
786 # Names / JavaScript preferred to be able to add multiple scripts
787 if "/Names" not in self._root_object:
788 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()
789 names = cast(DictionaryObject, self._root_object[CA.NAMES])
790 if "/JavaScript" not in names:
791 names[NameObject("/JavaScript")] = DictionaryObject(
792 {NameObject("/Names"): ArrayObject()}
793 )
794 js_list = cast(
795 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]
796 )
797 # We need a name for parameterized JavaScript in the PDF file,
798 # but it can be anything.
799 js_list.append(create_string_object(str(uuid.uuid4())))
801 js = DictionaryObject(
802 {
803 NameObject(PagesAttributes.TYPE): NameObject("/Action"),
804 NameObject("/S"): NameObject("/JavaScript"),
805 NameObject("/JS"): TextStringObject(f"{javascript}"),
806 }
807 )
808 js_list.append(self._add_object(js))
810 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":
811 """
812 Embed a file inside the PDF.
814 Reference:
815 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
816 Section 7.11.3
818 Args:
819 filename: The filename to display.
820 data: The data in the file.
822 Returns:
823 EmbeddedFile instance for the newly created embedded file.
825 """
826 return EmbeddedFile._create_new(self, filename, data)
828 def append_pages_from_reader(
829 self,
830 reader: PdfReader,
831 after_page_append: Optional[Callable[[PageObject], None]] = None,
832 ) -> None:
833 """
834 Copy pages from reader to writer. Includes an optional callback
835 parameter which is invoked after pages are appended to the writer.
837 ``append`` should be preferred.
839 Args:
840 reader: a PdfReader object from which to copy page
841 annotations to this writer object. The writer's annots
842 will then be updated.
843 after_page_append:
844 Callback function that is invoked after each page is appended to
845 the writer. Signature includes a reference to the appended page
846 (delegates to append_pages_from_reader). The single parameter of
847 the callback is a reference to the page just appended to the
848 document.
850 """
851 reader_num_pages = len(reader.pages)
852 # Copy pages from reader to writer
853 for reader_page_number in range(reader_num_pages):
854 reader_page = reader.pages[reader_page_number]
855 writer_page = self.add_page(reader_page)
856 # Trigger callback, pass writer page as parameter
857 if callable(after_page_append):
858 after_page_append(writer_page)
860 def _merge_content_stream_to_page(
861 self,
862 page: PageObject,
863 new_content_data: bytes,
864 ) -> None:
865 """
866 Combines existing content stream(s) with new content (as bytes).
868 Args:
869 page: The page to which the new content data will be added.
870 new_content_data: A binary-encoded new content stream, for
871 instance the commands to draw an XObject.
872 """
873 # First resolve the existing page content. This always is an IndirectObject:
874 # PDF Explained by John Whitington
875 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
876 if NameObject("/Contents") in page:
877 existing_content_ref = page[NameObject("/Contents")]
878 existing_content = existing_content_ref.get_object()
880 if isinstance(existing_content, ArrayObject):
881 # Create a new StreamObject for the new_content_data
882 new_stream_obj = StreamObject()
883 new_stream_obj.set_data(new_content_data)
884 existing_content.append(self._add_object(new_stream_obj))
885 page[NameObject("/Contents")] = self._add_object(existing_content)
886 if isinstance(existing_content, StreamObject):
887 # Merge new content to existing StreamObject
888 merged_data = existing_content.get_data() + b"\n" + new_content_data
889 new_stream = StreamObject()
890 new_stream.set_data(merged_data)
891 page[NameObject("/Contents")] = self._add_object(new_stream)
892 else:
893 # If no existing content, then we have an empty page.
894 # Create a new StreamObject in a new /Contents entry.
895 new_stream = StreamObject()
896 new_stream.set_data(new_content_data)
897 page[NameObject("/Contents")] = self._add_object(new_stream)
899 def _add_apstream_object(
900 self,
901 page: PageObject,
902 appearance_stream_obj: StreamObject,
903 object_name: str,
904 x_offset: float,
905 y_offset: float,
906 ) -> None:
907 """
908 Adds an appearance stream to the page content in the form of
909 an XObject.
911 Args:
912 page: The page to which to add the appearance stream.
913 appearance_stream_obj: The appearance stream.
914 object_name: The name of the appearance stream.
915 x_offset: The horizontal offset for the appearance stream.
916 y_offset: The vertical offset for the appearance stream.
917 """
918 # Prepare XObject resource dictionary on the page. This currently
919 # only deals with font resources, but can easily be adapted to also
920 # include other resources.
921 pg_res = cast(DictionaryObject, page[PG.RESOURCES])
922 if "/Resources" in appearance_stream_obj:
923 ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"])
924 ap_stream_font_dict = cast(DictionaryObject, ap_stream_res.get("/Font", DictionaryObject()))
925 if "/Font" not in pg_res:
926 font_dict_ref = self._add_object(DictionaryObject())
927 pg_res[NameObject("/Font")] = font_dict_ref
928 pg_font_res = cast(DictionaryObject, pg_res["/Font"].get_object())
929 # Merge fonts from the appearance stream into the page's font resources
930 for font_name, font_res in ap_stream_font_dict.items():
931 if font_name not in pg_font_res:
932 font_res_ref = self._add_object(font_res)
933 pg_font_res[font_name] = font_res_ref
934 # Always add the resolved stream object to the writer to get a new IndirectObject.
935 # This ensures we have a valid IndirectObject managed by *this* writer.
936 xobject_ref = self._add_object(appearance_stream_obj)
937 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()
938 if "/XObject" not in pg_res:
939 pg_res[NameObject("/XObject")] = DictionaryObject()
940 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])
941 if xobject_name not in pg_xo_res:
942 pg_xo_res[xobject_name] = xobject_ref
943 else:
944 logger_warning(
945 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",
946 __name__
947 )
948 xobject_cm = Transformation().translate(x_offset, y_offset)
949 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
950 self._merge_content_stream_to_page(page, xobject_drawing_commands)
952 FFBITS_NUL = FA.FfBits(0)
954 def update_page_form_field_values(
955 self,
956 page: Union[PageObject, list[PageObject], None],
957 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],
958 flags: FA.FfBits = FFBITS_NUL,
959 auto_regenerate: Optional[bool] = True,
960 flatten: bool = False,
961 ) -> None:
962 """
963 Update the form field values for a given page from a fields dictionary.
965 Copy field texts and values from fields to page.
966 If the field links to a parent object, add the information to the parent.
968 Args:
969 page: `PageObject` - references **PDF writer's page** where the
970 annotations and field data will be updated.
971 `List[Pageobject]` - provides list of pages to be processed.
972 `None` - all pages.
973 fields: a Python dictionary of:
975 * field names (/T) as keys and text values (/V) as value
976 * field names (/T) as keys and list of text values (/V) for multiple choice list
977 * field names (/T) as keys and tuple of:
978 * text values (/V)
979 * font id (e.g. /F1, the font id must exist)
980 * font size (0 for autosize)
982 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.
984 auto_regenerate: Set/unset the need_appearances flag;
985 the flag is unchanged if auto_regenerate is None.
987 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's
988 appearance stream to the page contents. Note that this option does not remove the
989 annotation itself.
991 """
992 if CatalogDictionary.ACRO_FORM not in self._root_object:
993 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")
994 acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
995 if InteractiveFormDictEntries.Fields not in acro_form:
996 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")
997 if isinstance(auto_regenerate, bool):
998 self.set_need_appearances_writer(auto_regenerate)
999 # Iterate through pages, update field values
1000 if page is None:
1001 page = list(self.pages)
1002 if isinstance(page, list):
1003 for p in page:
1004 if PG.ANNOTS in p: # just to prevent warnings
1005 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)
1006 return
1007 if PG.ANNOTS not in page:
1008 logger_warning("No fields to update on this page", __name__)
1009 return
1010 appearance_stream_obj: Optional[StreamObject] = None
1012 for annotation in page[PG.ANNOTS]: # type: ignore
1013 annotation = cast(DictionaryObject, annotation.get_object())
1014 if annotation.get("/Subtype", "") != "/Widget":
1015 continue
1016 if "/FT" in annotation and "/T" in annotation:
1017 parent_annotation = annotation
1018 else:
1019 parent_annotation = annotation.get(
1020 PG.PARENT, DictionaryObject()
1021 ).get_object()
1023 for field, value in fields.items():
1024 rectangle = cast(RectangleObject, annotation[AA.Rect])
1025 if not (
1026 self._get_qualified_field_name(parent_annotation) == field
1027 or parent_annotation.get("/T", None) == field
1028 ):
1029 continue
1030 if (
1031 parent_annotation.get("/FT", None) == "/Ch"
1032 and "/I" in parent_annotation
1033 ):
1034 del parent_annotation["/I"]
1035 if flags:
1036 annotation[NameObject(FA.Ff)] = NumberObject(flags)
1037 # Set the field value
1038 if not (value is None and flatten): # Only change values if given by user and not flattening.
1039 if isinstance(value, list):
1040 lst = ArrayObject(TextStringObject(v) for v in value)
1041 parent_annotation[NameObject(FA.V)] = lst
1042 elif isinstance(value, tuple):
1043 annotation[NameObject(FA.V)] = TextStringObject(
1044 value[0],
1045 )
1046 else:
1047 parent_annotation[NameObject(FA.V)] = TextStringObject(value)
1048 # Get or create the field's appearance stream object
1049 if parent_annotation.get(FA.FT) == "/Btn":
1050 # Checkbox button (no /FT found in Radio widgets);
1051 # We can find the associated appearance stream object
1052 # within the annotation.
1053 v = NameObject(value)
1054 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])
1055 normal_ap = cast(DictionaryObject, ap["/N"])
1056 if v not in normal_ap:
1057 v = NameObject("/Off")
1058 appearance_stream_obj = normal_ap.get(v)
1059 # Other cases will be updated through the for loop
1060 annotation[NameObject(AA.AS)] = v
1061 annotation[NameObject(FA.V)] = v
1062 elif (
1063 parent_annotation.get(FA.FT) == "/Tx"
1064 or parent_annotation.get(FA.FT) == "/Ch"
1065 ):
1066 # Textbox; we need to generate the appearance stream object
1067 if isinstance(value, tuple):
1068 appearance_stream_obj = TextStreamAppearance.from_text_annotation(
1069 acro_form, parent_annotation, annotation, value[1], value[2]
1070 )
1071 else:
1072 appearance_stream_obj = TextStreamAppearance.from_text_annotation(
1073 acro_form, parent_annotation, annotation
1074 )
1075 # Add the appearance stream object
1076 if AA.AP not in annotation:
1077 annotation[NameObject(AA.AP)] = DictionaryObject(
1078 {NameObject("/N"): self._add_object(appearance_stream_obj)}
1079 )
1080 elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])):
1081 cast(DictionaryObject, annotation[NameObject(AA.AP)])[
1082 NameObject("/N")
1083 ] = self._add_object(appearance_stream_obj)
1084 else: # [/AP][/N] exists
1085 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore
1086 self._objects[n - 1] = appearance_stream_obj
1087 appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self)
1088 elif (
1089 annotation.get(FA.FT) == "/Sig"
1090 ): # deprecated # not implemented yet
1091 logger_warning("Signature forms not implemented yet", __name__)
1092 if flatten and appearance_stream_obj is not None:
1093 self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])
1095 def reattach_fields(
1096 self, page: Optional[PageObject] = None
1097 ) -> list[DictionaryObject]:
1098 """
1099 Parse annotations within the page looking for orphan fields and
1100 reattach then into the Fields Structure.
1102 Args:
1103 page: page to analyze.
1104 If none is provided, all pages will be analyzed.
1106 Returns:
1107 list of reattached fields.
1109 """
1110 lst = []
1111 if page is None:
1112 for p in self.pages:
1113 lst += self.reattach_fields(p)
1114 return lst
1116 try:
1117 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1118 except KeyError:
1119 af = DictionaryObject()
1120 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af
1121 try:
1122 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])
1123 except KeyError:
1124 fields = ArrayObject()
1125 af[NameObject(InteractiveFormDictEntries.Fields)] = fields
1127 if "/Annots" not in page:
1128 return lst
1129 annotations = cast(ArrayObject, page["/Annots"])
1130 for idx, annotation in enumerate(annotations):
1131 is_indirect = isinstance(annotation, IndirectObject)
1132 annotation = cast(DictionaryObject, annotation.get_object())
1133 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:
1134 if (
1135 "indirect_reference" in annotation.__dict__
1136 and annotation.indirect_reference in fields
1137 ):
1138 continue
1139 if not is_indirect:
1140 annotations[idx] = self._add_object(annotation)
1141 fields.append(annotation.indirect_reference)
1142 lst.append(annotation)
1143 return lst
1145 def _collect_incremental_clone_object_ids(self, reader: PdfReader) -> list[int]:
1146 object_ids: set[int] = set()
1147 for xref_entry in reader.xref.values():
1148 object_ids.update(filter(None, xref_entry))
1149 object_ids.update(filter(None, reader.xref_objStm))
1151 object_count = len(object_ids)
1152 if object_count > self._incremental_clone_object_count_limit:
1153 raise LimitReachedError(
1154 f"Incremental clone object count {object_count} exceeds "
1155 f"maximum allowed count {self._incremental_clone_object_count_limit}."
1156 )
1158 max_object_id = max(object_ids, default=0)
1159 if max_object_id > self._incremental_clone_object_id_limit:
1160 raise LimitReachedError(
1161 f"Incremental clone object ID {max_object_id} exceeds "
1162 f"maximum allowed ID {self._incremental_clone_object_id_limit}."
1163 )
1165 return sorted(object_ids)
1167 def clone_reader_document_root(self, reader: PdfReader) -> None:
1168 """
1169 Copy the reader document root to the writer and all sub-elements,
1170 including pages, threads, outlines,... For partial insertion, ``append``
1171 should be considered.
1173 Args:
1174 reader: PdfReader from which the document root should be copied.
1176 """
1177 self._info_obj = None
1178 if self.incremental:
1179 object_ids = self._collect_incremental_clone_object_ids(reader)
1180 self._objects = [None] * (object_ids[-1] if object_ids else 0)
1181 for object_id in object_ids:
1182 reader_object = reader.get_object(object_id)
1183 if reader_object is not None:
1184 self._objects[object_id - 1] = reader_object.replicate(self)
1185 else:
1186 self._objects.clear()
1187 self._root_object = reader.root_object.clone(self)
1188 self._pages = self._root_object.raw_get("/Pages")
1190 if len(self._objects) > cast(int, reader.trailer["/Size"]):
1191 if self.strict:
1192 raise PdfReadError(
1193 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}"
1194 )
1195 logger_warning(
1196 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}",
1197 __name__
1198 )
1200 # must be done here before rewriting
1201 if self.incremental:
1202 self._original_hash = [
1203 (obj.hash_bin() if obj is not None else 0) for obj in self._objects
1204 ]
1206 try:
1207 self._flatten()
1208 except IndexError:
1209 raise PdfReadError("Got index error while flattening.")
1211 assert self.flattened_pages is not None
1212 for p in self.flattened_pages:
1213 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)
1214 if not self.incremental:
1215 p[NameObject("/Parent")] = self._pages
1216 if not self.incremental:
1217 cast(DictionaryObject, self._pages.get_object())[
1218 NameObject("/Kids")
1219 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
1221 def clone_document_from_reader(
1222 self,
1223 reader: PdfReader,
1224 after_page_append: Optional[Callable[[PageObject], None]] = None,
1225 ) -> None:
1226 """
1227 Create a copy (clone) of a document from a PDF file reader cloning
1228 section '/Root' and '/Info' and '/ID' of the pdf.
1230 Args:
1231 reader: PDF file reader instance from which the clone
1232 should be created.
1233 after_page_append:
1234 Callback function that is invoked after each page is appended to
1235 the writer. Signature includes a reference to the appended page
1236 (delegates to append_pages_from_reader). The single parameter of
1237 the callback is a reference to the page just appended to the
1238 document.
1240 """
1241 self.clone_reader_document_root(reader)
1242 inf = reader._info
1243 if self.incremental:
1244 if inf is not None:
1245 self._info_obj = cast(
1246 IndirectObject, inf.clone(self).indirect_reference
1247 )
1248 assert isinstance(self._info, DictionaryObject), "for mypy"
1249 self._original_hash[
1250 self._info_obj.indirect_reference.idnum - 1
1251 ] = self._info.hash_bin()
1252 elif inf is not None:
1253 self._info_obj = self._add_object(
1254 DictionaryObject(cast(DictionaryObject, inf.get_object()))
1255 )
1256 # else: _info_obj = None done in clone_reader_document_root()
1258 try:
1259 self._ID = cast(ArrayObject, reader._ID).clone(self)
1260 except AttributeError:
1261 pass
1263 if callable(after_page_append):
1264 for page in cast(
1265 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]
1266 ):
1267 after_page_append(page.get_object())
1269 def _compute_document_identifier(self) -> ByteStringObject:
1270 stream = BytesIO()
1271 self._write_pdf_structure(stream)
1272 stream.seek(0)
1273 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
1275 def generate_file_identifiers(self) -> None:
1276 """
1277 Generate an identifier for the PDF that will be written.
1279 The only point of this is ensuring uniqueness. Reproducibility is not
1280 required.
1281 When a file is first written, both identifiers shall be set to the same value.
1282 If both identifiers match when a file reference is resolved, it is very
1283 likely that the correct and unchanged file has been found. If only the first
1284 identifier matches, a different version of the correct file has been found.
1285 see §14.4 "File Identifiers".
1286 """
1287 if self._ID:
1288 id1 = self._ID[0]
1289 id2 = self._compute_document_identifier()
1290 else:
1291 id1 = self._compute_document_identifier()
1292 id2 = id1
1293 self._ID = ArrayObject((id1, id2))
1295 def encrypt(
1296 self,
1297 user_password: str,
1298 owner_password: Optional[str] = None,
1299 use_128bit: bool = True,
1300 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,
1301 *,
1302 algorithm: Optional[str] = None,
1303 ) -> None:
1304 """
1305 Encrypt this PDF file with the PDF Standard encryption handler.
1307 Args:
1308 user_password: The password which allows for opening
1309 and reading the PDF file with the restrictions provided.
1310 owner_password: The password which allows for
1311 opening the PDF files without any restrictions. By default,
1312 the owner password is the same as the user password.
1313 use_128bit: flag as to whether to use 128bit
1314 encryption. When false, 40bit encryption will be used.
1315 By default, this flag is on.
1316 permissions_flag: permissions as described in
1317 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means
1318 the permission is granted.
1319 Hence an integer value of -1 will set all flags.
1320 Bit position 3 is for printing, 4 is for modifying content,
1321 5 and 6 control annotations, 9 for form fields,
1322 10 for extraction of text and graphics.
1323 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",
1324 "AES-128", "AES-256-R5", "AES-256". If it is valid,
1325 `use_128bit` will be ignored.
1327 """
1328 if owner_password is None:
1329 owner_password = user_password
1331 if algorithm is not None:
1332 try:
1333 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))
1334 except AttributeError:
1335 raise ValueError(f"Algorithm '{algorithm}' NOT supported")
1336 else:
1337 alg = EncryptAlgorithm.RC4_128
1338 if not use_128bit:
1339 alg = EncryptAlgorithm.RC4_40
1340 self.generate_file_identifiers()
1341 assert self._ID
1342 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
1343 # in case call `encrypt` again
1344 entry = self._encryption.write_entry(user_password, owner_password)
1345 if self._encrypt_entry:
1346 # replace old encrypt_entry
1347 assert self._encrypt_entry.indirect_reference is not None
1348 entry.indirect_reference = self._encrypt_entry.indirect_reference
1349 self._objects[entry.indirect_reference.idnum - 1] = entry
1350 else:
1351 self._add_object(entry)
1352 self._encrypt_entry = entry
1354 def _resolve_links(self) -> None:
1355 """Patch up links that were added to the document earlier, to
1356 make sure they still point to the same pages.
1357 """
1358 for (new_link, old_link) in self._unresolved_links:
1359 old_page = old_link.find_referenced_page()
1360 if not old_page:
1361 continue
1362 new_page = self._merged_in_pages.get(old_page)
1363 if new_page is None:
1364 continue
1365 new_link.patch_reference(self, new_page)
1367 def write_stream(self, stream: StreamType) -> None:
1368 if hasattr(stream, "mode") and "b" not in stream.mode:
1369 logger_warning(
1370 f"File <{stream.name}> to write to is not in binary mode. "
1371 "It may not be written to correctly.",
1372 __name__,
1373 )
1374 self._resolve_links()
1376 if self.incremental:
1377 self._reader.stream.seek(0)
1378 stream.write(self._reader.stream.read(-1))
1379 if len(self.list_objects_in_increment()) > 0:
1380 self._write_increment(stream) # writes objs, xref stream and startxref
1381 else:
1382 object_positions, free_objects = self._write_pdf_structure(stream)
1383 xref_location = self._write_xref_table(
1384 stream, object_positions, free_objects
1385 )
1386 self._write_trailer(stream, xref_location)
1388 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:
1389 """
1390 Write the collection of pages added to this object out as a PDF file.
1392 Args:
1393 stream: An object to write the file to. The object can support
1394 the write method and the tell method, similar to a file object, or
1395 be a file path, just like the fileobj, just named it stream to keep
1396 existing workflow.
1398 Returns:
1399 A tuple (bool, IO).
1401 """
1402 my_file = False
1404 if stream == "":
1405 raise ValueError(f"Output({stream=}) is empty.")
1407 if isinstance(stream, (str, Path)):
1408 stream = FileIO(stream, "wb")
1409 my_file = True
1411 self.write_stream(stream)
1413 if my_file:
1414 stream.close()
1415 else:
1416 stream.flush()
1418 return my_file, stream
1420 def list_objects_in_increment(self) -> list[IndirectObject]:
1421 """
1422 For analysis or debugging.
1423 Provides the list of new or modified objects that will be written
1424 in the increment.
1425 Deleted objects will not be freed but will become orphans.
1427 Returns:
1428 List of new or modified IndirectObjects
1430 """
1431 original_hash_count = len(self._original_hash)
1432 return [
1433 cast(IndirectObject, obj).indirect_reference
1434 for i, obj in enumerate(self._objects)
1435 if (
1436 obj is not None
1437 and (
1438 i >= original_hash_count
1439 or obj.hash_bin() != self._original_hash[i]
1440 )
1441 )
1442 ]
1444 def _write_increment(self, stream: StreamType) -> None:
1445 object_positions = {}
1446 object_blocks = []
1447 current_start = -1
1448 current_stop = -2
1449 original_hash_count = len(self._original_hash)
1450 for i, obj in enumerate(self._objects):
1451 if obj is not None and (
1452 i >= original_hash_count
1453 or obj.hash_bin() != self._original_hash[i]
1454 ):
1455 idnum = i + 1
1456 assert isinstance(obj, PdfObject), "mypy"
1457 # first write new/modified object
1458 object_positions[idnum] = stream.tell()
1459 stream.write(f"{idnum} 0 obj\n".encode())
1460 """ encryption is not operational
1461 if self._encryption and obj != self._encrypt_entry:
1462 obj = self._encryption.encrypt_object(obj, idnum, 0)
1463 """
1464 obj.write_to_stream(stream)
1465 stream.write(b"\nendobj\n")
1467 # prepare xref
1468 if idnum != current_stop:
1469 if current_start > 0:
1470 object_blocks.append(
1471 [current_start, current_stop - current_start]
1472 )
1473 current_start = idnum
1474 current_stop = idnum + 1
1475 assert current_start > 0, "for pytest only"
1476 object_blocks.append([current_start, current_stop - current_start])
1477 # write incremented xref
1478 xref_location = stream.tell()
1479 xr_id = len(self._objects) + 1
1480 stream.write(f"{xr_id} 0 obj".encode())
1481 init_data = {
1482 NameObject("/Type"): NameObject("/XRef"),
1483 NameObject("/Size"): NumberObject(xr_id + 1),
1484 NameObject("/Root"): self.root_object.indirect_reference,
1485 NameObject("/Filter"): NameObject("/FlateDecode"),
1486 NameObject("/Index"): ArrayObject(
1487 [NumberObject(_it) for _su in object_blocks for _it in _su]
1488 ),
1489 NameObject("/W"): ArrayObject(
1490 [NumberObject(1), NumberObject(4), NumberObject(1)]
1491 ),
1492 "__streamdata__": b"",
1493 }
1494 if self._info is not None and (
1495 self._info.indirect_reference.idnum - 1 # type: ignore
1496 >= len(self._original_hash)
1497 or cast(IndirectObject, self._info).hash_bin() # kept for future
1498 != self._original_hash[
1499 self._info.indirect_reference.idnum - 1 # type: ignore
1500 ]
1501 ):
1502 init_data[NameObject(TK.INFO)] = self._info.indirect_reference
1503 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
1504 if self._ID:
1505 init_data[NameObject(TK.ID)] = self._ID
1506 xr = StreamObject.initialize_from_dictionary(init_data)
1507 xr.set_data(
1508 b"".join(
1509 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]
1510 )
1511 )
1512 xr.write_to_stream(stream)
1513 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1515 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:
1516 object_positions = []
1517 free_objects = []
1518 stream.write(self.pdf_header.encode() + b"\n")
1519 stream.write(b"%\xE2\xE3\xCF\xD3\n")
1521 for idnum, obj in enumerate(self._objects, start=1):
1522 if obj is not None:
1523 object_positions.append(stream.tell())
1524 stream.write(f"{idnum} 0 obj\n".encode())
1525 if self._encryption and obj != self._encrypt_entry:
1526 obj = self._encryption.encrypt_object(obj, idnum, 0)
1527 obj.write_to_stream(stream)
1528 stream.write(b"\nendobj\n")
1529 else:
1530 object_positions.append(-1)
1531 free_objects.append(idnum)
1532 free_objects.append(0) # add 0 to loop in accordance with specification
1533 return object_positions, free_objects
1535 def _write_xref_table(
1536 self, stream: StreamType, object_positions: list[int], free_objects: list[int]
1537 ) -> int:
1538 xref_location = stream.tell()
1539 stream.write(b"xref\n")
1540 stream.write(f"0 {len(self._objects) + 1}\n".encode())
1541 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
1542 free_idx = 1
1543 for offset in object_positions:
1544 if offset > 0:
1545 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
1546 else:
1547 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
1548 free_idx += 1
1549 return xref_location
1551 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
1552 """
1553 Write the PDF trailer to the stream.
1555 To quote the PDF specification:
1556 [The] trailer [gives] the location of the cross-reference table and
1557 of certain special objects within the body of the file.
1558 """
1559 stream.write(b"trailer\n")
1560 trailer = DictionaryObject(
1561 {
1562 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
1563 NameObject(TK.ROOT): self.root_object.indirect_reference,
1564 }
1565 )
1566 if self._info is not None:
1567 trailer[NameObject(TK.INFO)] = self._info.indirect_reference
1568 if self._ID is not None:
1569 trailer[NameObject(TK.ID)] = self._ID
1570 if self._encrypt_entry:
1571 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference
1572 trailer.write_to_stream(stream)
1573 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1575 @property
1576 def metadata(self) -> Optional[DocumentInformation]:
1577 """
1578 Retrieve/set the PDF file's document information dictionary, if it exists.
1580 Args:
1581 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
1583 Note that some PDF files use (XMP) metadata streams instead of document
1584 information dictionaries, and these metadata streams will not be
1585 accessed by this function, but by :meth:`~xmp_metadata`.
1587 """
1588 return super().metadata
1590 @metadata.setter
1591 def metadata(
1592 self,
1593 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],
1594 ) -> None:
1595 if value is None:
1596 self._info = None
1597 else:
1598 if self._info is not None:
1599 self._info.clear()
1601 self.add_metadata(value)
1603 def add_metadata(self, infos: dict[str, Any]) -> None:
1604 """
1605 Add custom metadata to the output.
1607 Args:
1608 infos: a Python dictionary where each key is a field
1609 and each value is your new metadata.
1611 """
1612 args = {}
1613 if isinstance(infos, PdfObject):
1614 infos = cast(DictionaryObject, infos.get_object())
1615 for key, value in list(infos.items()):
1616 if isinstance(value, PdfObject):
1617 value = value.get_object()
1618 args[NameObject(key)] = create_string_object(str(value))
1619 if self._info is None:
1620 self._info = DictionaryObject()
1621 self._info.update(args)
1623 _UNSET = object()
1625 def compress_identical_objects(
1626 self,
1627 remove_identicals: Any = _UNSET,
1628 remove_orphans: Any = _UNSET,
1629 *,
1630 remove_duplicates: bool = True,
1631 remove_unreferenced: bool = True,
1632 ) -> None:
1633 """
1634 Parse the PDF file and merge objects that have the same hash.
1635 This will make objects common to multiple pages.
1636 Recommended to be used just before writing output.
1638 Args:
1639 remove_identicals: Deprecated.
1640 remove_orphans: Deprecated.
1641 remove_duplicates: Remove duplicate objects.
1642 remove_unreferenced: Remove unreferenced objects.
1644 """
1645 if remove_identicals != self._UNSET:
1646 deprecate_with_replacement("remove_identicals", "remove_duplicates", "7.0.0")
1647 assert isinstance(remove_identicals, bool)
1648 remove_duplicates = remove_identicals
1649 if remove_orphans != self._UNSET:
1650 deprecate_with_replacement("remove_orphans", "remove_unreferenced", "7.0.0")
1651 assert isinstance(remove_orphans, bool)
1652 remove_unreferenced = remove_orphans
1654 def replace_in_obj(
1655 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]
1656 ) -> None:
1657 if isinstance(obj, DictionaryObject):
1658 key_val = obj.items()
1659 elif isinstance(obj, ArrayObject):
1660 key_val = enumerate(obj) # type: ignore
1661 else:
1662 return
1663 assert isinstance(obj, (DictionaryObject, ArrayObject))
1664 for k, v in key_val:
1665 if isinstance(v, IndirectObject):
1666 unreferenced[v.idnum - 1] = False
1667 if v in crossref:
1668 obj[k] = crossref[v]
1669 else:
1670 """The filtering on DictionaryObject and ArrayObject only
1671 will be performed within replace_in_obj"""
1672 replace_in_obj(v, crossref)
1674 # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...])
1675 self._idnum_hash = {}
1676 unreferenced = [True] * len(self._objects)
1677 # look for similar objects
1678 for idx, obj in enumerate(self._objects):
1679 if is_null_or_none(obj):
1680 continue
1681 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.
1682 assert isinstance(obj.indirect_reference, IndirectObject)
1683 h = obj.hash_value()
1684 if remove_duplicates and h in self._idnum_hash:
1685 self._idnum_hash[h][1].append(obj.indirect_reference)
1686 self._objects[idx] = None
1687 else:
1688 self._idnum_hash[h] = (obj.indirect_reference, [])
1690 # generate the dict converting others to 1st
1691 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
1692 cnv_rev: dict[IndirectObject, IndirectObject] = {}
1693 for k, v in cnv.items():
1694 cnv_rev.update(zip(v, (k,) * len(v)))
1696 # replace reference to merged objects
1697 for obj in self._objects:
1698 if isinstance(obj, (DictionaryObject, ArrayObject)):
1699 replace_in_obj(obj, cnv_rev)
1701 if remove_unreferenced:
1702 unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
1704 if not is_null_or_none(self._info):
1705 unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore
1707 try:
1708 unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore
1709 except AttributeError:
1710 pass
1712 for i in compress(range(len(self._objects)), unreferenced):
1713 self._objects[i] = None
1715 def get_reference(self, obj: PdfObject) -> IndirectObject:
1716 idnum = self._objects.index(obj) + 1
1717 ref = IndirectObject(idnum, 0, self)
1718 assert ref.get_object() == obj
1719 return ref
1721 def get_outline_root(self) -> TreeObject:
1722 if CO.OUTLINES in self._root_object:
1723 # Entries in the catalog dictionary
1724 outline = cast(TreeObject, self._root_object[CO.OUTLINES])
1725 if not isinstance(outline, TreeObject):
1726 t = TreeObject(outline)
1727 self._replace_object(outline.indirect_reference.idnum, t)
1728 outline = t
1729 idnum = self._objects.index(outline) + 1
1730 outline_ref = IndirectObject(idnum, 0, self)
1731 assert outline_ref.get_object() == outline
1732 else:
1733 outline = TreeObject()
1734 outline.update({})
1735 outline_ref = self._add_object(outline)
1736 self._root_object[NameObject(CO.OUTLINES)] = outline_ref
1738 return outline
1740 def get_threads_root(self) -> ArrayObject:
1741 """
1742 The list of threads.
1744 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1746 Returns:
1747 An array (possibly empty) of Dictionaries with an ``/F`` key,
1748 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
1750 """
1751 if CO.THREADS in self._root_object:
1752 # Entries in the catalog dictionary
1753 threads = cast(ArrayObject, self._root_object[CO.THREADS])
1754 else:
1755 threads = ArrayObject()
1756 self._root_object[NameObject(CO.THREADS)] = threads
1757 return threads
1759 @property
1760 def threads(self) -> ArrayObject:
1761 """
1762 Read-only property for the list of threads.
1764 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1766 Each element is a dictionary with an ``/F`` key, and optionally
1767 information about the thread in ``/I`` or ``/Metadata`` keys.
1768 """
1769 return self.get_threads_root()
1771 def add_outline_item_destination(
1772 self,
1773 page_destination: Union[IndirectObject, PageObject, TreeObject],
1774 parent: Union[None, TreeObject, IndirectObject] = None,
1775 before: Union[None, TreeObject, IndirectObject] = None,
1776 is_open: bool = True,
1777 ) -> IndirectObject:
1778 page_destination = cast(PageObject, page_destination.get_object())
1779 if isinstance(page_destination, PageObject):
1780 return self.add_outline_item_destination(
1781 Destination(
1782 f"page #{page_destination.page_number}",
1783 cast(IndirectObject, page_destination.indirect_reference),
1784 Fit.fit(),
1785 )
1786 )
1788 if parent is None:
1789 parent = self.get_outline_root()
1791 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)
1792 parent = cast(TreeObject, parent.get_object())
1793 page_destination_ref = self._add_object(page_destination)
1794 if before is not None:
1795 before = before.indirect_reference
1796 parent.insert_child(
1797 page_destination_ref,
1798 before,
1799 self,
1800 page_destination.inc_parent_counter_outline
1801 if is_open
1802 else (lambda x, y: 0), # noqa: ARG005
1803 )
1804 if "/Count" not in page_destination:
1805 page_destination[NameObject("/Count")] = NumberObject(0)
1807 return page_destination_ref
1809 def add_outline_item_dict(
1810 self,
1811 outline_item: OutlineItemType,
1812 parent: Union[None, TreeObject, IndirectObject] = None,
1813 before: Union[None, TreeObject, IndirectObject] = None,
1814 is_open: bool = True,
1815 ) -> IndirectObject:
1816 outline_item_object = TreeObject()
1817 outline_item_object.update(outline_item)
1819 """code currently unreachable
1820 if "/A" in outline_item:
1821 action = DictionaryObject()
1822 a_dict = cast(DictionaryObject, outline_item["/A"])
1823 for k, v in list(a_dict.items()):
1824 action[NameObject(str(k))] = v
1825 action_ref = self._add_object(action)
1826 outline_item_object[NameObject("/A")] = action_ref
1827 """
1828 return self.add_outline_item_destination(
1829 outline_item_object, parent, before, is_open
1830 )
1832 def add_outline_item(
1833 self,
1834 title: str,
1835 page_number: Union[None, PageObject, IndirectObject, int],
1836 parent: Union[None, TreeObject, IndirectObject] = None,
1837 before: Union[None, TreeObject, IndirectObject] = None,
1838 color: Optional[Union[tuple[float, float, float], str]] = None,
1839 bold: bool = False,
1840 italic: bool = False,
1841 fit: Fit = PAGE_FIT,
1842 is_open: bool = True,
1843 ) -> IndirectObject:
1844 """
1845 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.
1847 Args:
1848 title: Title to use for this outline item.
1849 page_number: Page number this outline item will point to.
1850 parent: A reference to a parent outline item to create nested
1851 outline items.
1852 before:
1853 color: Color of the outline item's font as a red, green, blue tuple
1854 from 0.0 to 1.0 or as a Hex String (#RRGGBB)
1855 bold: Outline item font is bold
1856 italic: Outline item font is italic
1857 fit: The fit of the destination page.
1859 Returns:
1860 The added outline item as an indirect object.
1862 """
1863 page_ref: Union[None, NullObject, IndirectObject, NumberObject]
1864 if isinstance(italic, Fit): # it means that we are on the old params
1865 if fit is not None and page_number is None:
1866 page_number = fit
1867 return self.add_outline_item(
1868 title, page_number, parent, None, before, color, bold, italic, is_open=is_open
1869 )
1870 if page_number is None:
1871 action_ref = None
1872 else:
1873 if isinstance(page_number, IndirectObject):
1874 page_ref = page_number
1875 elif isinstance(page_number, PageObject):
1876 page_ref = page_number.indirect_reference
1877 elif isinstance(page_number, int):
1878 try:
1879 page_ref = self.pages[page_number].indirect_reference
1880 except IndexError:
1881 page_ref = NumberObject(page_number)
1882 if page_ref is None:
1883 logger_warning(
1884 f"can not find reference of page {page_number}",
1885 __name__,
1886 )
1887 page_ref = NullObject()
1888 dest = Destination(
1889 NameObject("/" + title + " outline item"),
1890 page_ref,
1891 fit,
1892 )
1894 action_ref = self._add_object(
1895 DictionaryObject(
1896 {
1897 NameObject(GoToActionArguments.D): dest.dest_array,
1898 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
1899 }
1900 )
1901 )
1902 outline_item = self._add_object(
1903 _create_outline_item(action_ref, title, color, italic, bold)
1904 )
1906 if parent is None:
1907 parent = self.get_outline_root()
1908 return self.add_outline_item_destination(outline_item, parent, before, is_open)
1910 def add_outline(self) -> None:
1911 raise NotImplementedError(
1912 "This method is not yet implemented. Use :meth:`add_outline_item` instead."
1913 )
1915 def add_named_destination_array(
1916 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]
1917 ) -> None:
1918 named_dest = self.get_named_dest_root()
1919 i = 0
1920 while i < len(named_dest):
1921 if title < named_dest[i]:
1922 named_dest.insert(i, destination)
1923 named_dest.insert(i, TextStringObject(title))
1924 return
1925 i += 2
1926 named_dest.extend([TextStringObject(title), destination])
1927 return
1929 def add_named_destination_object(
1930 self,
1931 page_destination: PdfObject,
1932 ) -> IndirectObject:
1933 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore
1934 self.add_named_destination_array(
1935 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore
1936 )
1938 return page_destination_ref
1940 def add_named_destination(
1941 self,
1942 title: str,
1943 page_number: int,
1944 ) -> IndirectObject:
1945 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore
1946 dest = DictionaryObject()
1947 dest.update(
1948 {
1949 NameObject(GoToActionArguments.D): ArrayObject(
1950 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]
1951 ),
1952 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
1953 }
1954 )
1956 dest_ref = self._add_object(dest)
1957 if not isinstance(title, TextStringObject):
1958 title = TextStringObject(str(title))
1960 self.add_named_destination_array(title, dest_ref)
1961 return dest_ref
1963 def remove_links(self) -> None:
1964 """Remove links and annotations from this output."""
1965 for page in self.pages:
1966 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
1968 def remove_annotations(
1969 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]
1970 ) -> None:
1971 """
1972 Remove annotations by annotation subtype.
1974 Args:
1975 subtypes: subtype or list of subtypes to be removed.
1976 Examples are: "/Link", "/FileAttachment", "/Sound",
1977 "/Movie", "/Screen", ...
1978 If you want to remove all annotations, use subtypes=None.
1980 """
1981 for page in self.pages:
1982 self._remove_annots_from_page(page, subtypes)
1984 def _remove_annots_from_page(
1985 self,
1986 page: Union[IndirectObject, PageObject, DictionaryObject],
1987 subtypes: Optional[Iterable[str]],
1988 ) -> None:
1989 page = cast(DictionaryObject, page.get_object())
1990 if PG.ANNOTS in page:
1991 i = 0
1992 while i < len(cast(ArrayObject, page[PG.ANNOTS])):
1993 an = cast(ArrayObject, page[PG.ANNOTS])[i]
1994 obj = cast(DictionaryObject, an.get_object())
1995 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
1996 if isinstance(an, IndirectObject):
1997 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
1998 del page[PG.ANNOTS][i] # type:ignore
1999 else:
2000 i += 1
2002 def remove_objects_from_page(
2003 self,
2004 page: Union[PageObject, DictionaryObject],
2005 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
2006 text_filters: Optional[dict[str, Any]] = None
2007 ) -> None:
2008 """
2009 Remove objects specified by ``to_delete`` from the given page.
2011 Args:
2012 page: Page object to clean up.
2013 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
2014 or a list of ObjectDeletionFlag
2015 text_filters: Properties of text to be deleted, if applicable. Optional.
2016 This is a Python dictionary with the following properties:
2018 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.
2020 """
2021 if isinstance(to_delete, (list, tuple)):
2022 for to_d in to_delete:
2023 self.remove_objects_from_page(page, to_d)
2024 return None
2025 assert isinstance(to_delete, ObjectDeletionFlag)
2027 if to_delete & ObjectDeletionFlag.LINKS:
2028 return self._remove_annots_from_page(page, ("/Link",))
2029 if to_delete & ObjectDeletionFlag.ATTACHMENTS:
2030 return self._remove_annots_from_page(
2031 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
2032 )
2033 if to_delete & ObjectDeletionFlag.OBJECTS_3D:
2034 return self._remove_annots_from_page(page, ("/3D",))
2035 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
2036 return self._remove_annots_from_page(page, None)
2038 jump_operators = []
2039 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
2040 jump_operators = [
2041 b"w", b"J", b"j", b"M", b"d", b"i",
2042 b"W", b"W*",
2043 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",
2044 b"m", b"l", b"c", b"v", b"y", b"h", b"re",
2045 b"sh"
2046 ]
2047 if to_delete & ObjectDeletionFlag.TEXT:
2048 jump_operators = [b"Tj", b"TJ", b"'", b'"']
2050 if not isinstance(page, PageObject):
2051 page = PageObject(self, page.indirect_reference) # pragma: no cover
2052 if "/Contents" in page:
2053 content = cast(ContentStream, page.get_contents())
2055 images, forms = self._remove_objects_from_page__clean_forms(
2056 elt=page, stack=[], jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters,
2057 )
2059 self._remove_objects_from_page__clean(
2060 content=content, images=images, forms=forms,
2061 jump_operators=jump_operators, to_delete=to_delete,
2062 text_filters=text_filters
2063 )
2064 page.replace_contents(content)
2065 return [], [] # type: ignore[return-value]
2067 def _remove_objects_from_page__clean(
2068 self,
2069 content: ContentStream,
2070 images: list[str],
2071 forms: list[str],
2072 jump_operators: list[bytes],
2073 to_delete: ObjectDeletionFlag,
2074 text_filters: Optional[dict[str, Any]] = None,
2075 ) -> None:
2076 font_id = None
2077 font_ids_to_delete = []
2078 if text_filters and to_delete & ObjectDeletionFlag.TEXT:
2079 font_ids_to_delete = text_filters.get("font_ids", [])
2081 i = 0
2082 while i < len(content.operations):
2083 operands, operator = content.operations[i]
2084 if operator == b"Tf":
2085 font_id = operands[0]
2086 if (
2087 (
2088 operator == b"INLINE IMAGE"
2089 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)
2090 )
2091 or (operator in jump_operators)
2092 or (
2093 operator == b"Do"
2094 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)
2095 and (operands[0] in images)
2096 )
2097 ):
2098 if (
2099 not to_delete & ObjectDeletionFlag.TEXT
2100 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
2101 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
2102 ):
2103 del content.operations[i]
2104 else:
2105 i += 1
2106 else:
2107 i += 1
2108 content.get_data() # this ensures ._data is rebuilt from the .operations
2110 def _remove_objects_from_page__clean_forms(
2111 self,
2112 elt: DictionaryObject,
2113 stack: list[DictionaryObject],
2114 jump_operators: list[bytes],
2115 to_delete: ObjectDeletionFlag,
2116 text_filters: Optional[dict[str, Any]] = None,
2117 ) -> tuple[list[str], list[str]]:
2118 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference
2119 if (elt in stack) or (
2120 hasattr(elt, "indirect_reference") and any(
2121 elt.indirect_reference == getattr(x, "indirect_reference", -1)
2122 for x in stack
2123 )
2124 ):
2125 # to prevent infinite looping
2126 return [], [] # pragma: no cover
2127 try:
2128 d = cast(
2129 dict[Any, Any],
2130 cast(DictionaryObject, elt["/Resources"])["/XObject"],
2131 )
2132 except KeyError:
2133 d = {}
2134 images = []
2135 forms = []
2136 for k, v in d.items():
2137 o = v.get_object()
2138 try:
2139 content: Any = None
2140 if (
2141 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES
2142 and o["/Subtype"] == "/Image"
2143 ):
2144 content = NullObject() # to delete the image keeping the entry
2145 images.append(k)
2146 if o["/Subtype"] == "/Form":
2147 forms.append(k)
2148 if isinstance(o, ContentStream):
2149 content = o
2150 else:
2151 content = ContentStream(o, self)
2152 content.update(
2153 {
2154 k1: v1
2155 for k1, v1 in o.items()
2156 if k1 not in ["/Length", "/Filter", "/DecodeParms"]
2157 }
2158 )
2159 try:
2160 content.indirect_reference = o.indirect_reference
2161 except AttributeError: # pragma: no cover
2162 pass
2163 stack.append(elt)
2165 # clean subforms
2166 self._remove_objects_from_page__clean_forms(
2167 elt=content, stack=stack, jump_operators=jump_operators, to_delete=to_delete,
2168 text_filters=text_filters,
2169 )
2170 if content is not None:
2171 if isinstance(v, IndirectObject):
2172 self._objects[v.idnum - 1] = content
2173 else:
2174 # should only occur in a PDF not respecting PDF spec
2175 # where streams must be indirected.
2176 d[k] = self._add_object(content) # pragma: no cover
2177 except (TypeError, KeyError):
2178 pass
2179 for im in images:
2180 del d[im] # for clean-up
2181 if isinstance(elt, StreamObject): # for /Form
2182 if not isinstance(elt, ContentStream): # pragma: no cover
2183 e = ContentStream(elt, self)
2184 e.update(elt.items())
2185 elt = e
2186 # clean the content
2187 self._remove_objects_from_page__clean(
2188 content=elt, images=images, forms=forms, jump_operators=jump_operators,
2189 to_delete=to_delete, text_filters=text_filters
2190 )
2191 return images, forms
2193 def remove_images(
2194 self,
2195 to_delete: ImageType = ImageType.ALL,
2196 ) -> None:
2197 """
2198 Remove images from this output.
2200 Args:
2201 to_delete: The type of images to be deleted
2202 (default = all images types)
2204 """
2205 if isinstance(to_delete, bool):
2206 to_delete = ImageType.ALL
2208 i = ObjectDeletionFlag.NONE
2210 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):
2211 if to_delete & ImageType[image]:
2212 i |= ObjectDeletionFlag[image]
2214 for page in self.pages:
2215 self.remove_objects_from_page(page, i)
2217 def remove_text(self, font_names: Optional[list[str]] = None) -> None:
2218 """
2219 Remove text from the PDF.
2221 Args:
2222 font_names: List of font names to remove, such as "Helvetica-Bold".
2223 Optional. If not specified, all text will be removed.
2224 """
2225 if not font_names:
2226 font_names = []
2228 for page in self.pages:
2229 resource_ids_to_remove = []
2231 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
2232 # Font names need to be converted to resource names/IDs for easier removal
2233 if font_names:
2234 # Recursively loop through page objects to gather font info
2235 def get_font_info(
2236 obj: Any,
2237 font_info: Optional[dict[str, Any]] = None,
2238 key: Optional[str] = None
2239 ) -> dict[str, Any]:
2240 if font_info is None:
2241 font_info = {}
2242 if isinstance(obj, IndirectObject):
2243 obj = obj.get_object()
2244 if isinstance(obj, dict):
2245 if obj.get("/Type") == "/Font":
2246 font_name = obj.get("/BaseFont", "")
2247 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
2248 normalized_font_name = font_name.lstrip("/").split("+")[-1]
2249 if normalized_font_name not in font_info:
2250 font_info[normalized_font_name] = {
2251 "normalized_font_name": normalized_font_name,
2252 "resource_ids": [],
2253 }
2254 if key not in font_info[normalized_font_name]["resource_ids"]:
2255 font_info[normalized_font_name]["resource_ids"].append(key)
2256 for k in obj:
2257 font_info = get_font_info(obj[k], font_info, k)
2258 elif isinstance(obj, (list, ArrayObject)):
2259 for child_obj in obj:
2260 font_info = get_font_info(child_obj, font_info)
2261 return font_info
2263 # Add relevant resource names for removal
2264 font_info = get_font_info(page.get("/Resources"))
2265 for font_name in font_names:
2266 if font_name in font_info:
2267 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])
2269 text_filters = {}
2270 if font_names:
2271 text_filters["font_ids"] = resource_ids_to_remove
2272 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
2274 def add_uri(
2275 self,
2276 page_number: int,
2277 uri: str,
2278 rect: RectangleObject,
2279 border: Optional[ArrayObject] = None,
2280 ) -> None:
2281 """
2282 Add an URI from a rectangular area to the specified page.
2284 Args:
2285 page_number: index of the page on which to place the URI action.
2286 uri: URI of resource to link to.
2287 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or
2288 array of four integers specifying the clickable rectangular area
2289 ``[xLL, yLL, xUR, yUR]``, or string in the form
2290 ``"[ xLL yLL xUR yUR ]"``.
2291 border: if provided, an array describing border-drawing
2292 properties. See the PDF spec for details. No border will be
2293 drawn if this argument is omitted.
2295 """
2296 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore
2297 page_ref = cast(dict[str, Any], self.get_object(page_link))
2299 border_arr: BorderArrayType
2300 if border is not None:
2301 border_arr = [NumberObject(n) for n in border[:3]]
2302 if len(border) == 4:
2303 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
2304 border_arr.append(dash_pattern)
2305 else:
2306 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]
2308 if isinstance(rect, str):
2309 rect = NumberObject(rect)
2310 elif isinstance(rect, RectangleObject):
2311 pass
2312 else:
2313 rect = RectangleObject(rect)
2315 lnk2 = DictionaryObject()
2316 lnk2.update(
2317 {
2318 NameObject("/S"): NameObject("/URI"),
2319 NameObject("/URI"): TextStringObject(uri),
2320 }
2321 )
2322 lnk = DictionaryObject()
2323 lnk.update(
2324 {
2325 NameObject(AA.Type): NameObject("/Annot"),
2326 NameObject(AA.Subtype): NameObject("/Link"),
2327 NameObject(AA.P): page_link,
2328 NameObject(AA.Rect): rect,
2329 NameObject("/H"): NameObject("/I"),
2330 NameObject(AA.Border): ArrayObject(border_arr),
2331 NameObject("/A"): lnk2,
2332 }
2333 )
2334 lnk_ref = self._add_object(lnk)
2336 if PG.ANNOTS in page_ref:
2337 page_ref[PG.ANNOTS].append(lnk_ref)
2338 else:
2339 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])
2341 _valid_layouts = (
2342 "/NoLayout",
2343 "/SinglePage",
2344 "/OneColumn",
2345 "/TwoColumnLeft",
2346 "/TwoColumnRight",
2347 "/TwoPageLeft",
2348 "/TwoPageRight",
2349 )
2351 def _get_page_layout(self) -> Optional[LayoutType]:
2352 try:
2353 return cast(LayoutType, self._root_object["/PageLayout"])
2354 except KeyError:
2355 return None
2357 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:
2358 """
2359 Set the page layout.
2361 Args:
2362 layout: The page layout to be used.
2364 .. list-table:: Valid ``layout`` arguments
2365 :widths: 50 200
2367 * - /NoLayout
2368 - Layout explicitly not specified
2369 * - /SinglePage
2370 - Show one page at a time
2371 * - /OneColumn
2372 - Show one column at a time
2373 * - /TwoColumnLeft
2374 - Show pages in two columns, odd-numbered pages on the left
2375 * - /TwoColumnRight
2376 - Show pages in two columns, odd-numbered pages on the right
2377 * - /TwoPageLeft
2378 - Show two pages at a time, odd-numbered pages on the left
2379 * - /TwoPageRight
2380 - Show two pages at a time, odd-numbered pages on the right
2382 """
2383 if not isinstance(layout, NameObject):
2384 if layout not in self._valid_layouts:
2385 logger_warning(
2386 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",
2387 __name__,
2388 )
2389 layout = NameObject(layout)
2390 self._root_object.update({NameObject("/PageLayout"): layout})
2392 def set_page_layout(self, layout: LayoutType) -> None:
2393 """
2394 Set the page layout.
2396 Args:
2397 layout: The page layout to be used
2399 .. list-table:: Valid ``layout`` arguments
2400 :widths: 50 200
2402 * - /NoLayout
2403 - Layout explicitly not specified
2404 * - /SinglePage
2405 - Show one page at a time
2406 * - /OneColumn
2407 - Show one column at a time
2408 * - /TwoColumnLeft
2409 - Show pages in two columns, odd-numbered pages on the left
2410 * - /TwoColumnRight
2411 - Show pages in two columns, odd-numbered pages on the right
2412 * - /TwoPageLeft
2413 - Show two pages at a time, odd-numbered pages on the left
2414 * - /TwoPageRight
2415 - Show two pages at a time, odd-numbered pages on the right
2417 """
2418 self._set_page_layout(layout)
2420 @property
2421 def page_layout(self) -> Optional[LayoutType]:
2422 """
2423 Page layout property.
2425 .. list-table:: Valid ``layout`` values
2426 :widths: 50 200
2428 * - /NoLayout
2429 - Layout explicitly not specified
2430 * - /SinglePage
2431 - Show one page at a time
2432 * - /OneColumn
2433 - Show one column at a time
2434 * - /TwoColumnLeft
2435 - Show pages in two columns, odd-numbered pages on the left
2436 * - /TwoColumnRight
2437 - Show pages in two columns, odd-numbered pages on the right
2438 * - /TwoPageLeft
2439 - Show two pages at a time, odd-numbered pages on the left
2440 * - /TwoPageRight
2441 - Show two pages at a time, odd-numbered pages on the right
2442 """
2443 return self._get_page_layout()
2445 @page_layout.setter
2446 def page_layout(self, layout: LayoutType) -> None:
2447 self._set_page_layout(layout)
2449 _valid_modes = (
2450 "/UseNone",
2451 "/UseOutlines",
2452 "/UseThumbs",
2453 "/FullScreen",
2454 "/UseOC",
2455 "/UseAttachments",
2456 )
2458 def _get_page_mode(self) -> Optional[PagemodeType]:
2459 try:
2460 return cast(PagemodeType, self._root_object["/PageMode"])
2461 except KeyError:
2462 return None
2464 @property
2465 def page_mode(self) -> Optional[PagemodeType]:
2466 """
2467 Page mode property.
2469 .. list-table:: Valid ``mode`` values
2470 :widths: 50 200
2472 * - /UseNone
2473 - Do not show outline or thumbnails panels
2474 * - /UseOutlines
2475 - Show outline (aka bookmarks) panel
2476 * - /UseThumbs
2477 - Show page thumbnails panel
2478 * - /FullScreen
2479 - Fullscreen view
2480 * - /UseOC
2481 - Show Optional Content Group (OCG) panel
2482 * - /UseAttachments
2483 - Show attachments panel
2484 """
2485 return self._get_page_mode()
2487 @page_mode.setter
2488 def page_mode(self, mode: PagemodeType) -> None:
2489 if isinstance(mode, NameObject):
2490 mode_name: NameObject = mode
2491 else:
2492 if mode not in self._valid_modes:
2493 logger_warning(
2494 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__
2495 )
2496 mode_name = NameObject(mode)
2497 self._root_object.update({NameObject("/PageMode"): mode_name})
2499 def add_annotation(
2500 self,
2501 page_number: Union[int, PageObject],
2502 annotation: dict[str, Any],
2503 ) -> DictionaryObject:
2504 """
2505 Add a single annotation to the page.
2506 The added annotation must be a new annotation.
2507 It cannot be recycled.
2509 Args:
2510 page_number: PageObject or page index.
2511 annotation: Annotation to be added (created with annotation).
2513 Returns:
2514 The inserted object.
2515 This can be used for popup creation, for example.
2517 """
2518 page = page_number
2519 if isinstance(page, int):
2520 page = self.pages[page]
2521 elif not isinstance(page, PageObject):
2522 raise TypeError("page: invalid type")
2524 to_add = cast(DictionaryObject, _pdf_objectify(annotation))
2525 to_add[NameObject("/P")] = page.indirect_reference
2527 if page.annotations is None:
2528 page[NameObject("/Annots")] = ArrayObject()
2529 assert page.annotations is not None
2531 # Internal link annotations need the correct object type for the
2532 # destination
2533 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:
2534 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])
2535 dest = Destination(
2536 NameObject("/LinkName"),
2537 tmp["target_page_index"],
2538 Fit(
2539 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]
2540 ), # I have no clue why this dict-hack is necessary
2541 )
2542 to_add[NameObject("/Dest")] = dest.dest_array
2544 page.annotations.append(self._add_object(to_add))
2546 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:
2547 cast(DictionaryObject, to_add["/Parent"].get_object())[
2548 NameObject("/Popup")
2549 ] = to_add.indirect_reference
2551 return to_add
2553 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:
2554 """
2555 Perform some clean up in the page.
2556 Currently: convert NameObject named destination to TextStringObject
2557 (required for names/dests list)
2559 Args:
2560 page:
2562 Returns:
2563 The cleaned PageObject
2565 """
2566 page = cast("PageObject", page.get_object())
2567 for a in page.get("/Annots", []):
2568 a_obj = a.get_object()
2569 d = a_obj.get("/Dest", None)
2570 act = a_obj.get("/A", None)
2571 if isinstance(d, NameObject):
2572 a_obj[NameObject("/Dest")] = TextStringObject(d)
2573 elif act is not None:
2574 act = act.get_object()
2575 d = act.get("/D", None)
2576 if isinstance(d, NameObject):
2577 act[NameObject("/D")] = TextStringObject(d)
2578 return page
2580 def _create_stream(
2581 self, fileobj: Union[Path, StrByteType, PdfReader]
2582 ) -> tuple[IOBase, Optional[Encryption]]:
2583 # If the fileobj parameter is a string, assume it is a path
2584 # and create a file object at that location. If it is a file,
2585 # copy the file's contents into a BytesIO stream object; if
2586 # it is a PdfReader, copy that reader's stream into a
2587 # BytesIO stream.
2588 # If fileobj is none of the above types, it is not modified
2589 encryption_obj = None
2590 stream: IOBase
2591 if isinstance(fileobj, (str, Path)):
2592 with FileIO(fileobj, "rb") as f:
2593 stream = BytesIO(f.read())
2594 elif isinstance(fileobj, PdfReader):
2595 if fileobj._encryption:
2596 encryption_obj = fileobj._encryption
2597 orig_tell = fileobj.stream.tell()
2598 fileobj.stream.seek(0)
2599 stream = BytesIO(fileobj.stream.read())
2601 # reset the stream to its original location
2602 fileobj.stream.seek(orig_tell)
2603 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):
2604 fileobj.seek(0)
2605 filecontent = fileobj.read()
2606 stream = BytesIO(filecontent)
2607 else:
2608 raise NotImplementedError(
2609 "Merging requires an object that PdfReader can parse. "
2610 "Typically, that is a Path or a string representing a Path, "
2611 "a file object, or an object implementing .seek and .read. "
2612 "Passing a PdfReader directly works as well."
2613 )
2614 return stream, encryption_obj
2616 def append(
2617 self,
2618 fileobj: Union[StrByteType, PdfReader, Path],
2619 outline_item: Union[
2620 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]
2621 ] = None,
2622 pages: Union[
2623 None,
2624 PageRange,
2625 tuple[int, int],
2626 tuple[int, int, int],
2627 list[int],
2628 list[PageObject],
2629 ] = None,
2630 import_outline: bool = True,
2631 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,
2632 ) -> None:
2633 """
2634 Identical to the :meth:`merge()<merge>` method, but assumes you want to
2635 concatenate all pages onto the end of the file instead of specifying a
2636 position.
2638 Args:
2639 fileobj: A File Object or an object that supports the standard
2640 read and seek methods similar to a File Object. Could also be a
2641 string representing a path to a PDF file.
2642 outline_item: Optionally, you may specify a string to build an
2643 outline (aka 'bookmark') to identify the beginning of the
2644 included file.
2645 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2646 or a ``(start, stop[, step])`` tuple
2647 or a list of pages to be processed
2648 to merge only the specified range of pages from the source
2649 document into the output document.
2650 import_outline: You may prevent the source document's
2651 outline (collection of outline items, previously referred to as
2652 'bookmarks') from being imported by specifying this as ``False``.
2653 excluded_fields: Provide the list of fields/keys to be ignored
2654 if ``/Annots`` is part of the list, the annotation will be ignored
2655 if ``/B`` is part of the list, the articles will be ignored
2657 """
2658 if excluded_fields is None:
2659 excluded_fields = ()
2660 if isinstance(outline_item, (tuple, list, PageRange)):
2661 if isinstance(pages, bool):
2662 if not isinstance(import_outline, bool):
2663 excluded_fields = import_outline
2664 import_outline = pages
2665 pages = outline_item
2666 self.merge(
2667 None,
2668 fileobj,
2669 None,
2670 pages,
2671 import_outline,
2672 excluded_fields,
2673 )
2674 else: # if isinstance(outline_item, str):
2675 self.merge(
2676 None,
2677 fileobj,
2678 outline_item,
2679 pages,
2680 import_outline,
2681 excluded_fields,
2682 )
2684 def merge(
2685 self,
2686 position: Optional[int],
2687 fileobj: Union[Path, StrByteType, PdfReader],
2688 outline_item: Optional[str] = None,
2689 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,
2690 import_outline: bool = True,
2691 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),
2692 ) -> None:
2693 """
2694 Merge the pages from the given file into the output file at the
2695 specified page number.
2697 Args:
2698 position: The *page number* to insert this file. File will
2699 be inserted after the given number.
2700 fileobj: A File Object or an object that supports the standard
2701 read and seek methods similar to a File Object. Could also be a
2702 string representing a path to a PDF file.
2703 outline_item: Optionally, you may specify a string to build an outline
2704 (aka 'bookmark') to identify the
2705 beginning of the included file.
2706 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2707 or a ``(start, stop[, step])`` tuple
2708 or a list of pages to be processed
2709 to merge only the specified range of pages from the source
2710 document into the output document.
2711 import_outline: You may prevent the source document's
2712 outline (collection of outline items, previously referred to as
2713 'bookmarks') from being imported by specifying this as ``False``.
2714 excluded_fields: provide the list of fields/keys to be ignored
2715 if ``/Annots`` is part of the list, the annotation will be ignored
2716 if ``/B`` is part of the list, the articles will be ignored
2718 Raises:
2719 TypeError: The pages attribute is not configured properly
2721 """
2722 if isinstance(fileobj, PdfDocCommon):
2723 reader = fileobj
2724 else:
2725 stream, _encryption_obj = self._create_stream(fileobj)
2726 # Create a new PdfReader instance using the stream
2727 # (either file or BytesIO or StringIO) created above
2728 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]
2730 if excluded_fields is None:
2731 excluded_fields = ()
2732 # Find the range of pages to merge.
2733 if pages is None:
2734 pages = list(range(len(reader.pages)))
2735 elif isinstance(pages, PageRange):
2736 pages = list(range(*pages.indices(len(reader.pages))))
2737 elif isinstance(pages, list):
2738 pass # keep unchanged
2739 elif isinstance(pages, tuple) and len(pages) <= 3:
2740 pages = list(range(*pages))
2741 elif not isinstance(pages, tuple):
2742 raise TypeError(
2743 '"pages" must be a tuple of (start, stop[, step]) or a list'
2744 )
2746 srcpages = {}
2747 for page in pages:
2748 if isinstance(page, PageObject):
2749 pg = page
2750 else:
2751 pg = reader.pages[page]
2752 assert pg.indirect_reference is not None
2753 if position is None:
2754 # numbers in the exclude list identifies that the exclusion is
2755 # only applicable to 1st level of cloning
2756 srcpages[pg.indirect_reference.idnum] = self.add_page(
2757 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2758 )
2759 else:
2760 srcpages[pg.indirect_reference.idnum] = self.insert_page(
2761 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2762 )
2763 position += 1
2764 srcpages[pg.indirect_reference.idnum].original_page = pg
2766 reader._named_destinations = (
2767 reader.named_destinations
2768 ) # need for the outline processing below
2770 arr: Any
2772 for dest in reader._named_destinations.values():
2773 self._merge__process_named_dests(dest=dest, reader=reader, srcpages=srcpages)
2775 outline_item_typ: TreeObject
2776 if outline_item is not None:
2777 outline_item_typ = cast(
2778 "TreeObject",
2779 self.add_outline_item(
2780 TextStringObject(outline_item),
2781 next(iter(srcpages.values())).indirect_reference,
2782 fit=PAGE_FIT,
2783 ).get_object(),
2784 )
2785 else:
2786 outline_item_typ = self.get_outline_root()
2788 _ro = reader.root_object
2789 if import_outline and CO.OUTLINES in _ro:
2790 outline = self._get_filtered_outline(
2791 _ro.get(CO.OUTLINES, None), srcpages, reader
2792 )
2793 self._insert_filtered_outline(
2794 outline, outline_item_typ, None
2795 ) # TODO: use before parameter
2797 if "/Annots" not in excluded_fields:
2798 for pag in srcpages.values():
2799 lst = self._insert_filtered_annotations(
2800 pag.original_page.get("/Annots", []), pag, srcpages, reader
2801 )
2802 if len(lst) > 0:
2803 pag[NameObject("/Annots")] = lst
2804 self.clean_page(pag)
2806 if "/AcroForm" in _ro and not is_null_or_none(_ro["/AcroForm"]):
2807 if "/AcroForm" not in self._root_object:
2808 self._root_object[NameObject("/AcroForm")] = self._add_object(
2809 cast(
2810 DictionaryObject,
2811 reader.root_object["/AcroForm"],
2812 ).clone(self, False, ("/Fields",))
2813 )
2814 arr = ArrayObject()
2815 else:
2816 arr = cast(
2817 ArrayObject,
2818 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
2819 )
2820 trslat = self._id_translated[id(reader)]
2821 try:
2822 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
2823 try:
2824 ind = IndirectObject(trslat[f.idnum], 0, self)
2825 if ind not in arr:
2826 arr.append(ind)
2827 except KeyError:
2828 # for trslat[] which mean the field has not be copied
2829 # through the page
2830 pass
2831 except KeyError: # for /Acroform or /Fields are not existing
2832 arr = self._add_object(ArrayObject())
2833 cast(DictionaryObject, self._root_object["/AcroForm"])[
2834 NameObject("/Fields")
2835 ] = arr
2837 if "/B" not in excluded_fields:
2838 self.add_filtered_articles("", srcpages, reader)
2840 def _merge__process_named_dests(self, dest: Any, reader: PdfDocCommon, srcpages: dict[int, PageObject]) -> None:
2841 arr: Any = dest.dest_array
2842 if "/Names" in self._root_object and dest["/Title"] in cast(
2843 list[Any],
2844 cast(
2845 DictionaryObject,
2846 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),
2847 ).get("/Names", DictionaryObject()),
2848 ):
2849 # already exists: should not duplicate it
2850 pass
2851 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):
2852 pass
2853 elif isinstance(dest["/Page"], int):
2854 # the page reference is a page number normally not a PDF Reference
2855 # page numbers as int are normally accepted only in external goto
2856 try:
2857 p = reader.pages[dest["/Page"]]
2858 except IndexError:
2859 return
2860 assert p.indirect_reference is not None
2861 try:
2862 arr[NumberObject(0)] = NumberObject(
2863 srcpages[p.indirect_reference.idnum].page_number
2864 )
2865 self.add_named_destination_array(dest["/Title"], arr)
2866 except KeyError:
2867 pass
2868 elif dest["/Page"].indirect_reference.idnum in srcpages:
2869 arr[NumberObject(0)] = srcpages[
2870 dest["/Page"].indirect_reference.idnum
2871 ].indirect_reference
2872 self.add_named_destination_array(dest["/Title"], arr)
2874 def _add_articles_thread(
2875 self,
2876 thread: DictionaryObject, # thread entry from the reader's array of threads
2877 pages: dict[int, PageObject],
2878 reader: PdfReader,
2879 ) -> IndirectObject:
2880 """
2881 Clone the thread with only the applicable articles.
2883 Args:
2884 thread:
2885 pages:
2886 reader:
2888 Returns:
2889 The added thread as an indirect reference
2891 """
2892 nthread = thread.clone(
2893 self, force_duplicate=True, ignore_fields=("/F",)
2894 ) # use of clone to keep link between reader and writer
2895 self.threads.append(nthread.indirect_reference)
2896 first_article = cast("DictionaryObject", thread["/F"])
2897 current_article: Optional[DictionaryObject] = first_article
2898 new_article: Optional[DictionaryObject] = None
2899 while current_article is not None:
2900 pag = self._get_cloned_page(
2901 cast("PageObject", current_article["/P"]), pages, reader
2902 )
2903 if pag is not None:
2904 if new_article is None:
2905 new_article = cast(
2906 "DictionaryObject",
2907 self._add_object(DictionaryObject()).get_object(),
2908 )
2909 new_first = new_article
2910 nthread[NameObject("/F")] = new_article.indirect_reference
2911 else:
2912 new_article2 = cast(
2913 "DictionaryObject",
2914 self._add_object(
2915 DictionaryObject(
2916 {NameObject("/V"): new_article.indirect_reference}
2917 )
2918 ).get_object(),
2919 )
2920 new_article[NameObject("/N")] = new_article2.indirect_reference
2921 new_article = new_article2
2922 new_article[NameObject("/P")] = pag
2923 new_article[NameObject("/T")] = nthread.indirect_reference
2924 new_article[NameObject("/R")] = current_article["/R"]
2925 pag_obj = cast("PageObject", pag.get_object())
2926 if "/B" not in pag_obj:
2927 pag_obj[NameObject("/B")] = ArrayObject()
2928 cast("ArrayObject", pag_obj["/B"]).append(
2929 new_article.indirect_reference
2930 )
2931 current_article = cast("DictionaryObject", current_article["/N"])
2932 if current_article == first_article:
2933 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore
2934 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore
2935 current_article = None
2936 assert nthread.indirect_reference is not None
2937 return nthread.indirect_reference
2939 def add_filtered_articles(
2940 self,
2941 fltr: Union[
2942 Pattern[Any], str
2943 ], # thread entry from the reader's array of threads
2944 pages: dict[int, PageObject],
2945 reader: PdfReader,
2946 ) -> None:
2947 """
2948 Add articles matching the defined criteria.
2950 Args:
2951 fltr:
2952 pages:
2953 reader:
2955 """
2956 if isinstance(fltr, str):
2957 fltr = re.compile(fltr)
2958 elif not isinstance(fltr, Pattern):
2959 fltr = re.compile("")
2960 for p in pages.values():
2961 pp = p.original_page
2962 for a in pp.get("/B", ()):
2963 a_obj = a.get_object()
2964 if is_null_or_none(a_obj):
2965 continue
2966 thr = a_obj.get("/T")
2967 if thr is None:
2968 continue
2969 thr = thr.get_object()
2970 if thr.indirect_reference.idnum not in self._id_translated[
2971 id(reader)
2972 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):
2973 self._add_articles_thread(thr, pages, reader)
2975 def _get_cloned_page(
2976 self,
2977 page: Union[None, IndirectObject, PageObject, NullObject],
2978 pages: dict[int, PageObject],
2979 reader: PdfReader,
2980 ) -> Optional[IndirectObject]:
2981 if isinstance(page, NullObject):
2982 return None
2983 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":
2984 _i = page.indirect_reference
2985 elif isinstance(page, IndirectObject):
2986 _i = page
2987 try:
2988 return pages[_i.idnum].indirect_reference # type: ignore
2989 except Exception:
2990 return None
2992 def _insert_filtered_annotations(
2993 self,
2994 annots: Union[IndirectObject, list[DictionaryObject], None],
2995 page: PageObject,
2996 pages: dict[int, PageObject],
2997 reader: PdfReader,
2998 ) -> list[Destination]:
2999 outlist = ArrayObject()
3000 if isinstance(annots, IndirectObject):
3001 annots = cast("list[Any]", annots.get_object())
3002 if annots is None:
3003 return outlist
3004 if not isinstance(annots, list):
3005 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)
3006 return outlist
3007 for an in annots:
3008 ano = cast("DictionaryObject", an.get_object())
3009 if (
3010 ano["/Subtype"] != "/Link" # type: ignore[comparison-overlap]
3011 or "/A" not in ano
3012 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" # type: ignore[comparison-overlap]
3013 or "/Dest" in ano
3014 ):
3015 if "/Dest" not in ano:
3016 outlist.append(self._add_object(ano.clone(self)))
3017 else:
3018 d = ano["/Dest"]
3019 if isinstance(d, str):
3020 # it is a named dest
3021 if str(d) in self.get_named_dest_root():
3022 outlist.append(ano.clone(self).indirect_reference)
3023 else:
3024 d = cast("ArrayObject", d)
3025 p = self._get_cloned_page(d[0], pages, reader)
3026 if p is not None:
3027 anc = ano.clone(self, ignore_fields=("/Dest",))
3028 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])
3029 outlist.append(self._add_object(anc))
3030 else:
3031 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())
3032 if is_null_or_none(d):
3033 continue
3034 if isinstance(d, str):
3035 # it is a named dest
3036 if str(d) in self.get_named_dest_root():
3037 outlist.append(ano.clone(self).indirect_reference)
3038 else:
3039 d = cast("ArrayObject", d)
3040 p = self._get_cloned_page(d[0], pages, reader)
3041 if p is not None:
3042 anc = ano.clone(self, ignore_fields=("/D",))
3043 cast("DictionaryObject", anc["/A"])[
3044 NameObject("/D")
3045 ] = ArrayObject([p, *d[1:]])
3046 outlist.append(self._add_object(anc))
3047 return outlist
3049 def _get_filtered_outline(
3050 self,
3051 node: Any,
3052 pages: dict[int, PageObject],
3053 reader: PdfReader,
3054 ) -> list[Destination]:
3055 """
3056 Extract outline item entries that are part of the specified page set.
3058 Args:
3059 node:
3060 pages:
3061 reader:
3063 Returns:
3064 A list of destination objects.
3066 """
3067 new_outline = []
3068 if node is None:
3069 node = NullObject()
3070 node = node.get_object()
3071 if is_null_or_none(node):
3072 node = DictionaryObject()
3073 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:
3074 node = node.get("/First", None)
3075 if node is not None:
3076 node = node.get_object()
3077 new_outline += self._get_filtered_outline(node, pages, reader)
3078 else:
3079 v: Union[None, IndirectObject, NullObject]
3080 while node is not None:
3081 node = node.get_object()
3082 o = cast("Destination", reader._build_outline_item(node))
3083 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)
3084 if v is None:
3085 v = NullObject()
3086 o[NameObject("/Page")] = v
3087 if "/First" in node:
3088 o._filtered_children = self._get_filtered_outline(
3089 node["/First"], pages, reader
3090 )
3091 else:
3092 o._filtered_children = []
3093 if (
3094 not isinstance(o["/Page"], NullObject)
3095 or len(o._filtered_children) > 0
3096 ):
3097 new_outline.append(o)
3098 node = node.get("/Next", None)
3099 return new_outline
3101 def _clone_outline(self, dest: Destination) -> TreeObject:
3102 n_ol = TreeObject()
3103 self._add_object(n_ol)
3104 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])
3105 if not isinstance(dest["/Page"], NullObject):
3106 if dest.node is not None and "/A" in dest.node:
3107 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)
3108 else:
3109 n_ol[NameObject("/Dest")] = dest.dest_array
3110 # TODO: /SE
3111 if dest.node is not None:
3112 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))
3113 n_ol[NameObject("/C")] = ArrayObject(
3114 dest.node.get(
3115 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]
3116 )
3117 )
3118 return n_ol
3120 def _insert_filtered_outline(
3121 self,
3122 outlines: list[Destination],
3123 parent: Union[TreeObject, IndirectObject],
3124 before: Union[None, TreeObject, IndirectObject] = None,
3125 ) -> None:
3126 for dest in outlines:
3127 # TODO: can be improved to keep A and SE entries (ignored for the moment)
3128 # with np=self.add_outline_item_destination(dest,parent,before)
3129 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:
3130 np = parent
3131 else:
3132 np = self._clone_outline(dest)
3133 cast(TreeObject, parent.get_object()).insert_child(np, before, self)
3134 self._insert_filtered_outline(dest._filtered_children, np, None)
3136 def close(self) -> None:
3137 """Implemented for API harmonization."""
3138 return
3140 def find_outline_item(
3141 self,
3142 outline_item: dict[str, Any],
3143 root: Optional[OutlineType] = None,
3144 ) -> Optional[list[int]]:
3145 if root is None:
3146 o = self.get_outline_root()
3147 else:
3148 o = cast("TreeObject", root)
3150 i = 0
3151 while o is not None:
3152 if (
3153 o.indirect_reference == outline_item
3154 or o.get("/Title", None) == outline_item
3155 ):
3156 return [i]
3157 if "/First" in o:
3158 res = self.find_outline_item(
3159 outline_item, cast(OutlineType, o["/First"])
3160 )
3161 if res:
3162 return ([i] if "/Title" in o else []) + res
3163 if "/Next" in o:
3164 i += 1
3165 o = cast(TreeObject, o["/Next"])
3166 else:
3167 return None
3168 raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover
3170 def reset_translation(
3171 self, reader: Union[None, PdfReader, IndirectObject] = None
3172 ) -> None:
3173 """
3174 Reset the translation table between reader and the writer object.
3176 Late cloning will create new independent objects.
3178 Args:
3179 reader: PdfReader or IndirectObject referencing a PdfReader object.
3180 if set to None or omitted, all tables will be reset.
3182 """
3183 if reader is None:
3184 self._id_translated = {}
3185 elif isinstance(reader, PdfReader):
3186 try:
3187 del self._id_translated[id(reader)]
3188 except Exception:
3189 pass
3190 elif isinstance(reader, IndirectObject):
3191 try:
3192 del self._id_translated[id(reader.pdf)]
3193 except Exception:
3194 pass
3195 else:
3196 raise Exception("invalid parameter {reader}")
3198 def set_page_label(
3199 self,
3200 page_index_from: int,
3201 page_index_to: int,
3202 style: Optional[PageLabelStyle] = None,
3203 prefix: Optional[str] = None,
3204 start: Optional[int] = 0,
3205 ) -> None:
3206 """
3207 Set a page label to a range of pages.
3209 Page indexes must be given starting from 0.
3210 Labels must have a style, a prefix or both.
3211 If a range is not assigned any page label, a decimal label starting from 1 is applied.
3213 Args:
3214 page_index_from: page index of the beginning of the range starting from 0
3215 page_index_to: page index of the beginning of the range starting from 0
3216 style: The numbering style to be used for the numeric portion of each page label:
3218 * ``/D`` Decimal Arabic numerals
3219 * ``/R`` Uppercase Roman numerals
3220 * ``/r`` Lowercase Roman numerals
3221 * ``/A`` Uppercase letters (A to Z for the first 26 pages,
3222 AA to ZZ for the next 26, and so on)
3223 * ``/a`` Lowercase letters (a to z for the first 26 pages,
3224 aa to zz for the next 26, and so on)
3226 prefix: The label prefix for page labels in this range.
3227 start: The value of the numeric portion for the first page label
3228 in the range.
3229 Subsequent pages are numbered sequentially from this value,
3230 which must be greater than or equal to 1.
3231 Default value: 1.
3233 """
3234 if style is None and prefix is None:
3235 raise ValueError("At least one of style and prefix must be given")
3236 if page_index_from < 0:
3237 raise ValueError("page_index_from must be greater or equal than 0")
3238 if page_index_to < page_index_from:
3239 raise ValueError(
3240 "page_index_to must be greater or equal than page_index_from"
3241 )
3242 if page_index_to >= len(self.pages):
3243 raise ValueError("page_index_to exceeds number of pages")
3244 if start is not None and start != 0 and start < 1:
3245 raise ValueError("If given, start must be greater or equal than one")
3247 self._set_page_label(page_index_from, page_index_to, style, prefix, start)
3249 def _set_page_label(
3250 self,
3251 page_index_from: int,
3252 page_index_to: int,
3253 style: Optional[PageLabelStyle] = None,
3254 prefix: Optional[str] = None,
3255 start: Optional[int] = 0,
3256 ) -> None:
3257 """
3258 Set a page label to a range of pages.
3260 Page indexes must be given starting from 0.
3261 Labels must have a style, a prefix or both.
3262 If a range is not assigned any page label a decimal label starting from 1 is applied.
3264 Args:
3265 page_index_from: page index of the beginning of the range starting from 0
3266 page_index_to: page index of the beginning of the range starting from 0
3267 style: The numbering style to be used for the numeric portion of each page label:
3268 /D Decimal Arabic numerals
3269 /R Uppercase Roman numerals
3270 /r Lowercase Roman numerals
3271 /A Uppercase letters (A to Z for the first 26 pages,
3272 AA to ZZ for the next 26, and so on)
3273 /a Lowercase letters (a to z for the first 26 pages,
3274 aa to zz for the next 26, and so on)
3275 prefix: The label prefix for page labels in this range.
3276 start: The value of the numeric portion for the first page label
3277 in the range.
3278 Subsequent pages are numbered sequentially from this value,
3279 which must be greater than or equal to 1. Default value: 1.
3281 """
3282 default_page_label = DictionaryObject()
3283 default_page_label[NameObject("/S")] = NameObject("/D")
3285 new_page_label = DictionaryObject()
3286 if style is not None:
3287 new_page_label[NameObject("/S")] = NameObject(style)
3288 if prefix is not None:
3289 new_page_label[NameObject("/P")] = TextStringObject(prefix)
3290 if start != 0:
3291 new_page_label[NameObject("/St")] = NumberObject(start)
3293 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:
3294 nums = ArrayObject()
3295 nums_insert(NumberObject(0), default_page_label, nums)
3296 page_labels = TreeObject()
3297 page_labels[NameObject("/Nums")] = nums
3298 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3300 page_labels = cast(
3301 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]
3302 )
3303 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])
3305 nums_insert(NumberObject(page_index_from), new_page_label, nums)
3306 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)
3307 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)
3308 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):
3309 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)
3311 page_labels[NameObject("/Nums")] = nums
3312 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3314 def _repr_mimebundle_(
3315 self,
3316 include: Union[None, Iterable[str]] = None,
3317 exclude: Union[None, Iterable[str]] = None,
3318 ) -> dict[str, Any]:
3319 """
3320 Integration into Jupyter Notebooks.
3322 This method returns a dictionary that maps a mime-type to its
3323 representation.
3325 .. seealso::
3327 https://ipython.readthedocs.io/en/stable/config/integrating.html
3328 """
3329 pdf_data = BytesIO()
3330 self.write(pdf_data)
3331 data = {
3332 "application/pdf": pdf_data,
3333 }
3335 if include is not None:
3336 # Filter representations based on include list
3337 data = {k: v for k, v in data.items() if k in include}
3339 if exclude is not None:
3340 # Remove representations based on exclude list
3341 data = {k: v for k, v in data.items() if k not in exclude}
3343 return data
3346def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:
3347 if isinstance(obj, PdfObject):
3348 return obj
3349 if isinstance(obj, dict):
3350 to_add = DictionaryObject()
3351 for key, value in obj.items():
3352 to_add[NameObject(key)] = _pdf_objectify(value)
3353 return to_add
3354 if isinstance(obj, str):
3355 if obj.startswith("/"):
3356 return NameObject(obj)
3357 return TextStringObject(obj)
3358 if isinstance(obj, (float, int)):
3359 return FloatObject(obj)
3360 if isinstance(obj, list):
3361 return ArrayObject(_pdf_objectify(i) for i in obj)
3362 raise NotImplementedError(
3363 f"{type(obj)=} could not be cast to a PdfObject"
3364 )
3367def _create_outline_item(
3368 action_ref: Union[None, IndirectObject],
3369 title: str,
3370 color: Union[tuple[float, float, float], str, None],
3371 italic: bool,
3372 bold: bool,
3373) -> TreeObject:
3374 outline_item = TreeObject()
3375 if action_ref is not None:
3376 outline_item[NameObject("/A")] = action_ref
3377 outline_item.update(
3378 {
3379 NameObject("/Title"): create_string_object(title),
3380 }
3381 )
3382 if color:
3383 if isinstance(color, str):
3384 color = hex_to_rgb(color)
3385 outline_item.update(
3386 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}
3387 )
3388 if italic or bold:
3389 format_flag = 0
3390 if italic:
3391 format_flag += OutlineFontFlag.italic
3392 if bold:
3393 format_flag += OutlineFontFlag.bold
3394 outline_item.update({NameObject("/F"): NumberObject(format_flag)})
3395 return outline_item