Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 20%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import decimal
31import enum
32import hashlib
33import re
34import struct
35import uuid
36from collections.abc import Iterable, Mapping
37from io import BytesIO, FileIO, IOBase
38from itertools import compress
39from pathlib import Path
40from re import Pattern
41from types import TracebackType
42from typing import (
43 IO,
44 Any,
45 Callable,
46 Optional,
47 Union,
48 cast,
49)
51from ._cmap import _default_fonts_space_width, build_char_map_from_dict
52from ._doc_common import DocumentInformation, PdfDocCommon
53from ._encryption import EncryptAlgorithm, Encryption
54from ._page import PageObject, Transformation
55from ._page_labels import nums_clear_range, nums_insert, nums_next
56from ._reader import PdfReader
57from ._utils import (
58 StrByteType,
59 StreamType,
60 _get_max_pdf_version_header,
61 deprecation_no_replacement,
62 logger_warning,
63)
64from .constants import AnnotationDictionaryAttributes as AA
65from .constants import CatalogAttributes as CA
66from .constants import (
67 CatalogDictionary,
68 GoToActionArguments,
69 ImageType,
70 InteractiveFormDictEntries,
71 OutlineFontFlag,
72 PageLabelStyle,
73 PagesAttributes,
74 TypFitArguments,
75 UserAccessPermissions,
76)
77from .constants import Core as CO
78from .constants import FieldDictionaryAttributes as FA
79from .constants import PageAttributes as PG
80from .constants import TrailerKeys as TK
81from .errors import PdfReadError, PyPdfError
82from .generic import (
83 PAGE_FIT,
84 ArrayObject,
85 BooleanObject,
86 ByteStringObject,
87 ContentStream,
88 DecodedStreamObject,
89 Destination,
90 DictionaryObject,
91 EmbeddedFile,
92 Fit,
93 FloatObject,
94 IndirectObject,
95 NameObject,
96 NullObject,
97 NumberObject,
98 PdfObject,
99 RectangleObject,
100 ReferenceLink,
101 StreamObject,
102 TextStringObject,
103 TreeObject,
104 ViewerPreferences,
105 create_string_object,
106 extract_links,
107 hex_to_rgb,
108 is_null_or_none,
109)
110from .pagerange import PageRange, PageRangeSpec
111from .types import (
112 AnnotationSubtype,
113 BorderArrayType,
114 LayoutType,
115 OutlineItemType,
116 OutlineType,
117 PagemodeType,
118)
119from .xmp import XmpInformation
121ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()
122DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12
125class ObjectDeletionFlag(enum.IntFlag):
126 NONE = 0
127 TEXT = enum.auto()
128 LINKS = enum.auto()
129 ATTACHMENTS = enum.auto()
130 OBJECTS_3D = enum.auto()
131 ALL_ANNOTATIONS = enum.auto()
132 XOBJECT_IMAGES = enum.auto()
133 INLINE_IMAGES = enum.auto()
134 DRAWING_IMAGES = enum.auto()
135 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
138def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
139 hash = hashlib.md5(usedforsecurity=False)
140 for block in iter(lambda: stream.read(blocksize), b""):
141 hash.update(block)
142 return hash.hexdigest()
145class PdfWriter(PdfDocCommon):
146 """
147 Write a PDF file out, given pages produced by another class or through
148 cloning a PDF file during initialization.
150 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.
152 Args:
153 clone_from: identical to fileobj (for compatibility)
155 incremental: If true, loads the document and set the PdfWriter in incremental mode.
157 When writing incrementally, the original document is written first and new/modified
158 content is appended. To be used for signed document/forms to keep signature valid.
160 full: If true, loads all the objects (always full if incremental = True).
161 This parameter may allow loading large PDFs.
163 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.
164 If false, pypdf will try to be forgiving and do something reasonable, but it will log
165 a warning message. It is a best-effort approach.
167 """
169 def __init__(
170 self,
171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",
172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,
173 incremental: bool = False,
174 full: bool = False,
175 strict: bool = False,
176 ) -> None:
177 self.strict = strict
178 """
179 If true, pypdf will raise an exception if a PDF does not follow the specification.
180 If false, pypdf will try to be forgiving and do something reasonable, but it will log
181 a warning message. It is a best-effort approach.
182 """
184 self.incremental = incremental or full
185 """
186 Returns if the PdfWriter object has been started in incremental mode.
187 """
189 self._objects: list[Optional[PdfObject]] = []
190 """
191 The indirect objects in the PDF.
192 For the incremental case, it will be filled with None
193 in clone_reader_document_root.
194 """
196 self._original_hash: list[int] = []
197 """
198 List of hashes after import; used to identify changes.
199 """
201 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}
202 """
203 Maps hash values of indirect objects to the list of IndirectObjects.
204 This is used for compression.
205 """
207 self._id_translated: dict[int, dict[int, int]] = {}
208 """List of already translated IDs.
209 dict[id(pdf)][(idnum, generation)]
210 """
212 self._info_obj: Optional[PdfObject]
213 """The PDF files's document information dictionary,
214 the Info entry in the PDF file's trailer dictionary."""
216 self._ID: Union[ArrayObject, None] = None
217 """The PDF file identifier,
218 defined by the ID in the PDF file's trailer dictionary."""
220 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []
221 "Tracks links in pages added to the writer for resolving later."
222 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}
223 "Tracks pages added to the writer and what page they turned into."
225 if self.incremental:
226 if isinstance(fileobj, (str, Path)):
227 with open(fileobj, "rb") as f:
228 fileobj = BytesIO(f.read(-1))
229 if isinstance(fileobj, BytesIO):
230 fileobj = PdfReader(fileobj)
231 if not isinstance(fileobj, PdfReader):
232 raise PyPdfError("Invalid type for incremental mode")
233 self._reader = fileobj # prev content is in _reader.stream
234 self._header = fileobj.pdf_header.encode()
235 self._readonly = True # TODO: to be analysed
236 else:
237 self._header = b"%PDF-1.3"
238 self._info_obj = self._add_object(
239 DictionaryObject(
240 {NameObject("/Producer"): create_string_object("pypdf")}
241 )
242 )
244 def _get_clone_from(
245 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
246 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
247 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:
248 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (
249 fileobj == "" or clone_from is not None
250 ):
251 return clone_from
252 cloning = True
253 if isinstance(fileobj, (str, Path)) and (
254 not Path(str(fileobj)).exists()
255 or Path(str(fileobj)).stat().st_size == 0
256 ):
257 cloning = False
258 if isinstance(fileobj, (IOBase, BytesIO)):
259 t = fileobj.tell()
260 if fileobj.seek(0, 2) == 0:
261 cloning = False
262 fileobj.seek(t, 0)
263 if cloning:
264 clone_from = fileobj
265 return clone_from
267 clone_from = _get_clone_from(fileobj, clone_from)
268 # To prevent overwriting
269 self.temp_fileobj = fileobj
270 self.fileobj = ""
271 self._with_as_usage = False
272 self._cloned = False
273 # The root of our page tree node
274 pages = DictionaryObject(
275 {
276 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),
277 NameObject(PagesAttributes.COUNT): NumberObject(0),
278 NameObject(PagesAttributes.KIDS): ArrayObject(),
279 }
280 )
281 self.flattened_pages = []
282 self._encryption: Optional[Encryption] = None
283 self._encrypt_entry: Optional[DictionaryObject] = None
285 if clone_from is not None:
286 if not isinstance(clone_from, PdfReader):
287 clone_from = PdfReader(clone_from)
288 self.clone_document_from_reader(clone_from)
289 self._cloned = True
290 else:
291 self._pages = self._add_object(pages)
292 self._root_object = DictionaryObject(
293 {
294 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),
295 NameObject(CO.PAGES): self._pages,
296 }
297 )
298 self._add_object(self._root_object)
299 if full and not incremental:
300 self.incremental = False
301 if isinstance(self._ID, list):
302 if isinstance(self._ID[0], TextStringObject):
303 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
304 if isinstance(self._ID[1], TextStringObject):
305 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())
307 # for commonality
308 @property
309 def is_encrypted(self) -> bool:
310 """
311 Read-only boolean property showing whether this PDF file is encrypted.
313 Note that this property, if true, will remain true even after the
314 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
315 """
316 return False
318 @property
319 def root_object(self) -> DictionaryObject:
320 """
321 Provide direct access to PDF Structure.
323 Note:
324 Recommended only for read access.
326 """
327 return self._root_object
329 @property
330 def _info(self) -> Optional[DictionaryObject]:
331 """
332 Provide access to "/Info". Standardized with PdfReader.
334 Returns:
335 /Info Dictionary; None if the entry does not exist
337 """
338 return (
339 None
340 if self._info_obj is None
341 else cast(DictionaryObject, self._info_obj.get_object())
342 )
344 @_info.setter
345 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
346 if value is None:
347 try:
348 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore
349 except (KeyError, AttributeError):
350 pass
351 self._info_obj = None
352 else:
353 if self._info_obj is None:
354 self._info_obj = self._add_object(DictionaryObject())
355 obj = cast(DictionaryObject, self._info_obj.get_object())
356 obj.clear()
357 obj.update(cast(DictionaryObject, value.get_object()))
359 @property
360 def xmp_metadata(self) -> Optional[XmpInformation]:
361 """XMP (Extensible Metadata Platform) data."""
362 return cast(XmpInformation, self.root_object.xmp_metadata)
364 @xmp_metadata.setter
365 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:
366 """XMP (Extensible Metadata Platform) data."""
367 if value is None:
368 if "/Metadata" in self.root_object:
369 del self.root_object["/Metadata"]
370 return
372 metadata = self.root_object.get("/Metadata", None)
373 if not isinstance(metadata, IndirectObject):
374 if metadata is not None:
375 del self.root_object["/Metadata"]
376 metadata_stream = StreamObject()
377 stream_reference = self._add_object(metadata_stream)
378 self.root_object[NameObject("/Metadata")] = stream_reference
379 else:
380 metadata_stream = cast(StreamObject, metadata.get_object())
382 if isinstance(value, XmpInformation):
383 bytes_data = value.stream.get_data()
384 else:
385 bytes_data = value
386 metadata_stream.set_data(bytes_data)
388 @property
389 def with_as_usage(self) -> bool:
390 deprecation_no_replacement("with_as_usage", "5.0")
391 return self._with_as_usage
393 @with_as_usage.setter
394 def with_as_usage(self, value: bool) -> None:
395 deprecation_no_replacement("with_as_usage", "5.0")
396 self._with_as_usage = value
398 def __enter__(self) -> "PdfWriter":
399 """Store how writer is initialized by 'with'."""
400 c: bool = self._cloned
401 t = self.temp_fileobj
402 self.__init__() # type: ignore
403 self._cloned = c
404 self._with_as_usage = True
405 self.fileobj = t # type: ignore
406 return self
408 def __exit__(
409 self,
410 exc_type: Optional[type[BaseException]],
411 exc: Optional[BaseException],
412 traceback: Optional[TracebackType],
413 ) -> None:
414 """Write data to the fileobj."""
415 if self.fileobj and not self._cloned:
416 self.write(self.fileobj)
418 @property
419 def pdf_header(self) -> str:
420 """
421 Read/Write property of the PDF header that is written.
423 This should be something like ``'%PDF-1.5'``. It is recommended to set
424 the lowest version that supports all features which are used within the
425 PDF file.
427 Note: `pdf_header` returns a string but accepts bytes or str for writing
428 """
429 return self._header.decode()
431 @pdf_header.setter
432 def pdf_header(self, new_header: Union[str, bytes]) -> None:
433 if isinstance(new_header, str):
434 new_header = new_header.encode()
435 self._header = new_header
437 def _add_object(self, obj: PdfObject) -> IndirectObject:
438 if (
439 getattr(obj, "indirect_reference", None) is not None
440 and obj.indirect_reference.pdf == self # type: ignore
441 ):
442 return obj.indirect_reference # type: ignore
443 # check for /Contents in Pages (/Contents in annotations are strings)
444 if isinstance(obj, DictionaryObject) and isinstance(
445 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
446 ):
447 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
448 self._objects.append(obj)
449 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
450 return obj.indirect_reference
452 def get_object(
453 self,
454 indirect_reference: Union[int, IndirectObject],
455 ) -> PdfObject:
456 if isinstance(indirect_reference, int):
457 obj = self._objects[indirect_reference - 1]
458 elif indirect_reference.pdf != self:
459 raise ValueError("PDF must be self")
460 else:
461 obj = self._objects[indirect_reference.idnum - 1]
462 assert obj is not None, "mypy"
463 return obj
465 def _replace_object(
466 self,
467 indirect_reference: Union[int, IndirectObject],
468 obj: PdfObject,
469 ) -> PdfObject:
470 if isinstance(indirect_reference, IndirectObject):
471 if indirect_reference.pdf != self:
472 raise ValueError("PDF must be self")
473 indirect_reference = indirect_reference.idnum
474 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore
475 if (
476 getattr(obj, "indirect_reference", None) is not None
477 and obj.indirect_reference.pdf != self # type: ignore
478 ):
479 obj = obj.clone(self)
480 self._objects[indirect_reference - 1] = obj
481 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
483 assert isinstance(obj, PdfObject), "mypy"
484 return obj
486 def _add_page(
487 self,
488 page: PageObject,
489 index: int,
490 excluded_keys: Iterable[str] = (),
491 ) -> PageObject:
492 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:
493 raise ValueError("Invalid page object")
494 assert self.flattened_pages is not None, "for mypy"
495 page_org = page
496 excluded_keys = list(excluded_keys)
497 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]
498 # Acrobat does not accept two indirect references pointing on the same
499 # page; therefore in order to add multiple copies of the same
500 # page, we need to create a new dictionary for the page, however the
501 # objects below (including content) are not duplicated:
502 try: # delete an already existing page
503 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore
504 page_org.indirect_reference.idnum # type: ignore
505 ]
506 except Exception:
507 pass
509 page = cast(
510 "PageObject", page_org.clone(self, False, excluded_keys).get_object()
511 )
512 if page_org.pdf is not None:
513 other = page_org.pdf.pdf_header
514 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
516 node, idx = self._get_page_in_node(index)
517 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference
519 if idx >= 0:
520 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)
521 self.flattened_pages.insert(index, page)
522 else:
523 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)
524 self.flattened_pages.append(page)
525 recurse = 0
526 while not is_null_or_none(node):
527 node = cast(DictionaryObject, node.get_object())
528 node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1)
529 node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix.
530 recurse += 1
531 if recurse > 1000:
532 raise PyPdfError("Too many recursive calls!")
534 if page_org.pdf is not None:
535 # the page may contain links to other pages, and those other
536 # pages may or may not already be added. we store the
537 # information we need, so that we can resolve the references
538 # later.
539 self._unresolved_links.extend(extract_links(page, page_org))
540 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
542 return page
544 def set_need_appearances_writer(self, state: bool = True) -> None:
545 """
546 Sets the "NeedAppearances" flag in the PDF writer.
548 The "NeedAppearances" flag indicates whether the appearance dictionary
549 for form fields should be automatically generated by the PDF viewer or
550 if the embedded appearance should be used.
552 Args:
553 state: The actual value of the NeedAppearances flag.
555 Returns:
556 None
558 """
559 # See §12.7.2 and §7.7.2 for more information:
560 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
561 try:
562 # get the AcroForm tree
563 if CatalogDictionary.ACRO_FORM not in self._root_object:
564 self._root_object[
565 NameObject(CatalogDictionary.ACRO_FORM)
566 ] = self._add_object(DictionaryObject())
568 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)
569 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[
570 need_appearances
571 ] = BooleanObject(state)
572 except Exception as exc: # pragma: no cover
573 logger_warning(
574 f"set_need_appearances_writer({state}) catch : {exc}", __name__
575 )
577 def create_viewer_preferences(self) -> ViewerPreferences:
578 o = ViewerPreferences()
579 self._root_object[
580 NameObject(CatalogDictionary.VIEWER_PREFERENCES)
581 ] = self._add_object(o)
582 return o
584 def add_page(
585 self,
586 page: PageObject,
587 excluded_keys: Iterable[str] = (),
588 ) -> PageObject:
589 """
590 Add a page to this PDF file.
592 Recommended for advanced usage including the adequate excluded_keys.
594 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`
595 instance.
597 Args:
598 page: The page to add to the document. Should be
599 an instance of :class:`PageObject<pypdf._page.PageObject>`
600 excluded_keys:
602 Returns:
603 The added PageObject.
605 """
606 assert self.flattened_pages is not None, "mypy"
607 return self._add_page(page, len(self.flattened_pages), excluded_keys)
609 def insert_page(
610 self,
611 page: PageObject,
612 index: int = 0,
613 excluded_keys: Iterable[str] = (),
614 ) -> PageObject:
615 """
616 Insert a page in this PDF file. The page is usually acquired from a
617 :class:`PdfReader<pypdf.PdfReader>` instance.
619 Args:
620 page: The page to add to the document.
621 index: Position at which the page will be inserted.
622 excluded_keys:
624 Returns:
625 The added PageObject.
627 """
628 assert self.flattened_pages is not None, "mypy"
629 if index < 0:
630 index = len(self.flattened_pages) + index
631 if index < 0:
632 raise ValueError("Invalid index value")
633 if index >= len(self.flattened_pages):
634 return self.add_page(page, excluded_keys)
635 return self._add_page(page, index, excluded_keys)
637 def _get_page_number_by_indirect(
638 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
639 ) -> Optional[int]:
640 """
641 Generate _page_id2num.
643 Args:
644 indirect_reference:
646 Returns:
647 The page number or None
649 """
650 # To provide same function as in PdfReader
651 if is_null_or_none(indirect_reference):
652 return None
653 assert indirect_reference is not None, "mypy"
654 if isinstance(indirect_reference, int):
655 indirect_reference = IndirectObject(indirect_reference, 0, self)
656 obj = indirect_reference.get_object()
657 if isinstance(obj, PageObject):
658 return obj.page_number
659 return None
661 def add_blank_page(
662 self, width: Optional[float] = None, height: Optional[float] = None
663 ) -> PageObject:
664 """
665 Append a blank page to this PDF file and return it.
667 If no page size is specified, use the size of the last page.
669 Args:
670 width: The width of the new page expressed in default user
671 space units.
672 height: The height of the new page expressed in default
673 user space units.
675 Returns:
676 The newly appended page.
678 Raises:
679 PageSizeNotDefinedError: if width and height are not defined
680 and previous page does not exist.
682 """
683 page = PageObject.create_blank_page(self, width, height)
684 return self.add_page(page)
686 def insert_blank_page(
687 self,
688 width: Optional[Union[float, decimal.Decimal]] = None,
689 height: Optional[Union[float, decimal.Decimal]] = None,
690 index: int = 0,
691 ) -> PageObject:
692 """
693 Insert a blank page to this PDF file and return it.
695 If no page size is specified, use the size of the last page.
697 Args:
698 width: The width of the new page expressed in default user
699 space units.
700 height: The height of the new page expressed in default
701 user space units.
702 index: Position to add the page.
704 Returns:
705 The newly inserted page.
707 Raises:
708 PageSizeNotDefinedError: if width and height are not defined
709 and previous page does not exist.
711 """
712 if width is None or (height is None and index < self.get_num_pages()):
713 oldpage = self.pages[index]
714 width = oldpage.mediabox.width
715 height = oldpage.mediabox.height
716 page = PageObject.create_blank_page(self, width, height)
717 self.insert_page(page, index)
718 return page
720 @property
721 def open_destination(
722 self,
723 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
724 return super().open_destination
726 @open_destination.setter
727 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
728 if dest is None:
729 try:
730 del self._root_object["/OpenAction"]
731 except KeyError:
732 pass
733 elif isinstance(dest, str):
734 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)
735 elif isinstance(dest, Destination):
736 self._root_object[NameObject("/OpenAction")] = dest.dest_array
737 elif isinstance(dest, PageObject):
738 self._root_object[NameObject("/OpenAction")] = Destination(
739 "Opening",
740 dest.indirect_reference
741 if dest.indirect_reference is not None
742 else NullObject(),
743 PAGE_FIT,
744 ).dest_array
746 def add_js(self, javascript: str) -> None:
747 """
748 Add JavaScript which will launch upon opening this PDF.
750 Args:
751 javascript: Your JavaScript.
753 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
754 # Example: This will launch the print window when the PDF is opened.
756 """
757 # Names / JavaScript preferred to be able to add multiple scripts
758 if "/Names" not in self._root_object:
759 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()
760 names = cast(DictionaryObject, self._root_object[CA.NAMES])
761 if "/JavaScript" not in names:
762 names[NameObject("/JavaScript")] = DictionaryObject(
763 {NameObject("/Names"): ArrayObject()}
764 )
765 js_list = cast(
766 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]
767 )
768 # We need a name for parameterized JavaScript in the PDF file,
769 # but it can be anything.
770 js_list.append(create_string_object(str(uuid.uuid4())))
772 js = DictionaryObject(
773 {
774 NameObject(PagesAttributes.TYPE): NameObject("/Action"),
775 NameObject("/S"): NameObject("/JavaScript"),
776 NameObject("/JS"): TextStringObject(f"{javascript}"),
777 }
778 )
779 js_list.append(self._add_object(js))
781 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":
782 """
783 Embed a file inside the PDF.
785 Reference:
786 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
787 Section 7.11.3
789 Args:
790 filename: The filename to display.
791 data: The data in the file.
793 Returns:
794 EmbeddedFile instance for the newly created embedded file.
796 """
797 return EmbeddedFile._create_new(self, filename, data)
799 def append_pages_from_reader(
800 self,
801 reader: PdfReader,
802 after_page_append: Optional[Callable[[PageObject], None]] = None,
803 ) -> None:
804 """
805 Copy pages from reader to writer. Includes an optional callback
806 parameter which is invoked after pages are appended to the writer.
808 ``append`` should be preferred.
810 Args:
811 reader: a PdfReader object from which to copy page
812 annotations to this writer object. The writer's annots
813 will then be updated.
814 after_page_append:
815 Callback function that is invoked after each page is appended to
816 the writer. Signature includes a reference to the appended page
817 (delegates to append_pages_from_reader). The single parameter of
818 the callback is a reference to the page just appended to the
819 document.
821 """
822 reader_num_pages = len(reader.pages)
823 # Copy pages from reader to writer
824 for reader_page_number in range(reader_num_pages):
825 reader_page = reader.pages[reader_page_number]
826 writer_page = self.add_page(reader_page)
827 # Trigger callback, pass writer page as parameter
828 if callable(after_page_append):
829 after_page_append(writer_page)
831 def _merge_content_stream_to_page(
832 self,
833 page: PageObject,
834 new_content_data: bytes,
835 ) -> None:
836 """
837 Combines existing content stream(s) with new content (as bytes).
839 Args:
840 page: The page to which the new content data will be added.
841 new_content_data: A binary-encoded new content stream, for
842 instance the commands to draw an XObject.
843 """
844 # First resolve the existing page content. This always is an IndirectObject:
845 # PDF Explained by John Whitington
846 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
847 if NameObject("/Contents") in page:
848 existing_content_ref = page[NameObject("/Contents")]
849 existing_content = existing_content_ref.get_object()
851 if isinstance(existing_content, ArrayObject):
852 # Create a new StreamObject for the new_content_data
853 new_stream_obj = StreamObject()
854 new_stream_obj.set_data(new_content_data)
855 existing_content.append(self._add_object(new_stream_obj))
856 page[NameObject("/Contents")] = self._add_object(existing_content)
857 if isinstance(existing_content, StreamObject):
858 # Merge new content to existing StreamObject
859 merged_data = existing_content.get_data() + b"\n" + new_content_data
860 new_stream = StreamObject()
861 new_stream.set_data(merged_data)
862 page[NameObject("/Contents")] = self._add_object(new_stream)
863 else:
864 # If no existing content, then we have an empty page.
865 # Create a new StreamObject in a new /Contents entry.
866 new_stream = StreamObject()
867 new_stream.set_data(new_content_data)
868 page[NameObject("/Contents")] = self._add_object(new_stream)
870 def _add_apstream_object(
871 self,
872 page: PageObject,
873 appearance_stream_obj: StreamObject,
874 object_name: str,
875 x_offset: float,
876 y_offset: float,
877 font_res: Optional[DictionaryObject] = None
878 ) -> None:
879 """
880 Adds an appearance stream to the page content in the form of
881 an XObject.
883 Args:
884 page: The page to which to add the appearance stream.
885 appearance_stream_obj: The appearance stream.
886 object_name: The name of the appearance stream.
887 x_offset: The horizontal offset for the appearance stream.
888 y_offset: The vertical offset for the appearance stream.
889 font_res: The appearance stream's font resource (if given).
890 """
891 # Prepare XObject resource dictionary on the page
892 pg_res = cast(DictionaryObject, page[PG.RESOURCES])
893 if font_res is not None:
894 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated
895 if "/Font" not in pg_res:
896 pg_res[NameObject("/Font")] = DictionaryObject()
897 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])
898 if font_name not in pg_ft_res:
899 pg_ft_res[NameObject(font_name)] = font_res
900 # Always add the resolved stream object to the writer to get a new IndirectObject.
901 # This ensures we have a valid IndirectObject managed by *this* writer.
902 xobject_ref = self._add_object(appearance_stream_obj)
903 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()
904 if "/XObject" not in pg_res:
905 pg_res[NameObject("/XObject")] = DictionaryObject()
906 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])
907 if xobject_name not in pg_xo_res:
908 pg_xo_res[xobject_name] = xobject_ref
909 else:
910 logger_warning(
911 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",
912 __name__
913 )
914 xobject_cm = Transformation().translate(x_offset, y_offset)
915 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
916 self._merge_content_stream_to_page(page, xobject_drawing_commands)
918 def _update_field_annotation(
919 self,
920 page: PageObject,
921 field: DictionaryObject,
922 annotation: DictionaryObject,
923 font_name: str = "",
924 font_size: float = -1,
925 flatten: bool = False,
926 ) -> None:
927 # Calculate rectangle dimensions
928 _rct = cast(RectangleObject, annotation[AA.Rect])
929 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))
931 # Extract font information
932 da = annotation.get_inherited(
933 AA.DA,
934 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(
935 AA.DA, None
936 ),
937 )
938 if da is None:
939 da = TextStringObject("/Helv 0 Tf 0 g")
940 else:
941 da = da.get_object()
942 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")
943 font_properties = [x for x in font_properties if x != ""]
944 if font_name:
945 font_properties[font_properties.index("Tf") - 2] = font_name
946 else:
947 font_name = font_properties[font_properties.index("Tf") - 2]
948 font_height = (
949 font_size
950 if font_size >= 0
951 else float(font_properties[font_properties.index("Tf") - 1])
952 )
953 if font_height == 0:
954 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:
955 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE
956 else:
957 font_height = rct.height - 2
958 font_properties[font_properties.index("Tf") - 1] = str(font_height)
959 da = " ".join(font_properties)
960 y_offset = rct.height - 1 - font_height
962 # Retrieve font information from local DR ...
963 dr: Any = cast(
964 DictionaryObject,
965 cast(
966 DictionaryObject,
967 annotation.get_inherited(
968 "/DR",
969 cast(
970 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]
971 ).get("/DR", DictionaryObject()),
972 ),
973 ).get_object(),
974 )
975 dr = dr.get("/Font", DictionaryObject()).get_object()
976 # _default_fonts_space_width keys is the list of Standard fonts
977 if font_name not in dr and font_name not in _default_fonts_space_width:
978 # ...or AcroForm dictionary
979 dr = cast(
980 dict[Any, Any],
981 cast(
982 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]
983 ).get("/DR", {}),
984 )
985 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()
986 font_res = dr.get(font_name, None)
987 if not is_null_or_none(font_res):
988 font_res = cast(DictionaryObject, font_res.get_object())
989 _font_subtype, _, font_encoding, font_map = build_char_map_from_dict(
990 200, font_res
991 )
992 try: # remove width stored in -1 key
993 del font_map[-1]
994 except KeyError:
995 pass
996 font_full_rev: dict[str, bytes]
997 if isinstance(font_encoding, str):
998 font_full_rev = {
999 v: k.encode(font_encoding) for k, v in font_map.items()
1000 }
1001 else:
1002 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
1003 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
1004 for key, value in font_map.items():
1005 font_full_rev[value] = font_encoding_rev.get(key, key)
1006 else:
1007 logger_warning(f"Font dictionary for {font_name} not found.", __name__)
1008 font_full_rev = {}
1010 # Retrieve field text and selected values
1011 field_flags = field.get(FA.Ff, 0)
1012 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
1013 txt = "\n".join(annotation.get_inherited(FA.Opt, []))
1014 sel = field.get("/V", [])
1015 if not isinstance(sel, list):
1016 sel = [sel]
1017 else: # /Tx
1018 txt = field.get("/V", "")
1019 sel = []
1020 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
1021 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
1022 # Generate appearance stream
1023 ap_stream = generate_appearance_stream(
1024 txt, sel, da, font_full_rev, rct, font_height, y_offset
1025 )
1027 # Create appearance dictionary
1028 dct = DecodedStreamObject.initialize_from_dictionary(
1029 {
1030 NameObject("/Type"): NameObject("/XObject"),
1031 NameObject("/Subtype"): NameObject("/Form"),
1032 NameObject("/BBox"): rct,
1033 "__streamdata__": ByteStringObject(ap_stream),
1034 "/Length": 0,
1035 }
1036 )
1037 if AA.AP in annotation:
1038 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():
1039 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
1040 dct[k] = v
1042 # Update Resources with font information if necessary
1043 if font_res is not None:
1044 dct[NameObject("/Resources")] = DictionaryObject(
1045 {
1046 NameObject("/Font"): DictionaryObject(
1047 {
1048 NameObject(font_name): getattr(
1049 font_res, "indirect_reference", font_res
1050 )
1051 }
1052 )
1053 }
1054 )
1055 if AA.AP not in annotation:
1056 annotation[NameObject(AA.AP)] = DictionaryObject(
1057 {NameObject("/N"): self._add_object(dct)}
1058 )
1059 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):
1060 cast(DictionaryObject, annotation[NameObject(AA.AP)])[
1061 NameObject("/N")
1062 ] = self._add_object(dct)
1063 else: # [/AP][/N] exists
1064 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore
1065 self._objects[n - 1] = dct
1066 dct.indirect_reference = IndirectObject(n, 0, self)
1068 if flatten:
1069 field_name = self._get_qualified_field_name(annotation)
1070 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)
1072 FFBITS_NUL = FA.FfBits(0)
1074 def update_page_form_field_values(
1075 self,
1076 page: Union[PageObject, list[PageObject], None],
1077 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],
1078 flags: FA.FfBits = FFBITS_NUL,
1079 auto_regenerate: Optional[bool] = True,
1080 flatten: bool = False,
1081 ) -> None:
1082 """
1083 Update the form field values for a given page from a fields dictionary.
1085 Copy field texts and values from fields to page.
1086 If the field links to a parent object, add the information to the parent.
1088 Args:
1089 page: `PageObject` - references **PDF writer's page** where the
1090 annotations and field data will be updated.
1091 `List[Pageobject]` - provides list of pages to be processed.
1092 `None` - all pages.
1093 fields: a Python dictionary of:
1095 * field names (/T) as keys and text values (/V) as value
1096 * field names (/T) as keys and list of text values (/V) for multiple choice list
1097 * field names (/T) as keys and tuple of:
1098 * text values (/V)
1099 * font id (e.g. /F1, the font id must exist)
1100 * font size (0 for autosize)
1102 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.
1104 auto_regenerate: Set/unset the need_appearances flag;
1105 the flag is unchanged if auto_regenerate is None.
1107 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's
1108 appearance stream to the page contents. Note that this option does not remove the
1109 annotation itself.
1111 """
1112 if CatalogDictionary.ACRO_FORM not in self._root_object:
1113 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")
1114 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1115 if InteractiveFormDictEntries.Fields not in af:
1116 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")
1117 if isinstance(auto_regenerate, bool):
1118 self.set_need_appearances_writer(auto_regenerate)
1119 # Iterate through pages, update field values
1120 if page is None:
1121 page = list(self.pages)
1122 if isinstance(page, list):
1123 for p in page:
1124 if PG.ANNOTS in p: # just to prevent warnings
1125 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)
1126 return
1127 if PG.ANNOTS not in page:
1128 logger_warning("No fields to update on this page", __name__)
1129 return
1130 for annotation in page[PG.ANNOTS]: # type: ignore
1131 annotation = cast(DictionaryObject, annotation.get_object())
1132 if annotation.get("/Subtype", "") != "/Widget":
1133 continue
1134 if "/FT" in annotation and "/T" in annotation:
1135 parent_annotation = annotation
1136 else:
1137 parent_annotation = annotation.get(
1138 PG.PARENT, DictionaryObject()
1139 ).get_object()
1141 for field, value in fields.items():
1142 if not (
1143 self._get_qualified_field_name(parent_annotation) == field
1144 or parent_annotation.get("/T", None) == field
1145 ):
1146 continue
1147 if (
1148 parent_annotation.get("/FT", None) == "/Ch"
1149 and "/I" in parent_annotation
1150 ):
1151 del parent_annotation["/I"]
1152 if flags:
1153 annotation[NameObject(FA.Ff)] = NumberObject(flags)
1154 if not (value is None and flatten): # Only change values if given by user and not flattening.
1155 if isinstance(value, list):
1156 lst = ArrayObject(TextStringObject(v) for v in value)
1157 parent_annotation[NameObject(FA.V)] = lst
1158 elif isinstance(value, tuple):
1159 annotation[NameObject(FA.V)] = TextStringObject(
1160 value[0],
1161 )
1162 else:
1163 parent_annotation[NameObject(FA.V)] = TextStringObject(value)
1164 if parent_annotation.get(FA.FT) == "/Btn":
1165 # Checkbox button (no /FT found in Radio widgets)
1166 v = NameObject(value)
1167 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])
1168 normal_ap = cast(DictionaryObject, ap["/N"])
1169 if v not in normal_ap:
1170 v = NameObject("/Off")
1171 appearance_stream_obj = normal_ap.get(v)
1172 # other cases will be updated through the for loop
1173 annotation[NameObject(AA.AS)] = v
1174 annotation[NameObject(FA.V)] = v
1175 if flatten and appearance_stream_obj is not None:
1176 # We basically copy the entire appearance stream, which should be an XObject that
1177 # is already registered. No need to add font resources.
1178 rct = cast(RectangleObject, annotation[AA.Rect])
1179 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])
1180 elif (
1181 parent_annotation.get(FA.FT) == "/Tx"
1182 or parent_annotation.get(FA.FT) == "/Ch"
1183 ):
1184 # textbox
1185 if isinstance(value, tuple):
1186 self._update_field_annotation(
1187 page, parent_annotation, annotation, value[1], value[2], flatten=flatten
1188 )
1189 else:
1190 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)
1191 elif (
1192 annotation.get(FA.FT) == "/Sig"
1193 ): # deprecated # not implemented yet
1194 logger_warning("Signature forms not implemented yet", __name__)
1196 def reattach_fields(
1197 self, page: Optional[PageObject] = None
1198 ) -> list[DictionaryObject]:
1199 """
1200 Parse annotations within the page looking for orphan fields and
1201 reattach then into the Fields Structure.
1203 Args:
1204 page: page to analyze.
1205 If none is provided, all pages will be analyzed.
1207 Returns:
1208 list of reattached fields.
1210 """
1211 lst = []
1212 if page is None:
1213 for p in self.pages:
1214 lst += self.reattach_fields(p)
1215 return lst
1217 try:
1218 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1219 except KeyError:
1220 af = DictionaryObject()
1221 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af
1222 try:
1223 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])
1224 except KeyError:
1225 fields = ArrayObject()
1226 af[NameObject(InteractiveFormDictEntries.Fields)] = fields
1228 if "/Annots" not in page:
1229 return lst
1230 annotations = cast(ArrayObject, page["/Annots"])
1231 for idx, annotation in enumerate(annotations):
1232 is_indirect = isinstance(annotation, IndirectObject)
1233 annotation = cast(DictionaryObject, annotation.get_object())
1234 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:
1235 if (
1236 "indirect_reference" in annotation.__dict__
1237 and annotation.indirect_reference in fields
1238 ):
1239 continue
1240 if not is_indirect:
1241 annotations[idx] = self._add_object(annotation)
1242 fields.append(annotation.indirect_reference)
1243 lst.append(annotation)
1244 return lst
1246 def clone_reader_document_root(self, reader: PdfReader) -> None:
1247 """
1248 Copy the reader document root to the writer and all sub-elements,
1249 including pages, threads, outlines,... For partial insertion, ``append``
1250 should be considered.
1252 Args:
1253 reader: PdfReader from which the document root should be copied.
1255 """
1256 self._info_obj = None
1257 if self.incremental:
1258 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)
1259 for i in range(len(self._objects)):
1260 o = reader.get_object(i + 1)
1261 if o is not None:
1262 self._objects[i] = o.replicate(self)
1263 else:
1264 self._objects.clear()
1265 self._root_object = reader.root_object.clone(self)
1266 self._pages = self._root_object.raw_get("/Pages")
1268 if len(self._objects) > cast(int, reader.trailer["/Size"]):
1269 if self.strict:
1270 raise PdfReadError(
1271 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}"
1272 )
1273 logger_warning(
1274 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}",
1275 __name__
1276 )
1278 # must be done here before rewriting
1279 if self.incremental:
1280 self._original_hash = [
1281 (obj.hash_bin() if obj is not None else 0) for obj in self._objects
1282 ]
1284 try:
1285 self._flatten()
1286 except IndexError:
1287 raise PdfReadError("Got index error while flattening.")
1289 assert self.flattened_pages is not None
1290 for p in self.flattened_pages:
1291 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)
1292 if not self.incremental:
1293 p[NameObject("/Parent")] = self._pages
1294 if not self.incremental:
1295 cast(DictionaryObject, self._pages.get_object())[
1296 NameObject("/Kids")
1297 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
1299 def clone_document_from_reader(
1300 self,
1301 reader: PdfReader,
1302 after_page_append: Optional[Callable[[PageObject], None]] = None,
1303 ) -> None:
1304 """
1305 Create a copy (clone) of a document from a PDF file reader cloning
1306 section '/Root' and '/Info' and '/ID' of the pdf.
1308 Args:
1309 reader: PDF file reader instance from which the clone
1310 should be created.
1311 after_page_append:
1312 Callback function that is invoked after each page is appended to
1313 the writer. Signature includes a reference to the appended page
1314 (delegates to append_pages_from_reader). The single parameter of
1315 the callback is a reference to the page just appended to the
1316 document.
1318 """
1319 self.clone_reader_document_root(reader)
1320 inf = reader._info
1321 if self.incremental:
1322 if inf is not None:
1323 self._info_obj = cast(
1324 IndirectObject, inf.clone(self).indirect_reference
1325 )
1326 assert isinstance(self._info, DictionaryObject), "for mypy"
1327 self._original_hash[
1328 self._info_obj.indirect_reference.idnum - 1
1329 ] = self._info.hash_bin()
1330 elif inf is not None:
1331 self._info_obj = self._add_object(
1332 DictionaryObject(cast(DictionaryObject, inf.get_object()))
1333 )
1334 # else: _info_obj = None done in clone_reader_document_root()
1336 try:
1337 self._ID = cast(ArrayObject, reader._ID).clone(self)
1338 except AttributeError:
1339 pass
1341 if callable(after_page_append):
1342 for page in cast(
1343 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]
1344 ):
1345 after_page_append(page.get_object())
1347 def _compute_document_identifier(self) -> ByteStringObject:
1348 stream = BytesIO()
1349 self._write_pdf_structure(stream)
1350 stream.seek(0)
1351 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
1353 def generate_file_identifiers(self) -> None:
1354 """
1355 Generate an identifier for the PDF that will be written.
1357 The only point of this is ensuring uniqueness. Reproducibility is not
1358 required.
1359 When a file is first written, both identifiers shall be set to the same value.
1360 If both identifiers match when a file reference is resolved, it is very
1361 likely that the correct and unchanged file has been found. If only the first
1362 identifier matches, a different version of the correct file has been found.
1363 see §14.4 "File Identifiers".
1364 """
1365 if self._ID:
1366 id1 = self._ID[0]
1367 id2 = self._compute_document_identifier()
1368 else:
1369 id1 = self._compute_document_identifier()
1370 id2 = id1
1371 self._ID = ArrayObject((id1, id2))
1373 def encrypt(
1374 self,
1375 user_password: str,
1376 owner_password: Optional[str] = None,
1377 use_128bit: bool = True,
1378 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,
1379 *,
1380 algorithm: Optional[str] = None,
1381 ) -> None:
1382 """
1383 Encrypt this PDF file with the PDF Standard encryption handler.
1385 Args:
1386 user_password: The password which allows for opening
1387 and reading the PDF file with the restrictions provided.
1388 owner_password: The password which allows for
1389 opening the PDF files without any restrictions. By default,
1390 the owner password is the same as the user password.
1391 use_128bit: flag as to whether to use 128bit
1392 encryption. When false, 40bit encryption will be used.
1393 By default, this flag is on.
1394 permissions_flag: permissions as described in
1395 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means
1396 the permission is granted.
1397 Hence an integer value of -1 will set all flags.
1398 Bit position 3 is for printing, 4 is for modifying content,
1399 5 and 6 control annotations, 9 for form fields,
1400 10 for extraction of text and graphics.
1401 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",
1402 "AES-128", "AES-256-R5", "AES-256". If it is valid,
1403 `use_128bit` will be ignored.
1405 """
1406 if owner_password is None:
1407 owner_password = user_password
1409 if algorithm is not None:
1410 try:
1411 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))
1412 except AttributeError:
1413 raise ValueError(f"Algorithm '{algorithm}' NOT supported")
1414 else:
1415 alg = EncryptAlgorithm.RC4_128
1416 if not use_128bit:
1417 alg = EncryptAlgorithm.RC4_40
1418 self.generate_file_identifiers()
1419 assert self._ID
1420 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
1421 # in case call `encrypt` again
1422 entry = self._encryption.write_entry(user_password, owner_password)
1423 if self._encrypt_entry:
1424 # replace old encrypt_entry
1425 assert self._encrypt_entry.indirect_reference is not None
1426 entry.indirect_reference = self._encrypt_entry.indirect_reference
1427 self._objects[entry.indirect_reference.idnum - 1] = entry
1428 else:
1429 self._add_object(entry)
1430 self._encrypt_entry = entry
1432 def _resolve_links(self) -> None:
1433 """Patch up links that were added to the document earlier, to
1434 make sure they still point to the same pages.
1435 """
1436 for (new_link, old_link) in self._unresolved_links:
1437 old_page = old_link.find_referenced_page()
1438 if not old_page:
1439 continue
1440 new_page = self._merged_in_pages.get(old_page)
1441 if new_page is None:
1442 continue
1443 new_link.patch_reference(self, new_page)
1445 def write_stream(self, stream: StreamType) -> None:
1446 if hasattr(stream, "mode") and "b" not in stream.mode:
1447 logger_warning(
1448 f"File <{stream.name}> to write to is not in binary mode. "
1449 "It may not be written to correctly.",
1450 __name__,
1451 )
1452 self._resolve_links()
1454 if self.incremental:
1455 self._reader.stream.seek(0)
1456 stream.write(self._reader.stream.read(-1))
1457 if len(self.list_objects_in_increment()) > 0:
1458 self._write_increment(stream) # writes objs, xref stream and startxref
1459 else:
1460 object_positions, free_objects = self._write_pdf_structure(stream)
1461 xref_location = self._write_xref_table(
1462 stream, object_positions, free_objects
1463 )
1464 self._write_trailer(stream, xref_location)
1466 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:
1467 """
1468 Write the collection of pages added to this object out as a PDF file.
1470 Args:
1471 stream: An object to write the file to. The object can support
1472 the write method and the tell method, similar to a file object, or
1473 be a file path, just like the fileobj, just named it stream to keep
1474 existing workflow.
1476 Returns:
1477 A tuple (bool, IO).
1479 """
1480 my_file = False
1482 if stream == "":
1483 raise ValueError(f"Output({stream=}) is empty.")
1485 if isinstance(stream, (str, Path)):
1486 stream = FileIO(stream, "wb")
1487 my_file = True
1489 self.write_stream(stream)
1491 if my_file:
1492 stream.close()
1493 else:
1494 stream.flush()
1496 return my_file, stream
1498 def list_objects_in_increment(self) -> list[IndirectObject]:
1499 """
1500 For analysis or debugging.
1501 Provides the list of new or modified objects that will be written
1502 in the increment.
1503 Deleted objects will not be freed but will become orphans.
1505 Returns:
1506 List of new or modified IndirectObjects
1508 """
1509 original_hash_count = len(self._original_hash)
1510 return [
1511 cast(IndirectObject, obj).indirect_reference
1512 for i, obj in enumerate(self._objects)
1513 if (
1514 obj is not None
1515 and (
1516 i >= original_hash_count
1517 or obj.hash_bin() != self._original_hash[i]
1518 )
1519 )
1520 ]
1522 def _write_increment(self, stream: StreamType) -> None:
1523 object_positions = {}
1524 object_blocks = []
1525 current_start = -1
1526 current_stop = -2
1527 original_hash_count = len(self._original_hash)
1528 for i, obj in enumerate(self._objects):
1529 if obj is not None and (
1530 i >= original_hash_count
1531 or obj.hash_bin() != self._original_hash[i]
1532 ):
1533 idnum = i + 1
1534 assert isinstance(obj, PdfObject), "mypy"
1535 # first write new/modified object
1536 object_positions[idnum] = stream.tell()
1537 stream.write(f"{idnum} 0 obj\n".encode())
1538 """ encryption is not operational
1539 if self._encryption and obj != self._encrypt_entry:
1540 obj = self._encryption.encrypt_object(obj, idnum, 0)
1541 """
1542 obj.write_to_stream(stream)
1543 stream.write(b"\nendobj\n")
1545 # prepare xref
1546 if idnum != current_stop:
1547 if current_start > 0:
1548 object_blocks.append(
1549 [current_start, current_stop - current_start]
1550 )
1551 current_start = idnum
1552 current_stop = idnum + 1
1553 assert current_start > 0, "for pytest only"
1554 object_blocks.append([current_start, current_stop - current_start])
1555 # write incremented xref
1556 xref_location = stream.tell()
1557 xr_id = len(self._objects) + 1
1558 stream.write(f"{xr_id} 0 obj".encode())
1559 init_data = {
1560 NameObject("/Type"): NameObject("/XRef"),
1561 NameObject("/Size"): NumberObject(xr_id + 1),
1562 NameObject("/Root"): self.root_object.indirect_reference,
1563 NameObject("/Filter"): NameObject("/FlateDecode"),
1564 NameObject("/Index"): ArrayObject(
1565 [NumberObject(_it) for _su in object_blocks for _it in _su]
1566 ),
1567 NameObject("/W"): ArrayObject(
1568 [NumberObject(1), NumberObject(4), NumberObject(1)]
1569 ),
1570 "__streamdata__": b"",
1571 }
1572 if self._info is not None and (
1573 self._info.indirect_reference.idnum - 1 # type: ignore
1574 >= len(self._original_hash)
1575 or cast(IndirectObject, self._info).hash_bin() # kept for future
1576 != self._original_hash[
1577 self._info.indirect_reference.idnum - 1 # type: ignore
1578 ]
1579 ):
1580 init_data[NameObject(TK.INFO)] = self._info.indirect_reference
1581 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
1582 if self._ID:
1583 init_data[NameObject(TK.ID)] = self._ID
1584 xr = StreamObject.initialize_from_dictionary(init_data)
1585 xr.set_data(
1586 b"".join(
1587 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]
1588 )
1589 )
1590 xr.write_to_stream(stream)
1591 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1593 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:
1594 object_positions = []
1595 free_objects = []
1596 stream.write(self.pdf_header.encode() + b"\n")
1597 stream.write(b"%\xE2\xE3\xCF\xD3\n")
1599 for idnum, obj in enumerate(self._objects, start=1):
1600 if obj is not None:
1601 object_positions.append(stream.tell())
1602 stream.write(f"{idnum} 0 obj\n".encode())
1603 if self._encryption and obj != self._encrypt_entry:
1604 obj = self._encryption.encrypt_object(obj, idnum, 0)
1605 obj.write_to_stream(stream)
1606 stream.write(b"\nendobj\n")
1607 else:
1608 object_positions.append(-1)
1609 free_objects.append(idnum)
1610 free_objects.append(0) # add 0 to loop in accordance with specification
1611 return object_positions, free_objects
1613 def _write_xref_table(
1614 self, stream: StreamType, object_positions: list[int], free_objects: list[int]
1615 ) -> int:
1616 xref_location = stream.tell()
1617 stream.write(b"xref\n")
1618 stream.write(f"0 {len(self._objects) + 1}\n".encode())
1619 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
1620 free_idx = 1
1621 for offset in object_positions:
1622 if offset > 0:
1623 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
1624 else:
1625 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
1626 free_idx += 1
1627 return xref_location
1629 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
1630 """
1631 Write the PDF trailer to the stream.
1633 To quote the PDF specification:
1634 [The] trailer [gives] the location of the cross-reference table and
1635 of certain special objects within the body of the file.
1636 """
1637 stream.write(b"trailer\n")
1638 trailer = DictionaryObject(
1639 {
1640 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
1641 NameObject(TK.ROOT): self.root_object.indirect_reference,
1642 }
1643 )
1644 if self._info is not None:
1645 trailer[NameObject(TK.INFO)] = self._info.indirect_reference
1646 if self._ID is not None:
1647 trailer[NameObject(TK.ID)] = self._ID
1648 if self._encrypt_entry:
1649 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference
1650 trailer.write_to_stream(stream)
1651 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1653 @property
1654 def metadata(self) -> Optional[DocumentInformation]:
1655 """
1656 Retrieve/set the PDF file's document information dictionary, if it exists.
1658 Args:
1659 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
1661 Note that some PDF files use (XMP) metadata streams instead of document
1662 information dictionaries, and these metadata streams will not be
1663 accessed by this function, but by :meth:`~xmp_metadata`.
1665 """
1666 return super().metadata
1668 @metadata.setter
1669 def metadata(
1670 self,
1671 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],
1672 ) -> None:
1673 if value is None:
1674 self._info = None
1675 else:
1676 if self._info is not None:
1677 self._info.clear()
1679 self.add_metadata(value)
1681 def add_metadata(self, infos: dict[str, Any]) -> None:
1682 """
1683 Add custom metadata to the output.
1685 Args:
1686 infos: a Python dictionary where each key is a field
1687 and each value is your new metadata.
1689 """
1690 args = {}
1691 if isinstance(infos, PdfObject):
1692 infos = cast(DictionaryObject, infos.get_object())
1693 for key, value in list(infos.items()):
1694 if isinstance(value, PdfObject):
1695 value = value.get_object()
1696 args[NameObject(key)] = create_string_object(str(value))
1697 if self._info is None:
1698 self._info = DictionaryObject()
1699 self._info.update(args)
1701 def compress_identical_objects(
1702 self,
1703 remove_identicals: bool = True,
1704 remove_orphans: bool = True,
1705 ) -> None:
1706 """
1707 Parse the PDF file and merge objects that have the same hash.
1708 This will make objects common to multiple pages.
1709 Recommended to be used just before writing output.
1711 Args:
1712 remove_identicals: Remove identical objects.
1713 remove_orphans: Remove unreferenced objects.
1715 """
1717 def replace_in_obj(
1718 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]
1719 ) -> None:
1720 if isinstance(obj, DictionaryObject):
1721 key_val = obj.items()
1722 elif isinstance(obj, ArrayObject):
1723 key_val = enumerate(obj) # type: ignore
1724 else:
1725 return
1726 assert isinstance(obj, (DictionaryObject, ArrayObject))
1727 for k, v in key_val:
1728 if isinstance(v, IndirectObject):
1729 orphans[v.idnum - 1] = False
1730 if v in crossref:
1731 obj[k] = crossref[v]
1732 else:
1733 """the filtering on DictionaryObject and ArrayObject only
1734 will be performed within replace_in_obj"""
1735 replace_in_obj(v, crossref)
1737 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
1738 self._idnum_hash = {}
1739 orphans = [True] * len(self._objects)
1740 # look for similar objects
1741 for idx, obj in enumerate(self._objects):
1742 if is_null_or_none(obj):
1743 continue
1744 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.
1745 assert isinstance(obj.indirect_reference, IndirectObject)
1746 h = obj.hash_value()
1747 if remove_identicals and h in self._idnum_hash:
1748 self._idnum_hash[h][1].append(obj.indirect_reference)
1749 self._objects[idx] = None
1750 else:
1751 self._idnum_hash[h] = (obj.indirect_reference, [])
1753 # generate the dict converting others to 1st
1754 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
1755 cnv_rev: dict[IndirectObject, IndirectObject] = {}
1756 for k, v in cnv.items():
1757 cnv_rev.update(zip(v, (k,) * len(v)))
1759 # replace reference to merged objects
1760 for obj in self._objects:
1761 if isinstance(obj, (DictionaryObject, ArrayObject)):
1762 replace_in_obj(obj, cnv_rev)
1764 # remove orphans (if applicable)
1765 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
1767 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore
1769 try:
1770 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore
1771 except AttributeError:
1772 pass
1773 for i in compress(range(len(self._objects)), orphans):
1774 self._objects[i] = None
1776 def get_reference(self, obj: PdfObject) -> IndirectObject:
1777 idnum = self._objects.index(obj) + 1
1778 ref = IndirectObject(idnum, 0, self)
1779 assert ref.get_object() == obj
1780 return ref
1782 def get_outline_root(self) -> TreeObject:
1783 if CO.OUTLINES in self._root_object:
1784 # Entries in the catalog dictionary
1785 outline = cast(TreeObject, self._root_object[CO.OUTLINES])
1786 if not isinstance(outline, TreeObject):
1787 t = TreeObject(outline)
1788 self._replace_object(outline.indirect_reference.idnum, t)
1789 outline = t
1790 idnum = self._objects.index(outline) + 1
1791 outline_ref = IndirectObject(idnum, 0, self)
1792 assert outline_ref.get_object() == outline
1793 else:
1794 outline = TreeObject()
1795 outline.update({})
1796 outline_ref = self._add_object(outline)
1797 self._root_object[NameObject(CO.OUTLINES)] = outline_ref
1799 return outline
1801 def get_threads_root(self) -> ArrayObject:
1802 """
1803 The list of threads.
1805 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1807 Returns:
1808 An array (possibly empty) of Dictionaries with an ``/F`` key,
1809 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
1811 """
1812 if CO.THREADS in self._root_object:
1813 # Entries in the catalog dictionary
1814 threads = cast(ArrayObject, self._root_object[CO.THREADS])
1815 else:
1816 threads = ArrayObject()
1817 self._root_object[NameObject(CO.THREADS)] = threads
1818 return threads
1820 @property
1821 def threads(self) -> ArrayObject:
1822 """
1823 Read-only property for the list of threads.
1825 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1827 Each element is a dictionary with an ``/F`` key, and optionally
1828 information about the thread in ``/I`` or ``/Metadata`` keys.
1829 """
1830 return self.get_threads_root()
1832 def add_outline_item_destination(
1833 self,
1834 page_destination: Union[IndirectObject, PageObject, TreeObject],
1835 parent: Union[None, TreeObject, IndirectObject] = None,
1836 before: Union[None, TreeObject, IndirectObject] = None,
1837 is_open: bool = True,
1838 ) -> IndirectObject:
1839 page_destination = cast(PageObject, page_destination.get_object())
1840 if isinstance(page_destination, PageObject):
1841 return self.add_outline_item_destination(
1842 Destination(
1843 f"page #{page_destination.page_number}",
1844 cast(IndirectObject, page_destination.indirect_reference),
1845 Fit.fit(),
1846 )
1847 )
1849 if parent is None:
1850 parent = self.get_outline_root()
1852 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)
1853 parent = cast(TreeObject, parent.get_object())
1854 page_destination_ref = self._add_object(page_destination)
1855 if before is not None:
1856 before = before.indirect_reference
1857 parent.insert_child(
1858 page_destination_ref,
1859 before,
1860 self,
1861 page_destination.inc_parent_counter_outline
1862 if is_open
1863 else (lambda x, y: 0), # noqa: ARG005
1864 )
1865 if "/Count" not in page_destination:
1866 page_destination[NameObject("/Count")] = NumberObject(0)
1868 return page_destination_ref
1870 def add_outline_item_dict(
1871 self,
1872 outline_item: OutlineItemType,
1873 parent: Union[None, TreeObject, IndirectObject] = None,
1874 before: Union[None, TreeObject, IndirectObject] = None,
1875 is_open: bool = True,
1876 ) -> IndirectObject:
1877 outline_item_object = TreeObject()
1878 outline_item_object.update(outline_item)
1880 """code currently unreachable
1881 if "/A" in outline_item:
1882 action = DictionaryObject()
1883 a_dict = cast(DictionaryObject, outline_item["/A"])
1884 for k, v in list(a_dict.items()):
1885 action[NameObject(str(k))] = v
1886 action_ref = self._add_object(action)
1887 outline_item_object[NameObject("/A")] = action_ref
1888 """
1889 return self.add_outline_item_destination(
1890 outline_item_object, parent, before, is_open
1891 )
1893 def add_outline_item(
1894 self,
1895 title: str,
1896 page_number: Union[None, PageObject, IndirectObject, int],
1897 parent: Union[None, TreeObject, IndirectObject] = None,
1898 before: Union[None, TreeObject, IndirectObject] = None,
1899 color: Optional[Union[tuple[float, float, float], str]] = None,
1900 bold: bool = False,
1901 italic: bool = False,
1902 fit: Fit = PAGE_FIT,
1903 is_open: bool = True,
1904 ) -> IndirectObject:
1905 """
1906 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.
1908 Args:
1909 title: Title to use for this outline item.
1910 page_number: Page number this outline item will point to.
1911 parent: A reference to a parent outline item to create nested
1912 outline items.
1913 before:
1914 color: Color of the outline item's font as a red, green, blue tuple
1915 from 0.0 to 1.0 or as a Hex String (#RRGGBB)
1916 bold: Outline item font is bold
1917 italic: Outline item font is italic
1918 fit: The fit of the destination page.
1920 Returns:
1921 The added outline item as an indirect object.
1923 """
1924 page_ref: Union[None, NullObject, IndirectObject, NumberObject]
1925 if isinstance(italic, Fit): # it means that we are on the old params
1926 if fit is not None and page_number is None:
1927 page_number = fit
1928 return self.add_outline_item(
1929 title, page_number, parent, None, before, color, bold, italic, is_open=is_open
1930 )
1931 if page_number is None:
1932 action_ref = None
1933 else:
1934 if isinstance(page_number, IndirectObject):
1935 page_ref = page_number
1936 elif isinstance(page_number, PageObject):
1937 page_ref = page_number.indirect_reference
1938 elif isinstance(page_number, int):
1939 try:
1940 page_ref = self.pages[page_number].indirect_reference
1941 except IndexError:
1942 page_ref = NumberObject(page_number)
1943 if page_ref is None:
1944 logger_warning(
1945 f"can not find reference of page {page_number}",
1946 __name__,
1947 )
1948 page_ref = NullObject()
1949 dest = Destination(
1950 NameObject("/" + title + " outline item"),
1951 page_ref,
1952 fit,
1953 )
1955 action_ref = self._add_object(
1956 DictionaryObject(
1957 {
1958 NameObject(GoToActionArguments.D): dest.dest_array,
1959 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
1960 }
1961 )
1962 )
1963 outline_item = self._add_object(
1964 _create_outline_item(action_ref, title, color, italic, bold)
1965 )
1967 if parent is None:
1968 parent = self.get_outline_root()
1969 return self.add_outline_item_destination(outline_item, parent, before, is_open)
1971 def add_outline(self) -> None:
1972 raise NotImplementedError(
1973 "This method is not yet implemented. Use :meth:`add_outline_item` instead."
1974 )
1976 def add_named_destination_array(
1977 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]
1978 ) -> None:
1979 named_dest = self.get_named_dest_root()
1980 i = 0
1981 while i < len(named_dest):
1982 if title < named_dest[i]:
1983 named_dest.insert(i, destination)
1984 named_dest.insert(i, TextStringObject(title))
1985 return
1986 i += 2
1987 named_dest.extend([TextStringObject(title), destination])
1988 return
1990 def add_named_destination_object(
1991 self,
1992 page_destination: PdfObject,
1993 ) -> IndirectObject:
1994 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore
1995 self.add_named_destination_array(
1996 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore
1997 )
1999 return page_destination_ref
2001 def add_named_destination(
2002 self,
2003 title: str,
2004 page_number: int,
2005 ) -> IndirectObject:
2006 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore
2007 dest = DictionaryObject()
2008 dest.update(
2009 {
2010 NameObject(GoToActionArguments.D): ArrayObject(
2011 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]
2012 ),
2013 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
2014 }
2015 )
2017 dest_ref = self._add_object(dest)
2018 if not isinstance(title, TextStringObject):
2019 title = TextStringObject(str(title))
2021 self.add_named_destination_array(title, dest_ref)
2022 return dest_ref
2024 def remove_links(self) -> None:
2025 """Remove links and annotations from this output."""
2026 for page in self.pages:
2027 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
2029 def remove_annotations(
2030 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]
2031 ) -> None:
2032 """
2033 Remove annotations by annotation subtype.
2035 Args:
2036 subtypes: subtype or list of subtypes to be removed.
2037 Examples are: "/Link", "/FileAttachment", "/Sound",
2038 "/Movie", "/Screen", ...
2039 If you want to remove all annotations, use subtypes=None.
2041 """
2042 for page in self.pages:
2043 self._remove_annots_from_page(page, subtypes)
2045 def _remove_annots_from_page(
2046 self,
2047 page: Union[IndirectObject, PageObject, DictionaryObject],
2048 subtypes: Optional[Iterable[str]],
2049 ) -> None:
2050 page = cast(DictionaryObject, page.get_object())
2051 if PG.ANNOTS in page:
2052 i = 0
2053 while i < len(cast(ArrayObject, page[PG.ANNOTS])):
2054 an = cast(ArrayObject, page[PG.ANNOTS])[i]
2055 obj = cast(DictionaryObject, an.get_object())
2056 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
2057 if isinstance(an, IndirectObject):
2058 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
2059 del page[PG.ANNOTS][i] # type:ignore
2060 else:
2061 i += 1
2063 def remove_objects_from_page(
2064 self,
2065 page: Union[PageObject, DictionaryObject],
2066 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
2067 text_filters: Optional[dict[str, Any]] = None
2068 ) -> None:
2069 """
2070 Remove objects specified by ``to_delete`` from the given page.
2072 Args:
2073 page: Page object to clean up.
2074 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
2075 or a list of ObjectDeletionFlag
2076 text_filters: Properties of text to be deleted, if applicable. Optional.
2077 This is a Python dictionary with the following properties:
2079 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.
2081 """
2082 if isinstance(to_delete, (list, tuple)):
2083 for to_d in to_delete:
2084 self.remove_objects_from_page(page, to_d)
2085 return None
2086 assert isinstance(to_delete, ObjectDeletionFlag)
2088 if to_delete & ObjectDeletionFlag.LINKS:
2089 return self._remove_annots_from_page(page, ("/Link",))
2090 if to_delete & ObjectDeletionFlag.ATTACHMENTS:
2091 return self._remove_annots_from_page(
2092 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
2093 )
2094 if to_delete & ObjectDeletionFlag.OBJECTS_3D:
2095 return self._remove_annots_from_page(page, ("/3D",))
2096 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
2097 return self._remove_annots_from_page(page, None)
2099 jump_operators = []
2100 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
2101 jump_operators = (
2102 [
2103 b"w", b"J", b"j", b"M", b"d", b"i",
2104 b"W", b"W*",
2105 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",
2106 b"m", b"l", b"c", b"v", b"y", b"h", b"re",
2107 b"sh"
2108 ]
2109 )
2110 if to_delete & ObjectDeletionFlag.TEXT:
2111 jump_operators = [b"Tj", b"TJ", b"'", b'"']
2113 def clean(
2114 content: ContentStream,
2115 images: list[str],
2116 forms: list[str],
2117 text_filters: Optional[dict[str, Any]] = None
2118 ) -> None:
2119 nonlocal jump_operators, to_delete
2121 font_id = None
2122 font_ids_to_delete = []
2123 if text_filters and to_delete & ObjectDeletionFlag.TEXT:
2124 font_ids_to_delete = text_filters.get("font_ids", [])
2126 i = 0
2127 while i < len(content.operations):
2128 operands, operator = content.operations[i]
2129 if operator == b"Tf":
2130 font_id = operands[0]
2131 if (
2132 (
2133 operator == b"INLINE IMAGE"
2134 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)
2135 )
2136 or (operator in jump_operators)
2137 or (
2138 operator == b"Do"
2139 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)
2140 and (operands[0] in images)
2141 )
2142 ):
2143 if (
2144 not to_delete & ObjectDeletionFlag.TEXT
2145 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
2146 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
2147 ):
2148 del content.operations[i]
2149 else:
2150 i += 1
2151 else:
2152 i += 1
2153 content.get_data() # this ensures ._data is rebuilt from the .operations
2155 def clean_forms(
2156 elt: DictionaryObject, stack: list[DictionaryObject]
2157 ) -> tuple[list[str], list[str]]:
2158 nonlocal to_delete
2159 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference
2160 if (elt in stack) or (
2161 hasattr(elt, "indirect_reference")
2162 and any(
2163 elt.indirect_reference == getattr(x, "indirect_reference", -1)
2164 for x in stack
2165 )
2166 ):
2167 # to prevent infinite looping
2168 return [], [] # pragma: no cover
2169 try:
2170 d = cast(
2171 dict[Any, Any],
2172 cast(DictionaryObject, elt["/Resources"])["/XObject"],
2173 )
2174 except KeyError:
2175 d = {}
2176 images = []
2177 forms = []
2178 for k, v in d.items():
2179 o = v.get_object()
2180 try:
2181 content: Any = None
2182 if (
2183 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES
2184 and o["/Subtype"] == "/Image"
2185 ):
2186 content = NullObject() # to delete the image keeping the entry
2187 images.append(k)
2188 if o["/Subtype"] == "/Form":
2189 forms.append(k)
2190 if isinstance(o, ContentStream):
2191 content = o
2192 else:
2193 content = ContentStream(o, self)
2194 content.update(
2195 {
2196 k1: v1
2197 for k1, v1 in o.items()
2198 if k1 not in ["/Length", "/Filter", "/DecodeParms"]
2199 }
2200 )
2201 try:
2202 content.indirect_reference = o.indirect_reference
2203 except AttributeError: # pragma: no cover
2204 pass
2205 stack.append(elt)
2206 clean_forms(content, stack) # clean subforms
2207 if content is not None:
2208 if isinstance(v, IndirectObject):
2209 self._objects[v.idnum - 1] = content
2210 else:
2211 # should only occur in a PDF not respecting PDF spec
2212 # where streams must be indirected.
2213 d[k] = self._add_object(content) # pragma: no cover
2214 except (TypeError, KeyError):
2215 pass
2216 for im in images:
2217 del d[im] # for clean-up
2218 if isinstance(elt, StreamObject): # for /Form
2219 if not isinstance(elt, ContentStream): # pragma: no cover
2220 e = ContentStream(elt, self)
2221 e.update(elt.items())
2222 elt = e
2223 clean(elt, images, forms, text_filters) # clean the content
2224 return images, forms
2226 if not isinstance(page, PageObject):
2227 page = PageObject(self, page.indirect_reference) # pragma: no cover
2228 if "/Contents" in page:
2229 content = cast(ContentStream, page.get_contents())
2231 images, forms = clean_forms(page, [])
2233 clean(content, images, forms, text_filters)
2234 page.replace_contents(content)
2236 def remove_images(
2237 self,
2238 to_delete: ImageType = ImageType.ALL,
2239 ) -> None:
2240 """
2241 Remove images from this output.
2243 Args:
2244 to_delete: The type of images to be deleted
2245 (default = all images types)
2247 """
2248 if isinstance(to_delete, bool):
2249 to_delete = ImageType.ALL
2251 i = ObjectDeletionFlag.NONE
2253 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):
2254 if to_delete & ImageType[image]:
2255 i |= ObjectDeletionFlag[image]
2257 for page in self.pages:
2258 self.remove_objects_from_page(page, i)
2260 def remove_text(self, font_names: Optional[list[str]] = None) -> None:
2261 """
2262 Remove text from the PDF.
2264 Args:
2265 font_names: List of font names to remove, such as "Helvetica-Bold".
2266 Optional. If not specified, all text will be removed.
2267 """
2268 if not font_names:
2269 font_names = []
2271 for page in self.pages:
2272 resource_ids_to_remove = []
2274 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
2275 # Font names need to be converted to resource names/IDs for easier removal
2276 if font_names:
2277 # Recursively loop through page objects to gather font info
2278 def get_font_info(
2279 obj: Any,
2280 font_info: Optional[dict[str, Any]] = None,
2281 key: Optional[str] = None
2282 ) -> dict[str, Any]:
2283 if font_info is None:
2284 font_info = {}
2285 if isinstance(obj, IndirectObject):
2286 obj = obj.get_object()
2287 if isinstance(obj, dict):
2288 if obj.get("/Type") == "/Font":
2289 font_name = obj.get("/BaseFont", "")
2290 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
2291 normalized_font_name = font_name.lstrip("/").split("+")[-1]
2292 if normalized_font_name not in font_info:
2293 font_info[normalized_font_name] = {
2294 "normalized_font_name": normalized_font_name,
2295 "resource_ids": [],
2296 }
2297 if key not in font_info[normalized_font_name]["resource_ids"]:
2298 font_info[normalized_font_name]["resource_ids"].append(key)
2299 for k in obj:
2300 font_info = get_font_info(obj[k], font_info, k)
2301 elif isinstance(obj, (list, ArrayObject)):
2302 for child_obj in obj:
2303 font_info = get_font_info(child_obj, font_info)
2304 return font_info
2306 # Add relevant resource names for removal
2307 font_info = get_font_info(page.get("/Resources"))
2308 for font_name in font_names:
2309 if font_name in font_info:
2310 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])
2312 text_filters = {}
2313 if font_names:
2314 text_filters["font_ids"] = resource_ids_to_remove
2315 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
2317 def add_uri(
2318 self,
2319 page_number: int,
2320 uri: str,
2321 rect: RectangleObject,
2322 border: Optional[ArrayObject] = None,
2323 ) -> None:
2324 """
2325 Add an URI from a rectangular area to the specified page.
2327 Args:
2328 page_number: index of the page on which to place the URI action.
2329 uri: URI of resource to link to.
2330 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or
2331 array of four integers specifying the clickable rectangular area
2332 ``[xLL, yLL, xUR, yUR]``, or string in the form
2333 ``"[ xLL yLL xUR yUR ]"``.
2334 border: if provided, an array describing border-drawing
2335 properties. See the PDF spec for details. No border will be
2336 drawn if this argument is omitted.
2338 """
2339 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore
2340 page_ref = cast(dict[str, Any], self.get_object(page_link))
2342 border_arr: BorderArrayType
2343 if border is not None:
2344 border_arr = [NumberObject(n) for n in border[:3]]
2345 if len(border) == 4:
2346 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
2347 border_arr.append(dash_pattern)
2348 else:
2349 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]
2351 if isinstance(rect, str):
2352 rect = NumberObject(rect)
2353 elif isinstance(rect, RectangleObject):
2354 pass
2355 else:
2356 rect = RectangleObject(rect)
2358 lnk2 = DictionaryObject()
2359 lnk2.update(
2360 {
2361 NameObject("/S"): NameObject("/URI"),
2362 NameObject("/URI"): TextStringObject(uri),
2363 }
2364 )
2365 lnk = DictionaryObject()
2366 lnk.update(
2367 {
2368 NameObject(AA.Type): NameObject("/Annot"),
2369 NameObject(AA.Subtype): NameObject("/Link"),
2370 NameObject(AA.P): page_link,
2371 NameObject(AA.Rect): rect,
2372 NameObject("/H"): NameObject("/I"),
2373 NameObject(AA.Border): ArrayObject(border_arr),
2374 NameObject("/A"): lnk2,
2375 }
2376 )
2377 lnk_ref = self._add_object(lnk)
2379 if PG.ANNOTS in page_ref:
2380 page_ref[PG.ANNOTS].append(lnk_ref)
2381 else:
2382 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])
2384 _valid_layouts = (
2385 "/NoLayout",
2386 "/SinglePage",
2387 "/OneColumn",
2388 "/TwoColumnLeft",
2389 "/TwoColumnRight",
2390 "/TwoPageLeft",
2391 "/TwoPageRight",
2392 )
2394 def _get_page_layout(self) -> Optional[LayoutType]:
2395 try:
2396 return cast(LayoutType, self._root_object["/PageLayout"])
2397 except KeyError:
2398 return None
2400 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:
2401 """
2402 Set the page layout.
2404 Args:
2405 layout: The page layout to be used.
2407 .. list-table:: Valid ``layout`` arguments
2408 :widths: 50 200
2410 * - /NoLayout
2411 - Layout explicitly not specified
2412 * - /SinglePage
2413 - Show one page at a time
2414 * - /OneColumn
2415 - Show one column at a time
2416 * - /TwoColumnLeft
2417 - Show pages in two columns, odd-numbered pages on the left
2418 * - /TwoColumnRight
2419 - Show pages in two columns, odd-numbered pages on the right
2420 * - /TwoPageLeft
2421 - Show two pages at a time, odd-numbered pages on the left
2422 * - /TwoPageRight
2423 - Show two pages at a time, odd-numbered pages on the right
2425 """
2426 if not isinstance(layout, NameObject):
2427 if layout not in self._valid_layouts:
2428 logger_warning(
2429 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",
2430 __name__,
2431 )
2432 layout = NameObject(layout)
2433 self._root_object.update({NameObject("/PageLayout"): layout})
2435 def set_page_layout(self, layout: LayoutType) -> None:
2436 """
2437 Set the page layout.
2439 Args:
2440 layout: The page layout to be used
2442 .. list-table:: Valid ``layout`` arguments
2443 :widths: 50 200
2445 * - /NoLayout
2446 - Layout explicitly not specified
2447 * - /SinglePage
2448 - Show one page at a time
2449 * - /OneColumn
2450 - Show one column at a time
2451 * - /TwoColumnLeft
2452 - Show pages in two columns, odd-numbered pages on the left
2453 * - /TwoColumnRight
2454 - Show pages in two columns, odd-numbered pages on the right
2455 * - /TwoPageLeft
2456 - Show two pages at a time, odd-numbered pages on the left
2457 * - /TwoPageRight
2458 - Show two pages at a time, odd-numbered pages on the right
2460 """
2461 self._set_page_layout(layout)
2463 @property
2464 def page_layout(self) -> Optional[LayoutType]:
2465 """
2466 Page layout property.
2468 .. list-table:: Valid ``layout`` values
2469 :widths: 50 200
2471 * - /NoLayout
2472 - Layout explicitly not specified
2473 * - /SinglePage
2474 - Show one page at a time
2475 * - /OneColumn
2476 - Show one column at a time
2477 * - /TwoColumnLeft
2478 - Show pages in two columns, odd-numbered pages on the left
2479 * - /TwoColumnRight
2480 - Show pages in two columns, odd-numbered pages on the right
2481 * - /TwoPageLeft
2482 - Show two pages at a time, odd-numbered pages on the left
2483 * - /TwoPageRight
2484 - Show two pages at a time, odd-numbered pages on the right
2485 """
2486 return self._get_page_layout()
2488 @page_layout.setter
2489 def page_layout(self, layout: LayoutType) -> None:
2490 self._set_page_layout(layout)
2492 _valid_modes = (
2493 "/UseNone",
2494 "/UseOutlines",
2495 "/UseThumbs",
2496 "/FullScreen",
2497 "/UseOC",
2498 "/UseAttachments",
2499 )
2501 def _get_page_mode(self) -> Optional[PagemodeType]:
2502 try:
2503 return cast(PagemodeType, self._root_object["/PageMode"])
2504 except KeyError:
2505 return None
2507 @property
2508 def page_mode(self) -> Optional[PagemodeType]:
2509 """
2510 Page mode property.
2512 .. list-table:: Valid ``mode`` values
2513 :widths: 50 200
2515 * - /UseNone
2516 - Do not show outline or thumbnails panels
2517 * - /UseOutlines
2518 - Show outline (aka bookmarks) panel
2519 * - /UseThumbs
2520 - Show page thumbnails panel
2521 * - /FullScreen
2522 - Fullscreen view
2523 * - /UseOC
2524 - Show Optional Content Group (OCG) panel
2525 * - /UseAttachments
2526 - Show attachments panel
2527 """
2528 return self._get_page_mode()
2530 @page_mode.setter
2531 def page_mode(self, mode: PagemodeType) -> None:
2532 if isinstance(mode, NameObject):
2533 mode_name: NameObject = mode
2534 else:
2535 if mode not in self._valid_modes:
2536 logger_warning(
2537 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__
2538 )
2539 mode_name = NameObject(mode)
2540 self._root_object.update({NameObject("/PageMode"): mode_name})
2542 def add_annotation(
2543 self,
2544 page_number: Union[int, PageObject],
2545 annotation: dict[str, Any],
2546 ) -> DictionaryObject:
2547 """
2548 Add a single annotation to the page.
2549 The added annotation must be a new annotation.
2550 It cannot be recycled.
2552 Args:
2553 page_number: PageObject or page index.
2554 annotation: Annotation to be added (created with annotation).
2556 Returns:
2557 The inserted object.
2558 This can be used for popup creation, for example.
2560 """
2561 page = page_number
2562 if isinstance(page, int):
2563 page = self.pages[page]
2564 elif not isinstance(page, PageObject):
2565 raise TypeError("page: invalid type")
2567 to_add = cast(DictionaryObject, _pdf_objectify(annotation))
2568 to_add[NameObject("/P")] = page.indirect_reference
2570 if page.annotations is None:
2571 page[NameObject("/Annots")] = ArrayObject()
2572 assert page.annotations is not None
2574 # Internal link annotations need the correct object type for the
2575 # destination
2576 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:
2577 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])
2578 dest = Destination(
2579 NameObject("/LinkName"),
2580 tmp["target_page_index"],
2581 Fit(
2582 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]
2583 ), # I have no clue why this dict-hack is necessary
2584 )
2585 to_add[NameObject("/Dest")] = dest.dest_array
2587 page.annotations.append(self._add_object(to_add))
2589 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:
2590 cast(DictionaryObject, to_add["/Parent"].get_object())[
2591 NameObject("/Popup")
2592 ] = to_add.indirect_reference
2594 return to_add
2596 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:
2597 """
2598 Perform some clean up in the page.
2599 Currently: convert NameObject named destination to TextStringObject
2600 (required for names/dests list)
2602 Args:
2603 page:
2605 Returns:
2606 The cleaned PageObject
2608 """
2609 page = cast("PageObject", page.get_object())
2610 for a in page.get("/Annots", []):
2611 a_obj = a.get_object()
2612 d = a_obj.get("/Dest", None)
2613 act = a_obj.get("/A", None)
2614 if isinstance(d, NameObject):
2615 a_obj[NameObject("/Dest")] = TextStringObject(d)
2616 elif act is not None:
2617 act = act.get_object()
2618 d = act.get("/D", None)
2619 if isinstance(d, NameObject):
2620 act[NameObject("/D")] = TextStringObject(d)
2621 return page
2623 def _create_stream(
2624 self, fileobj: Union[Path, StrByteType, PdfReader]
2625 ) -> tuple[IOBase, Optional[Encryption]]:
2626 # If the fileobj parameter is a string, assume it is a path
2627 # and create a file object at that location. If it is a file,
2628 # copy the file's contents into a BytesIO stream object; if
2629 # it is a PdfReader, copy that reader's stream into a
2630 # BytesIO stream.
2631 # If fileobj is none of the above types, it is not modified
2632 encryption_obj = None
2633 stream: IOBase
2634 if isinstance(fileobj, (str, Path)):
2635 with FileIO(fileobj, "rb") as f:
2636 stream = BytesIO(f.read())
2637 elif isinstance(fileobj, PdfReader):
2638 if fileobj._encryption:
2639 encryption_obj = fileobj._encryption
2640 orig_tell = fileobj.stream.tell()
2641 fileobj.stream.seek(0)
2642 stream = BytesIO(fileobj.stream.read())
2644 # reset the stream to its original location
2645 fileobj.stream.seek(orig_tell)
2646 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):
2647 fileobj.seek(0)
2648 filecontent = fileobj.read()
2649 stream = BytesIO(filecontent)
2650 else:
2651 raise NotImplementedError(
2652 "Merging requires an object that PdfReader can parse. "
2653 "Typically, that is a Path or a string representing a Path, "
2654 "a file object, or an object implementing .seek and .read. "
2655 "Passing a PdfReader directly works as well."
2656 )
2657 return stream, encryption_obj
2659 def append(
2660 self,
2661 fileobj: Union[StrByteType, PdfReader, Path],
2662 outline_item: Union[
2663 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]
2664 ] = None,
2665 pages: Union[
2666 None,
2667 PageRange,
2668 tuple[int, int],
2669 tuple[int, int, int],
2670 list[int],
2671 list[PageObject],
2672 ] = None,
2673 import_outline: bool = True,
2674 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,
2675 ) -> None:
2676 """
2677 Identical to the :meth:`merge()<merge>` method, but assumes you want to
2678 concatenate all pages onto the end of the file instead of specifying a
2679 position.
2681 Args:
2682 fileobj: A File Object or an object that supports the standard
2683 read and seek methods similar to a File Object. Could also be a
2684 string representing a path to a PDF file.
2685 outline_item: Optionally, you may specify a string to build an
2686 outline (aka 'bookmark') to identify the beginning of the
2687 included file.
2688 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2689 or a ``(start, stop[, step])`` tuple
2690 or a list of pages to be processed
2691 to merge only the specified range of pages from the source
2692 document into the output document.
2693 import_outline: You may prevent the source document's
2694 outline (collection of outline items, previously referred to as
2695 'bookmarks') from being imported by specifying this as ``False``.
2696 excluded_fields: Provide the list of fields/keys to be ignored
2697 if ``/Annots`` is part of the list, the annotation will be ignored
2698 if ``/B`` is part of the list, the articles will be ignored
2700 """
2701 if excluded_fields is None:
2702 excluded_fields = ()
2703 if isinstance(outline_item, (tuple, list, PageRange)):
2704 if isinstance(pages, bool):
2705 if not isinstance(import_outline, bool):
2706 excluded_fields = import_outline
2707 import_outline = pages
2708 pages = outline_item
2709 self.merge(
2710 None,
2711 fileobj,
2712 None,
2713 pages,
2714 import_outline,
2715 excluded_fields,
2716 )
2717 else: # if isinstance(outline_item, str):
2718 self.merge(
2719 None,
2720 fileobj,
2721 outline_item,
2722 pages,
2723 import_outline,
2724 excluded_fields,
2725 )
2727 def merge(
2728 self,
2729 position: Optional[int],
2730 fileobj: Union[Path, StrByteType, PdfReader],
2731 outline_item: Optional[str] = None,
2732 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,
2733 import_outline: bool = True,
2734 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),
2735 ) -> None:
2736 """
2737 Merge the pages from the given file into the output file at the
2738 specified page number.
2740 Args:
2741 position: The *page number* to insert this file. File will
2742 be inserted after the given number.
2743 fileobj: A File Object or an object that supports the standard
2744 read and seek methods similar to a File Object. Could also be a
2745 string representing a path to a PDF file.
2746 outline_item: Optionally, you may specify a string to build an outline
2747 (aka 'bookmark') to identify the
2748 beginning of the included file.
2749 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2750 or a ``(start, stop[, step])`` tuple
2751 or a list of pages to be processed
2752 to merge only the specified range of pages from the source
2753 document into the output document.
2754 import_outline: You may prevent the source document's
2755 outline (collection of outline items, previously referred to as
2756 'bookmarks') from being imported by specifying this as ``False``.
2757 excluded_fields: provide the list of fields/keys to be ignored
2758 if ``/Annots`` is part of the list, the annotation will be ignored
2759 if ``/B`` is part of the list, the articles will be ignored
2761 Raises:
2762 TypeError: The pages attribute is not configured properly
2764 """
2765 if isinstance(fileobj, PdfDocCommon):
2766 reader = fileobj
2767 else:
2768 stream, _encryption_obj = self._create_stream(fileobj)
2769 # Create a new PdfReader instance using the stream
2770 # (either file or BytesIO or StringIO) created above
2771 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]
2773 if excluded_fields is None:
2774 excluded_fields = ()
2775 # Find the range of pages to merge.
2776 if pages is None:
2777 pages = list(range(len(reader.pages)))
2778 elif isinstance(pages, PageRange):
2779 pages = list(range(*pages.indices(len(reader.pages))))
2780 elif isinstance(pages, list):
2781 pass # keep unchanged
2782 elif isinstance(pages, tuple) and len(pages) <= 3:
2783 pages = list(range(*pages))
2784 elif not isinstance(pages, tuple):
2785 raise TypeError(
2786 '"pages" must be a tuple of (start, stop[, step]) or a list'
2787 )
2789 srcpages = {}
2790 for page in pages:
2791 if isinstance(page, PageObject):
2792 pg = page
2793 else:
2794 pg = reader.pages[page]
2795 assert pg.indirect_reference is not None
2796 if position is None:
2797 # numbers in the exclude list identifies that the exclusion is
2798 # only applicable to 1st level of cloning
2799 srcpages[pg.indirect_reference.idnum] = self.add_page(
2800 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2801 )
2802 else:
2803 srcpages[pg.indirect_reference.idnum] = self.insert_page(
2804 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2805 )
2806 position += 1
2807 srcpages[pg.indirect_reference.idnum].original_page = pg
2809 reader._named_destinations = (
2810 reader.named_destinations
2811 ) # need for the outline processing below
2813 arr: Any
2815 def _process_named_dests(dest: Any) -> None:
2816 arr = dest.dest_array
2817 if "/Names" in self._root_object and dest["/Title"] in cast(
2818 list[Any],
2819 cast(
2820 DictionaryObject,
2821 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),
2822 ).get("/Names", DictionaryObject()),
2823 ):
2824 # already exists: should not duplicate it
2825 pass
2826 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):
2827 pass
2828 elif isinstance(dest["/Page"], int):
2829 # the page reference is a page number normally not a PDF Reference
2830 # page numbers as int are normally accepted only in external goto
2831 try:
2832 p = reader.pages[dest["/Page"]]
2833 except IndexError:
2834 return
2835 assert p.indirect_reference is not None
2836 try:
2837 arr[NumberObject(0)] = NumberObject(
2838 srcpages[p.indirect_reference.idnum].page_number
2839 )
2840 self.add_named_destination_array(dest["/Title"], arr)
2841 except KeyError:
2842 pass
2843 elif dest["/Page"].indirect_reference.idnum in srcpages:
2844 arr[NumberObject(0)] = srcpages[
2845 dest["/Page"].indirect_reference.idnum
2846 ].indirect_reference
2847 self.add_named_destination_array(dest["/Title"], arr)
2849 for dest in reader._named_destinations.values():
2850 _process_named_dests(dest)
2852 outline_item_typ: TreeObject
2853 if outline_item is not None:
2854 outline_item_typ = cast(
2855 "TreeObject",
2856 self.add_outline_item(
2857 TextStringObject(outline_item),
2858 next(iter(srcpages.values())).indirect_reference,
2859 fit=PAGE_FIT,
2860 ).get_object(),
2861 )
2862 else:
2863 outline_item_typ = self.get_outline_root()
2865 _ro = reader.root_object
2866 if import_outline and CO.OUTLINES in _ro:
2867 outline = self._get_filtered_outline(
2868 _ro.get(CO.OUTLINES, None), srcpages, reader
2869 )
2870 self._insert_filtered_outline(
2871 outline, outline_item_typ, None
2872 ) # TODO: use before parameter
2874 if "/Annots" not in excluded_fields:
2875 for pag in srcpages.values():
2876 lst = self._insert_filtered_annotations(
2877 pag.original_page.get("/Annots", []), pag, srcpages, reader
2878 )
2879 if len(lst) > 0:
2880 pag[NameObject("/Annots")] = lst
2881 self.clean_page(pag)
2883 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:
2884 if "/AcroForm" not in self._root_object:
2885 self._root_object[NameObject("/AcroForm")] = self._add_object(
2886 cast(
2887 DictionaryObject,
2888 reader.root_object["/AcroForm"],
2889 ).clone(self, False, ("/Fields",))
2890 )
2891 arr = ArrayObject()
2892 else:
2893 arr = cast(
2894 ArrayObject,
2895 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
2896 )
2897 trslat = self._id_translated[id(reader)]
2898 try:
2899 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
2900 try:
2901 ind = IndirectObject(trslat[f.idnum], 0, self)
2902 if ind not in arr:
2903 arr.append(ind)
2904 except KeyError:
2905 # for trslat[] which mean the field has not be copied
2906 # through the page
2907 pass
2908 except KeyError: # for /Acroform or /Fields are not existing
2909 arr = self._add_object(ArrayObject())
2910 cast(DictionaryObject, self._root_object["/AcroForm"])[
2911 NameObject("/Fields")
2912 ] = arr
2914 if "/B" not in excluded_fields:
2915 self.add_filtered_articles("", srcpages, reader)
2917 def _add_articles_thread(
2918 self,
2919 thread: DictionaryObject, # thread entry from the reader's array of threads
2920 pages: dict[int, PageObject],
2921 reader: PdfReader,
2922 ) -> IndirectObject:
2923 """
2924 Clone the thread with only the applicable articles.
2926 Args:
2927 thread:
2928 pages:
2929 reader:
2931 Returns:
2932 The added thread as an indirect reference
2934 """
2935 nthread = thread.clone(
2936 self, force_duplicate=True, ignore_fields=("/F",)
2937 ) # use of clone to keep link between reader and writer
2938 self.threads.append(nthread.indirect_reference)
2939 first_article = cast("DictionaryObject", thread["/F"])
2940 current_article: Optional[DictionaryObject] = first_article
2941 new_article: Optional[DictionaryObject] = None
2942 while current_article is not None:
2943 pag = self._get_cloned_page(
2944 cast("PageObject", current_article["/P"]), pages, reader
2945 )
2946 if pag is not None:
2947 if new_article is None:
2948 new_article = cast(
2949 "DictionaryObject",
2950 self._add_object(DictionaryObject()).get_object(),
2951 )
2952 new_first = new_article
2953 nthread[NameObject("/F")] = new_article.indirect_reference
2954 else:
2955 new_article2 = cast(
2956 "DictionaryObject",
2957 self._add_object(
2958 DictionaryObject(
2959 {NameObject("/V"): new_article.indirect_reference}
2960 )
2961 ).get_object(),
2962 )
2963 new_article[NameObject("/N")] = new_article2.indirect_reference
2964 new_article = new_article2
2965 new_article[NameObject("/P")] = pag
2966 new_article[NameObject("/T")] = nthread.indirect_reference
2967 new_article[NameObject("/R")] = current_article["/R"]
2968 pag_obj = cast("PageObject", pag.get_object())
2969 if "/B" not in pag_obj:
2970 pag_obj[NameObject("/B")] = ArrayObject()
2971 cast("ArrayObject", pag_obj["/B"]).append(
2972 new_article.indirect_reference
2973 )
2974 current_article = cast("DictionaryObject", current_article["/N"])
2975 if current_article == first_article:
2976 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore
2977 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore
2978 current_article = None
2979 assert nthread.indirect_reference is not None
2980 return nthread.indirect_reference
2982 def add_filtered_articles(
2983 self,
2984 fltr: Union[
2985 Pattern[Any], str
2986 ], # thread entry from the reader's array of threads
2987 pages: dict[int, PageObject],
2988 reader: PdfReader,
2989 ) -> None:
2990 """
2991 Add articles matching the defined criteria.
2993 Args:
2994 fltr:
2995 pages:
2996 reader:
2998 """
2999 if isinstance(fltr, str):
3000 fltr = re.compile(fltr)
3001 elif not isinstance(fltr, Pattern):
3002 fltr = re.compile("")
3003 for p in pages.values():
3004 pp = p.original_page
3005 for a in pp.get("/B", ()):
3006 thr = a.get_object().get("/T")
3007 if thr is None:
3008 continue
3009 thr = thr.get_object()
3010 if thr.indirect_reference.idnum not in self._id_translated[
3011 id(reader)
3012 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):
3013 self._add_articles_thread(thr, pages, reader)
3015 def _get_cloned_page(
3016 self,
3017 page: Union[None, IndirectObject, PageObject, NullObject],
3018 pages: dict[int, PageObject],
3019 reader: PdfReader,
3020 ) -> Optional[IndirectObject]:
3021 if isinstance(page, NullObject):
3022 return None
3023 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":
3024 _i = page.indirect_reference
3025 elif isinstance(page, IndirectObject):
3026 _i = page
3027 try:
3028 return pages[_i.idnum].indirect_reference # type: ignore
3029 except Exception:
3030 return None
3032 def _insert_filtered_annotations(
3033 self,
3034 annots: Union[IndirectObject, list[DictionaryObject], None],
3035 page: PageObject,
3036 pages: dict[int, PageObject],
3037 reader: PdfReader,
3038 ) -> list[Destination]:
3039 outlist = ArrayObject()
3040 if isinstance(annots, IndirectObject):
3041 annots = cast("list[Any]", annots.get_object())
3042 if annots is None:
3043 return outlist
3044 if not isinstance(annots, list):
3045 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)
3046 return outlist
3047 for an in annots:
3048 ano = cast("DictionaryObject", an.get_object())
3049 if (
3050 ano["/Subtype"] != "/Link"
3051 or "/A" not in ano
3052 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"
3053 or "/Dest" in ano
3054 ):
3055 if "/Dest" not in ano:
3056 outlist.append(self._add_object(ano.clone(self)))
3057 else:
3058 d = ano["/Dest"]
3059 if isinstance(d, str):
3060 # it is a named dest
3061 if str(d) in self.get_named_dest_root():
3062 outlist.append(ano.clone(self).indirect_reference)
3063 else:
3064 d = cast("ArrayObject", d)
3065 p = self._get_cloned_page(d[0], pages, reader)
3066 if p is not None:
3067 anc = ano.clone(self, ignore_fields=("/Dest",))
3068 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])
3069 outlist.append(self._add_object(anc))
3070 else:
3071 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())
3072 if d is None or isinstance(d, NullObject):
3073 continue
3074 if isinstance(d, str):
3075 # it is a named dest
3076 if str(d) in self.get_named_dest_root():
3077 outlist.append(ano.clone(self).indirect_reference)
3078 else:
3079 d = cast("ArrayObject", d)
3080 p = self._get_cloned_page(d[0], pages, reader)
3081 if p is not None:
3082 anc = ano.clone(self, ignore_fields=("/D",))
3083 cast("DictionaryObject", anc["/A"])[
3084 NameObject("/D")
3085 ] = ArrayObject([p, *d[1:]])
3086 outlist.append(self._add_object(anc))
3087 return outlist
3089 def _get_filtered_outline(
3090 self,
3091 node: Any,
3092 pages: dict[int, PageObject],
3093 reader: PdfReader,
3094 ) -> list[Destination]:
3095 """
3096 Extract outline item entries that are part of the specified page set.
3098 Args:
3099 node:
3100 pages:
3101 reader:
3103 Returns:
3104 A list of destination objects.
3106 """
3107 new_outline = []
3108 if node is None:
3109 node = NullObject()
3110 node = node.get_object()
3111 if is_null_or_none(node):
3112 node = DictionaryObject()
3113 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:
3114 node = node.get("/First", None)
3115 if node is not None:
3116 node = node.get_object()
3117 new_outline += self._get_filtered_outline(node, pages, reader)
3118 else:
3119 v: Union[None, IndirectObject, NullObject]
3120 while node is not None:
3121 node = node.get_object()
3122 o = cast("Destination", reader._build_outline_item(node))
3123 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)
3124 if v is None:
3125 v = NullObject()
3126 o[NameObject("/Page")] = v
3127 if "/First" in node:
3128 o._filtered_children = self._get_filtered_outline(
3129 node["/First"], pages, reader
3130 )
3131 else:
3132 o._filtered_children = []
3133 if (
3134 not isinstance(o["/Page"], NullObject)
3135 or len(o._filtered_children) > 0
3136 ):
3137 new_outline.append(o)
3138 node = node.get("/Next", None)
3139 return new_outline
3141 def _clone_outline(self, dest: Destination) -> TreeObject:
3142 n_ol = TreeObject()
3143 self._add_object(n_ol)
3144 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])
3145 if not isinstance(dest["/Page"], NullObject):
3146 if dest.node is not None and "/A" in dest.node:
3147 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)
3148 else:
3149 n_ol[NameObject("/Dest")] = dest.dest_array
3150 # TODO: /SE
3151 if dest.node is not None:
3152 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))
3153 n_ol[NameObject("/C")] = ArrayObject(
3154 dest.node.get(
3155 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]
3156 )
3157 )
3158 return n_ol
3160 def _insert_filtered_outline(
3161 self,
3162 outlines: list[Destination],
3163 parent: Union[TreeObject, IndirectObject],
3164 before: Union[None, TreeObject, IndirectObject] = None,
3165 ) -> None:
3166 for dest in outlines:
3167 # TODO: can be improved to keep A and SE entries (ignored for the moment)
3168 # with np=self.add_outline_item_destination(dest,parent,before)
3169 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:
3170 np = parent
3171 else:
3172 np = self._clone_outline(dest)
3173 cast(TreeObject, parent.get_object()).insert_child(np, before, self)
3174 self._insert_filtered_outline(dest._filtered_children, np, None)
3176 def close(self) -> None:
3177 """Implemented for API harmonization."""
3178 return
3180 def find_outline_item(
3181 self,
3182 outline_item: dict[str, Any],
3183 root: Optional[OutlineType] = None,
3184 ) -> Optional[list[int]]:
3185 if root is None:
3186 o = self.get_outline_root()
3187 else:
3188 o = cast("TreeObject", root)
3190 i = 0
3191 while o is not None:
3192 if (
3193 o.indirect_reference == outline_item
3194 or o.get("/Title", None) == outline_item
3195 ):
3196 return [i]
3197 if "/First" in o:
3198 res = self.find_outline_item(
3199 outline_item, cast(OutlineType, o["/First"])
3200 )
3201 if res:
3202 return ([i] if "/Title" in o else []) + res
3203 if "/Next" in o:
3204 i += 1
3205 o = cast(TreeObject, o["/Next"])
3206 else:
3207 return None
3209 def reset_translation(
3210 self, reader: Union[None, PdfReader, IndirectObject] = None
3211 ) -> None:
3212 """
3213 Reset the translation table between reader and the writer object.
3215 Late cloning will create new independent objects.
3217 Args:
3218 reader: PdfReader or IndirectObject referencing a PdfReader object.
3219 if set to None or omitted, all tables will be reset.
3221 """
3222 if reader is None:
3223 self._id_translated = {}
3224 elif isinstance(reader, PdfReader):
3225 try:
3226 del self._id_translated[id(reader)]
3227 except Exception:
3228 pass
3229 elif isinstance(reader, IndirectObject):
3230 try:
3231 del self._id_translated[id(reader.pdf)]
3232 except Exception:
3233 pass
3234 else:
3235 raise Exception("invalid parameter {reader}")
3237 def set_page_label(
3238 self,
3239 page_index_from: int,
3240 page_index_to: int,
3241 style: Optional[PageLabelStyle] = None,
3242 prefix: Optional[str] = None,
3243 start: Optional[int] = 0,
3244 ) -> None:
3245 """
3246 Set a page label to a range of pages.
3248 Page indexes must be given starting from 0.
3249 Labels must have a style, a prefix or both.
3250 If a range is not assigned any page label, a decimal label starting from 1 is applied.
3252 Args:
3253 page_index_from: page index of the beginning of the range starting from 0
3254 page_index_to: page index of the beginning of the range starting from 0
3255 style: The numbering style to be used for the numeric portion of each page label:
3257 * ``/D`` Decimal Arabic numerals
3258 * ``/R`` Uppercase Roman numerals
3259 * ``/r`` Lowercase Roman numerals
3260 * ``/A`` Uppercase letters (A to Z for the first 26 pages,
3261 AA to ZZ for the next 26, and so on)
3262 * ``/a`` Lowercase letters (a to z for the first 26 pages,
3263 aa to zz for the next 26, and so on)
3265 prefix: The label prefix for page labels in this range.
3266 start: The value of the numeric portion for the first page label
3267 in the range.
3268 Subsequent pages are numbered sequentially from this value,
3269 which must be greater than or equal to 1.
3270 Default value: 1.
3272 """
3273 if style is None and prefix is None:
3274 raise ValueError("At least one of style and prefix must be given")
3275 if page_index_from < 0:
3276 raise ValueError("page_index_from must be greater or equal than 0")
3277 if page_index_to < page_index_from:
3278 raise ValueError(
3279 "page_index_to must be greater or equal than page_index_from"
3280 )
3281 if page_index_to >= len(self.pages):
3282 raise ValueError("page_index_to exceeds number of pages")
3283 if start is not None and start != 0 and start < 1:
3284 raise ValueError("If given, start must be greater or equal than one")
3286 self._set_page_label(page_index_from, page_index_to, style, prefix, start)
3288 def _set_page_label(
3289 self,
3290 page_index_from: int,
3291 page_index_to: int,
3292 style: Optional[PageLabelStyle] = None,
3293 prefix: Optional[str] = None,
3294 start: Optional[int] = 0,
3295 ) -> None:
3296 """
3297 Set a page label to a range of pages.
3299 Page indexes must be given starting from 0.
3300 Labels must have a style, a prefix or both.
3301 If a range is not assigned any page label a decimal label starting from 1 is applied.
3303 Args:
3304 page_index_from: page index of the beginning of the range starting from 0
3305 page_index_to: page index of the beginning of the range starting from 0
3306 style: The numbering style to be used for the numeric portion of each page label:
3307 /D Decimal Arabic numerals
3308 /R Uppercase Roman numerals
3309 /r Lowercase Roman numerals
3310 /A Uppercase letters (A to Z for the first 26 pages,
3311 AA to ZZ for the next 26, and so on)
3312 /a Lowercase letters (a to z for the first 26 pages,
3313 aa to zz for the next 26, and so on)
3314 prefix: The label prefix for page labels in this range.
3315 start: The value of the numeric portion for the first page label
3316 in the range.
3317 Subsequent pages are numbered sequentially from this value,
3318 which must be greater than or equal to 1. Default value: 1.
3320 """
3321 default_page_label = DictionaryObject()
3322 default_page_label[NameObject("/S")] = NameObject("/D")
3324 new_page_label = DictionaryObject()
3325 if style is not None:
3326 new_page_label[NameObject("/S")] = NameObject(style)
3327 if prefix is not None:
3328 new_page_label[NameObject("/P")] = TextStringObject(prefix)
3329 if start != 0:
3330 new_page_label[NameObject("/St")] = NumberObject(start)
3332 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:
3333 nums = ArrayObject()
3334 nums_insert(NumberObject(0), default_page_label, nums)
3335 page_labels = TreeObject()
3336 page_labels[NameObject("/Nums")] = nums
3337 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3339 page_labels = cast(
3340 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]
3341 )
3342 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])
3344 nums_insert(NumberObject(page_index_from), new_page_label, nums)
3345 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)
3346 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)
3347 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):
3348 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)
3350 page_labels[NameObject("/Nums")] = nums
3351 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3353 def _repr_mimebundle_(
3354 self,
3355 include: Union[None, Iterable[str]] = None,
3356 exclude: Union[None, Iterable[str]] = None,
3357 ) -> dict[str, Any]:
3358 """
3359 Integration into Jupyter Notebooks.
3361 This method returns a dictionary that maps a mime-type to its
3362 representation.
3364 .. seealso::
3366 https://ipython.readthedocs.io/en/stable/config/integrating.html
3367 """
3368 pdf_data = BytesIO()
3369 self.write(pdf_data)
3370 data = {
3371 "application/pdf": pdf_data,
3372 }
3374 if include is not None:
3375 # Filter representations based on include list
3376 data = {k: v for k, v in data.items() if k in include}
3378 if exclude is not None:
3379 # Remove representations based on exclude list
3380 data = {k: v for k, v in data.items() if k not in exclude}
3382 return data
3385def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:
3386 if isinstance(obj, PdfObject):
3387 return obj
3388 if isinstance(obj, dict):
3389 to_add = DictionaryObject()
3390 for key, value in obj.items():
3391 to_add[NameObject(key)] = _pdf_objectify(value)
3392 return to_add
3393 if isinstance(obj, str):
3394 if obj.startswith("/"):
3395 return NameObject(obj)
3396 return TextStringObject(obj)
3397 if isinstance(obj, (float, int)):
3398 return FloatObject(obj)
3399 if isinstance(obj, list):
3400 return ArrayObject(_pdf_objectify(i) for i in obj)
3401 raise NotImplementedError(
3402 f"{type(obj)=} could not be cast to a PdfObject"
3403 )
3406def _create_outline_item(
3407 action_ref: Union[None, IndirectObject],
3408 title: str,
3409 color: Union[tuple[float, float, float], str, None],
3410 italic: bool,
3411 bold: bool,
3412) -> TreeObject:
3413 outline_item = TreeObject()
3414 if action_ref is not None:
3415 outline_item[NameObject("/A")] = action_ref
3416 outline_item.update(
3417 {
3418 NameObject("/Title"): create_string_object(title),
3419 }
3420 )
3421 if color:
3422 if isinstance(color, str):
3423 color = hex_to_rgb(color)
3424 outline_item.update(
3425 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}
3426 )
3427 if italic or bold:
3428 format_flag = 0
3429 if italic:
3430 format_flag += OutlineFontFlag.italic
3431 if bold:
3432 format_flag += OutlineFontFlag.bold
3433 outline_item.update({NameObject("/F"): NumberObject(format_flag)})
3434 return outline_item
3437def generate_appearance_stream(
3438 txt: str,
3439 sel: list[str],
3440 da: str,
3441 font_full_rev: dict[str, bytes],
3442 rct: RectangleObject,
3443 font_height: float,
3444 y_offset: float,
3445) -> bytes:
3446 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()
3447 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):
3448 if line in sel:
3449 # may be improved but cannot find how to get fill working => replaced with lined box
3450 ap_stream += (
3451 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"
3452 f"0.5 0.5 0.5 rg s\n{da}\n"
3453 ).encode()
3454 if line_number == 0:
3455 ap_stream += f"2 {y_offset} Td\n".encode()
3456 else:
3457 # Td is a relative translation
3458 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()
3459 enc_line: list[bytes] = [
3460 font_full_rev.get(c, c.encode("utf-16-be")) for c in line
3461 ]
3462 if any(len(c) >= 2 for c in enc_line):
3463 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"
3464 else:
3465 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"
3466 ap_stream += b"ET\nQ\nEMC\nQ\n"
3467 return ap_stream