Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import decimal
31import enum
32import hashlib
33import re
34import struct
35import uuid
36from io import BytesIO, FileIO, IOBase
37from itertools import compress
38from pathlib import Path
39from types import TracebackType
40from typing import (
41 IO,
42 Any,
43 Callable,
44 Dict,
45 Iterable,
46 List,
47 Optional,
48 Pattern,
49 Tuple,
50 Type,
51 Union,
52 cast,
53)
55from ._cmap import _default_fonts_space_width, build_char_map_from_dict
56from ._doc_common import DocumentInformation, PdfDocCommon
57from ._encryption import EncryptAlgorithm, Encryption
58from ._page import PageObject, Transformation
59from ._page_labels import nums_clear_range, nums_insert, nums_next
60from ._reader import PdfReader
61from ._utils import (
62 StrByteType,
63 StreamType,
64 _get_max_pdf_version_header,
65 deprecate,
66 deprecate_no_replacement,
67 deprecation_with_replacement,
68 logger_warning,
69)
70from .constants import AnnotationDictionaryAttributes as AA
71from .constants import CatalogAttributes as CA
72from .constants import (
73 CatalogDictionary,
74 GoToActionArguments,
75 ImageType,
76 InteractiveFormDictEntries,
77 OutlineFontFlag,
78 PageLabelStyle,
79 TypFitArguments,
80 UserAccessPermissions,
81)
82from .constants import Core as CO
83from .constants import FieldDictionaryAttributes as FA
84from .constants import PageAttributes as PG
85from .constants import PagesAttributes as PA
86from .constants import TrailerKeys as TK
87from .errors import PyPdfError
88from .generic import (
89 PAGE_FIT,
90 ArrayObject,
91 BooleanObject,
92 ByteStringObject,
93 ContentStream,
94 DecodedStreamObject,
95 Destination,
96 DictionaryObject,
97 EmbeddedFile,
98 Fit,
99 FloatObject,
100 IndirectObject,
101 NameObject,
102 NullObject,
103 NumberObject,
104 PdfObject,
105 RectangleObject,
106 StreamObject,
107 TextStringObject,
108 TreeObject,
109 ViewerPreferences,
110 create_string_object,
111 hex_to_rgb,
112 is_null_or_none,
113)
114from .pagerange import PageRange, PageRangeSpec
115from .types import (
116 AnnotationSubtype,
117 BorderArrayType,
118 LayoutType,
119 OutlineItemType,
120 OutlineType,
121 PagemodeType,
122)
123from .xmp import XmpInformation
125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()
126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12
129class ObjectDeletionFlag(enum.IntFlag):
130 NONE = 0
131 TEXT = enum.auto()
132 LINKS = enum.auto()
133 ATTACHMENTS = enum.auto()
134 OBJECTS_3D = enum.auto()
135 ALL_ANNOTATIONS = enum.auto()
136 XOBJECT_IMAGES = enum.auto()
137 INLINE_IMAGES = enum.auto()
138 DRAWING_IMAGES = enum.auto()
139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
143 hash = hashlib.md5()
144 for block in iter(lambda: stream.read(blocksize), b""):
145 hash.update(block)
146 return hash.hexdigest()
149class PdfWriter(PdfDocCommon):
150 """
151 Write a PDF file out, given pages produced by another class or through
152 cloning a PDF file during initialization.
154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.
156 Args:
157 clone_from: identical to fileobj (for compatibility)
159 incremental: If true, loads the document and set the PdfWriter in incremental mode.
161 When writing incrementally, the original document is written first and new/modified
162 content is appended. To be used for signed document/forms to keep signature valid.
164 full: If true, loads all the objects (always full if incremental = True).
165 This parameter may allow loading large PDFs.
167 """
169 def __init__(
170 self,
171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",
172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,
173 incremental: bool = False,
174 full: bool = False,
175 ) -> None:
176 self.incremental = incremental or full
177 """
178 Returns if the PdfWriter object has been started in incremental mode.
179 """
181 self._objects: List[Optional[PdfObject]] = []
182 """
183 The indirect objects in the PDF.
184 For the incremental case, it will be filled with None
185 in clone_reader_document_root.
186 """
188 self._original_hash: List[int] = []
189 """
190 List of hashes after import; used to identify changes.
191 """
193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}
194 """
195 Maps hash values of indirect objects to the list of IndirectObjects.
196 This is used for compression.
197 """
199 self._id_translated: Dict[int, Dict[int, int]] = {}
200 """List of already translated IDs.
201 dict[id(pdf)][(idnum, generation)]
202 """
204 self._info_obj: Optional[PdfObject]
205 """The PDF files's document information dictionary,
206 the Info entry in the PDF file's trailer dictionary."""
208 self._ID: Union[ArrayObject, None] = None
209 """The PDF file identifier,
210 defined by the ID in the PDF file's trailer dictionary."""
212 if self.incremental:
213 if isinstance(fileobj, (str, Path)):
214 with open(fileobj, "rb") as f:
215 fileobj = BytesIO(f.read(-1))
216 if isinstance(fileobj, BytesIO):
217 fileobj = PdfReader(fileobj)
218 if not isinstance(fileobj, PdfReader):
219 raise PyPdfError("Invalid type for incremental mode")
220 self._reader = fileobj # prev content is in _reader.stream
221 self._header = fileobj.pdf_header.encode()
222 self._readonly = True # TODO: to be analysed
223 else:
224 self._header = b"%PDF-1.3"
225 self._info_obj = self._add_object(
226 DictionaryObject(
227 {NameObject("/Producer"): create_string_object("pypdf")}
228 )
229 )
231 def _get_clone_from(
232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:
235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (
236 fileobj == "" or clone_from is not None
237 ):
238 return clone_from
239 cloning = True
240 if isinstance(fileobj, (str, Path)) and (
241 not Path(str(fileobj)).exists()
242 or Path(str(fileobj)).stat().st_size == 0
243 ):
244 cloning = False
245 if isinstance(fileobj, (IOBase, BytesIO)):
246 t = fileobj.tell()
247 if fileobj.seek(0, 2) == 0:
248 cloning = False
249 fileobj.seek(t, 0)
250 if cloning:
251 clone_from = fileobj
252 return clone_from
254 clone_from = _get_clone_from(fileobj, clone_from)
255 # To prevent overwriting
256 self.temp_fileobj = fileobj
257 self.fileobj = ""
258 self._with_as_usage = False
259 self._cloned = False
260 # The root of our page tree node
261 pages = DictionaryObject(
262 {
263 NameObject(PA.TYPE): NameObject("/Pages"),
264 NameObject(PA.COUNT): NumberObject(0),
265 NameObject(PA.KIDS): ArrayObject(),
266 }
267 )
268 self.flattened_pages = []
269 self._encryption: Optional[Encryption] = None
270 self._encrypt_entry: Optional[DictionaryObject] = None
272 if clone_from is not None:
273 if not isinstance(clone_from, PdfReader):
274 clone_from = PdfReader(clone_from)
275 self.clone_document_from_reader(clone_from)
276 self._cloned = True
277 else:
278 self._pages = self._add_object(pages)
279 self._root_object = DictionaryObject(
280 {
281 NameObject(PA.TYPE): NameObject(CO.CATALOG),
282 NameObject(CO.PAGES): self._pages,
283 }
284 )
285 self._add_object(self._root_object)
286 if full and not incremental:
287 self.incremental = False
288 if isinstance(self._ID, list):
289 if isinstance(self._ID[0], TextStringObject):
290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
291 if isinstance(self._ID[1], TextStringObject):
292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())
294 # for commonality
295 @property
296 def is_encrypted(self) -> bool:
297 """
298 Read-only boolean property showing whether this PDF file is encrypted.
300 Note that this property, if true, will remain true even after the
301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
302 """
303 return False
305 @property
306 def root_object(self) -> DictionaryObject:
307 """
308 Provide direct access to PDF Structure.
310 Note:
311 Recommended only for read access.
313 """
314 return self._root_object
316 @property
317 def _info(self) -> Optional[DictionaryObject]:
318 """
319 Provide access to "/Info". Standardized with PdfReader.
321 Returns:
322 /Info Dictionary; None if the entry does not exist
324 """
325 return (
326 None
327 if self._info_obj is None
328 else cast(DictionaryObject, self._info_obj.get_object())
329 )
331 @_info.setter
332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
333 if value is None:
334 try:
335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore
336 except (KeyError, AttributeError):
337 pass
338 self._info_obj = None
339 else:
340 if self._info_obj is None:
341 self._info_obj = self._add_object(DictionaryObject())
342 obj = cast(DictionaryObject, self._info_obj.get_object())
343 obj.clear()
344 obj.update(cast(DictionaryObject, value.get_object()))
346 @property
347 def xmp_metadata(self) -> Optional[XmpInformation]:
348 """XMP (Extensible Metadata Platform) data."""
349 return cast(XmpInformation, self.root_object.xmp_metadata)
351 @xmp_metadata.setter
352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None:
353 """XMP (Extensible Metadata Platform) data."""
354 if value is None:
355 if "/Metadata" in self.root_object:
356 del self.root_object["/Metadata"]
357 else:
358 self.root_object[NameObject("/Metadata")] = value
360 return self.root_object.xmp_metadata # type: ignore
362 @property
363 def with_as_usage(self) -> bool:
364 deprecate_no_replacement("with_as_usage", "6.0")
365 return self._with_as_usage
367 @with_as_usage.setter
368 def with_as_usage(self, value: bool) -> None:
369 deprecate_no_replacement("with_as_usage", "6.0")
370 self._with_as_usage = value
372 def __enter__(self) -> "PdfWriter":
373 """Store how writer is initialized by 'with'."""
374 c: bool = self._cloned
375 t = self.temp_fileobj
376 self.__init__() # type: ignore
377 self._cloned = c
378 self._with_as_usage = True
379 self.fileobj = t # type: ignore
380 return self
382 def __exit__(
383 self,
384 exc_type: Optional[Type[BaseException]],
385 exc: Optional[BaseException],
386 traceback: Optional[TracebackType],
387 ) -> None:
388 """Write data to the fileobj."""
389 if self.fileobj and not self._cloned:
390 self.write(self.fileobj)
392 @property
393 def pdf_header(self) -> str:
394 """
395 Read/Write property of the PDF header that is written.
397 This should be something like ``'%PDF-1.5'``. It is recommended to set
398 the lowest version that supports all features which are used within the
399 PDF file.
401 Note: `pdf_header` returns a string but accepts bytes or str for writing
402 """
403 return self._header.decode()
405 @pdf_header.setter
406 def pdf_header(self, new_header: Union[str, bytes]) -> None:
407 if isinstance(new_header, str):
408 new_header = new_header.encode()
409 self._header = new_header
411 def _add_object(self, obj: PdfObject) -> IndirectObject:
412 if (
413 getattr(obj, "indirect_reference", None) is not None
414 and obj.indirect_reference.pdf == self # type: ignore
415 ):
416 return obj.indirect_reference # type: ignore
417 # check for /Contents in Pages (/Contents in annotations are strings)
418 if isinstance(obj, DictionaryObject) and isinstance(
419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
420 ):
421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
422 self._objects.append(obj)
423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
424 return obj.indirect_reference
426 def get_object(
427 self,
428 indirect_reference: Union[int, IndirectObject],
429 ) -> PdfObject:
430 if isinstance(indirect_reference, int):
431 obj = self._objects[indirect_reference - 1]
432 elif indirect_reference.pdf != self:
433 raise ValueError("PDF must be self")
434 else:
435 obj = self._objects[indirect_reference.idnum - 1]
436 assert obj is not None, "mypy"
437 return obj
439 def _replace_object(
440 self,
441 indirect_reference: Union[int, IndirectObject],
442 obj: PdfObject,
443 ) -> PdfObject:
444 if isinstance(indirect_reference, IndirectObject):
445 if indirect_reference.pdf != self:
446 raise ValueError("PDF must be self")
447 indirect_reference = indirect_reference.idnum
448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore
449 if (
450 getattr(obj, "indirect_reference", None) is not None
451 and obj.indirect_reference.pdf != self # type: ignore
452 ):
453 obj = obj.clone(self)
454 self._objects[indirect_reference - 1] = obj
455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
457 assert isinstance(obj, PdfObject), "mypy"
458 return obj
460 def _add_page(
461 self,
462 page: PageObject,
463 index: int,
464 excluded_keys: Iterable[str] = (),
465 ) -> PageObject:
466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:
467 raise ValueError("Invalid page object")
468 assert self.flattened_pages is not None, "for mypy"
469 page_org = page
470 excluded_keys = list(excluded_keys)
471 excluded_keys += [PA.PARENT, "/StructParents"]
472 # Acrobat does not accept two indirect references pointing on the same
473 # page; therefore in order to add multiple copies of the same
474 # page, we need to create a new dictionary for the page, however the
475 # objects below (including content) are not duplicated:
476 try: # delete an already existing page
477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore
478 page_org.indirect_reference.idnum # type: ignore
479 ]
480 except Exception:
481 pass
482 page = cast(
483 "PageObject", page_org.clone(self, False, excluded_keys).get_object()
484 )
485 if page_org.pdf is not None:
486 other = page_org.pdf.pdf_header
487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
488 node, idx = self._get_page_in_node(index)
489 page[NameObject(PA.PARENT)] = node.indirect_reference
491 if idx >= 0:
492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)
493 self.flattened_pages.insert(index, page)
494 else:
495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)
496 self.flattened_pages.append(page)
497 recurse = 0
498 while not is_null_or_none(node):
499 node = cast(DictionaryObject, node.get_object())
500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)
501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix.
502 recurse += 1
503 if recurse > 1000:
504 raise PyPdfError("Too many recursive calls!")
505 return page
507 def set_need_appearances_writer(self, state: bool = True) -> None:
508 """
509 Sets the "NeedAppearances" flag in the PDF writer.
511 The "NeedAppearances" flag indicates whether the appearance dictionary
512 for form fields should be automatically generated by the PDF viewer or
513 if the embedded appearance should be used.
515 Args:
516 state: The actual value of the NeedAppearances flag.
518 Returns:
519 None
521 """
522 # See §12.7.2 and §7.7.2 for more information:
523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
524 try:
525 # get the AcroForm tree
526 if CatalogDictionary.ACRO_FORM not in self._root_object:
527 self._root_object[
528 NameObject(CatalogDictionary.ACRO_FORM)
529 ] = self._add_object(DictionaryObject())
531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)
532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[
533 need_appearances
534 ] = BooleanObject(state)
535 except Exception as exc: # pragma: no cover
536 logger_warning(
537 f"set_need_appearances_writer({state}) catch : {exc}", __name__
538 )
540 def create_viewer_preferences(self) -> ViewerPreferences:
541 o = ViewerPreferences()
542 self._root_object[
543 NameObject(CatalogDictionary.VIEWER_PREFERENCES)
544 ] = self._add_object(o)
545 return o
547 def add_page(
548 self,
549 page: PageObject,
550 excluded_keys: Iterable[str] = (),
551 ) -> PageObject:
552 """
553 Add a page to this PDF file.
555 Recommended for advanced usage including the adequate excluded_keys.
557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`
558 instance.
560 Args:
561 page: The page to add to the document. Should be
562 an instance of :class:`PageObject<pypdf._page.PageObject>`
563 excluded_keys:
565 Returns:
566 The added PageObject.
568 """
569 assert self.flattened_pages is not None, "mypy"
570 return self._add_page(page, len(self.flattened_pages), excluded_keys)
572 def insert_page(
573 self,
574 page: PageObject,
575 index: int = 0,
576 excluded_keys: Iterable[str] = (),
577 ) -> PageObject:
578 """
579 Insert a page in this PDF file. The page is usually acquired from a
580 :class:`PdfReader<pypdf.PdfReader>` instance.
582 Args:
583 page: The page to add to the document.
584 index: Position at which the page will be inserted.
585 excluded_keys:
587 Returns:
588 The added PageObject.
590 """
591 assert self.flattened_pages is not None, "mypy"
592 if index < 0:
593 index = len(self.flattened_pages) + index
594 if index < 0:
595 raise ValueError("Invalid index value")
596 if index >= len(self.flattened_pages):
597 return self.add_page(page, excluded_keys)
598 return self._add_page(page, index, excluded_keys)
600 def _get_page_number_by_indirect(
601 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
602 ) -> Optional[int]:
603 """
604 Generate _page_id2num.
606 Args:
607 indirect_reference:
609 Returns:
610 The page number or None
612 """
613 # To provide same function as in PdfReader
614 if is_null_or_none(indirect_reference):
615 return None
616 assert indirect_reference is not None, "mypy"
617 if isinstance(indirect_reference, int):
618 indirect_reference = IndirectObject(indirect_reference, 0, self)
619 obj = indirect_reference.get_object()
620 if isinstance(obj, PageObject):
621 return obj.page_number
622 return None
624 def add_blank_page(
625 self, width: Optional[float] = None, height: Optional[float] = None
626 ) -> PageObject:
627 """
628 Append a blank page to this PDF file and return it.
630 If no page size is specified, use the size of the last page.
632 Args:
633 width: The width of the new page expressed in default user
634 space units.
635 height: The height of the new page expressed in default
636 user space units.
638 Returns:
639 The newly appended page.
641 Raises:
642 PageSizeNotDefinedError: if width and height are not defined
643 and previous page does not exist.
645 """
646 page = PageObject.create_blank_page(self, width, height)
647 return self.add_page(page)
649 def insert_blank_page(
650 self,
651 width: Optional[Union[float, decimal.Decimal]] = None,
652 height: Optional[Union[float, decimal.Decimal]] = None,
653 index: int = 0,
654 ) -> PageObject:
655 """
656 Insert a blank page to this PDF file and return it.
658 If no page size is specified, use the size of the last page.
660 Args:
661 width: The width of the new page expressed in default user
662 space units.
663 height: The height of the new page expressed in default
664 user space units.
665 index: Position to add the page.
667 Returns:
668 The newly inserted page.
670 Raises:
671 PageSizeNotDefinedError: if width and height are not defined
672 and previous page does not exist.
674 """
675 if width is None or (height is None and index < self.get_num_pages()):
676 oldpage = self.pages[index]
677 width = oldpage.mediabox.width
678 height = oldpage.mediabox.height
679 page = PageObject.create_blank_page(self, width, height)
680 self.insert_page(page, index)
681 return page
683 @property
684 def open_destination(
685 self,
686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
687 return super().open_destination
689 @open_destination.setter
690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
691 if dest is None:
692 try:
693 del self._root_object["/OpenAction"]
694 except KeyError:
695 pass
696 elif isinstance(dest, str):
697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)
698 elif isinstance(dest, Destination):
699 self._root_object[NameObject("/OpenAction")] = dest.dest_array
700 elif isinstance(dest, PageObject):
701 self._root_object[NameObject("/OpenAction")] = Destination(
702 "Opening",
703 dest.indirect_reference
704 if dest.indirect_reference is not None
705 else NullObject(),
706 PAGE_FIT,
707 ).dest_array
709 def add_js(self, javascript: str) -> None:
710 """
711 Add JavaScript which will launch upon opening this PDF.
713 Args:
714 javascript: Your JavaScript.
716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
717 # Example: This will launch the print window when the PDF is opened.
719 """
720 # Names / JavaScript preferred to be able to add multiple scripts
721 if "/Names" not in self._root_object:
722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()
723 names = cast(DictionaryObject, self._root_object[CA.NAMES])
724 if "/JavaScript" not in names:
725 names[NameObject("/JavaScript")] = DictionaryObject(
726 {NameObject("/Names"): ArrayObject()}
727 )
728 js_list = cast(
729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]
730 )
731 # We need a name for parameterized JavaScript in the PDF file,
732 # but it can be anything.
733 js_list.append(create_string_object(str(uuid.uuid4())))
735 js = DictionaryObject(
736 {
737 NameObject(PA.TYPE): NameObject("/Action"),
738 NameObject("/S"): NameObject("/JavaScript"),
739 NameObject("/JS"): TextStringObject(f"{javascript}"),
740 }
741 )
742 js_list.append(self._add_object(js))
744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":
745 """
746 Embed a file inside the PDF.
748 Reference:
749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
750 Section 7.11.3
752 Args:
753 filename: The filename to display.
754 data: The data in the file.
756 Returns:
757 EmbeddedFile instance for the newly created embedded file.
759 """
760 return EmbeddedFile._create_new(self, filename, data)
762 def append_pages_from_reader(
763 self,
764 reader: PdfReader,
765 after_page_append: Optional[Callable[[PageObject], None]] = None,
766 ) -> None:
767 """
768 Copy pages from reader to writer. Includes an optional callback
769 parameter which is invoked after pages are appended to the writer.
771 ``append`` should be preferred.
773 Args:
774 reader: a PdfReader object from which to copy page
775 annotations to this writer object. The writer's annots
776 will then be updated.
777 after_page_append:
778 Callback function that is invoked after each page is appended to
779 the writer. Signature includes a reference to the appended page
780 (delegates to append_pages_from_reader). The single parameter of
781 the callback is a reference to the page just appended to the
782 document.
784 """
785 reader_num_pages = len(reader.pages)
786 # Copy pages from reader to writer
787 for reader_page_number in range(reader_num_pages):
788 reader_page = reader.pages[reader_page_number]
789 writer_page = self.add_page(reader_page)
790 # Trigger callback, pass writer page as parameter
791 if callable(after_page_append):
792 after_page_append(writer_page)
794 def _merge_content_stream_to_page(
795 self,
796 page: PageObject,
797 new_content_data: bytes,
798 ) -> None:
799 """
800 Combines existing content stream(s) with new content (as bytes),
801 and returns a new single StreamObject.
803 Args:
804 page: The page to which the new content data will be added.
805 new_content_data: A binary-encoded new content stream, for
806 instance the commands to draw an XObject.
807 """
808 # First resolve the existing page content. This always is an IndirectObject:
809 # PDF Explained by John Whitington
810 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
811 if NameObject("/Contents") in page:
812 existing_content_ref = page[NameObject("/Contents")]
813 existing_content = existing_content_ref.get_object()
815 if isinstance(existing_content, ArrayObject):
816 # Create a new StreamObject for the new_content_data
817 new_stream_obj = StreamObject()
818 new_stream_obj.set_data(new_content_data)
819 existing_content.append(self._add_object(new_stream_obj))
820 page[NameObject("/Contents")] = self._add_object(existing_content)
821 if isinstance(existing_content, StreamObject):
822 # Merge new content to existing StreamObject
823 merged_data = existing_content.get_data() + b"\n" + new_content_data
824 new_stream = StreamObject()
825 new_stream.set_data(merged_data)
826 page[NameObject("/Contents")] = self._add_object(new_stream)
827 else:
828 # If no existing content, then we have an empty page.
829 # Create a new StreamObject in a new /Contents entry.
830 new_stream = StreamObject()
831 new_stream.set_data(new_content_data)
832 page[NameObject("/Contents")] = self._add_object(new_stream)
834 def _add_apstream_object(
835 self,
836 page: PageObject,
837 appearance_stream_obj: StreamObject,
838 object_name: str,
839 x_offset: float,
840 y_offset: float,
841 font_res: Optional[DictionaryObject] = None
842 ) -> None:
843 """
844 Adds an appearance stream to the page content in the form of
845 an XObject.
847 Args:
848 page: The page to which to add the appearance stream.
849 appearance_stream_obj: The appearance stream.
850 object_name: The name of the appearance stream.
851 x_offset: The horizontal offset for the appearance stream.
852 y_offset: The vertical offset for the appearance stream.
853 font_res: The appearance stream's font resource (if given).
854 """
855 # Prepare XObject resource dictionary on the page
856 pg_res = cast(DictionaryObject, page[PG.RESOURCES])
857 if font_res is not None:
858 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated
859 if "/Font" not in pg_res:
860 pg_res[NameObject("/Font")] = DictionaryObject()
861 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])
862 if font_name not in pg_ft_res:
863 pg_ft_res[NameObject(font_name)] = font_res
864 # Always add the resolved stream object to the writer to get a new IndirectObject.
865 # This ensures we have a valid IndirectObject managed by *this* writer.
866 xobject_ref = self._add_object(appearance_stream_obj)
867 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()
868 if "/XObject" not in pg_res:
869 pg_res[NameObject("/XObject")] = DictionaryObject()
870 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])
871 if xobject_name not in pg_xo_res:
872 pg_xo_res[xobject_name] = xobject_ref
873 else:
874 logger_warning(
875 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",
876 __name__
877 )
878 xobject_cm = Transformation().translate(x_offset, y_offset)
879 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
880 self._merge_content_stream_to_page(page, xobject_drawing_commands)
882 def _update_field_annotation(
883 self,
884 page: PageObject,
885 field: DictionaryObject,
886 annotation: DictionaryObject,
887 font_name: str = "",
888 font_size: float = -1,
889 flatten: bool = False,
890 ) -> None:
891 # Calculate rectangle dimensions
892 _rct = cast(RectangleObject, annotation[AA.Rect])
893 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))
895 # Extract font information
896 da = annotation.get_inherited(
897 AA.DA,
898 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(
899 AA.DA, None
900 ),
901 )
902 if da is None:
903 da = TextStringObject("/Helv 0 Tf 0 g")
904 else:
905 da = da.get_object()
906 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")
907 font_properties = [x for x in font_properties if x != ""]
908 if font_name:
909 font_properties[font_properties.index("Tf") - 2] = font_name
910 else:
911 font_name = font_properties[font_properties.index("Tf") - 2]
912 font_height = (
913 font_size
914 if font_size >= 0
915 else float(font_properties[font_properties.index("Tf") - 1])
916 )
917 if font_height == 0:
918 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:
919 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE
920 else:
921 font_height = rct.height - 2
922 font_properties[font_properties.index("Tf") - 1] = str(font_height)
923 da = " ".join(font_properties)
924 y_offset = rct.height - 1 - font_height
926 # Retrieve font information from local DR ...
927 dr: Any = cast(
928 DictionaryObject,
929 cast(
930 DictionaryObject,
931 annotation.get_inherited(
932 "/DR",
933 cast(
934 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]
935 ).get("/DR", DictionaryObject()),
936 ),
937 ).get_object(),
938 )
939 dr = dr.get("/Font", DictionaryObject()).get_object()
940 # _default_fonts_space_width keys is the list of Standard fonts
941 if font_name not in dr and font_name not in _default_fonts_space_width:
942 # ...or AcroForm dictionary
943 dr = cast(
944 Dict[Any, Any],
945 cast(
946 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]
947 ).get("/DR", {}),
948 )
949 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()
950 font_res = dr.get(font_name, None)
951 if not is_null_or_none(font_res):
952 font_res = cast(DictionaryObject, font_res.get_object())
953 font_subtype, _, font_encoding, font_map = build_char_map_from_dict(
954 200, font_res
955 )
956 try: # remove width stored in -1 key
957 del font_map[-1]
958 except KeyError:
959 pass
960 font_full_rev: Dict[str, bytes]
961 if isinstance(font_encoding, str):
962 font_full_rev = {
963 v: k.encode(font_encoding) for k, v in font_map.items()
964 }
965 else:
966 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
967 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
968 for key, value in font_map.items():
969 font_full_rev[value] = font_encoding_rev.get(key, key)
970 else:
971 logger_warning(f"Font dictionary for {font_name} not found.", __name__)
972 font_full_rev = {}
974 # Retrieve field text and selected values
975 field_flags = field.get(FA.Ff, 0)
976 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
977 txt = "\n".join(annotation.get_inherited(FA.Opt, []))
978 sel = field.get("/V", [])
979 if not isinstance(sel, list):
980 sel = [sel]
981 else: # /Tx
982 txt = field.get("/V", "")
983 sel = []
984 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
985 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
986 # Generate appearance stream
987 ap_stream = generate_appearance_stream(
988 txt, sel, da, font_full_rev, rct, font_height, y_offset
989 )
991 # Create appearance dictionary
992 dct = DecodedStreamObject.initialize_from_dictionary(
993 {
994 NameObject("/Type"): NameObject("/XObject"),
995 NameObject("/Subtype"): NameObject("/Form"),
996 NameObject("/BBox"): rct,
997 "__streamdata__": ByteStringObject(ap_stream),
998 "/Length": 0,
999 }
1000 )
1001 if AA.AP in annotation:
1002 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():
1003 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
1004 dct[k] = v
1006 # Update Resources with font information if necessary
1007 if font_res is not None:
1008 dct[NameObject("/Resources")] = DictionaryObject(
1009 {
1010 NameObject("/Font"): DictionaryObject(
1011 {
1012 NameObject(font_name): getattr(
1013 font_res, "indirect_reference", font_res
1014 )
1015 }
1016 )
1017 }
1018 )
1019 if AA.AP not in annotation:
1020 annotation[NameObject(AA.AP)] = DictionaryObject(
1021 {NameObject("/N"): self._add_object(dct)}
1022 )
1023 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):
1024 cast(DictionaryObject, annotation[NameObject(AA.AP)])[
1025 NameObject("/N")
1026 ] = self._add_object(dct)
1027 else: # [/AP][/N] exists
1028 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore
1029 self._objects[n - 1] = dct
1030 dct.indirect_reference = IndirectObject(n, 0, self)
1032 if flatten:
1033 field_name = self._get_qualified_field_name(annotation)
1034 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)
1036 FFBITS_NUL = FA.FfBits(0)
1038 def update_page_form_field_values(
1039 self,
1040 page: Union[PageObject, List[PageObject], None],
1041 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]],
1042 flags: FA.FfBits = FFBITS_NUL,
1043 auto_regenerate: Optional[bool] = True,
1044 flatten: bool = False,
1045 ) -> None:
1046 """
1047 Update the form field values for a given page from a fields dictionary.
1049 Copy field texts and values from fields to page.
1050 If the field links to a parent object, add the information to the parent.
1052 Args:
1053 page: `PageObject` - references **PDF writer's page** where the
1054 annotations and field data will be updated.
1055 `List[Pageobject]` - provides list of pages to be processed.
1056 `None` - all pages.
1057 fields: a Python dictionary of:
1059 * field names (/T) as keys and text values (/V) as value
1060 * field names (/T) as keys and list of text values (/V) for multiple choice list
1061 * field names (/T) as keys and tuple of:
1062 * text values (/V)
1063 * font id (e.g. /F1, the font id must exist)
1064 * font size (0 for autosize)
1066 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.
1068 auto_regenerate: Set/unset the need_appearances flag;
1069 the flag is unchanged if auto_regenerate is None.
1071 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's
1072 appearance stream to the page contents. Note that this option does not remove the
1073 annotation itself.
1075 """
1076 if CatalogDictionary.ACRO_FORM not in self._root_object:
1077 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")
1078 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1079 if InteractiveFormDictEntries.Fields not in af:
1080 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")
1081 if isinstance(auto_regenerate, bool):
1082 self.set_need_appearances_writer(auto_regenerate)
1083 # Iterate through pages, update field values
1084 if page is None:
1085 page = list(self.pages)
1086 if isinstance(page, list):
1087 for p in page:
1088 if PG.ANNOTS in p: # just to prevent warnings
1089 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)
1090 return
1091 if PG.ANNOTS not in page:
1092 logger_warning("No fields to update on this page", __name__)
1093 return
1094 for annotation in page[PG.ANNOTS]: # type: ignore
1095 annotation = cast(DictionaryObject, annotation.get_object())
1096 if annotation.get("/Subtype", "") != "/Widget":
1097 continue
1098 if "/FT" in annotation and "/T" in annotation:
1099 parent_annotation = annotation
1100 else:
1101 parent_annotation = annotation.get(
1102 PG.PARENT, DictionaryObject()
1103 ).get_object()
1105 for field, value in fields.items():
1106 if not (
1107 self._get_qualified_field_name(parent_annotation) == field
1108 or parent_annotation.get("/T", None) == field
1109 ):
1110 continue
1111 if (
1112 parent_annotation.get("/FT", None) == "/Ch"
1113 and "/I" in parent_annotation
1114 ):
1115 del parent_annotation["/I"]
1116 if flags:
1117 annotation[NameObject(FA.Ff)] = NumberObject(flags)
1118 if not (value is None and flatten): # Only change values if given by user and not flattening.
1119 if isinstance(value, list):
1120 lst = ArrayObject(TextStringObject(v) for v in value)
1121 parent_annotation[NameObject(FA.V)] = lst
1122 elif isinstance(value, tuple):
1123 annotation[NameObject(FA.V)] = TextStringObject(
1124 value[0],
1125 )
1126 else:
1127 parent_annotation[NameObject(FA.V)] = TextStringObject(value)
1128 if parent_annotation.get(FA.FT) == "/Btn":
1129 # Checkbox button (no /FT found in Radio widgets)
1130 v = NameObject(value)
1131 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])
1132 normal_ap = cast(DictionaryObject, ap["/N"])
1133 if v not in normal_ap:
1134 v = NameObject("/Off")
1135 appearance_stream_obj = normal_ap.get(v)
1136 # other cases will be updated through the for loop
1137 annotation[NameObject(AA.AS)] = v
1138 annotation[NameObject(FA.V)] = v
1139 if flatten and appearance_stream_obj is not None:
1140 # We basically copy the entire appearance stream, which should be an XObject that
1141 # is already registered. No need to add font resources.
1142 rct = cast(RectangleObject, annotation[AA.Rect])
1143 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])
1144 elif (
1145 parent_annotation.get(FA.FT) == "/Tx"
1146 or parent_annotation.get(FA.FT) == "/Ch"
1147 ):
1148 # textbox
1149 if isinstance(value, tuple):
1150 self._update_field_annotation(
1151 page, parent_annotation, annotation, value[1], value[2], flatten=flatten
1152 )
1153 else:
1154 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)
1155 elif (
1156 annotation.get(FA.FT) == "/Sig"
1157 ): # deprecated # not implemented yet
1158 logger_warning("Signature forms not implemented yet", __name__)
1160 def reattach_fields(
1161 self, page: Optional[PageObject] = None
1162 ) -> List[DictionaryObject]:
1163 """
1164 Parse annotations within the page looking for orphan fields and
1165 reattach then into the Fields Structure.
1167 Args:
1168 page: page to analyze.
1169 If none is provided, all pages will be analyzed.
1171 Returns:
1172 list of reattached fields.
1174 """
1175 lst = []
1176 if page is None:
1177 for p in self.pages:
1178 lst += self.reattach_fields(p)
1179 return lst
1181 try:
1182 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1183 except KeyError:
1184 af = DictionaryObject()
1185 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af
1186 try:
1187 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])
1188 except KeyError:
1189 fields = ArrayObject()
1190 af[NameObject(InteractiveFormDictEntries.Fields)] = fields
1192 if "/Annots" not in page:
1193 return lst
1194 annotations = cast(ArrayObject, page["/Annots"])
1195 for idx, annotation in enumerate(annotations):
1196 is_indirect = isinstance(annotation, IndirectObject)
1197 annotation = cast(DictionaryObject, annotation.get_object())
1198 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:
1199 if (
1200 "indirect_reference" in annotation.__dict__
1201 and annotation.indirect_reference in fields
1202 ):
1203 continue
1204 if not is_indirect:
1205 annotations[idx] = self._add_object(annotation)
1206 fields.append(annotation.indirect_reference)
1207 lst.append(annotation)
1208 return lst
1210 def clone_reader_document_root(self, reader: PdfReader) -> None:
1211 """
1212 Copy the reader document root to the writer and all sub-elements,
1213 including pages, threads, outlines,... For partial insertion, ``append``
1214 should be considered.
1216 Args:
1217 reader: PdfReader from which the document root should be copied.
1219 """
1220 self._info_obj = None
1221 if self.incremental:
1222 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)
1223 for i in range(len(self._objects)):
1224 o = reader.get_object(i + 1)
1225 if o is not None:
1226 self._objects[i] = o.replicate(self)
1227 else:
1228 self._objects.clear()
1229 self._root_object = reader.root_object.clone(self)
1230 self._pages = self._root_object.raw_get("/Pages")
1232 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest
1233 # must be done here before rewriting
1234 if self.incremental:
1235 self._original_hash = [
1236 (obj.hash_bin() if obj is not None else 0) for obj in self._objects
1237 ]
1238 self._flatten()
1239 assert self.flattened_pages is not None
1240 for p in self.flattened_pages:
1241 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)
1242 if not self.incremental:
1243 p[NameObject("/Parent")] = self._pages
1244 if not self.incremental:
1245 cast(DictionaryObject, self._pages.get_object())[
1246 NameObject("/Kids")
1247 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
1249 def clone_document_from_reader(
1250 self,
1251 reader: PdfReader,
1252 after_page_append: Optional[Callable[[PageObject], None]] = None,
1253 ) -> None:
1254 """
1255 Create a copy (clone) of a document from a PDF file reader cloning
1256 section '/Root' and '/Info' and '/ID' of the pdf.
1258 Args:
1259 reader: PDF file reader instance from which the clone
1260 should be created.
1261 after_page_append:
1262 Callback function that is invoked after each page is appended to
1263 the writer. Signature includes a reference to the appended page
1264 (delegates to append_pages_from_reader). The single parameter of
1265 the callback is a reference to the page just appended to the
1266 document.
1268 """
1269 self.clone_reader_document_root(reader)
1270 inf = reader._info
1271 if self.incremental:
1272 if inf is not None:
1273 self._info_obj = cast(
1274 IndirectObject, inf.clone(self).indirect_reference
1275 )
1276 assert isinstance(self._info, DictionaryObject), "for mypy"
1277 self._original_hash[
1278 self._info_obj.indirect_reference.idnum - 1
1279 ] = self._info.hash_bin()
1280 elif inf is not None:
1281 self._info_obj = self._add_object(
1282 DictionaryObject(cast(DictionaryObject, inf.get_object()))
1283 )
1284 # else: _info_obj = None done in clone_reader_document_root()
1286 try:
1287 self._ID = cast(ArrayObject, reader._ID).clone(self)
1288 except AttributeError:
1289 pass
1291 if callable(after_page_append):
1292 for page in cast(
1293 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]
1294 ):
1295 after_page_append(page.get_object())
1297 def _compute_document_identifier(self) -> ByteStringObject:
1298 stream = BytesIO()
1299 self._write_pdf_structure(stream)
1300 stream.seek(0)
1301 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
1303 def generate_file_identifiers(self) -> None:
1304 """
1305 Generate an identifier for the PDF that will be written.
1307 The only point of this is ensuring uniqueness. Reproducibility is not
1308 required.
1309 When a file is first written, both identifiers shall be set to the same value.
1310 If both identifiers match when a file reference is resolved, it is very
1311 likely that the correct and unchanged file has been found. If only the first
1312 identifier matches, a different version of the correct file has been found.
1313 see §14.4 "File Identifiers".
1314 """
1315 if self._ID:
1316 id1 = self._ID[0]
1317 id2 = self._compute_document_identifier()
1318 else:
1319 id1 = self._compute_document_identifier()
1320 id2 = id1
1321 self._ID = ArrayObject((id1, id2))
1323 def encrypt(
1324 self,
1325 user_password: str,
1326 owner_password: Optional[str] = None,
1327 use_128bit: bool = True,
1328 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,
1329 *,
1330 algorithm: Optional[str] = None,
1331 ) -> None:
1332 """
1333 Encrypt this PDF file with the PDF Standard encryption handler.
1335 Args:
1336 user_password: The password which allows for opening
1337 and reading the PDF file with the restrictions provided.
1338 owner_password: The password which allows for
1339 opening the PDF files without any restrictions. By default,
1340 the owner password is the same as the user password.
1341 use_128bit: flag as to whether to use 128bit
1342 encryption. When false, 40bit encryption will be used.
1343 By default, this flag is on.
1344 permissions_flag: permissions as described in
1345 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means
1346 the permission is granted.
1347 Hence an integer value of -1 will set all flags.
1348 Bit position 3 is for printing, 4 is for modifying content,
1349 5 and 6 control annotations, 9 for form fields,
1350 10 for extraction of text and graphics.
1351 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",
1352 "AES-128", "AES-256-R5", "AES-256". If it is valid,
1353 `use_128bit` will be ignored.
1355 """
1356 if owner_password is None:
1357 owner_password = user_password
1359 if algorithm is not None:
1360 try:
1361 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))
1362 except AttributeError:
1363 raise ValueError(f"Algorithm '{algorithm}' NOT supported")
1364 else:
1365 alg = EncryptAlgorithm.RC4_128
1366 if not use_128bit:
1367 alg = EncryptAlgorithm.RC4_40
1368 self.generate_file_identifiers()
1369 assert self._ID
1370 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
1371 # in case call `encrypt` again
1372 entry = self._encryption.write_entry(user_password, owner_password)
1373 if self._encrypt_entry:
1374 # replace old encrypt_entry
1375 assert self._encrypt_entry.indirect_reference is not None
1376 entry.indirect_reference = self._encrypt_entry.indirect_reference
1377 self._objects[entry.indirect_reference.idnum - 1] = entry
1378 else:
1379 self._add_object(entry)
1380 self._encrypt_entry = entry
1382 def write_stream(self, stream: StreamType) -> None:
1383 if hasattr(stream, "mode") and "b" not in stream.mode:
1384 logger_warning(
1385 f"File <{stream.name}> to write to is not in binary mode. "
1386 "It may not be written to correctly.",
1387 __name__,
1388 )
1389 # deprecated to be removed in pypdf 6.0.0 :
1390 # if not self._root:
1391 # self._root = self._add_object(self._root_object)
1392 # self._sweep_indirect_references(self._root)
1394 if self.incremental:
1395 self._reader.stream.seek(0)
1396 stream.write(self._reader.stream.read(-1))
1397 if len(self.list_objects_in_increment()) > 0:
1398 self._write_increment(stream) # writes objs, xref stream and startxref
1399 else:
1400 object_positions, free_objects = self._write_pdf_structure(stream)
1401 xref_location = self._write_xref_table(
1402 stream, object_positions, free_objects
1403 )
1404 self._write_trailer(stream, xref_location)
1406 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
1407 """
1408 Write the collection of pages added to this object out as a PDF file.
1410 Args:
1411 stream: An object to write the file to. The object can support
1412 the write method and the tell method, similar to a file object, or
1413 be a file path, just like the fileobj, just named it stream to keep
1414 existing workflow.
1416 Returns:
1417 A tuple (bool, IO).
1419 """
1420 my_file = False
1422 if stream == "":
1423 raise ValueError(f"Output({stream=}) is empty.")
1425 if isinstance(stream, (str, Path)):
1426 stream = FileIO(stream, "wb")
1427 my_file = True
1429 self.write_stream(stream)
1431 if my_file:
1432 stream.close()
1433 else:
1434 stream.flush()
1436 return my_file, stream
1438 def list_objects_in_increment(self) -> List[IndirectObject]:
1439 """
1440 For analysis or debugging.
1441 Provides the list of new or modified objects that will be written
1442 in the increment.
1443 Deleted objects will not be freed but will become orphans.
1445 Returns:
1446 List of new or modified IndirectObjects
1448 """
1449 original_hash_count = len(self._original_hash)
1450 return [
1451 cast(IndirectObject, obj).indirect_reference
1452 for i, obj in enumerate(self._objects)
1453 if (
1454 obj is not None
1455 and (
1456 i >= original_hash_count
1457 or obj.hash_bin() != self._original_hash[i]
1458 )
1459 )
1460 ]
1462 def _write_increment(self, stream: StreamType) -> None:
1463 object_positions = {}
1464 object_blocks = []
1465 current_start = -1
1466 current_stop = -2
1467 original_hash_count = len(self._original_hash)
1468 for i, obj in enumerate(self._objects):
1469 if obj is not None and (
1470 i >= original_hash_count
1471 or obj.hash_bin() != self._original_hash[i]
1472 ):
1473 idnum = i + 1
1474 assert isinstance(obj, PdfObject), "mypy"
1475 # first write new/modified object
1476 object_positions[idnum] = stream.tell()
1477 stream.write(f"{idnum} 0 obj\n".encode())
1478 """ encryption is not operational
1479 if self._encryption and obj != self._encrypt_entry:
1480 obj = self._encryption.encrypt_object(obj, idnum, 0)
1481 """
1482 obj.write_to_stream(stream)
1483 stream.write(b"\nendobj\n")
1485 # prepare xref
1486 if idnum != current_stop:
1487 if current_start > 0:
1488 object_blocks.append(
1489 [current_start, current_stop - current_start]
1490 )
1491 current_start = idnum
1492 current_stop = idnum + 1
1493 assert current_start > 0, "for pytest only"
1494 object_blocks.append([current_start, current_stop - current_start])
1495 # write incremented xref
1496 xref_location = stream.tell()
1497 xr_id = len(self._objects) + 1
1498 stream.write(f"{xr_id} 0 obj".encode())
1499 init_data = {
1500 NameObject("/Type"): NameObject("/XRef"),
1501 NameObject("/Size"): NumberObject(xr_id + 1),
1502 NameObject("/Root"): self.root_object.indirect_reference,
1503 NameObject("/Filter"): NameObject("/FlateDecode"),
1504 NameObject("/Index"): ArrayObject(
1505 [NumberObject(_it) for _su in object_blocks for _it in _su]
1506 ),
1507 NameObject("/W"): ArrayObject(
1508 [NumberObject(1), NumberObject(4), NumberObject(1)]
1509 ),
1510 "__streamdata__": b"",
1511 }
1512 if self._info is not None and (
1513 self._info.indirect_reference.idnum - 1 # type: ignore
1514 >= len(self._original_hash)
1515 or cast(IndirectObject, self._info).hash_bin() # kept for future
1516 != self._original_hash[
1517 self._info.indirect_reference.idnum - 1 # type: ignore
1518 ]
1519 ):
1520 init_data[NameObject(TK.INFO)] = self._info.indirect_reference
1521 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
1522 if self._ID:
1523 init_data[NameObject(TK.ID)] = self._ID
1524 xr = StreamObject.initialize_from_dictionary(init_data)
1525 xr.set_data(
1526 b"".join(
1527 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]
1528 )
1529 )
1530 xr.write_to_stream(stream)
1531 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1533 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
1534 object_positions = []
1535 free_objects = []
1536 stream.write(self.pdf_header.encode() + b"\n")
1537 stream.write(b"%\xE2\xE3\xCF\xD3\n")
1539 for idnum, obj in enumerate(self._objects, start=1):
1540 if obj is not None:
1541 object_positions.append(stream.tell())
1542 stream.write(f"{idnum} 0 obj\n".encode())
1543 if self._encryption and obj != self._encrypt_entry:
1544 obj = self._encryption.encrypt_object(obj, idnum, 0)
1545 obj.write_to_stream(stream)
1546 stream.write(b"\nendobj\n")
1547 else:
1548 object_positions.append(-1)
1549 free_objects.append(idnum)
1550 free_objects.append(0) # add 0 to loop in accordance with specification
1551 return object_positions, free_objects
1553 def _write_xref_table(
1554 self, stream: StreamType, object_positions: List[int], free_objects: List[int]
1555 ) -> int:
1556 xref_location = stream.tell()
1557 stream.write(b"xref\n")
1558 stream.write(f"0 {len(self._objects) + 1}\n".encode())
1559 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
1560 free_idx = 1
1561 for offset in object_positions:
1562 if offset > 0:
1563 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
1564 else:
1565 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
1566 free_idx += 1
1567 return xref_location
1569 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
1570 """
1571 Write the PDF trailer to the stream.
1573 To quote the PDF specification:
1574 [The] trailer [gives] the location of the cross-reference table and
1575 of certain special objects within the body of the file.
1576 """
1577 stream.write(b"trailer\n")
1578 trailer = DictionaryObject(
1579 {
1580 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
1581 NameObject(TK.ROOT): self.root_object.indirect_reference,
1582 }
1583 )
1584 if self._info is not None:
1585 trailer[NameObject(TK.INFO)] = self._info.indirect_reference
1586 if self._ID is not None:
1587 trailer[NameObject(TK.ID)] = self._ID
1588 if self._encrypt_entry:
1589 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference
1590 trailer.write_to_stream(stream)
1591 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1593 @property
1594 def metadata(self) -> Optional[DocumentInformation]:
1595 """
1596 Retrieve/set the PDF file's document information dictionary, if it exists.
1598 Args:
1599 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
1601 Note that some PDF files use (XMP) metadata streams instead of document
1602 information dictionaries, and these metadata streams will not be
1603 accessed by this function, but by :meth:`~xmp_metadata`.
1605 """
1606 return super().metadata
1608 @metadata.setter
1609 def metadata(
1610 self,
1611 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],
1612 ) -> None:
1613 if value is None:
1614 self._info = None
1615 else:
1616 if self._info is not None:
1617 self._info.clear()
1619 self.add_metadata(value)
1621 def add_metadata(self, infos: Dict[str, Any]) -> None:
1622 """
1623 Add custom metadata to the output.
1625 Args:
1626 infos: a Python dictionary where each key is a field
1627 and each value is your new metadata.
1629 """
1630 args = {}
1631 if isinstance(infos, PdfObject):
1632 infos = cast(DictionaryObject, infos.get_object())
1633 for key, value in list(infos.items()):
1634 if isinstance(value, PdfObject):
1635 value = value.get_object()
1636 args[NameObject(key)] = create_string_object(str(value))
1637 if self._info is None:
1638 self._info = DictionaryObject()
1639 self._info.update(args)
1641 def compress_identical_objects(
1642 self,
1643 remove_identicals: bool = True,
1644 remove_orphans: bool = True,
1645 ) -> None:
1646 """
1647 Parse the PDF file and merge objects that have the same hash.
1648 This will make objects common to multiple pages.
1649 Recommended to be used just before writing output.
1651 Args:
1652 remove_identicals: Remove identical objects.
1653 remove_orphans: Remove unreferenced objects.
1655 """
1657 def replace_in_obj(
1658 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]
1659 ) -> None:
1660 if isinstance(obj, DictionaryObject):
1661 key_val = obj.items()
1662 elif isinstance(obj, ArrayObject):
1663 key_val = enumerate(obj) # type: ignore
1664 else:
1665 return
1666 assert isinstance(obj, (DictionaryObject, ArrayObject))
1667 for k, v in key_val:
1668 if isinstance(v, IndirectObject):
1669 orphans[v.idnum - 1] = False
1670 if v in crossref:
1671 obj[k] = crossref[v]
1672 else:
1673 """the filtering on DictionaryObject and ArrayObject only
1674 will be performed within replace_in_obj"""
1675 replace_in_obj(v, crossref)
1677 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
1678 self._idnum_hash = {}
1679 orphans = [True] * len(self._objects)
1680 # look for similar objects
1681 for idx, obj in enumerate(self._objects):
1682 if is_null_or_none(obj):
1683 continue
1684 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.
1685 assert isinstance(obj.indirect_reference, IndirectObject)
1686 h = obj.hash_value()
1687 if remove_identicals and h in self._idnum_hash:
1688 self._idnum_hash[h][1].append(obj.indirect_reference)
1689 self._objects[idx] = None
1690 else:
1691 self._idnum_hash[h] = (obj.indirect_reference, [])
1693 # generate the dict converting others to 1st
1694 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
1695 cnv_rev: Dict[IndirectObject, IndirectObject] = {}
1696 for k, v in cnv.items():
1697 cnv_rev.update(zip(v, (k,) * len(v)))
1699 # replace reference to merged objects
1700 for obj in self._objects:
1701 if isinstance(obj, (DictionaryObject, ArrayObject)):
1702 replace_in_obj(obj, cnv_rev)
1704 # remove orphans (if applicable)
1705 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
1707 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore
1709 try:
1710 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore
1711 except AttributeError:
1712 pass
1713 for i in compress(range(len(self._objects)), orphans):
1714 self._objects[i] = None
1716 def _sweep_indirect_references(
1717 self,
1718 root: Union[
1719 ArrayObject,
1720 BooleanObject,
1721 DictionaryObject,
1722 FloatObject,
1723 IndirectObject,
1724 NameObject,
1725 PdfObject,
1726 NumberObject,
1727 TextStringObject,
1728 NullObject,
1729 ],
1730 ) -> None: # deprecated
1731 """
1732 Resolving any circular references to Page objects.
1734 Circular references to Page objects can arise when objects such as
1735 annotations refer to their associated page. If these references are not
1736 properly handled, the PDF file will contain multiple copies of the same
1737 Page object. To address this problem, Page objects store their original
1738 object reference number. This method adds the reference number of any
1739 circularly referenced Page objects to an external reference map. This
1740 ensures that self-referencing trees reference the correct new object
1741 location, rather than copying in a new copy of the Page object.
1743 Args:
1744 root: The root of the PDF object tree to sweep.
1746 """
1747 deprecate(
1748 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed",
1749 )
1751 def _resolve_indirect_object(
1752 self, data: IndirectObject
1753 ) -> IndirectObject: # deprecated
1754 """
1755 Resolves an indirect object to an indirect object in this PDF file.
1757 If the input indirect object already belongs to this PDF file, it is
1758 returned directly. Otherwise, the object is retrieved from the input
1759 object's PDF file using the object's ID number and generation number. If
1760 the object cannot be found, a warning is logged and a `NullObject` is
1761 returned.
1763 If the object is not already in this PDF file, it is added to the file's
1764 list of objects and assigned a new ID number and generation number of 0.
1765 The hash value of the object is then added to the `_idnum_hash`
1766 dictionary, with the corresponding `IndirectObject` reference as the
1767 value.
1769 Args:
1770 data: The `IndirectObject` to resolve.
1772 Returns:
1773 The resolved `IndirectObject` in this PDF file.
1775 Raises:
1776 ValueError: If the input stream is closed.
1778 """
1779 deprecate(
1780 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed",
1781 )
1782 return IndirectObject(0, 0, self)
1784 def get_reference(self, obj: PdfObject) -> IndirectObject:
1785 idnum = self._objects.index(obj) + 1
1786 ref = IndirectObject(idnum, 0, self)
1787 assert ref.get_object() == obj
1788 return ref
1790 def get_outline_root(self) -> TreeObject:
1791 if CO.OUTLINES in self._root_object:
1792 # Entries in the catalog dictionary
1793 outline = cast(TreeObject, self._root_object[CO.OUTLINES])
1794 if not isinstance(outline, TreeObject):
1795 t = TreeObject(outline)
1796 self._replace_object(outline.indirect_reference.idnum, t)
1797 outline = t
1798 idnum = self._objects.index(outline) + 1
1799 outline_ref = IndirectObject(idnum, 0, self)
1800 assert outline_ref.get_object() == outline
1801 else:
1802 outline = TreeObject()
1803 outline.update({})
1804 outline_ref = self._add_object(outline)
1805 self._root_object[NameObject(CO.OUTLINES)] = outline_ref
1807 return outline
1809 def get_threads_root(self) -> ArrayObject:
1810 """
1811 The list of threads.
1813 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1815 Returns:
1816 An array (possibly empty) of Dictionaries with an ``/F`` key,
1817 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
1819 """
1820 if CO.THREADS in self._root_object:
1821 # Entries in the catalog dictionary
1822 threads = cast(ArrayObject, self._root_object[CO.THREADS])
1823 else:
1824 threads = ArrayObject()
1825 self._root_object[NameObject(CO.THREADS)] = threads
1826 return threads
1828 @property
1829 def threads(self) -> ArrayObject:
1830 """
1831 Read-only property for the list of threads.
1833 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1835 Each element is a dictionary with an ``/F`` key, and optionally
1836 information about the thread in ``/I`` or ``/Metadata`` keys.
1837 """
1838 return self.get_threads_root()
1840 def add_outline_item_destination(
1841 self,
1842 page_destination: Union[IndirectObject, PageObject, TreeObject],
1843 parent: Union[None, TreeObject, IndirectObject] = None,
1844 before: Union[None, TreeObject, IndirectObject] = None,
1845 is_open: bool = True,
1846 ) -> IndirectObject:
1847 page_destination = cast(PageObject, page_destination.get_object())
1848 if isinstance(page_destination, PageObject):
1849 return self.add_outline_item_destination(
1850 Destination(
1851 f"page #{page_destination.page_number}",
1852 cast(IndirectObject, page_destination.indirect_reference),
1853 Fit.fit(),
1854 )
1855 )
1857 if parent is None:
1858 parent = self.get_outline_root()
1860 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)
1861 parent = cast(TreeObject, parent.get_object())
1862 page_destination_ref = self._add_object(page_destination)
1863 if before is not None:
1864 before = before.indirect_reference
1865 parent.insert_child(
1866 page_destination_ref,
1867 before,
1868 self,
1869 page_destination.inc_parent_counter_outline
1870 if is_open
1871 else (lambda x, y: 0), # noqa: ARG005
1872 )
1873 if "/Count" not in page_destination:
1874 page_destination[NameObject("/Count")] = NumberObject(0)
1876 return page_destination_ref
1878 def add_outline_item_dict(
1879 self,
1880 outline_item: OutlineItemType,
1881 parent: Union[None, TreeObject, IndirectObject] = None,
1882 before: Union[None, TreeObject, IndirectObject] = None,
1883 is_open: bool = True,
1884 ) -> IndirectObject:
1885 outline_item_object = TreeObject()
1886 outline_item_object.update(outline_item)
1888 """code currently unreachable
1889 if "/A" in outline_item:
1890 action = DictionaryObject()
1891 a_dict = cast(DictionaryObject, outline_item["/A"])
1892 for k, v in list(a_dict.items()):
1893 action[NameObject(str(k))] = v
1894 action_ref = self._add_object(action)
1895 outline_item_object[NameObject("/A")] = action_ref
1896 """
1897 return self.add_outline_item_destination(
1898 outline_item_object, parent, before, is_open
1899 )
1901 def add_outline_item(
1902 self,
1903 title: str,
1904 page_number: Union[None, PageObject, IndirectObject, int],
1905 parent: Union[None, TreeObject, IndirectObject] = None,
1906 before: Union[None, TreeObject, IndirectObject] = None,
1907 color: Optional[Union[Tuple[float, float, float], str]] = None,
1908 bold: bool = False,
1909 italic: bool = False,
1910 fit: Fit = PAGE_FIT,
1911 is_open: bool = True,
1912 ) -> IndirectObject:
1913 """
1914 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.
1916 Args:
1917 title: Title to use for this outline item.
1918 page_number: Page number this outline item will point to.
1919 parent: A reference to a parent outline item to create nested
1920 outline items.
1921 before:
1922 color: Color of the outline item's font as a red, green, blue tuple
1923 from 0.0 to 1.0 or as a Hex String (#RRGGBB)
1924 bold: Outline item font is bold
1925 italic: Outline item font is italic
1926 fit: The fit of the destination page.
1928 Returns:
1929 The added outline item as an indirect object.
1931 """
1932 page_ref: Union[None, NullObject, IndirectObject, NumberObject]
1933 if isinstance(italic, Fit): # it means that we are on the old params
1934 if fit is not None and page_number is None:
1935 page_number = fit
1936 return self.add_outline_item(
1937 title, page_number, parent, None, before, color, bold, italic, is_open=is_open
1938 )
1939 if page_number is None:
1940 action_ref = None
1941 else:
1942 if isinstance(page_number, IndirectObject):
1943 page_ref = page_number
1944 elif isinstance(page_number, PageObject):
1945 page_ref = page_number.indirect_reference
1946 elif isinstance(page_number, int):
1947 try:
1948 page_ref = self.pages[page_number].indirect_reference
1949 except IndexError:
1950 page_ref = NumberObject(page_number)
1951 if page_ref is None:
1952 logger_warning(
1953 f"can not find reference of page {page_number}",
1954 __name__,
1955 )
1956 page_ref = NullObject()
1957 dest = Destination(
1958 NameObject("/" + title + " outline item"),
1959 page_ref,
1960 fit,
1961 )
1963 action_ref = self._add_object(
1964 DictionaryObject(
1965 {
1966 NameObject(GoToActionArguments.D): dest.dest_array,
1967 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
1968 }
1969 )
1970 )
1971 outline_item = self._add_object(
1972 _create_outline_item(action_ref, title, color, italic, bold)
1973 )
1975 if parent is None:
1976 parent = self.get_outline_root()
1977 return self.add_outline_item_destination(outline_item, parent, before, is_open)
1979 def add_outline(self) -> None:
1980 raise NotImplementedError(
1981 "This method is not yet implemented. Use :meth:`add_outline_item` instead."
1982 )
1984 def add_named_destination_array(
1985 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]
1986 ) -> None:
1987 named_dest = self.get_named_dest_root()
1988 i = 0
1989 while i < len(named_dest):
1990 if title < named_dest[i]:
1991 named_dest.insert(i, destination)
1992 named_dest.insert(i, TextStringObject(title))
1993 return
1994 i += 2
1995 named_dest.extend([TextStringObject(title), destination])
1996 return
1998 def add_named_destination_object(
1999 self,
2000 page_destination: PdfObject,
2001 ) -> IndirectObject:
2002 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore
2003 self.add_named_destination_array(
2004 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore
2005 )
2007 return page_destination_ref
2009 def add_named_destination(
2010 self,
2011 title: str,
2012 page_number: int,
2013 ) -> IndirectObject:
2014 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore
2015 dest = DictionaryObject()
2016 dest.update(
2017 {
2018 NameObject(GoToActionArguments.D): ArrayObject(
2019 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]
2020 ),
2021 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
2022 }
2023 )
2025 dest_ref = self._add_object(dest)
2026 if not isinstance(title, TextStringObject):
2027 title = TextStringObject(str(title))
2029 self.add_named_destination_array(title, dest_ref)
2030 return dest_ref
2032 def remove_links(self) -> None:
2033 """Remove links and annotations from this output."""
2034 for page in self.pages:
2035 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
2037 def remove_annotations(
2038 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]
2039 ) -> None:
2040 """
2041 Remove annotations by annotation subtype.
2043 Args:
2044 subtypes: subtype or list of subtypes to be removed.
2045 Examples are: "/Link", "/FileAttachment", "/Sound",
2046 "/Movie", "/Screen", ...
2047 If you want to remove all annotations, use subtypes=None.
2049 """
2050 for page in self.pages:
2051 self._remove_annots_from_page(page, subtypes)
2053 def _remove_annots_from_page(
2054 self,
2055 page: Union[IndirectObject, PageObject, DictionaryObject],
2056 subtypes: Optional[Iterable[str]],
2057 ) -> None:
2058 page = cast(DictionaryObject, page.get_object())
2059 if PG.ANNOTS in page:
2060 i = 0
2061 while i < len(cast(ArrayObject, page[PG.ANNOTS])):
2062 an = cast(ArrayObject, page[PG.ANNOTS])[i]
2063 obj = cast(DictionaryObject, an.get_object())
2064 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
2065 if isinstance(an, IndirectObject):
2066 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
2067 del page[PG.ANNOTS][i] # type:ignore
2068 else:
2069 i += 1
2071 def remove_objects_from_page(
2072 self,
2073 page: Union[PageObject, DictionaryObject],
2074 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
2075 text_filters: Optional[Dict[str, Any]] = None
2076 ) -> None:
2077 """
2078 Remove objects specified by ``to_delete`` from the given page.
2080 Args:
2081 page: Page object to clean up.
2082 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
2083 or a list of ObjectDeletionFlag
2084 text_filters: Properties of text to be deleted, if applicable. Optional.
2085 This is a Python dictionary with the following properties:
2087 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.
2089 """
2090 if isinstance(to_delete, (list, tuple)):
2091 for to_d in to_delete:
2092 self.remove_objects_from_page(page, to_d)
2093 return None
2094 assert isinstance(to_delete, ObjectDeletionFlag)
2096 if to_delete & ObjectDeletionFlag.LINKS:
2097 return self._remove_annots_from_page(page, ("/Link",))
2098 if to_delete & ObjectDeletionFlag.ATTACHMENTS:
2099 return self._remove_annots_from_page(
2100 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
2101 )
2102 if to_delete & ObjectDeletionFlag.OBJECTS_3D:
2103 return self._remove_annots_from_page(page, ("/3D",))
2104 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
2105 return self._remove_annots_from_page(page, None)
2107 jump_operators = []
2108 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
2109 jump_operators = (
2110 [
2111 b"w", b"J", b"j", b"M", b"d", b"i",
2112 b"W", b"W*",
2113 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",
2114 b"m", b"l", b"c", b"v", b"y", b"h", b"re",
2115 b"sh"
2116 ]
2117 )
2118 if to_delete & ObjectDeletionFlag.TEXT:
2119 jump_operators = [b"Tj", b"TJ", b"'", b'"']
2121 def clean(
2122 content: ContentStream,
2123 images: List[str],
2124 forms: List[str],
2125 text_filters: Optional[Dict[str, Any]] = None
2126 ) -> None:
2127 nonlocal jump_operators, to_delete
2129 font_id = None
2130 font_ids_to_delete = []
2131 if text_filters and to_delete & ObjectDeletionFlag.TEXT:
2132 font_ids_to_delete = text_filters.get("font_ids", [])
2134 i = 0
2135 while i < len(content.operations):
2136 operands, operator = content.operations[i]
2137 if operator == b"Tf":
2138 font_id = operands[0]
2139 if (
2140 (
2141 operator == b"INLINE IMAGE"
2142 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)
2143 )
2144 or (operator in jump_operators)
2145 or (
2146 operator == b"Do"
2147 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)
2148 and (operands[0] in images)
2149 )
2150 ):
2151 if (
2152 not to_delete & ObjectDeletionFlag.TEXT
2153 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
2154 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
2155 ):
2156 del content.operations[i]
2157 else:
2158 i += 1
2159 else:
2160 i += 1
2161 content.get_data() # this ensures ._data is rebuilt from the .operations
2163 def clean_forms(
2164 elt: DictionaryObject, stack: List[DictionaryObject]
2165 ) -> Tuple[List[str], List[str]]:
2166 nonlocal to_delete
2167 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference
2168 if (elt in stack) or (
2169 hasattr(elt, "indirect_reference")
2170 and any(
2171 elt.indirect_reference == getattr(x, "indirect_reference", -1)
2172 for x in stack
2173 )
2174 ):
2175 # to prevent infinite looping
2176 return [], [] # pragma: no cover
2177 try:
2178 d = cast(
2179 Dict[Any, Any],
2180 cast(DictionaryObject, elt["/Resources"])["/XObject"],
2181 )
2182 except KeyError:
2183 d = {}
2184 images = []
2185 forms = []
2186 for k, v in d.items():
2187 o = v.get_object()
2188 try:
2189 content: Any = None
2190 if (
2191 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES
2192 and o["/Subtype"] == "/Image"
2193 ):
2194 content = NullObject() # to delete the image keeping the entry
2195 images.append(k)
2196 if o["/Subtype"] == "/Form":
2197 forms.append(k)
2198 if isinstance(o, ContentStream):
2199 content = o
2200 else:
2201 content = ContentStream(o, self)
2202 content.update(
2203 {
2204 k1: v1
2205 for k1, v1 in o.items()
2206 if k1 not in ["/Length", "/Filter", "/DecodeParms"]
2207 }
2208 )
2209 try:
2210 content.indirect_reference = o.indirect_reference
2211 except AttributeError: # pragma: no cover
2212 pass
2213 stack.append(elt)
2214 clean_forms(content, stack) # clean subforms
2215 if content is not None:
2216 if isinstance(v, IndirectObject):
2217 self._objects[v.idnum - 1] = content
2218 else:
2219 # should only occur in a PDF not respecting PDF spec
2220 # where streams must be indirected.
2221 d[k] = self._add_object(content) # pragma: no cover
2222 except (TypeError, KeyError):
2223 pass
2224 for im in images:
2225 del d[im] # for clean-up
2226 if isinstance(elt, StreamObject): # for /Form
2227 if not isinstance(elt, ContentStream): # pragma: no cover
2228 e = ContentStream(elt, self)
2229 e.update(elt.items())
2230 elt = e
2231 clean(elt, images, forms, text_filters) # clean the content
2232 return images, forms
2234 if not isinstance(page, PageObject):
2235 page = PageObject(self, page.indirect_reference) # pragma: no cover
2236 if "/Contents" in page:
2237 content = cast(ContentStream, page.get_contents())
2239 images, forms = clean_forms(page, [])
2241 clean(content, images, forms, text_filters)
2242 page.replace_contents(content)
2244 def remove_images(
2245 self,
2246 to_delete: ImageType = ImageType.ALL,
2247 ) -> None:
2248 """
2249 Remove images from this output.
2251 Args:
2252 to_delete: The type of images to be deleted
2253 (default = all images types)
2255 """
2256 if isinstance(to_delete, bool):
2257 to_delete = ImageType.ALL
2259 i = ObjectDeletionFlag.NONE
2261 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):
2262 if to_delete & ImageType[image]:
2263 i |= ObjectDeletionFlag[image]
2265 for page in self.pages:
2266 self.remove_objects_from_page(page, i)
2268 def remove_text(self, font_names: Optional[List[str]] = None) -> None:
2269 """
2270 Remove text from the PDF.
2272 Args:
2273 font_names: List of font names to remove, such as "Helvetica-Bold".
2274 Optional. If not specified, all text will be removed.
2275 """
2276 if not font_names:
2277 font_names = []
2279 for page in self.pages:
2280 resource_ids_to_remove = []
2282 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
2283 # Font names need to be converted to resource names/IDs for easier removal
2284 if font_names:
2285 # Recursively loop through page objects to gather font info
2286 def get_font_info(
2287 obj: Any,
2288 font_info: Optional[Dict[str, Any]] = None,
2289 key: Optional[str] = None
2290 ) -> Dict[str, Any]:
2291 if font_info is None:
2292 font_info = {}
2293 if isinstance(obj, IndirectObject):
2294 obj = obj.get_object()
2295 if isinstance(obj, dict):
2296 if obj.get("/Type") == "/Font":
2297 font_name = obj.get("/BaseFont", "")
2298 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
2299 normalized_font_name = font_name.lstrip("/").split("+")[-1]
2300 if normalized_font_name not in font_info:
2301 font_info[normalized_font_name] = {
2302 "normalized_font_name": normalized_font_name,
2303 "resource_ids": [],
2304 }
2305 if key not in font_info[normalized_font_name]["resource_ids"]:
2306 font_info[normalized_font_name]["resource_ids"].append(key)
2307 for k in obj:
2308 font_info = get_font_info(obj[k], font_info, k)
2309 elif isinstance(obj, (list, ArrayObject)):
2310 for child_obj in obj:
2311 font_info = get_font_info(child_obj, font_info)
2312 return font_info
2314 # Add relevant resource names for removal
2315 font_info = get_font_info(page.get("/Resources"))
2316 for font_name in font_names:
2317 if font_name in font_info:
2318 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])
2320 text_filters = {}
2321 if font_names:
2322 text_filters["font_ids"] = resource_ids_to_remove
2323 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
2325 def add_uri(
2326 self,
2327 page_number: int,
2328 uri: str,
2329 rect: RectangleObject,
2330 border: Optional[ArrayObject] = None,
2331 ) -> None:
2332 """
2333 Add an URI from a rectangular area to the specified page.
2335 Args:
2336 page_number: index of the page on which to place the URI action.
2337 uri: URI of resource to link to.
2338 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or
2339 array of four integers specifying the clickable rectangular area
2340 ``[xLL, yLL, xUR, yUR]``, or string in the form
2341 ``"[ xLL yLL xUR yUR ]"``.
2342 border: if provided, an array describing border-drawing
2343 properties. See the PDF spec for details. No border will be
2344 drawn if this argument is omitted.
2346 """
2347 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore
2348 page_ref = cast(Dict[str, Any], self.get_object(page_link))
2350 border_arr: BorderArrayType
2351 if border is not None:
2352 border_arr = [NumberObject(n) for n in border[:3]]
2353 if len(border) == 4:
2354 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
2355 border_arr.append(dash_pattern)
2356 else:
2357 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]
2359 if isinstance(rect, str):
2360 rect = NumberObject(rect)
2361 elif isinstance(rect, RectangleObject):
2362 pass
2363 else:
2364 rect = RectangleObject(rect)
2366 lnk2 = DictionaryObject()
2367 lnk2.update(
2368 {
2369 NameObject("/S"): NameObject("/URI"),
2370 NameObject("/URI"): TextStringObject(uri),
2371 }
2372 )
2373 lnk = DictionaryObject()
2374 lnk.update(
2375 {
2376 NameObject(AA.Type): NameObject("/Annot"),
2377 NameObject(AA.Subtype): NameObject("/Link"),
2378 NameObject(AA.P): page_link,
2379 NameObject(AA.Rect): rect,
2380 NameObject("/H"): NameObject("/I"),
2381 NameObject(AA.Border): ArrayObject(border_arr),
2382 NameObject("/A"): lnk2,
2383 }
2384 )
2385 lnk_ref = self._add_object(lnk)
2387 if PG.ANNOTS in page_ref:
2388 page_ref[PG.ANNOTS].append(lnk_ref)
2389 else:
2390 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])
2392 _valid_layouts = (
2393 "/NoLayout",
2394 "/SinglePage",
2395 "/OneColumn",
2396 "/TwoColumnLeft",
2397 "/TwoColumnRight",
2398 "/TwoPageLeft",
2399 "/TwoPageRight",
2400 )
2402 def _get_page_layout(self) -> Optional[LayoutType]:
2403 try:
2404 return cast(LayoutType, self._root_object["/PageLayout"])
2405 except KeyError:
2406 return None
2408 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:
2409 """
2410 Set the page layout.
2412 Args:
2413 layout: The page layout to be used.
2415 .. list-table:: Valid ``layout`` arguments
2416 :widths: 50 200
2418 * - /NoLayout
2419 - Layout explicitly not specified
2420 * - /SinglePage
2421 - Show one page at a time
2422 * - /OneColumn
2423 - Show one column at a time
2424 * - /TwoColumnLeft
2425 - Show pages in two columns, odd-numbered pages on the left
2426 * - /TwoColumnRight
2427 - Show pages in two columns, odd-numbered pages on the right
2428 * - /TwoPageLeft
2429 - Show two pages at a time, odd-numbered pages on the left
2430 * - /TwoPageRight
2431 - Show two pages at a time, odd-numbered pages on the right
2433 """
2434 if not isinstance(layout, NameObject):
2435 if layout not in self._valid_layouts:
2436 logger_warning(
2437 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",
2438 __name__,
2439 )
2440 layout = NameObject(layout)
2441 self._root_object.update({NameObject("/PageLayout"): layout})
2443 def set_page_layout(self, layout: LayoutType) -> None:
2444 """
2445 Set the page layout.
2447 Args:
2448 layout: The page layout to be used
2450 .. list-table:: Valid ``layout`` arguments
2451 :widths: 50 200
2453 * - /NoLayout
2454 - Layout explicitly not specified
2455 * - /SinglePage
2456 - Show one page at a time
2457 * - /OneColumn
2458 - Show one column at a time
2459 * - /TwoColumnLeft
2460 - Show pages in two columns, odd-numbered pages on the left
2461 * - /TwoColumnRight
2462 - Show pages in two columns, odd-numbered pages on the right
2463 * - /TwoPageLeft
2464 - Show two pages at a time, odd-numbered pages on the left
2465 * - /TwoPageRight
2466 - Show two pages at a time, odd-numbered pages on the right
2468 """
2469 self._set_page_layout(layout)
2471 @property
2472 def page_layout(self) -> Optional[LayoutType]:
2473 """
2474 Page layout property.
2476 .. list-table:: Valid ``layout`` values
2477 :widths: 50 200
2479 * - /NoLayout
2480 - Layout explicitly not specified
2481 * - /SinglePage
2482 - Show one page at a time
2483 * - /OneColumn
2484 - Show one column at a time
2485 * - /TwoColumnLeft
2486 - Show pages in two columns, odd-numbered pages on the left
2487 * - /TwoColumnRight
2488 - Show pages in two columns, odd-numbered pages on the right
2489 * - /TwoPageLeft
2490 - Show two pages at a time, odd-numbered pages on the left
2491 * - /TwoPageRight
2492 - Show two pages at a time, odd-numbered pages on the right
2493 """
2494 return self._get_page_layout()
2496 @page_layout.setter
2497 def page_layout(self, layout: LayoutType) -> None:
2498 self._set_page_layout(layout)
2500 _valid_modes = (
2501 "/UseNone",
2502 "/UseOutlines",
2503 "/UseThumbs",
2504 "/FullScreen",
2505 "/UseOC",
2506 "/UseAttachments",
2507 )
2509 def _get_page_mode(self) -> Optional[PagemodeType]:
2510 try:
2511 return cast(PagemodeType, self._root_object["/PageMode"])
2512 except KeyError:
2513 return None
2515 @property
2516 def page_mode(self) -> Optional[PagemodeType]:
2517 """
2518 Page mode property.
2520 .. list-table:: Valid ``mode`` values
2521 :widths: 50 200
2523 * - /UseNone
2524 - Do not show outline or thumbnails panels
2525 * - /UseOutlines
2526 - Show outline (aka bookmarks) panel
2527 * - /UseThumbs
2528 - Show page thumbnails panel
2529 * - /FullScreen
2530 - Fullscreen view
2531 * - /UseOC
2532 - Show Optional Content Group (OCG) panel
2533 * - /UseAttachments
2534 - Show attachments panel
2535 """
2536 return self._get_page_mode()
2538 @page_mode.setter
2539 def page_mode(self, mode: PagemodeType) -> None:
2540 if isinstance(mode, NameObject):
2541 mode_name: NameObject = mode
2542 else:
2543 if mode not in self._valid_modes:
2544 logger_warning(
2545 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__
2546 )
2547 mode_name = NameObject(mode)
2548 self._root_object.update({NameObject("/PageMode"): mode_name})
2550 def add_annotation(
2551 self,
2552 page_number: Union[int, PageObject],
2553 annotation: Dict[str, Any],
2554 ) -> DictionaryObject:
2555 """
2556 Add a single annotation to the page.
2557 The added annotation must be a new annotation.
2558 It cannot be recycled.
2560 Args:
2561 page_number: PageObject or page index.
2562 annotation: Annotation to be added (created with annotation).
2564 Returns:
2565 The inserted object.
2566 This can be used for popup creation, for example.
2568 """
2569 page = page_number
2570 if isinstance(page, int):
2571 page = self.pages[page]
2572 elif not isinstance(page, PageObject):
2573 raise TypeError("page: invalid type")
2575 to_add = cast(DictionaryObject, _pdf_objectify(annotation))
2576 to_add[NameObject("/P")] = page.indirect_reference
2578 if page.annotations is None:
2579 page[NameObject("/Annots")] = ArrayObject()
2580 assert page.annotations is not None
2582 # Internal link annotations need the correct object type for the
2583 # destination
2584 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:
2585 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")])
2586 dest = Destination(
2587 NameObject("/LinkName"),
2588 tmp["target_page_index"],
2589 Fit(
2590 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]
2591 ), # I have no clue why this dict-hack is necessary
2592 )
2593 to_add[NameObject("/Dest")] = dest.dest_array
2595 page.annotations.append(self._add_object(to_add))
2597 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:
2598 cast(DictionaryObject, to_add["/Parent"].get_object())[
2599 NameObject("/Popup")
2600 ] = to_add.indirect_reference
2602 return to_add
2604 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:
2605 """
2606 Perform some clean up in the page.
2607 Currently: convert NameObject named destination to TextStringObject
2608 (required for names/dests list)
2610 Args:
2611 page:
2613 Returns:
2614 The cleaned PageObject
2616 """
2617 page = cast("PageObject", page.get_object())
2618 for a in page.get("/Annots", []):
2619 a_obj = a.get_object()
2620 d = a_obj.get("/Dest", None)
2621 act = a_obj.get("/A", None)
2622 if isinstance(d, NameObject):
2623 a_obj[NameObject("/Dest")] = TextStringObject(d)
2624 elif act is not None:
2625 act = act.get_object()
2626 d = act.get("/D", None)
2627 if isinstance(d, NameObject):
2628 act[NameObject("/D")] = TextStringObject(d)
2629 return page
2631 def _create_stream(
2632 self, fileobj: Union[Path, StrByteType, PdfReader]
2633 ) -> Tuple[IOBase, Optional[Encryption]]:
2634 # If the fileobj parameter is a string, assume it is a path
2635 # and create a file object at that location. If it is a file,
2636 # copy the file's contents into a BytesIO stream object; if
2637 # it is a PdfReader, copy that reader's stream into a
2638 # BytesIO stream.
2639 # If fileobj is none of the above types, it is not modified
2640 encryption_obj = None
2641 stream: IOBase
2642 if isinstance(fileobj, (str, Path)):
2643 with FileIO(fileobj, "rb") as f:
2644 stream = BytesIO(f.read())
2645 elif isinstance(fileobj, PdfReader):
2646 if fileobj._encryption:
2647 encryption_obj = fileobj._encryption
2648 orig_tell = fileobj.stream.tell()
2649 fileobj.stream.seek(0)
2650 stream = BytesIO(fileobj.stream.read())
2652 # reset the stream to its original location
2653 fileobj.stream.seek(orig_tell)
2654 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):
2655 fileobj.seek(0)
2656 filecontent = fileobj.read()
2657 stream = BytesIO(filecontent)
2658 else:
2659 raise NotImplementedError(
2660 "Merging requires an object that PdfReader can parse. "
2661 "Typically, that is a Path or a string representing a Path, "
2662 "a file object, or an object implementing .seek and .read. "
2663 "Passing a PdfReader directly works as well."
2664 )
2665 return stream, encryption_obj
2667 def append(
2668 self,
2669 fileobj: Union[StrByteType, PdfReader, Path],
2670 outline_item: Union[
2671 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]
2672 ] = None,
2673 pages: Union[
2674 None,
2675 PageRange,
2676 Tuple[int, int],
2677 Tuple[int, int, int],
2678 List[int],
2679 List[PageObject],
2680 ] = None,
2681 import_outline: bool = True,
2682 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None,
2683 ) -> None:
2684 """
2685 Identical to the :meth:`merge()<merge>` method, but assumes you want to
2686 concatenate all pages onto the end of the file instead of specifying a
2687 position.
2689 Args:
2690 fileobj: A File Object or an object that supports the standard
2691 read and seek methods similar to a File Object. Could also be a
2692 string representing a path to a PDF file.
2693 outline_item: Optionally, you may specify a string to build an
2694 outline (aka 'bookmark') to identify the beginning of the
2695 included file.
2696 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2697 or a ``(start, stop[, step])`` tuple
2698 or a list of pages to be processed
2699 to merge only the specified range of pages from the source
2700 document into the output document.
2701 import_outline: You may prevent the source document's
2702 outline (collection of outline items, previously referred to as
2703 'bookmarks') from being imported by specifying this as ``False``.
2704 excluded_fields: Provide the list of fields/keys to be ignored
2705 if ``/Annots`` is part of the list, the annotation will be ignored
2706 if ``/B`` is part of the list, the articles will be ignored
2708 """
2709 if excluded_fields is None:
2710 excluded_fields = ()
2711 if isinstance(outline_item, (tuple, list, PageRange)):
2712 if isinstance(pages, bool):
2713 if not isinstance(import_outline, bool):
2714 excluded_fields = import_outline
2715 import_outline = pages
2716 pages = outline_item
2717 self.merge(
2718 None,
2719 fileobj,
2720 None,
2721 pages,
2722 import_outline,
2723 excluded_fields,
2724 )
2725 else: # if isinstance(outline_item, str):
2726 self.merge(
2727 None,
2728 fileobj,
2729 outline_item,
2730 pages,
2731 import_outline,
2732 excluded_fields,
2733 )
2735 def merge(
2736 self,
2737 position: Optional[int],
2738 fileobj: Union[Path, StrByteType, PdfReader],
2739 outline_item: Optional[str] = None,
2740 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None,
2741 import_outline: bool = True,
2742 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (),
2743 ) -> None:
2744 """
2745 Merge the pages from the given file into the output file at the
2746 specified page number.
2748 Args:
2749 position: The *page number* to insert this file. File will
2750 be inserted after the given number.
2751 fileobj: A File Object or an object that supports the standard
2752 read and seek methods similar to a File Object. Could also be a
2753 string representing a path to a PDF file.
2754 outline_item: Optionally, you may specify a string to build an outline
2755 (aka 'bookmark') to identify the
2756 beginning of the included file.
2757 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2758 or a ``(start, stop[, step])`` tuple
2759 or a list of pages to be processed
2760 to merge only the specified range of pages from the source
2761 document into the output document.
2762 import_outline: You may prevent the source document's
2763 outline (collection of outline items, previously referred to as
2764 'bookmarks') from being imported by specifying this as ``False``.
2765 excluded_fields: provide the list of fields/keys to be ignored
2766 if ``/Annots`` is part of the list, the annotation will be ignored
2767 if ``/B`` is part of the list, the articles will be ignored
2769 Raises:
2770 TypeError: The pages attribute is not configured properly
2772 """
2773 if isinstance(fileobj, PdfDocCommon):
2774 reader = fileobj
2775 else:
2776 stream, encryption_obj = self._create_stream(fileobj)
2777 # Create a new PdfReader instance using the stream
2778 # (either file or BytesIO or StringIO) created above
2779 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]
2781 if excluded_fields is None:
2782 excluded_fields = ()
2783 # Find the range of pages to merge.
2784 if pages is None:
2785 pages = list(range(len(reader.pages)))
2786 elif isinstance(pages, PageRange):
2787 pages = list(range(*pages.indices(len(reader.pages))))
2788 elif isinstance(pages, list):
2789 pass # keep unchanged
2790 elif isinstance(pages, tuple) and len(pages) <= 3:
2791 pages = list(range(*pages))
2792 elif not isinstance(pages, tuple):
2793 raise TypeError(
2794 '"pages" must be a tuple of (start, stop[, step]) or a list'
2795 )
2797 srcpages = {}
2798 for page in pages:
2799 if isinstance(page, PageObject):
2800 pg = page
2801 else:
2802 pg = reader.pages[page]
2803 assert pg.indirect_reference is not None
2804 if position is None:
2805 # numbers in the exclude list identifies that the exclusion is
2806 # only applicable to 1st level of cloning
2807 srcpages[pg.indirect_reference.idnum] = self.add_page(
2808 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2809 )
2810 else:
2811 srcpages[pg.indirect_reference.idnum] = self.insert_page(
2812 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2813 )
2814 position += 1
2815 srcpages[pg.indirect_reference.idnum].original_page = pg
2817 reader._named_destinations = (
2818 reader.named_destinations
2819 ) # need for the outline processing below
2821 arr: Any
2823 def _process_named_dests(dest: Any) -> None:
2824 arr = dest.dest_array
2825 if "/Names" in self._root_object and dest["/Title"] in cast(
2826 List[Any],
2827 cast(
2828 DictionaryObject,
2829 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),
2830 ).get("/Names", DictionaryObject()),
2831 ):
2832 # already exists: should not duplicate it
2833 pass
2834 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):
2835 pass
2836 elif isinstance(dest["/Page"], int):
2837 # the page reference is a page number normally not a PDF Reference
2838 # page numbers as int are normally accepted only in external goto
2839 try:
2840 p = reader.pages[dest["/Page"]]
2841 except IndexError:
2842 return
2843 assert p.indirect_reference is not None
2844 try:
2845 arr[NumberObject(0)] = NumberObject(
2846 srcpages[p.indirect_reference.idnum].page_number
2847 )
2848 self.add_named_destination_array(dest["/Title"], arr)
2849 except KeyError:
2850 pass
2851 elif dest["/Page"].indirect_reference.idnum in srcpages:
2852 arr[NumberObject(0)] = srcpages[
2853 dest["/Page"].indirect_reference.idnum
2854 ].indirect_reference
2855 self.add_named_destination_array(dest["/Title"], arr)
2857 for dest in reader._named_destinations.values():
2858 _process_named_dests(dest)
2860 outline_item_typ: TreeObject
2861 if outline_item is not None:
2862 outline_item_typ = cast(
2863 "TreeObject",
2864 self.add_outline_item(
2865 TextStringObject(outline_item),
2866 next(iter(srcpages.values())).indirect_reference,
2867 fit=PAGE_FIT,
2868 ).get_object(),
2869 )
2870 else:
2871 outline_item_typ = self.get_outline_root()
2873 _ro = reader.root_object
2874 if import_outline and CO.OUTLINES in _ro:
2875 outline = self._get_filtered_outline(
2876 _ro.get(CO.OUTLINES, None), srcpages, reader
2877 )
2878 self._insert_filtered_outline(
2879 outline, outline_item_typ, None
2880 ) # TODO: use before parameter
2882 if "/Annots" not in excluded_fields:
2883 for pag in srcpages.values():
2884 lst = self._insert_filtered_annotations(
2885 pag.original_page.get("/Annots", []), pag, srcpages, reader
2886 )
2887 if len(lst) > 0:
2888 pag[NameObject("/Annots")] = lst
2889 self.clean_page(pag)
2891 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:
2892 if "/AcroForm" not in self._root_object:
2893 self._root_object[NameObject("/AcroForm")] = self._add_object(
2894 cast(
2895 DictionaryObject,
2896 reader.root_object["/AcroForm"],
2897 ).clone(self, False, ("/Fields",))
2898 )
2899 arr = ArrayObject()
2900 else:
2901 arr = cast(
2902 ArrayObject,
2903 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
2904 )
2905 trslat = self._id_translated[id(reader)]
2906 try:
2907 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
2908 try:
2909 ind = IndirectObject(trslat[f.idnum], 0, self)
2910 if ind not in arr:
2911 arr.append(ind)
2912 except KeyError:
2913 # for trslat[] which mean the field has not be copied
2914 # through the page
2915 pass
2916 except KeyError: # for /Acroform or /Fields are not existing
2917 arr = self._add_object(ArrayObject())
2918 cast(DictionaryObject, self._root_object["/AcroForm"])[
2919 NameObject("/Fields")
2920 ] = arr
2922 if "/B" not in excluded_fields:
2923 self.add_filtered_articles("", srcpages, reader)
2925 def _add_articles_thread(
2926 self,
2927 thread: DictionaryObject, # thread entry from the reader's array of threads
2928 pages: Dict[int, PageObject],
2929 reader: PdfReader,
2930 ) -> IndirectObject:
2931 """
2932 Clone the thread with only the applicable articles.
2934 Args:
2935 thread:
2936 pages:
2937 reader:
2939 Returns:
2940 The added thread as an indirect reference
2942 """
2943 nthread = thread.clone(
2944 self, force_duplicate=True, ignore_fields=("/F",)
2945 ) # use of clone to keep link between reader and writer
2946 self.threads.append(nthread.indirect_reference)
2947 first_article = cast("DictionaryObject", thread["/F"])
2948 current_article: Optional[DictionaryObject] = first_article
2949 new_article: Optional[DictionaryObject] = None
2950 while current_article is not None:
2951 pag = self._get_cloned_page(
2952 cast("PageObject", current_article["/P"]), pages, reader
2953 )
2954 if pag is not None:
2955 if new_article is None:
2956 new_article = cast(
2957 "DictionaryObject",
2958 self._add_object(DictionaryObject()).get_object(),
2959 )
2960 new_first = new_article
2961 nthread[NameObject("/F")] = new_article.indirect_reference
2962 else:
2963 new_article2 = cast(
2964 "DictionaryObject",
2965 self._add_object(
2966 DictionaryObject(
2967 {NameObject("/V"): new_article.indirect_reference}
2968 )
2969 ).get_object(),
2970 )
2971 new_article[NameObject("/N")] = new_article2.indirect_reference
2972 new_article = new_article2
2973 new_article[NameObject("/P")] = pag
2974 new_article[NameObject("/T")] = nthread.indirect_reference
2975 new_article[NameObject("/R")] = current_article["/R"]
2976 pag_obj = cast("PageObject", pag.get_object())
2977 if "/B" not in pag_obj:
2978 pag_obj[NameObject("/B")] = ArrayObject()
2979 cast("ArrayObject", pag_obj["/B"]).append(
2980 new_article.indirect_reference
2981 )
2982 current_article = cast("DictionaryObject", current_article["/N"])
2983 if current_article == first_article:
2984 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore
2985 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore
2986 current_article = None
2987 assert nthread.indirect_reference is not None
2988 return nthread.indirect_reference
2990 def add_filtered_articles(
2991 self,
2992 fltr: Union[
2993 Pattern[Any], str
2994 ], # thread entry from the reader's array of threads
2995 pages: Dict[int, PageObject],
2996 reader: PdfReader,
2997 ) -> None:
2998 """
2999 Add articles matching the defined criteria.
3001 Args:
3002 fltr:
3003 pages:
3004 reader:
3006 """
3007 if isinstance(fltr, str):
3008 fltr = re.compile(fltr)
3009 elif not isinstance(fltr, Pattern):
3010 fltr = re.compile("")
3011 for p in pages.values():
3012 pp = p.original_page
3013 for a in pp.get("/B", ()):
3014 thr = a.get_object().get("/T")
3015 if thr is None:
3016 continue
3017 thr = thr.get_object()
3018 if thr.indirect_reference.idnum not in self._id_translated[
3019 id(reader)
3020 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):
3021 self._add_articles_thread(thr, pages, reader)
3023 def _get_cloned_page(
3024 self,
3025 page: Union[None, IndirectObject, PageObject, NullObject],
3026 pages: Dict[int, PageObject],
3027 reader: PdfReader,
3028 ) -> Optional[IndirectObject]:
3029 if isinstance(page, NullObject):
3030 return None
3031 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":
3032 _i = page.indirect_reference
3033 elif isinstance(page, IndirectObject):
3034 _i = page
3035 try:
3036 return pages[_i.idnum].indirect_reference # type: ignore
3037 except Exception:
3038 return None
3040 def _insert_filtered_annotations(
3041 self,
3042 annots: Union[IndirectObject, List[DictionaryObject], None],
3043 page: PageObject,
3044 pages: Dict[int, PageObject],
3045 reader: PdfReader,
3046 ) -> List[Destination]:
3047 outlist = ArrayObject()
3048 if isinstance(annots, IndirectObject):
3049 annots = cast("List[Any]", annots.get_object())
3050 if annots is None:
3051 return outlist
3052 if not isinstance(annots, list):
3053 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)
3054 return outlist
3055 for an in annots:
3056 ano = cast("DictionaryObject", an.get_object())
3057 if (
3058 ano["/Subtype"] != "/Link"
3059 or "/A" not in ano
3060 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"
3061 or "/Dest" in ano
3062 ):
3063 if "/Dest" not in ano:
3064 outlist.append(self._add_object(ano.clone(self)))
3065 else:
3066 d = ano["/Dest"]
3067 if isinstance(d, str):
3068 # it is a named dest
3069 if str(d) in self.get_named_dest_root():
3070 outlist.append(ano.clone(self).indirect_reference)
3071 else:
3072 d = cast("ArrayObject", d)
3073 p = self._get_cloned_page(d[0], pages, reader)
3074 if p is not None:
3075 anc = ano.clone(self, ignore_fields=("/Dest",))
3076 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])
3077 outlist.append(self._add_object(anc))
3078 else:
3079 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())
3080 if d is None or isinstance(d, NullObject):
3081 continue
3082 if isinstance(d, str):
3083 # it is a named dest
3084 if str(d) in self.get_named_dest_root():
3085 outlist.append(ano.clone(self).indirect_reference)
3086 else:
3087 d = cast("ArrayObject", d)
3088 p = self._get_cloned_page(d[0], pages, reader)
3089 if p is not None:
3090 anc = ano.clone(self, ignore_fields=("/D",))
3091 cast("DictionaryObject", anc["/A"])[
3092 NameObject("/D")
3093 ] = ArrayObject([p, *d[1:]])
3094 outlist.append(self._add_object(anc))
3095 return outlist
3097 def _get_filtered_outline(
3098 self,
3099 node: Any,
3100 pages: Dict[int, PageObject],
3101 reader: PdfReader,
3102 ) -> List[Destination]:
3103 """
3104 Extract outline item entries that are part of the specified page set.
3106 Args:
3107 node:
3108 pages:
3109 reader:
3111 Returns:
3112 A list of destination objects.
3114 """
3115 new_outline = []
3116 if node is None:
3117 node = NullObject()
3118 node = node.get_object()
3119 if is_null_or_none(node):
3120 node = DictionaryObject()
3121 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:
3122 node = node.get("/First", None)
3123 if node is not None:
3124 node = node.get_object()
3125 new_outline += self._get_filtered_outline(node, pages, reader)
3126 else:
3127 v: Union[None, IndirectObject, NullObject]
3128 while node is not None:
3129 node = node.get_object()
3130 o = cast("Destination", reader._build_outline_item(node))
3131 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)
3132 if v is None:
3133 v = NullObject()
3134 o[NameObject("/Page")] = v
3135 if "/First" in node:
3136 o._filtered_children = self._get_filtered_outline(
3137 node["/First"], pages, reader
3138 )
3139 else:
3140 o._filtered_children = []
3141 if (
3142 not isinstance(o["/Page"], NullObject)
3143 or len(o._filtered_children) > 0
3144 ):
3145 new_outline.append(o)
3146 node = node.get("/Next", None)
3147 return new_outline
3149 def _clone_outline(self, dest: Destination) -> TreeObject:
3150 n_ol = TreeObject()
3151 self._add_object(n_ol)
3152 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])
3153 if not isinstance(dest["/Page"], NullObject):
3154 if dest.node is not None and "/A" in dest.node:
3155 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)
3156 else:
3157 n_ol[NameObject("/Dest")] = dest.dest_array
3158 # TODO: /SE
3159 if dest.node is not None:
3160 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))
3161 n_ol[NameObject("/C")] = ArrayObject(
3162 dest.node.get(
3163 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]
3164 )
3165 )
3166 return n_ol
3168 def _insert_filtered_outline(
3169 self,
3170 outlines: List[Destination],
3171 parent: Union[TreeObject, IndirectObject],
3172 before: Union[None, TreeObject, IndirectObject] = None,
3173 ) -> None:
3174 for dest in outlines:
3175 # TODO: can be improved to keep A and SE entries (ignored for the moment)
3176 # with np=self.add_outline_item_destination(dest,parent,before)
3177 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:
3178 np = parent
3179 else:
3180 np = self._clone_outline(dest)
3181 cast(TreeObject, parent.get_object()).insert_child(np, before, self)
3182 self._insert_filtered_outline(dest._filtered_children, np, None)
3184 def close(self) -> None:
3185 """Implemented for API harmonization."""
3186 return
3188 def find_outline_item(
3189 self,
3190 outline_item: Dict[str, Any],
3191 root: Optional[OutlineType] = None,
3192 ) -> Optional[List[int]]:
3193 if root is None:
3194 o = self.get_outline_root()
3195 else:
3196 o = cast("TreeObject", root)
3198 i = 0
3199 while o is not None:
3200 if (
3201 o.indirect_reference == outline_item
3202 or o.get("/Title", None) == outline_item
3203 ):
3204 return [i]
3205 if "/First" in o:
3206 res = self.find_outline_item(
3207 outline_item, cast(OutlineType, o["/First"])
3208 )
3209 if res:
3210 return ([i] if "/Title" in o else []) + res
3211 if "/Next" in o:
3212 i += 1
3213 o = cast(TreeObject, o["/Next"])
3214 else:
3215 return None
3217 def find_bookmark(
3218 self,
3219 outline_item: Dict[str, Any],
3220 root: Optional[OutlineType] = None,
3221 ) -> None: # deprecated
3222 """
3223 .. deprecated:: 2.9.0
3224 Use :meth:`find_outline_item` instead.
3225 """
3226 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0")
3228 def reset_translation(
3229 self, reader: Union[None, PdfReader, IndirectObject] = None
3230 ) -> None:
3231 """
3232 Reset the translation table between reader and the writer object.
3234 Late cloning will create new independent objects.
3236 Args:
3237 reader: PdfReader or IndirectObject referencing a PdfReader object.
3238 if set to None or omitted, all tables will be reset.
3240 """
3241 if reader is None:
3242 self._id_translated = {}
3243 elif isinstance(reader, PdfReader):
3244 try:
3245 del self._id_translated[id(reader)]
3246 except Exception:
3247 pass
3248 elif isinstance(reader, IndirectObject):
3249 try:
3250 del self._id_translated[id(reader.pdf)]
3251 except Exception:
3252 pass
3253 else:
3254 raise Exception("invalid parameter {reader}")
3256 def set_page_label(
3257 self,
3258 page_index_from: int,
3259 page_index_to: int,
3260 style: Optional[PageLabelStyle] = None,
3261 prefix: Optional[str] = None,
3262 start: Optional[int] = 0,
3263 ) -> None:
3264 """
3265 Set a page label to a range of pages.
3267 Page indexes must be given starting from 0.
3268 Labels must have a style, a prefix or both.
3269 If a range is not assigned any page label, a decimal label starting from 1 is applied.
3271 Args:
3272 page_index_from: page index of the beginning of the range starting from 0
3273 page_index_to: page index of the beginning of the range starting from 0
3274 style: The numbering style to be used for the numeric portion of each page label:
3276 * ``/D`` Decimal Arabic numerals
3277 * ``/R`` Uppercase Roman numerals
3278 * ``/r`` Lowercase Roman numerals
3279 * ``/A`` Uppercase letters (A to Z for the first 26 pages,
3280 AA to ZZ for the next 26, and so on)
3281 * ``/a`` Lowercase letters (a to z for the first 26 pages,
3282 aa to zz for the next 26, and so on)
3284 prefix: The label prefix for page labels in this range.
3285 start: The value of the numeric portion for the first page label
3286 in the range.
3287 Subsequent pages are numbered sequentially from this value,
3288 which must be greater than or equal to 1.
3289 Default value: 1.
3291 """
3292 if style is None and prefix is None:
3293 raise ValueError("At least one of style and prefix must be given")
3294 if page_index_from < 0:
3295 raise ValueError("page_index_from must be greater or equal than 0")
3296 if page_index_to < page_index_from:
3297 raise ValueError(
3298 "page_index_to must be greater or equal than page_index_from"
3299 )
3300 if page_index_to >= len(self.pages):
3301 raise ValueError("page_index_to exceeds number of pages")
3302 if start is not None and start != 0 and start < 1:
3303 raise ValueError("If given, start must be greater or equal than one")
3305 self._set_page_label(page_index_from, page_index_to, style, prefix, start)
3307 def _set_page_label(
3308 self,
3309 page_index_from: int,
3310 page_index_to: int,
3311 style: Optional[PageLabelStyle] = None,
3312 prefix: Optional[str] = None,
3313 start: Optional[int] = 0,
3314 ) -> None:
3315 """
3316 Set a page label to a range of pages.
3318 Page indexes must be given starting from 0.
3319 Labels must have a style, a prefix or both.
3320 If a range is not assigned any page label a decimal label starting from 1 is applied.
3322 Args:
3323 page_index_from: page index of the beginning of the range starting from 0
3324 page_index_to: page index of the beginning of the range starting from 0
3325 style: The numbering style to be used for the numeric portion of each page label:
3326 /D Decimal Arabic numerals
3327 /R Uppercase Roman numerals
3328 /r Lowercase Roman numerals
3329 /A Uppercase letters (A to Z for the first 26 pages,
3330 AA to ZZ for the next 26, and so on)
3331 /a Lowercase letters (a to z for the first 26 pages,
3332 aa to zz for the next 26, and so on)
3333 prefix: The label prefix for page labels in this range.
3334 start: The value of the numeric portion for the first page label
3335 in the range.
3336 Subsequent pages are numbered sequentially from this value,
3337 which must be greater than or equal to 1. Default value: 1.
3339 """
3340 default_page_label = DictionaryObject()
3341 default_page_label[NameObject("/S")] = NameObject("/D")
3343 new_page_label = DictionaryObject()
3344 if style is not None:
3345 new_page_label[NameObject("/S")] = NameObject(style)
3346 if prefix is not None:
3347 new_page_label[NameObject("/P")] = TextStringObject(prefix)
3348 if start != 0:
3349 new_page_label[NameObject("/St")] = NumberObject(start)
3351 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:
3352 nums = ArrayObject()
3353 nums_insert(NumberObject(0), default_page_label, nums)
3354 page_labels = TreeObject()
3355 page_labels[NameObject("/Nums")] = nums
3356 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3358 page_labels = cast(
3359 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]
3360 )
3361 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])
3363 nums_insert(NumberObject(page_index_from), new_page_label, nums)
3364 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)
3365 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)
3366 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):
3367 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)
3369 page_labels[NameObject("/Nums")] = nums
3370 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3372 def _repr_mimebundle_(
3373 self,
3374 include: Union[None, Iterable[str]] = None,
3375 exclude: Union[None, Iterable[str]] = None,
3376 ) -> Dict[str, Any]:
3377 """
3378 Integration into Jupyter Notebooks.
3380 This method returns a dictionary that maps a mime-type to its
3381 representation.
3383 .. seealso::
3385 https://ipython.readthedocs.io/en/stable/config/integrating.html
3386 """
3387 pdf_data = BytesIO()
3388 self.write(pdf_data)
3389 data = {
3390 "application/pdf": pdf_data,
3391 }
3393 if include is not None:
3394 # Filter representations based on include list
3395 data = {k: v for k, v in data.items() if k in include}
3397 if exclude is not None:
3398 # Remove representations based on exclude list
3399 data = {k: v for k, v in data.items() if k not in exclude}
3401 return data
3404def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject:
3405 if isinstance(obj, PdfObject):
3406 return obj
3407 if isinstance(obj, dict):
3408 to_add = DictionaryObject()
3409 for key, value in obj.items():
3410 to_add[NameObject(key)] = _pdf_objectify(value)
3411 return to_add
3412 if isinstance(obj, str):
3413 if obj.startswith("/"):
3414 return NameObject(obj)
3415 return TextStringObject(obj)
3416 if isinstance(obj, (float, int)):
3417 return FloatObject(obj)
3418 if isinstance(obj, list):
3419 return ArrayObject(_pdf_objectify(i) for i in obj)
3420 raise NotImplementedError(
3421 f"{type(obj)=} could not be cast to a PdfObject"
3422 )
3425def _create_outline_item(
3426 action_ref: Union[None, IndirectObject],
3427 title: str,
3428 color: Union[Tuple[float, float, float], str, None],
3429 italic: bool,
3430 bold: bool,
3431) -> TreeObject:
3432 outline_item = TreeObject()
3433 if action_ref is not None:
3434 outline_item[NameObject("/A")] = action_ref
3435 outline_item.update(
3436 {
3437 NameObject("/Title"): create_string_object(title),
3438 }
3439 )
3440 if color:
3441 if isinstance(color, str):
3442 color = hex_to_rgb(color)
3443 outline_item.update(
3444 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}
3445 )
3446 if italic or bold:
3447 format_flag = 0
3448 if italic:
3449 format_flag += OutlineFontFlag.italic
3450 if bold:
3451 format_flag += OutlineFontFlag.bold
3452 outline_item.update({NameObject("/F"): NumberObject(format_flag)})
3453 return outline_item
3456def generate_appearance_stream(
3457 txt: str,
3458 sel: List[str],
3459 da: str,
3460 font_full_rev: Dict[str, bytes],
3461 rct: RectangleObject,
3462 font_height: float,
3463 y_offset: float,
3464) -> bytes:
3465 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()
3466 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):
3467 if line in sel:
3468 # may be improved but cannot find how to get fill working => replaced with lined box
3469 ap_stream += (
3470 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"
3471 f"0.5 0.5 0.5 rg s\n{da}\n"
3472 ).encode()
3473 if line_number == 0:
3474 ap_stream += f"2 {y_offset} Td\n".encode()
3475 else:
3476 # Td is a relative translation
3477 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()
3478 enc_line: List[bytes] = [
3479 font_full_rev.get(c, c.encode("utf-16-be")) for c in line
3480 ]
3481 if any(len(c) >= 2 for c in enc_line):
3482 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"
3483 else:
3484 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"
3485 ap_stream += b"ET\nQ\nEMC\nQ\n"
3486 return ap_stream