Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import decimal
31import enum
32import hashlib
33import re
34import struct
35import uuid
36from io import BytesIO, FileIO, IOBase
37from itertools import compress
38from pathlib import Path
39from types import TracebackType
40from typing import (
41 IO,
42 Any,
43 Callable,
44 Dict,
45 Iterable,
46 List,
47 Optional,
48 Pattern,
49 Tuple,
50 Type,
51 Union,
52 cast,
53)
55from ._cmap import _default_fonts_space_width, build_char_map_from_dict
56from ._doc_common import DocumentInformation, PdfDocCommon
57from ._encryption import EncryptAlgorithm, Encryption
58from ._page import PageObject, Transformation
59from ._page_labels import nums_clear_range, nums_insert, nums_next
60from ._reader import PdfReader
61from ._utils import (
62 StrByteType,
63 StreamType,
64 _get_max_pdf_version_header,
65 deprecate,
66 deprecate_no_replacement,
67 deprecation_with_replacement,
68 logger_warning,
69)
70from .constants import AnnotationDictionaryAttributes as AA
71from .constants import CatalogAttributes as CA
72from .constants import (
73 CatalogDictionary,
74 FileSpecificationDictionaryEntries,
75 GoToActionArguments,
76 ImageType,
77 InteractiveFormDictEntries,
78 OutlineFontFlag,
79 PageLabelStyle,
80 TypFitArguments,
81 UserAccessPermissions,
82)
83from .constants import Core as CO
84from .constants import FieldDictionaryAttributes as FA
85from .constants import PageAttributes as PG
86from .constants import PagesAttributes as PA
87from .constants import TrailerKeys as TK
88from .errors import PyPdfError
89from .generic import (
90 PAGE_FIT,
91 ArrayObject,
92 BooleanObject,
93 ByteStringObject,
94 ContentStream,
95 DecodedStreamObject,
96 Destination,
97 DictionaryObject,
98 Fit,
99 FloatObject,
100 IndirectObject,
101 NameObject,
102 NullObject,
103 NumberObject,
104 PdfObject,
105 RectangleObject,
106 StreamObject,
107 TextStringObject,
108 TreeObject,
109 ViewerPreferences,
110 create_string_object,
111 hex_to_rgb,
112 is_null_or_none,
113)
114from .pagerange import PageRange, PageRangeSpec
115from .types import (
116 AnnotationSubtype,
117 BorderArrayType,
118 LayoutType,
119 OutlineItemType,
120 OutlineType,
121 PagemodeType,
122)
123from .xmp import XmpInformation
125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()
126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12
129class ObjectDeletionFlag(enum.IntFlag):
130 NONE = 0
131 TEXT = enum.auto()
132 LINKS = enum.auto()
133 ATTACHMENTS = enum.auto()
134 OBJECTS_3D = enum.auto()
135 ALL_ANNOTATIONS = enum.auto()
136 XOBJECT_IMAGES = enum.auto()
137 INLINE_IMAGES = enum.auto()
138 DRAWING_IMAGES = enum.auto()
139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
143 hash = hashlib.md5()
144 for block in iter(lambda: stream.read(blocksize), b""):
145 hash.update(block)
146 return hash.hexdigest()
149class PdfWriter(PdfDocCommon):
150 """
151 Write a PDF file out, given pages produced by another class or through
152 cloning a PDF file during initialization.
154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.
156 Args:
157 clone_from: identical to fileobj (for compatibility)
159 incremental: If true, loads the document and set the PdfWriter in incremental mode.
161 When writing incrementally, the original document is written first and new/modified
162 content is appended. To be used for signed document/forms to keep signature valid.
164 full: If true, loads all the objects (always full if incremental = True).
165 This parameter may allow loading large PDFs.
167 """
169 def __init__(
170 self,
171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",
172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,
173 incremental: bool = False,
174 full: bool = False,
175 ) -> None:
176 self.incremental = incremental or full
177 """
178 Returns if the PdfWriter object has been started in incremental mode.
179 """
181 self._objects: List[Optional[PdfObject]] = []
182 """
183 The indirect objects in the PDF.
184 For the incremental case, it will be filled with None
185 in clone_reader_document_root.
186 """
188 self._original_hash: List[int] = []
189 """
190 List of hashes after import; used to identify changes.
191 """
193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}
194 """
195 Maps hash values of indirect objects to the list of IndirectObjects.
196 This is used for compression.
197 """
199 self._id_translated: Dict[int, Dict[int, int]] = {}
200 """List of already translated IDs.
201 dict[id(pdf)][(idnum, generation)]
202 """
204 self._info_obj: Optional[PdfObject]
205 """The PDF files's document information dictionary,
206 the Info entry in the PDF file's trailer dictionary."""
208 self._ID: Union[ArrayObject, None] = None
209 """The PDF file identifier,
210 defined by the ID in the PDF file's trailer dictionary."""
212 if self.incremental:
213 if isinstance(fileobj, (str, Path)):
214 with open(fileobj, "rb") as f:
215 fileobj = BytesIO(f.read(-1))
216 if isinstance(fileobj, BytesIO):
217 fileobj = PdfReader(fileobj)
218 if not isinstance(fileobj, PdfReader):
219 raise PyPdfError("Invalid type for incremental mode")
220 self._reader = fileobj # prev content is in _reader.stream
221 self._header = fileobj.pdf_header.encode()
222 self._readonly = True # TODO: to be analysed
223 else:
224 self._header = b"%PDF-1.3"
225 self._info_obj = self._add_object(
226 DictionaryObject(
227 {NameObject("/Producer"): create_string_object("pypdf")}
228 )
229 )
231 def _get_clone_from(
232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:
235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (
236 fileobj == "" or clone_from is not None
237 ):
238 return clone_from
239 cloning = True
240 if isinstance(fileobj, (str, Path)) and (
241 not Path(str(fileobj)).exists()
242 or Path(str(fileobj)).stat().st_size == 0
243 ):
244 cloning = False
245 if isinstance(fileobj, (IOBase, BytesIO)):
246 t = fileobj.tell()
247 if fileobj.seek(0, 2) == 0:
248 cloning = False
249 fileobj.seek(t, 0)
250 if cloning:
251 clone_from = fileobj
252 return clone_from
254 clone_from = _get_clone_from(fileobj, clone_from)
255 # To prevent overwriting
256 self.temp_fileobj = fileobj
257 self.fileobj = ""
258 self._with_as_usage = False
259 self._cloned = False
260 # The root of our page tree node
261 pages = DictionaryObject(
262 {
263 NameObject(PA.TYPE): NameObject("/Pages"),
264 NameObject(PA.COUNT): NumberObject(0),
265 NameObject(PA.KIDS): ArrayObject(),
266 }
267 )
268 self.flattened_pages = []
269 self._encryption: Optional[Encryption] = None
270 self._encrypt_entry: Optional[DictionaryObject] = None
272 if clone_from is not None:
273 if not isinstance(clone_from, PdfReader):
274 clone_from = PdfReader(clone_from)
275 self.clone_document_from_reader(clone_from)
276 self._cloned = True
277 else:
278 self._pages = self._add_object(pages)
279 self._root_object = DictionaryObject(
280 {
281 NameObject(PA.TYPE): NameObject(CO.CATALOG),
282 NameObject(CO.PAGES): self._pages,
283 }
284 )
285 self._add_object(self._root_object)
286 if full and not incremental:
287 self.incremental = False
288 if isinstance(self._ID, list):
289 if isinstance(self._ID[0], TextStringObject):
290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
291 if isinstance(self._ID[1], TextStringObject):
292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())
294 # for commonality
295 @property
296 def is_encrypted(self) -> bool:
297 """
298 Read-only boolean property showing whether this PDF file is encrypted.
300 Note that this property, if true, will remain true even after the
301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
302 """
303 return False
305 @property
306 def root_object(self) -> DictionaryObject:
307 """
308 Provide direct access to PDF Structure.
310 Note:
311 Recommended only for read access.
313 """
314 return self._root_object
316 @property
317 def _info(self) -> Optional[DictionaryObject]:
318 """
319 Provide access to "/Info". Standardized with PdfReader.
321 Returns:
322 /Info Dictionary; None if the entry does not exist
324 """
325 return (
326 None
327 if self._info_obj is None
328 else cast(DictionaryObject, self._info_obj.get_object())
329 )
331 @_info.setter
332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
333 if value is None:
334 try:
335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore
336 except (KeyError, AttributeError):
337 pass
338 self._info_obj = None
339 else:
340 if self._info_obj is None:
341 self._info_obj = self._add_object(DictionaryObject())
342 obj = cast(DictionaryObject, self._info_obj.get_object())
343 obj.clear()
344 obj.update(cast(DictionaryObject, value.get_object()))
346 @property
347 def xmp_metadata(self) -> Optional[XmpInformation]:
348 """XMP (Extensible Metadata Platform) data."""
349 return cast(XmpInformation, self.root_object.xmp_metadata)
351 @xmp_metadata.setter
352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None:
353 """XMP (Extensible Metadata Platform) data."""
354 if value is None:
355 if "/Metadata" in self.root_object:
356 del self.root_object["/Metadata"]
357 else:
358 self.root_object[NameObject("/Metadata")] = value
360 return self.root_object.xmp_metadata # type: ignore
362 @property
363 def with_as_usage(self) -> bool:
364 deprecate_no_replacement("with_as_usage", "6.0")
365 return self._with_as_usage
367 @with_as_usage.setter
368 def with_as_usage(self, value: bool) -> None:
369 deprecate_no_replacement("with_as_usage", "6.0")
370 self._with_as_usage = value
372 def __enter__(self) -> "PdfWriter":
373 """Store how writer is initialized by 'with'."""
374 c: bool = self._cloned
375 t = self.temp_fileobj
376 self.__init__() # type: ignore
377 self._cloned = c
378 self._with_as_usage = True
379 self.fileobj = t # type: ignore
380 return self
382 def __exit__(
383 self,
384 exc_type: Optional[Type[BaseException]],
385 exc: Optional[BaseException],
386 traceback: Optional[TracebackType],
387 ) -> None:
388 """Write data to the fileobj."""
389 if self.fileobj and not self._cloned:
390 self.write(self.fileobj)
392 @property
393 def pdf_header(self) -> str:
394 """
395 Read/Write property of the PDF header that is written.
397 This should be something like ``'%PDF-1.5'``. It is recommended to set
398 the lowest version that supports all features which are used within the
399 PDF file.
401 Note: `pdf_header` returns a string but accepts bytes or str for writing
402 """
403 return self._header.decode()
405 @pdf_header.setter
406 def pdf_header(self, new_header: Union[str, bytes]) -> None:
407 if isinstance(new_header, str):
408 new_header = new_header.encode()
409 self._header = new_header
411 def _add_object(self, obj: PdfObject) -> IndirectObject:
412 if (
413 getattr(obj, "indirect_reference", None) is not None
414 and obj.indirect_reference.pdf == self # type: ignore
415 ):
416 return obj.indirect_reference # type: ignore
417 # check for /Contents in Pages (/Contents in annotations are strings)
418 if isinstance(obj, DictionaryObject) and isinstance(
419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
420 ):
421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
422 self._objects.append(obj)
423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
424 return obj.indirect_reference
426 def get_object(
427 self,
428 indirect_reference: Union[int, IndirectObject],
429 ) -> PdfObject:
430 if isinstance(indirect_reference, int):
431 obj = self._objects[indirect_reference - 1]
432 elif indirect_reference.pdf != self:
433 raise ValueError("PDF must be self")
434 else:
435 obj = self._objects[indirect_reference.idnum - 1]
436 assert obj is not None, "mypy"
437 return obj
439 def _replace_object(
440 self,
441 indirect_reference: Union[int, IndirectObject],
442 obj: PdfObject,
443 ) -> PdfObject:
444 if isinstance(indirect_reference, IndirectObject):
445 if indirect_reference.pdf != self:
446 raise ValueError("PDF must be self")
447 indirect_reference = indirect_reference.idnum
448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore
449 if (
450 getattr(obj, "indirect_reference", None) is not None
451 and obj.indirect_reference.pdf != self # type: ignore
452 ):
453 obj = obj.clone(self)
454 self._objects[indirect_reference - 1] = obj
455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
457 assert isinstance(obj, PdfObject), "mypy"
458 return obj
460 def _add_page(
461 self,
462 page: PageObject,
463 index: int,
464 excluded_keys: Iterable[str] = (),
465 ) -> PageObject:
466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:
467 raise ValueError("Invalid page object")
468 assert self.flattened_pages is not None, "for mypy"
469 page_org = page
470 excluded_keys = list(excluded_keys)
471 excluded_keys += [PA.PARENT, "/StructParents"]
472 # Acrobat does not accept two indirect references pointing on the same
473 # page; therefore in order to add multiple copies of the same
474 # page, we need to create a new dictionary for the page, however the
475 # objects below (including content) are not duplicated:
476 try: # delete an already existing page
477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore
478 page_org.indirect_reference.idnum # type: ignore
479 ]
480 except Exception:
481 pass
482 page = cast(
483 "PageObject", page_org.clone(self, False, excluded_keys).get_object()
484 )
485 if page_org.pdf is not None:
486 other = page_org.pdf.pdf_header
487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
488 node, idx = self._get_page_in_node(index)
489 page[NameObject(PA.PARENT)] = node.indirect_reference
491 if idx >= 0:
492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)
493 self.flattened_pages.insert(index, page)
494 else:
495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)
496 self.flattened_pages.append(page)
497 recurse = 0
498 while not is_null_or_none(node):
499 node = cast(DictionaryObject, node.get_object())
500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)
501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix.
502 recurse += 1
503 if recurse > 1000:
504 raise PyPdfError("Too many recursive calls!")
505 return page
507 def set_need_appearances_writer(self, state: bool = True) -> None:
508 """
509 Sets the "NeedAppearances" flag in the PDF writer.
511 The "NeedAppearances" flag indicates whether the appearance dictionary
512 for form fields should be automatically generated by the PDF viewer or
513 if the embedded appearance should be used.
515 Args:
516 state: The actual value of the NeedAppearances flag.
518 Returns:
519 None
521 """
522 # See §12.7.2 and §7.7.2 for more information:
523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
524 try:
525 # get the AcroForm tree
526 if CatalogDictionary.ACRO_FORM not in self._root_object:
527 self._root_object[
528 NameObject(CatalogDictionary.ACRO_FORM)
529 ] = self._add_object(DictionaryObject())
531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)
532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[
533 need_appearances
534 ] = BooleanObject(state)
535 except Exception as exc: # pragma: no cover
536 logger_warning(
537 f"set_need_appearances_writer({state}) catch : {exc}", __name__
538 )
540 def create_viewer_preferences(self) -> ViewerPreferences:
541 o = ViewerPreferences()
542 self._root_object[
543 NameObject(CatalogDictionary.VIEWER_PREFERENCES)
544 ] = self._add_object(o)
545 return o
547 def add_page(
548 self,
549 page: PageObject,
550 excluded_keys: Iterable[str] = (),
551 ) -> PageObject:
552 """
553 Add a page to this PDF file.
555 Recommended for advanced usage including the adequate excluded_keys.
557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`
558 instance.
560 Args:
561 page: The page to add to the document. Should be
562 an instance of :class:`PageObject<pypdf._page.PageObject>`
563 excluded_keys:
565 Returns:
566 The added PageObject.
568 """
569 assert self.flattened_pages is not None, "mypy"
570 return self._add_page(page, len(self.flattened_pages), excluded_keys)
572 def insert_page(
573 self,
574 page: PageObject,
575 index: int = 0,
576 excluded_keys: Iterable[str] = (),
577 ) -> PageObject:
578 """
579 Insert a page in this PDF file. The page is usually acquired from a
580 :class:`PdfReader<pypdf.PdfReader>` instance.
582 Args:
583 page: The page to add to the document.
584 index: Position at which the page will be inserted.
585 excluded_keys:
587 Returns:
588 The added PageObject.
590 """
591 assert self.flattened_pages is not None, "mypy"
592 if index < 0:
593 index = len(self.flattened_pages) + index
594 if index < 0:
595 raise ValueError("Invalid index value")
596 if index >= len(self.flattened_pages):
597 return self.add_page(page, excluded_keys)
598 return self._add_page(page, index, excluded_keys)
600 def _get_page_number_by_indirect(
601 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
602 ) -> Optional[int]:
603 """
604 Generate _page_id2num.
606 Args:
607 indirect_reference:
609 Returns:
610 The page number or None
612 """
613 # To provide same function as in PdfReader
614 if is_null_or_none(indirect_reference):
615 return None
616 assert indirect_reference is not None, "mypy"
617 if isinstance(indirect_reference, int):
618 indirect_reference = IndirectObject(indirect_reference, 0, self)
619 obj = indirect_reference.get_object()
620 if isinstance(obj, PageObject):
621 return obj.page_number
622 return None
624 def add_blank_page(
625 self, width: Optional[float] = None, height: Optional[float] = None
626 ) -> PageObject:
627 """
628 Append a blank page to this PDF file and return it.
630 If no page size is specified, use the size of the last page.
632 Args:
633 width: The width of the new page expressed in default user
634 space units.
635 height: The height of the new page expressed in default
636 user space units.
638 Returns:
639 The newly appended page.
641 Raises:
642 PageSizeNotDefinedError: if width and height are not defined
643 and previous page does not exist.
645 """
646 page = PageObject.create_blank_page(self, width, height)
647 return self.add_page(page)
649 def insert_blank_page(
650 self,
651 width: Optional[Union[float, decimal.Decimal]] = None,
652 height: Optional[Union[float, decimal.Decimal]] = None,
653 index: int = 0,
654 ) -> PageObject:
655 """
656 Insert a blank page to this PDF file and return it.
658 If no page size is specified, use the size of the last page.
660 Args:
661 width: The width of the new page expressed in default user
662 space units.
663 height: The height of the new page expressed in default
664 user space units.
665 index: Position to add the page.
667 Returns:
668 The newly inserted page.
670 Raises:
671 PageSizeNotDefinedError: if width and height are not defined
672 and previous page does not exist.
674 """
675 if width is None or (height is None and index < self.get_num_pages()):
676 oldpage = self.pages[index]
677 width = oldpage.mediabox.width
678 height = oldpage.mediabox.height
679 page = PageObject.create_blank_page(self, width, height)
680 self.insert_page(page, index)
681 return page
683 @property
684 def open_destination(
685 self,
686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
687 return super().open_destination
689 @open_destination.setter
690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
691 if dest is None:
692 try:
693 del self._root_object["/OpenAction"]
694 except KeyError:
695 pass
696 elif isinstance(dest, str):
697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)
698 elif isinstance(dest, Destination):
699 self._root_object[NameObject("/OpenAction")] = dest.dest_array
700 elif isinstance(dest, PageObject):
701 self._root_object[NameObject("/OpenAction")] = Destination(
702 "Opening",
703 dest.indirect_reference
704 if dest.indirect_reference is not None
705 else NullObject(),
706 PAGE_FIT,
707 ).dest_array
709 def add_js(self, javascript: str) -> None:
710 """
711 Add JavaScript which will launch upon opening this PDF.
713 Args:
714 javascript: Your JavaScript.
716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
717 # Example: This will launch the print window when the PDF is opened.
719 """
720 # Names / JavaScript preferred to be able to add multiple scripts
721 if "/Names" not in self._root_object:
722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()
723 names = cast(DictionaryObject, self._root_object[CA.NAMES])
724 if "/JavaScript" not in names:
725 names[NameObject("/JavaScript")] = DictionaryObject(
726 {NameObject("/Names"): ArrayObject()}
727 )
728 js_list = cast(
729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]
730 )
731 # We need a name for parameterized JavaScript in the PDF file,
732 # but it can be anything.
733 js_list.append(create_string_object(str(uuid.uuid4())))
735 js = DictionaryObject(
736 {
737 NameObject(PA.TYPE): NameObject("/Action"),
738 NameObject("/S"): NameObject("/JavaScript"),
739 NameObject("/JS"): TextStringObject(f"{javascript}"),
740 }
741 )
742 js_list.append(self._add_object(js))
744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
745 """
746 Embed a file inside the PDF.
748 Reference:
749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
750 Section 7.11.3
752 Args:
753 filename: The filename to display.
754 data: The data in the file.
756 """
757 # We need three entries:
758 # * The file's data
759 # * The /Filespec entry
760 # * The file's name, which goes in the Catalog
762 # The entry for the file
763 # Sample:
764 # 8 0 obj
765 # <<
766 # /Length 12
767 # /Type /EmbeddedFile
768 # >>
769 # stream
770 # Hello world!
771 # endstream
772 # endobj
774 if isinstance(data, str):
775 data = data.encode("latin-1")
776 file_entry = DecodedStreamObject()
777 file_entry.set_data(data)
778 file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
780 # The Filespec entry
781 # Sample:
782 # 7 0 obj
783 # <<
784 # /Type /Filespec
785 # /F (hello.txt)
786 # /EF << /F 8 0 R >>
787 # >>
788 # endobj
790 ef_entry = DictionaryObject()
791 ef_entry.update({NameObject("/F"): self._add_object(file_entry)})
793 filespec = DictionaryObject()
794 filespec.update(
795 {
796 NameObject(PA.TYPE): NameObject("/Filespec"),
797 NameObject(FileSpecificationDictionaryEntries.F): create_string_object(
798 filename
799 ), # Perhaps also try TextStringObject
800 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,
801 }
802 )
804 # Then create the entry for the root, as it needs
805 # a reference to the Filespec
806 # Sample:
807 # 1 0 obj
808 # <<
809 # /Type /Catalog
810 # /Outlines 2 0 R
811 # /Pages 3 0 R
812 # /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
813 # >>
814 # endobj
816 if CA.NAMES not in self._root_object:
817 self._root_object[NameObject(CA.NAMES)] = self._add_object(
818 DictionaryObject()
819 )
820 if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]):
821 embedded_files_names_dictionary = DictionaryObject(
822 {NameObject(CA.NAMES): ArrayObject()}
823 )
824 cast(DictionaryObject, self._root_object[CA.NAMES])[
825 NameObject("/EmbeddedFiles")
826 ] = self._add_object(embedded_files_names_dictionary)
827 else:
828 embedded_files_names_dictionary = cast(
829 DictionaryObject,
830 cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"],
831 )
832 cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend(
833 [create_string_object(filename), filespec]
834 )
836 def append_pages_from_reader(
837 self,
838 reader: PdfReader,
839 after_page_append: Optional[Callable[[PageObject], None]] = None,
840 ) -> None:
841 """
842 Copy pages from reader to writer. Includes an optional callback
843 parameter which is invoked after pages are appended to the writer.
845 ``append`` should be preferred.
847 Args:
848 reader: a PdfReader object from which to copy page
849 annotations to this writer object. The writer's annots
850 will then be updated.
851 after_page_append:
852 Callback function that is invoked after each page is appended to
853 the writer. Signature includes a reference to the appended page
854 (delegates to append_pages_from_reader). The single parameter of
855 the callback is a reference to the page just appended to the
856 document.
858 """
859 reader_num_pages = len(reader.pages)
860 # Copy pages from reader to writer
861 for reader_page_number in range(reader_num_pages):
862 reader_page = reader.pages[reader_page_number]
863 writer_page = self.add_page(reader_page)
864 # Trigger callback, pass writer page as parameter
865 if callable(after_page_append):
866 after_page_append(writer_page)
868 def _merge_content_stream_to_page(
869 self,
870 page: PageObject,
871 new_content_data: bytes,
872 ) -> None:
873 """
874 Combines existing content stream(s) with new content (as bytes),
875 and returns a new single StreamObject.
877 Args:
878 page: The page to which the new content data will be added.
879 new_content_data: A binary-encoded new content stream, for
880 instance the commands to draw an XObject.
881 """
882 # First resolve the existing page content. This always is an IndirectObject:
883 # PDF Explained by John Whitington
884 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
885 if NameObject("/Contents") in page:
886 existing_content_ref = page[NameObject("/Contents")]
887 existing_content = existing_content_ref.get_object()
889 if isinstance(existing_content, ArrayObject):
890 # Create a new StreamObject for the new_content_data
891 new_stream_obj = StreamObject()
892 new_stream_obj.set_data(new_content_data)
893 existing_content.append(self._add_object(new_stream_obj))
894 page[NameObject("/Contents")] = self._add_object(existing_content)
895 if isinstance(existing_content, StreamObject):
896 # Merge new content to existing StreamObject
897 merged_data = existing_content.get_data() + b"\n" + new_content_data
898 new_stream = StreamObject()
899 new_stream.set_data(merged_data)
900 page[NameObject("/Contents")] = self._add_object(new_stream)
901 else:
902 # If no existing content, then we have an empty page.
903 # Create a new StreamObject in a new /Contents entry.
904 new_stream = StreamObject()
905 new_stream.set_data(new_content_data)
906 page[NameObject("/Contents")] = self._add_object(new_stream)
908 def _add_apstream_object(
909 self,
910 page: PageObject,
911 appearance_stream_obj: StreamObject,
912 object_name: str,
913 x_offset: float,
914 y_offset: float,
915 font_res: Optional[DictionaryObject] = None
916 ) -> None:
917 """
918 Adds an appearance stream to the page content in the form of
919 an XObject.
921 Args:
922 page: The page to which to add the appearance stream.
923 appearance_stream_obj: The appearance stream.
924 object_name: The name of the appearance stream.
925 x_offset: The horizontal offset for the appearance stream.
926 y_offset: The vertical offset for the appearance stream.
927 font_res: The appearance stream's font resource (if given).
928 """
929 # Prepare XObject resource dictionary on the page
930 pg_res = cast(DictionaryObject, page[PG.RESOURCES])
931 if font_res is not None:
932 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated
933 if "/Font" not in pg_res:
934 pg_res[NameObject("/Font")] = DictionaryObject()
935 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])
936 if font_name not in pg_ft_res:
937 pg_ft_res[NameObject(font_name)] = font_res
938 # Always add the resolved stream object to the writer to get a new IndirectObject.
939 # This ensures we have a valid IndirectObject managed by *this* writer.
940 xobject_ref = self._add_object(appearance_stream_obj)
941 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()
942 if "/XObject" not in pg_res:
943 pg_res[NameObject("/XObject")] = DictionaryObject()
944 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])
945 if xobject_name not in pg_xo_res:
946 pg_xo_res[xobject_name] = xobject_ref
947 else:
948 logger_warning(
949 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",
950 __name__
951 )
952 xobject_cm = Transformation().translate(x_offset, y_offset)
953 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
954 self._merge_content_stream_to_page(page, xobject_drawing_commands)
956 def _update_field_annotation(
957 self,
958 page: PageObject,
959 field: DictionaryObject,
960 annotation: DictionaryObject,
961 font_name: str = "",
962 font_size: float = -1,
963 flatten: bool = False,
964 ) -> None:
965 # Calculate rectangle dimensions
966 _rct = cast(RectangleObject, annotation[AA.Rect])
967 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))
969 # Extract font information
970 da = annotation.get_inherited(
971 AA.DA,
972 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(
973 AA.DA, None
974 ),
975 )
976 if da is None:
977 da = TextStringObject("/Helv 0 Tf 0 g")
978 else:
979 da = da.get_object()
980 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")
981 font_properties = [x for x in font_properties if x != ""]
982 if font_name:
983 font_properties[font_properties.index("Tf") - 2] = font_name
984 else:
985 font_name = font_properties[font_properties.index("Tf") - 2]
986 font_height = (
987 font_size
988 if font_size >= 0
989 else float(font_properties[font_properties.index("Tf") - 1])
990 )
991 if font_height == 0:
992 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:
993 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE
994 else:
995 font_height = rct.height - 2
996 font_properties[font_properties.index("Tf") - 1] = str(font_height)
997 da = " ".join(font_properties)
998 y_offset = rct.height - 1 - font_height
1000 # Retrieve font information from local DR ...
1001 dr: Any = cast(
1002 DictionaryObject,
1003 cast(
1004 DictionaryObject,
1005 annotation.get_inherited(
1006 "/DR",
1007 cast(
1008 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]
1009 ).get("/DR", DictionaryObject()),
1010 ),
1011 ).get_object(),
1012 )
1013 dr = dr.get("/Font", DictionaryObject()).get_object()
1014 # _default_fonts_space_width keys is the list of Standard fonts
1015 if font_name not in dr and font_name not in _default_fonts_space_width:
1016 # ...or AcroForm dictionary
1017 dr = cast(
1018 Dict[Any, Any],
1019 cast(
1020 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]
1021 ).get("/DR", {}),
1022 )
1023 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()
1024 font_res = dr.get(font_name, None)
1025 if not is_null_or_none(font_res):
1026 font_res = cast(DictionaryObject, font_res.get_object())
1027 font_subtype, _, font_encoding, font_map = build_char_map_from_dict(
1028 200, font_res
1029 )
1030 try: # remove width stored in -1 key
1031 del font_map[-1]
1032 except KeyError:
1033 pass
1034 font_full_rev: Dict[str, bytes]
1035 if isinstance(font_encoding, str):
1036 font_full_rev = {
1037 v: k.encode(font_encoding) for k, v in font_map.items()
1038 }
1039 else:
1040 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
1041 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
1042 for key, value in font_map.items():
1043 font_full_rev[value] = font_encoding_rev.get(key, key)
1044 else:
1045 logger_warning(f"Font dictionary for {font_name} not found.", __name__)
1046 font_full_rev = {}
1048 # Retrieve field text and selected values
1049 field_flags = field.get(FA.Ff, 0)
1050 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
1051 txt = "\n".join(annotation.get_inherited(FA.Opt, []))
1052 sel = field.get("/V", [])
1053 if not isinstance(sel, list):
1054 sel = [sel]
1055 else: # /Tx
1056 txt = field.get("/V", "")
1057 sel = []
1058 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
1059 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
1060 # Generate appearance stream
1061 ap_stream = generate_appearance_stream(
1062 txt, sel, da, font_full_rev, rct, font_height, y_offset
1063 )
1065 # Create appearance dictionary
1066 dct = DecodedStreamObject.initialize_from_dictionary(
1067 {
1068 NameObject("/Type"): NameObject("/XObject"),
1069 NameObject("/Subtype"): NameObject("/Form"),
1070 NameObject("/BBox"): rct,
1071 "__streamdata__": ByteStringObject(ap_stream),
1072 "/Length": 0,
1073 }
1074 )
1075 if AA.AP in annotation:
1076 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():
1077 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
1078 dct[k] = v
1080 # Update Resources with font information if necessary
1081 if font_res is not None:
1082 dct[NameObject("/Resources")] = DictionaryObject(
1083 {
1084 NameObject("/Font"): DictionaryObject(
1085 {
1086 NameObject(font_name): getattr(
1087 font_res, "indirect_reference", font_res
1088 )
1089 }
1090 )
1091 }
1092 )
1093 if AA.AP not in annotation:
1094 annotation[NameObject(AA.AP)] = DictionaryObject(
1095 {NameObject("/N"): self._add_object(dct)}
1096 )
1097 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):
1098 cast(DictionaryObject, annotation[NameObject(AA.AP)])[
1099 NameObject("/N")
1100 ] = self._add_object(dct)
1101 else: # [/AP][/N] exists
1102 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore
1103 self._objects[n - 1] = dct
1104 dct.indirect_reference = IndirectObject(n, 0, self)
1106 if flatten:
1107 field_name = self._get_qualified_field_name(annotation)
1108 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)
1110 FFBITS_NUL = FA.FfBits(0)
1112 def update_page_form_field_values(
1113 self,
1114 page: Union[PageObject, List[PageObject], None],
1115 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]],
1116 flags: FA.FfBits = FFBITS_NUL,
1117 auto_regenerate: Optional[bool] = True,
1118 flatten: bool = False,
1119 ) -> None:
1120 """
1121 Update the form field values for a given page from a fields dictionary.
1123 Copy field texts and values from fields to page.
1124 If the field links to a parent object, add the information to the parent.
1126 Args:
1127 page: `PageObject` - references **PDF writer's page** where the
1128 annotations and field data will be updated.
1129 `List[Pageobject]` - provides list of pages to be processed.
1130 `None` - all pages.
1131 fields: a Python dictionary of:
1133 * field names (/T) as keys and text values (/V) as value
1134 * field names (/T) as keys and list of text values (/V) for multiple choice list
1135 * field names (/T) as keys and tuple of:
1136 * text values (/V)
1137 * font id (e.g. /F1, the font id must exist)
1138 * font size (0 for autosize)
1140 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.
1142 auto_regenerate: Set/unset the need_appearances flag;
1143 the flag is unchanged if auto_regenerate is None.
1145 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's
1146 appearance stream to the page contents. Note that this option does not remove the
1147 annotation itself.
1149 """
1150 if CatalogDictionary.ACRO_FORM not in self._root_object:
1151 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")
1152 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1153 if InteractiveFormDictEntries.Fields not in af:
1154 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")
1155 if isinstance(auto_regenerate, bool):
1156 self.set_need_appearances_writer(auto_regenerate)
1157 # Iterate through pages, update field values
1158 if page is None:
1159 page = list(self.pages)
1160 if isinstance(page, list):
1161 for p in page:
1162 if PG.ANNOTS in p: # just to prevent warnings
1163 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)
1164 return
1165 if PG.ANNOTS not in page:
1166 logger_warning("No fields to update on this page", __name__)
1167 return
1168 for annotation in page[PG.ANNOTS]: # type: ignore
1169 annotation = cast(DictionaryObject, annotation.get_object())
1170 if annotation.get("/Subtype", "") != "/Widget":
1171 continue
1172 if "/FT" in annotation and "/T" in annotation:
1173 parent_annotation = annotation
1174 else:
1175 parent_annotation = annotation.get(
1176 PG.PARENT, DictionaryObject()
1177 ).get_object()
1179 for field, value in fields.items():
1180 if not (
1181 self._get_qualified_field_name(parent_annotation) == field
1182 or parent_annotation.get("/T", None) == field
1183 ):
1184 continue
1185 if (
1186 parent_annotation.get("/FT", None) == "/Ch"
1187 and "/I" in parent_annotation
1188 ):
1189 del parent_annotation["/I"]
1190 if flags:
1191 annotation[NameObject(FA.Ff)] = NumberObject(flags)
1192 if not (value is None and flatten): # Only change values if given by user and not flattening.
1193 if isinstance(value, list):
1194 lst = ArrayObject(TextStringObject(v) for v in value)
1195 parent_annotation[NameObject(FA.V)] = lst
1196 elif isinstance(value, tuple):
1197 annotation[NameObject(FA.V)] = TextStringObject(
1198 value[0],
1199 )
1200 else:
1201 parent_annotation[NameObject(FA.V)] = TextStringObject(value)
1202 if parent_annotation.get(FA.FT) == "/Btn":
1203 # Checkbox button (no /FT found in Radio widgets)
1204 v = NameObject(value)
1205 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])
1206 normal_ap = cast(DictionaryObject, ap["/N"])
1207 if v not in normal_ap:
1208 v = NameObject("/Off")
1209 appearance_stream_obj = normal_ap.get(v)
1210 # other cases will be updated through the for loop
1211 annotation[NameObject(AA.AS)] = v
1212 annotation[NameObject(FA.V)] = v
1213 if flatten and appearance_stream_obj is not None:
1214 # We basically copy the entire appearance stream, which should be an XObject that
1215 # is already registered. No need to add font resources.
1216 rct = cast(RectangleObject, annotation[AA.Rect])
1217 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])
1218 elif (
1219 parent_annotation.get(FA.FT) == "/Tx"
1220 or parent_annotation.get(FA.FT) == "/Ch"
1221 ):
1222 # textbox
1223 if isinstance(value, tuple):
1224 self._update_field_annotation(
1225 page, parent_annotation, annotation, value[1], value[2], flatten=flatten
1226 )
1227 else:
1228 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)
1229 elif (
1230 annotation.get(FA.FT) == "/Sig"
1231 ): # deprecated # not implemented yet
1232 logger_warning("Signature forms not implemented yet", __name__)
1234 def reattach_fields(
1235 self, page: Optional[PageObject] = None
1236 ) -> List[DictionaryObject]:
1237 """
1238 Parse annotations within the page looking for orphan fields and
1239 reattach then into the Fields Structure.
1241 Args:
1242 page: page to analyze.
1243 If none is provided, all pages will be analyzed.
1245 Returns:
1246 list of reattached fields.
1248 """
1249 lst = []
1250 if page is None:
1251 for p in self.pages:
1252 lst += self.reattach_fields(p)
1253 return lst
1255 try:
1256 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])
1257 except KeyError:
1258 af = DictionaryObject()
1259 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af
1260 try:
1261 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])
1262 except KeyError:
1263 fields = ArrayObject()
1264 af[NameObject(InteractiveFormDictEntries.Fields)] = fields
1266 if "/Annots" not in page:
1267 return lst
1268 annotations = cast(ArrayObject, page["/Annots"])
1269 for idx, annotation in enumerate(annotations):
1270 is_indirect = isinstance(annotation, IndirectObject)
1271 annotation = cast(DictionaryObject, annotation.get_object())
1272 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:
1273 if (
1274 "indirect_reference" in annotation.__dict__
1275 and annotation.indirect_reference in fields
1276 ):
1277 continue
1278 if not is_indirect:
1279 annotations[idx] = self._add_object(annotation)
1280 fields.append(annotation.indirect_reference)
1281 lst.append(annotation)
1282 return lst
1284 def clone_reader_document_root(self, reader: PdfReader) -> None:
1285 """
1286 Copy the reader document root to the writer and all sub-elements,
1287 including pages, threads, outlines,... For partial insertion, ``append``
1288 should be considered.
1290 Args:
1291 reader: PdfReader from which the document root should be copied.
1293 """
1294 self._info_obj = None
1295 if self.incremental:
1296 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)
1297 for i in range(len(self._objects)):
1298 o = reader.get_object(i + 1)
1299 if o is not None:
1300 self._objects[i] = o.replicate(self)
1301 else:
1302 self._objects.clear()
1303 self._root_object = reader.root_object.clone(self)
1304 self._pages = self._root_object.raw_get("/Pages")
1306 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest
1307 # must be done here before rewriting
1308 if self.incremental:
1309 self._original_hash = [
1310 (obj.hash_bin() if obj is not None else 0) for obj in self._objects
1311 ]
1312 self._flatten()
1313 assert self.flattened_pages is not None
1314 for p in self.flattened_pages:
1315 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)
1316 if not self.incremental:
1317 p[NameObject("/Parent")] = self._pages
1318 if not self.incremental:
1319 cast(DictionaryObject, self._pages.get_object())[
1320 NameObject("/Kids")
1321 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
1323 def clone_document_from_reader(
1324 self,
1325 reader: PdfReader,
1326 after_page_append: Optional[Callable[[PageObject], None]] = None,
1327 ) -> None:
1328 """
1329 Create a copy (clone) of a document from a PDF file reader cloning
1330 section '/Root' and '/Info' and '/ID' of the pdf.
1332 Args:
1333 reader: PDF file reader instance from which the clone
1334 should be created.
1335 after_page_append:
1336 Callback function that is invoked after each page is appended to
1337 the writer. Signature includes a reference to the appended page
1338 (delegates to append_pages_from_reader). The single parameter of
1339 the callback is a reference to the page just appended to the
1340 document.
1342 """
1343 self.clone_reader_document_root(reader)
1344 inf = reader._info
1345 if self.incremental:
1346 if inf is not None:
1347 self._info_obj = cast(
1348 IndirectObject, inf.clone(self).indirect_reference
1349 )
1350 assert isinstance(self._info, DictionaryObject), "for mypy"
1351 self._original_hash[
1352 self._info_obj.indirect_reference.idnum - 1
1353 ] = self._info.hash_bin()
1354 elif inf is not None:
1355 self._info_obj = self._add_object(
1356 DictionaryObject(cast(DictionaryObject, inf.get_object()))
1357 )
1358 # else: _info_obj = None done in clone_reader_document_root()
1360 try:
1361 self._ID = cast(ArrayObject, reader._ID).clone(self)
1362 except AttributeError:
1363 pass
1365 if callable(after_page_append):
1366 for page in cast(
1367 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]
1368 ):
1369 after_page_append(page.get_object())
1371 def _compute_document_identifier(self) -> ByteStringObject:
1372 stream = BytesIO()
1373 self._write_pdf_structure(stream)
1374 stream.seek(0)
1375 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
1377 def generate_file_identifiers(self) -> None:
1378 """
1379 Generate an identifier for the PDF that will be written.
1381 The only point of this is ensuring uniqueness. Reproducibility is not
1382 required.
1383 When a file is first written, both identifiers shall be set to the same value.
1384 If both identifiers match when a file reference is resolved, it is very
1385 likely that the correct and unchanged file has been found. If only the first
1386 identifier matches, a different version of the correct file has been found.
1387 see §14.4 "File Identifiers".
1388 """
1389 if self._ID:
1390 id1 = self._ID[0]
1391 id2 = self._compute_document_identifier()
1392 else:
1393 id1 = self._compute_document_identifier()
1394 id2 = id1
1395 self._ID = ArrayObject((id1, id2))
1397 def encrypt(
1398 self,
1399 user_password: str,
1400 owner_password: Optional[str] = None,
1401 use_128bit: bool = True,
1402 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,
1403 *,
1404 algorithm: Optional[str] = None,
1405 ) -> None:
1406 """
1407 Encrypt this PDF file with the PDF Standard encryption handler.
1409 Args:
1410 user_password: The password which allows for opening
1411 and reading the PDF file with the restrictions provided.
1412 owner_password: The password which allows for
1413 opening the PDF files without any restrictions. By default,
1414 the owner password is the same as the user password.
1415 use_128bit: flag as to whether to use 128bit
1416 encryption. When false, 40bit encryption will be used.
1417 By default, this flag is on.
1418 permissions_flag: permissions as described in
1419 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means
1420 the permission is granted.
1421 Hence an integer value of -1 will set all flags.
1422 Bit position 3 is for printing, 4 is for modifying content,
1423 5 and 6 control annotations, 9 for form fields,
1424 10 for extraction of text and graphics.
1425 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",
1426 "AES-128", "AES-256-R5", "AES-256". If it is valid,
1427 `use_128bit` will be ignored.
1429 """
1430 if owner_password is None:
1431 owner_password = user_password
1433 if algorithm is not None:
1434 try:
1435 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))
1436 except AttributeError:
1437 raise ValueError(f"Algorithm '{algorithm}' NOT supported")
1438 else:
1439 alg = EncryptAlgorithm.RC4_128
1440 if not use_128bit:
1441 alg = EncryptAlgorithm.RC4_40
1442 self.generate_file_identifiers()
1443 assert self._ID
1444 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
1445 # in case call `encrypt` again
1446 entry = self._encryption.write_entry(user_password, owner_password)
1447 if self._encrypt_entry:
1448 # replace old encrypt_entry
1449 assert self._encrypt_entry.indirect_reference is not None
1450 entry.indirect_reference = self._encrypt_entry.indirect_reference
1451 self._objects[entry.indirect_reference.idnum - 1] = entry
1452 else:
1453 self._add_object(entry)
1454 self._encrypt_entry = entry
1456 def write_stream(self, stream: StreamType) -> None:
1457 if hasattr(stream, "mode") and "b" not in stream.mode:
1458 logger_warning(
1459 f"File <{stream.name}> to write to is not in binary mode. "
1460 "It may not be written to correctly.",
1461 __name__,
1462 )
1463 # deprecated to be removed in pypdf 6.0.0 :
1464 # if not self._root:
1465 # self._root = self._add_object(self._root_object)
1466 # self._sweep_indirect_references(self._root)
1468 if self.incremental:
1469 self._reader.stream.seek(0)
1470 stream.write(self._reader.stream.read(-1))
1471 if len(self.list_objects_in_increment()) > 0:
1472 self._write_increment(stream) # writes objs, xref stream and startxref
1473 else:
1474 object_positions, free_objects = self._write_pdf_structure(stream)
1475 xref_location = self._write_xref_table(
1476 stream, object_positions, free_objects
1477 )
1478 self._write_trailer(stream, xref_location)
1480 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
1481 """
1482 Write the collection of pages added to this object out as a PDF file.
1484 Args:
1485 stream: An object to write the file to. The object can support
1486 the write method and the tell method, similar to a file object, or
1487 be a file path, just like the fileobj, just named it stream to keep
1488 existing workflow.
1490 Returns:
1491 A tuple (bool, IO).
1493 """
1494 my_file = False
1496 if stream == "":
1497 raise ValueError(f"Output({stream=}) is empty.")
1499 if isinstance(stream, (str, Path)):
1500 stream = FileIO(stream, "wb")
1501 my_file = True
1503 self.write_stream(stream)
1505 if my_file:
1506 stream.close()
1507 else:
1508 stream.flush()
1510 return my_file, stream
1512 def list_objects_in_increment(self) -> List[IndirectObject]:
1513 """
1514 For analysis or debugging.
1515 Provides the list of new or modified objects that will be written
1516 in the increment.
1517 Deleted objects will not be freed but will become orphans.
1519 Returns:
1520 List of new or modified IndirectObjects
1522 """
1523 original_hash_count = len(self._original_hash)
1524 return [
1525 cast(IndirectObject, obj).indirect_reference
1526 for i, obj in enumerate(self._objects)
1527 if (
1528 obj is not None
1529 and (
1530 i >= original_hash_count
1531 or obj.hash_bin() != self._original_hash[i]
1532 )
1533 )
1534 ]
1536 def _write_increment(self, stream: StreamType) -> None:
1537 object_positions = {}
1538 object_blocks = []
1539 current_start = -1
1540 current_stop = -2
1541 original_hash_count = len(self._original_hash)
1542 for i, obj in enumerate(self._objects):
1543 if obj is not None and (
1544 i >= original_hash_count
1545 or obj.hash_bin() != self._original_hash[i]
1546 ):
1547 idnum = i + 1
1548 assert isinstance(obj, PdfObject), "mypy"
1549 # first write new/modified object
1550 object_positions[idnum] = stream.tell()
1551 stream.write(f"{idnum} 0 obj\n".encode())
1552 """ encryption is not operational
1553 if self._encryption and obj != self._encrypt_entry:
1554 obj = self._encryption.encrypt_object(obj, idnum, 0)
1555 """
1556 obj.write_to_stream(stream)
1557 stream.write(b"\nendobj\n")
1559 # prepare xref
1560 if idnum != current_stop:
1561 if current_start > 0:
1562 object_blocks.append(
1563 [current_start, current_stop - current_start]
1564 )
1565 current_start = idnum
1566 current_stop = idnum + 1
1567 assert current_start > 0, "for pytest only"
1568 object_blocks.append([current_start, current_stop - current_start])
1569 # write incremented xref
1570 xref_location = stream.tell()
1571 xr_id = len(self._objects) + 1
1572 stream.write(f"{xr_id} 0 obj".encode())
1573 init_data = {
1574 NameObject("/Type"): NameObject("/XRef"),
1575 NameObject("/Size"): NumberObject(xr_id + 1),
1576 NameObject("/Root"): self.root_object.indirect_reference,
1577 NameObject("/Filter"): NameObject("/FlateDecode"),
1578 NameObject("/Index"): ArrayObject(
1579 [NumberObject(_it) for _su in object_blocks for _it in _su]
1580 ),
1581 NameObject("/W"): ArrayObject(
1582 [NumberObject(1), NumberObject(4), NumberObject(1)]
1583 ),
1584 "__streamdata__": b"",
1585 }
1586 if self._info is not None and (
1587 self._info.indirect_reference.idnum - 1 # type: ignore
1588 >= len(self._original_hash)
1589 or cast(IndirectObject, self._info).hash_bin() # kept for future
1590 != self._original_hash[
1591 self._info.indirect_reference.idnum - 1 # type: ignore
1592 ]
1593 ):
1594 init_data[NameObject(TK.INFO)] = self._info.indirect_reference
1595 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
1596 if self._ID:
1597 init_data[NameObject(TK.ID)] = self._ID
1598 xr = StreamObject.initialize_from_dictionary(init_data)
1599 xr.set_data(
1600 b"".join(
1601 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]
1602 )
1603 )
1604 xr.write_to_stream(stream)
1605 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1607 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
1608 object_positions = []
1609 free_objects = []
1610 stream.write(self.pdf_header.encode() + b"\n")
1611 stream.write(b"%\xE2\xE3\xCF\xD3\n")
1613 for idnum, obj in enumerate(self._objects, start=1):
1614 if obj is not None:
1615 object_positions.append(stream.tell())
1616 stream.write(f"{idnum} 0 obj\n".encode())
1617 if self._encryption and obj != self._encrypt_entry:
1618 obj = self._encryption.encrypt_object(obj, idnum, 0)
1619 obj.write_to_stream(stream)
1620 stream.write(b"\nendobj\n")
1621 else:
1622 object_positions.append(-1)
1623 free_objects.append(idnum)
1624 free_objects.append(0) # add 0 to loop in accordance with specification
1625 return object_positions, free_objects
1627 def _write_xref_table(
1628 self, stream: StreamType, object_positions: List[int], free_objects: List[int]
1629 ) -> int:
1630 xref_location = stream.tell()
1631 stream.write(b"xref\n")
1632 stream.write(f"0 {len(self._objects) + 1}\n".encode())
1633 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
1634 free_idx = 1
1635 for offset in object_positions:
1636 if offset > 0:
1637 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
1638 else:
1639 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
1640 free_idx += 1
1641 return xref_location
1643 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
1644 """
1645 Write the PDF trailer to the stream.
1647 To quote the PDF specification:
1648 [The] trailer [gives] the location of the cross-reference table and
1649 of certain special objects within the body of the file.
1650 """
1651 stream.write(b"trailer\n")
1652 trailer = DictionaryObject(
1653 {
1654 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
1655 NameObject(TK.ROOT): self.root_object.indirect_reference,
1656 }
1657 )
1658 if self._info is not None:
1659 trailer[NameObject(TK.INFO)] = self._info.indirect_reference
1660 if self._ID is not None:
1661 trailer[NameObject(TK.ID)] = self._ID
1662 if self._encrypt_entry:
1663 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference
1664 trailer.write_to_stream(stream)
1665 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof
1667 @property
1668 def metadata(self) -> Optional[DocumentInformation]:
1669 """
1670 Retrieve/set the PDF file's document information dictionary, if it exists.
1672 Args:
1673 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
1675 Note that some PDF files use (XMP) metadata streams instead of document
1676 information dictionaries, and these metadata streams will not be
1677 accessed by this function, but by :meth:`~xmp_metadata`.
1679 """
1680 return super().metadata
1682 @metadata.setter
1683 def metadata(
1684 self,
1685 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],
1686 ) -> None:
1687 if value is None:
1688 self._info = None
1689 else:
1690 if self._info is not None:
1691 self._info.clear()
1693 self.add_metadata(value)
1695 def add_metadata(self, infos: Dict[str, Any]) -> None:
1696 """
1697 Add custom metadata to the output.
1699 Args:
1700 infos: a Python dictionary where each key is a field
1701 and each value is your new metadata.
1703 """
1704 args = {}
1705 if isinstance(infos, PdfObject):
1706 infos = cast(DictionaryObject, infos.get_object())
1707 for key, value in list(infos.items()):
1708 if isinstance(value, PdfObject):
1709 value = value.get_object()
1710 args[NameObject(key)] = create_string_object(str(value))
1711 if self._info is None:
1712 self._info = DictionaryObject()
1713 self._info.update(args)
1715 def compress_identical_objects(
1716 self,
1717 remove_identicals: bool = True,
1718 remove_orphans: bool = True,
1719 ) -> None:
1720 """
1721 Parse the PDF file and merge objects that have the same hash.
1722 This will make objects common to multiple pages.
1723 Recommended to be used just before writing output.
1725 Args:
1726 remove_identicals: Remove identical objects.
1727 remove_orphans: Remove unreferenced objects.
1729 """
1731 def replace_in_obj(
1732 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]
1733 ) -> None:
1734 if isinstance(obj, DictionaryObject):
1735 key_val = obj.items()
1736 elif isinstance(obj, ArrayObject):
1737 key_val = enumerate(obj) # type: ignore
1738 else:
1739 return
1740 assert isinstance(obj, (DictionaryObject, ArrayObject))
1741 for k, v in key_val:
1742 if isinstance(v, IndirectObject):
1743 orphans[v.idnum - 1] = False
1744 if v in crossref:
1745 obj[k] = crossref[v]
1746 else:
1747 """the filtering on DictionaryObject and ArrayObject only
1748 will be performed within replace_in_obj"""
1749 replace_in_obj(v, crossref)
1751 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
1752 self._idnum_hash = {}
1753 orphans = [True] * len(self._objects)
1754 # look for similar objects
1755 for idx, obj in enumerate(self._objects):
1756 if is_null_or_none(obj):
1757 continue
1758 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.
1759 assert isinstance(obj.indirect_reference, IndirectObject)
1760 h = obj.hash_value()
1761 if remove_identicals and h in self._idnum_hash:
1762 self._idnum_hash[h][1].append(obj.indirect_reference)
1763 self._objects[idx] = None
1764 else:
1765 self._idnum_hash[h] = (obj.indirect_reference, [])
1767 # generate the dict converting others to 1st
1768 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
1769 cnv_rev: Dict[IndirectObject, IndirectObject] = {}
1770 for k, v in cnv.items():
1771 cnv_rev.update(zip(v, (k,) * len(v)))
1773 # replace reference to merged objects
1774 for obj in self._objects:
1775 if isinstance(obj, (DictionaryObject, ArrayObject)):
1776 replace_in_obj(obj, cnv_rev)
1778 # remove orphans (if applicable)
1779 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
1781 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore
1783 try:
1784 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore
1785 except AttributeError:
1786 pass
1787 for i in compress(range(len(self._objects)), orphans):
1788 self._objects[i] = None
1790 def _sweep_indirect_references(
1791 self,
1792 root: Union[
1793 ArrayObject,
1794 BooleanObject,
1795 DictionaryObject,
1796 FloatObject,
1797 IndirectObject,
1798 NameObject,
1799 PdfObject,
1800 NumberObject,
1801 TextStringObject,
1802 NullObject,
1803 ],
1804 ) -> None: # deprecated
1805 """
1806 Resolving any circular references to Page objects.
1808 Circular references to Page objects can arise when objects such as
1809 annotations refer to their associated page. If these references are not
1810 properly handled, the PDF file will contain multiple copies of the same
1811 Page object. To address this problem, Page objects store their original
1812 object reference number. This method adds the reference number of any
1813 circularly referenced Page objects to an external reference map. This
1814 ensures that self-referencing trees reference the correct new object
1815 location, rather than copying in a new copy of the Page object.
1817 Args:
1818 root: The root of the PDF object tree to sweep.
1820 """
1821 deprecate(
1822 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed",
1823 )
1825 def _resolve_indirect_object(
1826 self, data: IndirectObject
1827 ) -> IndirectObject: # deprecated
1828 """
1829 Resolves an indirect object to an indirect object in this PDF file.
1831 If the input indirect object already belongs to this PDF file, it is
1832 returned directly. Otherwise, the object is retrieved from the input
1833 object's PDF file using the object's ID number and generation number. If
1834 the object cannot be found, a warning is logged and a `NullObject` is
1835 returned.
1837 If the object is not already in this PDF file, it is added to the file's
1838 list of objects and assigned a new ID number and generation number of 0.
1839 The hash value of the object is then added to the `_idnum_hash`
1840 dictionary, with the corresponding `IndirectObject` reference as the
1841 value.
1843 Args:
1844 data: The `IndirectObject` to resolve.
1846 Returns:
1847 The resolved `IndirectObject` in this PDF file.
1849 Raises:
1850 ValueError: If the input stream is closed.
1852 """
1853 deprecate(
1854 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed",
1855 )
1856 return IndirectObject(0, 0, self)
1858 def get_reference(self, obj: PdfObject) -> IndirectObject:
1859 idnum = self._objects.index(obj) + 1
1860 ref = IndirectObject(idnum, 0, self)
1861 assert ref.get_object() == obj
1862 return ref
1864 def get_outline_root(self) -> TreeObject:
1865 if CO.OUTLINES in self._root_object:
1866 # Entries in the catalog dictionary
1867 outline = cast(TreeObject, self._root_object[CO.OUTLINES])
1868 if not isinstance(outline, TreeObject):
1869 t = TreeObject(outline)
1870 self._replace_object(outline.indirect_reference.idnum, t)
1871 outline = t
1872 idnum = self._objects.index(outline) + 1
1873 outline_ref = IndirectObject(idnum, 0, self)
1874 assert outline_ref.get_object() == outline
1875 else:
1876 outline = TreeObject()
1877 outline.update({})
1878 outline_ref = self._add_object(outline)
1879 self._root_object[NameObject(CO.OUTLINES)] = outline_ref
1881 return outline
1883 def get_threads_root(self) -> ArrayObject:
1884 """
1885 The list of threads.
1887 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1889 Returns:
1890 An array (possibly empty) of Dictionaries with an ``/F`` key,
1891 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
1893 """
1894 if CO.THREADS in self._root_object:
1895 # Entries in the catalog dictionary
1896 threads = cast(ArrayObject, self._root_object[CO.THREADS])
1897 else:
1898 threads = ArrayObject()
1899 self._root_object[NameObject(CO.THREADS)] = threads
1900 return threads
1902 @property
1903 def threads(self) -> ArrayObject:
1904 """
1905 Read-only property for the list of threads.
1907 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
1909 Each element is a dictionary with an ``/F`` key, and optionally
1910 information about the thread in ``/I`` or ``/Metadata`` keys.
1911 """
1912 return self.get_threads_root()
1914 def add_outline_item_destination(
1915 self,
1916 page_destination: Union[IndirectObject, PageObject, TreeObject],
1917 parent: Union[None, TreeObject, IndirectObject] = None,
1918 before: Union[None, TreeObject, IndirectObject] = None,
1919 is_open: bool = True,
1920 ) -> IndirectObject:
1921 page_destination = cast(PageObject, page_destination.get_object())
1922 if isinstance(page_destination, PageObject):
1923 return self.add_outline_item_destination(
1924 Destination(
1925 f"page #{page_destination.page_number}",
1926 cast(IndirectObject, page_destination.indirect_reference),
1927 Fit.fit(),
1928 )
1929 )
1931 if parent is None:
1932 parent = self.get_outline_root()
1934 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)
1935 parent = cast(TreeObject, parent.get_object())
1936 page_destination_ref = self._add_object(page_destination)
1937 if before is not None:
1938 before = before.indirect_reference
1939 parent.insert_child(
1940 page_destination_ref,
1941 before,
1942 self,
1943 page_destination.inc_parent_counter_outline
1944 if is_open
1945 else (lambda x, y: 0), # noqa: ARG005
1946 )
1947 if "/Count" not in page_destination:
1948 page_destination[NameObject("/Count")] = NumberObject(0)
1950 return page_destination_ref
1952 def add_outline_item_dict(
1953 self,
1954 outline_item: OutlineItemType,
1955 parent: Union[None, TreeObject, IndirectObject] = None,
1956 before: Union[None, TreeObject, IndirectObject] = None,
1957 is_open: bool = True,
1958 ) -> IndirectObject:
1959 outline_item_object = TreeObject()
1960 outline_item_object.update(outline_item)
1962 """code currently unreachable
1963 if "/A" in outline_item:
1964 action = DictionaryObject()
1965 a_dict = cast(DictionaryObject, outline_item["/A"])
1966 for k, v in list(a_dict.items()):
1967 action[NameObject(str(k))] = v
1968 action_ref = self._add_object(action)
1969 outline_item_object[NameObject("/A")] = action_ref
1970 """
1971 return self.add_outline_item_destination(
1972 outline_item_object, parent, before, is_open
1973 )
1975 def add_outline_item(
1976 self,
1977 title: str,
1978 page_number: Union[None, PageObject, IndirectObject, int],
1979 parent: Union[None, TreeObject, IndirectObject] = None,
1980 before: Union[None, TreeObject, IndirectObject] = None,
1981 color: Optional[Union[Tuple[float, float, float], str]] = None,
1982 bold: bool = False,
1983 italic: bool = False,
1984 fit: Fit = PAGE_FIT,
1985 is_open: bool = True,
1986 ) -> IndirectObject:
1987 """
1988 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.
1990 Args:
1991 title: Title to use for this outline item.
1992 page_number: Page number this outline item will point to.
1993 parent: A reference to a parent outline item to create nested
1994 outline items.
1995 before:
1996 color: Color of the outline item's font as a red, green, blue tuple
1997 from 0.0 to 1.0 or as a Hex String (#RRGGBB)
1998 bold: Outline item font is bold
1999 italic: Outline item font is italic
2000 fit: The fit of the destination page.
2002 Returns:
2003 The added outline item as an indirect object.
2005 """
2006 page_ref: Union[None, NullObject, IndirectObject, NumberObject]
2007 if isinstance(italic, Fit): # it means that we are on the old params
2008 if fit is not None and page_number is None:
2009 page_number = fit
2010 return self.add_outline_item(
2011 title, page_number, parent, None, before, color, bold, italic, is_open=is_open
2012 )
2013 if page_number is None:
2014 action_ref = None
2015 else:
2016 if isinstance(page_number, IndirectObject):
2017 page_ref = page_number
2018 elif isinstance(page_number, PageObject):
2019 page_ref = page_number.indirect_reference
2020 elif isinstance(page_number, int):
2021 try:
2022 page_ref = self.pages[page_number].indirect_reference
2023 except IndexError:
2024 page_ref = NumberObject(page_number)
2025 if page_ref is None:
2026 logger_warning(
2027 f"can not find reference of page {page_number}",
2028 __name__,
2029 )
2030 page_ref = NullObject()
2031 dest = Destination(
2032 NameObject("/" + title + " outline item"),
2033 page_ref,
2034 fit,
2035 )
2037 action_ref = self._add_object(
2038 DictionaryObject(
2039 {
2040 NameObject(GoToActionArguments.D): dest.dest_array,
2041 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
2042 }
2043 )
2044 )
2045 outline_item = self._add_object(
2046 _create_outline_item(action_ref, title, color, italic, bold)
2047 )
2049 if parent is None:
2050 parent = self.get_outline_root()
2051 return self.add_outline_item_destination(outline_item, parent, before, is_open)
2053 def add_outline(self) -> None:
2054 raise NotImplementedError(
2055 "This method is not yet implemented. Use :meth:`add_outline_item` instead."
2056 )
2058 def add_named_destination_array(
2059 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]
2060 ) -> None:
2061 named_dest = self.get_named_dest_root()
2062 i = 0
2063 while i < len(named_dest):
2064 if title < named_dest[i]:
2065 named_dest.insert(i, destination)
2066 named_dest.insert(i, TextStringObject(title))
2067 return
2068 i += 2
2069 named_dest.extend([TextStringObject(title), destination])
2070 return
2072 def add_named_destination_object(
2073 self,
2074 page_destination: PdfObject,
2075 ) -> IndirectObject:
2076 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore
2077 self.add_named_destination_array(
2078 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore
2079 )
2081 return page_destination_ref
2083 def add_named_destination(
2084 self,
2085 title: str,
2086 page_number: int,
2087 ) -> IndirectObject:
2088 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore
2089 dest = DictionaryObject()
2090 dest.update(
2091 {
2092 NameObject(GoToActionArguments.D): ArrayObject(
2093 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]
2094 ),
2095 NameObject(GoToActionArguments.S): NameObject("/GoTo"),
2096 }
2097 )
2099 dest_ref = self._add_object(dest)
2100 if not isinstance(title, TextStringObject):
2101 title = TextStringObject(str(title))
2103 self.add_named_destination_array(title, dest_ref)
2104 return dest_ref
2106 def remove_links(self) -> None:
2107 """Remove links and annotations from this output."""
2108 for page in self.pages:
2109 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
2111 def remove_annotations(
2112 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]
2113 ) -> None:
2114 """
2115 Remove annotations by annotation subtype.
2117 Args:
2118 subtypes: subtype or list of subtypes to be removed.
2119 Examples are: "/Link", "/FileAttachment", "/Sound",
2120 "/Movie", "/Screen", ...
2121 If you want to remove all annotations, use subtypes=None.
2123 """
2124 for page in self.pages:
2125 self._remove_annots_from_page(page, subtypes)
2127 def _remove_annots_from_page(
2128 self,
2129 page: Union[IndirectObject, PageObject, DictionaryObject],
2130 subtypes: Optional[Iterable[str]],
2131 ) -> None:
2132 page = cast(DictionaryObject, page.get_object())
2133 if PG.ANNOTS in page:
2134 i = 0
2135 while i < len(cast(ArrayObject, page[PG.ANNOTS])):
2136 an = cast(ArrayObject, page[PG.ANNOTS])[i]
2137 obj = cast(DictionaryObject, an.get_object())
2138 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
2139 if isinstance(an, IndirectObject):
2140 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
2141 del page[PG.ANNOTS][i] # type:ignore
2142 else:
2143 i += 1
2145 def remove_objects_from_page(
2146 self,
2147 page: Union[PageObject, DictionaryObject],
2148 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
2149 text_filters: Optional[Dict[str, Any]] = None
2150 ) -> None:
2151 """
2152 Remove objects specified by ``to_delete`` from the given page.
2154 Args:
2155 page: Page object to clean up.
2156 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
2157 or a list of ObjectDeletionFlag
2158 text_filters: Properties of text to be deleted, if applicable. Optional.
2159 This is a Python dictionary with the following properties:
2161 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.
2163 """
2164 if isinstance(to_delete, (list, tuple)):
2165 for to_d in to_delete:
2166 self.remove_objects_from_page(page, to_d)
2167 return None
2168 assert isinstance(to_delete, ObjectDeletionFlag)
2170 if to_delete & ObjectDeletionFlag.LINKS:
2171 return self._remove_annots_from_page(page, ("/Link",))
2172 if to_delete & ObjectDeletionFlag.ATTACHMENTS:
2173 return self._remove_annots_from_page(
2174 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
2175 )
2176 if to_delete & ObjectDeletionFlag.OBJECTS_3D:
2177 return self._remove_annots_from_page(page, ("/3D",))
2178 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
2179 return self._remove_annots_from_page(page, None)
2181 jump_operators = []
2182 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
2183 jump_operators = (
2184 [
2185 b"w", b"J", b"j", b"M", b"d", b"i",
2186 b"W", b"W*",
2187 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",
2188 b"m", b"l", b"c", b"v", b"y", b"h", b"re",
2189 b"sh"
2190 ]
2191 )
2192 if to_delete & ObjectDeletionFlag.TEXT:
2193 jump_operators = [b"Tj", b"TJ", b"'", b'"']
2195 def clean(
2196 content: ContentStream,
2197 images: List[str],
2198 forms: List[str],
2199 text_filters: Optional[Dict[str, Any]] = None
2200 ) -> None:
2201 nonlocal jump_operators, to_delete
2203 font_id = None
2204 font_ids_to_delete = []
2205 if text_filters and to_delete & ObjectDeletionFlag.TEXT:
2206 font_ids_to_delete = text_filters.get("font_ids", [])
2208 i = 0
2209 while i < len(content.operations):
2210 operands, operator = content.operations[i]
2211 if operator == b"Tf":
2212 font_id = operands[0]
2213 if (
2214 (
2215 operator == b"INLINE IMAGE"
2216 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)
2217 )
2218 or (operator in jump_operators)
2219 or (
2220 operator == b"Do"
2221 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)
2222 and (operands[0] in images)
2223 )
2224 ):
2225 if (
2226 not to_delete & ObjectDeletionFlag.TEXT
2227 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)
2228 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)
2229 ):
2230 del content.operations[i]
2231 else:
2232 i += 1
2233 else:
2234 i += 1
2235 content.get_data() # this ensures ._data is rebuilt from the .operations
2237 def clean_forms(
2238 elt: DictionaryObject, stack: List[DictionaryObject]
2239 ) -> Tuple[List[str], List[str]]:
2240 nonlocal to_delete
2241 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference
2242 if (elt in stack) or (
2243 hasattr(elt, "indirect_reference")
2244 and any(
2245 elt.indirect_reference == getattr(x, "indirect_reference", -1)
2246 for x in stack
2247 )
2248 ):
2249 # to prevent infinite looping
2250 return [], [] # pragma: no cover
2251 try:
2252 d = cast(
2253 Dict[Any, Any],
2254 cast(DictionaryObject, elt["/Resources"])["/XObject"],
2255 )
2256 except KeyError:
2257 d = {}
2258 images = []
2259 forms = []
2260 for k, v in d.items():
2261 o = v.get_object()
2262 try:
2263 content: Any = None
2264 if (
2265 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES
2266 and o["/Subtype"] == "/Image"
2267 ):
2268 content = NullObject() # to delete the image keeping the entry
2269 images.append(k)
2270 if o["/Subtype"] == "/Form":
2271 forms.append(k)
2272 if isinstance(o, ContentStream):
2273 content = o
2274 else:
2275 content = ContentStream(o, self)
2276 content.update(
2277 {
2278 k1: v1
2279 for k1, v1 in o.items()
2280 if k1 not in ["/Length", "/Filter", "/DecodeParms"]
2281 }
2282 )
2283 try:
2284 content.indirect_reference = o.indirect_reference
2285 except AttributeError: # pragma: no cover
2286 pass
2287 stack.append(elt)
2288 clean_forms(content, stack) # clean subforms
2289 if content is not None:
2290 if isinstance(v, IndirectObject):
2291 self._objects[v.idnum - 1] = content
2292 else:
2293 # should only occur in a PDF not respecting PDF spec
2294 # where streams must be indirected.
2295 d[k] = self._add_object(content) # pragma: no cover
2296 except (TypeError, KeyError):
2297 pass
2298 for im in images:
2299 del d[im] # for clean-up
2300 if isinstance(elt, StreamObject): # for /Form
2301 if not isinstance(elt, ContentStream): # pragma: no cover
2302 e = ContentStream(elt, self)
2303 e.update(elt.items())
2304 elt = e
2305 clean(elt, images, forms, text_filters) # clean the content
2306 return images, forms
2308 if not isinstance(page, PageObject):
2309 page = PageObject(self, page.indirect_reference) # pragma: no cover
2310 if "/Contents" in page:
2311 content = cast(ContentStream, page.get_contents())
2313 images, forms = clean_forms(page, [])
2315 clean(content, images, forms, text_filters)
2316 page.replace_contents(content)
2318 def remove_images(
2319 self,
2320 to_delete: ImageType = ImageType.ALL,
2321 ) -> None:
2322 """
2323 Remove images from this output.
2325 Args:
2326 to_delete: The type of images to be deleted
2327 (default = all images types)
2329 """
2330 if isinstance(to_delete, bool):
2331 to_delete = ImageType.ALL
2333 i = ObjectDeletionFlag.NONE
2335 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):
2336 if to_delete & ImageType[image]:
2337 i |= ObjectDeletionFlag[image]
2339 for page in self.pages:
2340 self.remove_objects_from_page(page, i)
2342 def remove_text(self, font_names: Optional[List[str]] = None) -> None:
2343 """
2344 Remove text from the PDF.
2346 Args:
2347 font_names: List of font names to remove, such as "Helvetica-Bold".
2348 Optional. If not specified, all text will be removed.
2349 """
2350 if not font_names:
2351 font_names = []
2353 for page in self.pages:
2354 resource_ids_to_remove = []
2356 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"
2357 # Font names need to be converted to resource names/IDs for easier removal
2358 if font_names:
2359 # Recursively loop through page objects to gather font info
2360 def get_font_info(
2361 obj: Any,
2362 font_info: Optional[Dict[str, Any]] = None,
2363 key: Optional[str] = None
2364 ) -> Dict[str, Any]:
2365 if font_info is None:
2366 font_info = {}
2367 if isinstance(obj, IndirectObject):
2368 obj = obj.get_object()
2369 if isinstance(obj, dict):
2370 if obj.get("/Type") == "/Font":
2371 font_name = obj.get("/BaseFont", "")
2372 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"
2373 normalized_font_name = font_name.lstrip("/").split("+")[-1]
2374 if normalized_font_name not in font_info:
2375 font_info[normalized_font_name] = {
2376 "normalized_font_name": normalized_font_name,
2377 "resource_ids": [],
2378 }
2379 if key not in font_info[normalized_font_name]["resource_ids"]:
2380 font_info[normalized_font_name]["resource_ids"].append(key)
2381 for k in obj:
2382 font_info = get_font_info(obj[k], font_info, k)
2383 elif isinstance(obj, (list, ArrayObject)):
2384 for child_obj in obj:
2385 font_info = get_font_info(child_obj, font_info)
2386 return font_info
2388 # Add relevant resource names for removal
2389 font_info = get_font_info(page.get("/Resources"))
2390 for font_name in font_names:
2391 if font_name in font_info:
2392 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])
2394 text_filters = {}
2395 if font_names:
2396 text_filters["font_ids"] = resource_ids_to_remove
2397 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
2399 def add_uri(
2400 self,
2401 page_number: int,
2402 uri: str,
2403 rect: RectangleObject,
2404 border: Optional[ArrayObject] = None,
2405 ) -> None:
2406 """
2407 Add an URI from a rectangular area to the specified page.
2409 Args:
2410 page_number: index of the page on which to place the URI action.
2411 uri: URI of resource to link to.
2412 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or
2413 array of four integers specifying the clickable rectangular area
2414 ``[xLL, yLL, xUR, yUR]``, or string in the form
2415 ``"[ xLL yLL xUR yUR ]"``.
2416 border: if provided, an array describing border-drawing
2417 properties. See the PDF spec for details. No border will be
2418 drawn if this argument is omitted.
2420 """
2421 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore
2422 page_ref = cast(Dict[str, Any], self.get_object(page_link))
2424 border_arr: BorderArrayType
2425 if border is not None:
2426 border_arr = [NumberObject(n) for n in border[:3]]
2427 if len(border) == 4:
2428 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
2429 border_arr.append(dash_pattern)
2430 else:
2431 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]
2433 if isinstance(rect, str):
2434 rect = NumberObject(rect)
2435 elif isinstance(rect, RectangleObject):
2436 pass
2437 else:
2438 rect = RectangleObject(rect)
2440 lnk2 = DictionaryObject()
2441 lnk2.update(
2442 {
2443 NameObject("/S"): NameObject("/URI"),
2444 NameObject("/URI"): TextStringObject(uri),
2445 }
2446 )
2447 lnk = DictionaryObject()
2448 lnk.update(
2449 {
2450 NameObject(AA.Type): NameObject("/Annot"),
2451 NameObject(AA.Subtype): NameObject("/Link"),
2452 NameObject(AA.P): page_link,
2453 NameObject(AA.Rect): rect,
2454 NameObject("/H"): NameObject("/I"),
2455 NameObject(AA.Border): ArrayObject(border_arr),
2456 NameObject("/A"): lnk2,
2457 }
2458 )
2459 lnk_ref = self._add_object(lnk)
2461 if PG.ANNOTS in page_ref:
2462 page_ref[PG.ANNOTS].append(lnk_ref)
2463 else:
2464 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])
2466 _valid_layouts = (
2467 "/NoLayout",
2468 "/SinglePage",
2469 "/OneColumn",
2470 "/TwoColumnLeft",
2471 "/TwoColumnRight",
2472 "/TwoPageLeft",
2473 "/TwoPageRight",
2474 )
2476 def _get_page_layout(self) -> Optional[LayoutType]:
2477 try:
2478 return cast(LayoutType, self._root_object["/PageLayout"])
2479 except KeyError:
2480 return None
2482 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:
2483 """
2484 Set the page layout.
2486 Args:
2487 layout: The page layout to be used.
2489 .. list-table:: Valid ``layout`` arguments
2490 :widths: 50 200
2492 * - /NoLayout
2493 - Layout explicitly not specified
2494 * - /SinglePage
2495 - Show one page at a time
2496 * - /OneColumn
2497 - Show one column at a time
2498 * - /TwoColumnLeft
2499 - Show pages in two columns, odd-numbered pages on the left
2500 * - /TwoColumnRight
2501 - Show pages in two columns, odd-numbered pages on the right
2502 * - /TwoPageLeft
2503 - Show two pages at a time, odd-numbered pages on the left
2504 * - /TwoPageRight
2505 - Show two pages at a time, odd-numbered pages on the right
2507 """
2508 if not isinstance(layout, NameObject):
2509 if layout not in self._valid_layouts:
2510 logger_warning(
2511 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",
2512 __name__,
2513 )
2514 layout = NameObject(layout)
2515 self._root_object.update({NameObject("/PageLayout"): layout})
2517 def set_page_layout(self, layout: LayoutType) -> None:
2518 """
2519 Set the page layout.
2521 Args:
2522 layout: The page layout to be used
2524 .. list-table:: Valid ``layout`` arguments
2525 :widths: 50 200
2527 * - /NoLayout
2528 - Layout explicitly not specified
2529 * - /SinglePage
2530 - Show one page at a time
2531 * - /OneColumn
2532 - Show one column at a time
2533 * - /TwoColumnLeft
2534 - Show pages in two columns, odd-numbered pages on the left
2535 * - /TwoColumnRight
2536 - Show pages in two columns, odd-numbered pages on the right
2537 * - /TwoPageLeft
2538 - Show two pages at a time, odd-numbered pages on the left
2539 * - /TwoPageRight
2540 - Show two pages at a time, odd-numbered pages on the right
2542 """
2543 self._set_page_layout(layout)
2545 @property
2546 def page_layout(self) -> Optional[LayoutType]:
2547 """
2548 Page layout property.
2550 .. list-table:: Valid ``layout`` values
2551 :widths: 50 200
2553 * - /NoLayout
2554 - Layout explicitly not specified
2555 * - /SinglePage
2556 - Show one page at a time
2557 * - /OneColumn
2558 - Show one column at a time
2559 * - /TwoColumnLeft
2560 - Show pages in two columns, odd-numbered pages on the left
2561 * - /TwoColumnRight
2562 - Show pages in two columns, odd-numbered pages on the right
2563 * - /TwoPageLeft
2564 - Show two pages at a time, odd-numbered pages on the left
2565 * - /TwoPageRight
2566 - Show two pages at a time, odd-numbered pages on the right
2567 """
2568 return self._get_page_layout()
2570 @page_layout.setter
2571 def page_layout(self, layout: LayoutType) -> None:
2572 self._set_page_layout(layout)
2574 _valid_modes = (
2575 "/UseNone",
2576 "/UseOutlines",
2577 "/UseThumbs",
2578 "/FullScreen",
2579 "/UseOC",
2580 "/UseAttachments",
2581 )
2583 def _get_page_mode(self) -> Optional[PagemodeType]:
2584 try:
2585 return cast(PagemodeType, self._root_object["/PageMode"])
2586 except KeyError:
2587 return None
2589 @property
2590 def page_mode(self) -> Optional[PagemodeType]:
2591 """
2592 Page mode property.
2594 .. list-table:: Valid ``mode`` values
2595 :widths: 50 200
2597 * - /UseNone
2598 - Do not show outline or thumbnails panels
2599 * - /UseOutlines
2600 - Show outline (aka bookmarks) panel
2601 * - /UseThumbs
2602 - Show page thumbnails panel
2603 * - /FullScreen
2604 - Fullscreen view
2605 * - /UseOC
2606 - Show Optional Content Group (OCG) panel
2607 * - /UseAttachments
2608 - Show attachments panel
2609 """
2610 return self._get_page_mode()
2612 @page_mode.setter
2613 def page_mode(self, mode: PagemodeType) -> None:
2614 if isinstance(mode, NameObject):
2615 mode_name: NameObject = mode
2616 else:
2617 if mode not in self._valid_modes:
2618 logger_warning(
2619 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__
2620 )
2621 mode_name = NameObject(mode)
2622 self._root_object.update({NameObject("/PageMode"): mode_name})
2624 def add_annotation(
2625 self,
2626 page_number: Union[int, PageObject],
2627 annotation: Dict[str, Any],
2628 ) -> DictionaryObject:
2629 """
2630 Add a single annotation to the page.
2631 The added annotation must be a new annotation.
2632 It cannot be recycled.
2634 Args:
2635 page_number: PageObject or page index.
2636 annotation: Annotation to be added (created with annotation).
2638 Returns:
2639 The inserted object.
2640 This can be used for popup creation, for example.
2642 """
2643 page = page_number
2644 if isinstance(page, int):
2645 page = self.pages[page]
2646 elif not isinstance(page, PageObject):
2647 raise TypeError("page: invalid type")
2649 to_add = cast(DictionaryObject, _pdf_objectify(annotation))
2650 to_add[NameObject("/P")] = page.indirect_reference
2652 if page.annotations is None:
2653 page[NameObject("/Annots")] = ArrayObject()
2654 assert page.annotations is not None
2656 # Internal link annotations need the correct object type for the
2657 # destination
2658 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:
2659 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")])
2660 dest = Destination(
2661 NameObject("/LinkName"),
2662 tmp["target_page_index"],
2663 Fit(
2664 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]
2665 ), # I have no clue why this dict-hack is necessary
2666 )
2667 to_add[NameObject("/Dest")] = dest.dest_array
2669 page.annotations.append(self._add_object(to_add))
2671 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:
2672 cast(DictionaryObject, to_add["/Parent"].get_object())[
2673 NameObject("/Popup")
2674 ] = to_add.indirect_reference
2676 return to_add
2678 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:
2679 """
2680 Perform some clean up in the page.
2681 Currently: convert NameObject named destination to TextStringObject
2682 (required for names/dests list)
2684 Args:
2685 page:
2687 Returns:
2688 The cleaned PageObject
2690 """
2691 page = cast("PageObject", page.get_object())
2692 for a in page.get("/Annots", []):
2693 a_obj = a.get_object()
2694 d = a_obj.get("/Dest", None)
2695 act = a_obj.get("/A", None)
2696 if isinstance(d, NameObject):
2697 a_obj[NameObject("/Dest")] = TextStringObject(d)
2698 elif act is not None:
2699 act = act.get_object()
2700 d = act.get("/D", None)
2701 if isinstance(d, NameObject):
2702 act[NameObject("/D")] = TextStringObject(d)
2703 return page
2705 def _create_stream(
2706 self, fileobj: Union[Path, StrByteType, PdfReader]
2707 ) -> Tuple[IOBase, Optional[Encryption]]:
2708 # If the fileobj parameter is a string, assume it is a path
2709 # and create a file object at that location. If it is a file,
2710 # copy the file's contents into a BytesIO stream object; if
2711 # it is a PdfReader, copy that reader's stream into a
2712 # BytesIO stream.
2713 # If fileobj is none of the above types, it is not modified
2714 encryption_obj = None
2715 stream: IOBase
2716 if isinstance(fileobj, (str, Path)):
2717 with FileIO(fileobj, "rb") as f:
2718 stream = BytesIO(f.read())
2719 elif isinstance(fileobj, PdfReader):
2720 if fileobj._encryption:
2721 encryption_obj = fileobj._encryption
2722 orig_tell = fileobj.stream.tell()
2723 fileobj.stream.seek(0)
2724 stream = BytesIO(fileobj.stream.read())
2726 # reset the stream to its original location
2727 fileobj.stream.seek(orig_tell)
2728 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):
2729 fileobj.seek(0)
2730 filecontent = fileobj.read()
2731 stream = BytesIO(filecontent)
2732 else:
2733 raise NotImplementedError(
2734 "Merging requires an object that PdfReader can parse. "
2735 "Typically, that is a Path or a string representing a Path, "
2736 "a file object, or an object implementing .seek and .read. "
2737 "Passing a PdfReader directly works as well."
2738 )
2739 return stream, encryption_obj
2741 def append(
2742 self,
2743 fileobj: Union[StrByteType, PdfReader, Path],
2744 outline_item: Union[
2745 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]
2746 ] = None,
2747 pages: Union[
2748 None,
2749 PageRange,
2750 Tuple[int, int],
2751 Tuple[int, int, int],
2752 List[int],
2753 List[PageObject],
2754 ] = None,
2755 import_outline: bool = True,
2756 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None,
2757 ) -> None:
2758 """
2759 Identical to the :meth:`merge()<merge>` method, but assumes you want to
2760 concatenate all pages onto the end of the file instead of specifying a
2761 position.
2763 Args:
2764 fileobj: A File Object or an object that supports the standard
2765 read and seek methods similar to a File Object. Could also be a
2766 string representing a path to a PDF file.
2767 outline_item: Optionally, you may specify a string to build an
2768 outline (aka 'bookmark') to identify the beginning of the
2769 included file.
2770 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2771 or a ``(start, stop[, step])`` tuple
2772 or a list of pages to be processed
2773 to merge only the specified range of pages from the source
2774 document into the output document.
2775 import_outline: You may prevent the source document's
2776 outline (collection of outline items, previously referred to as
2777 'bookmarks') from being imported by specifying this as ``False``.
2778 excluded_fields: Provide the list of fields/keys to be ignored
2779 if ``/Annots`` is part of the list, the annotation will be ignored
2780 if ``/B`` is part of the list, the articles will be ignored
2782 """
2783 if excluded_fields is None:
2784 excluded_fields = ()
2785 if isinstance(outline_item, (tuple, list, PageRange)):
2786 if isinstance(pages, bool):
2787 if not isinstance(import_outline, bool):
2788 excluded_fields = import_outline
2789 import_outline = pages
2790 pages = outline_item
2791 self.merge(
2792 None,
2793 fileobj,
2794 None,
2795 pages,
2796 import_outline,
2797 excluded_fields,
2798 )
2799 else: # if isinstance(outline_item, str):
2800 self.merge(
2801 None,
2802 fileobj,
2803 outline_item,
2804 pages,
2805 import_outline,
2806 excluded_fields,
2807 )
2809 def merge(
2810 self,
2811 position: Optional[int],
2812 fileobj: Union[Path, StrByteType, PdfReader],
2813 outline_item: Optional[str] = None,
2814 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None,
2815 import_outline: bool = True,
2816 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (),
2817 ) -> None:
2818 """
2819 Merge the pages from the given file into the output file at the
2820 specified page number.
2822 Args:
2823 position: The *page number* to insert this file. File will
2824 be inserted after the given number.
2825 fileobj: A File Object or an object that supports the standard
2826 read and seek methods similar to a File Object. Could also be a
2827 string representing a path to a PDF file.
2828 outline_item: Optionally, you may specify a string to build an outline
2829 (aka 'bookmark') to identify the
2830 beginning of the included file.
2831 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
2832 or a ``(start, stop[, step])`` tuple
2833 or a list of pages to be processed
2834 to merge only the specified range of pages from the source
2835 document into the output document.
2836 import_outline: You may prevent the source document's
2837 outline (collection of outline items, previously referred to as
2838 'bookmarks') from being imported by specifying this as ``False``.
2839 excluded_fields: provide the list of fields/keys to be ignored
2840 if ``/Annots`` is part of the list, the annotation will be ignored
2841 if ``/B`` is part of the list, the articles will be ignored
2843 Raises:
2844 TypeError: The pages attribute is not configured properly
2846 """
2847 if isinstance(fileobj, PdfDocCommon):
2848 reader = fileobj
2849 else:
2850 stream, encryption_obj = self._create_stream(fileobj)
2851 # Create a new PdfReader instance using the stream
2852 # (either file or BytesIO or StringIO) created above
2853 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]
2855 if excluded_fields is None:
2856 excluded_fields = ()
2857 # Find the range of pages to merge.
2858 if pages is None:
2859 pages = list(range(len(reader.pages)))
2860 elif isinstance(pages, PageRange):
2861 pages = list(range(*pages.indices(len(reader.pages))))
2862 elif isinstance(pages, list):
2863 pass # keep unchanged
2864 elif isinstance(pages, tuple) and len(pages) <= 3:
2865 pages = list(range(*pages))
2866 elif not isinstance(pages, tuple):
2867 raise TypeError(
2868 '"pages" must be a tuple of (start, stop[, step]) or a list'
2869 )
2871 srcpages = {}
2872 for page in pages:
2873 if isinstance(page, PageObject):
2874 pg = page
2875 else:
2876 pg = reader.pages[page]
2877 assert pg.indirect_reference is not None
2878 if position is None:
2879 # numbers in the exclude list identifies that the exclusion is
2880 # only applicable to 1st level of cloning
2881 srcpages[pg.indirect_reference.idnum] = self.add_page(
2882 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2883 )
2884 else:
2885 srcpages[pg.indirect_reference.idnum] = self.insert_page(
2886 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore
2887 )
2888 position += 1
2889 srcpages[pg.indirect_reference.idnum].original_page = pg
2891 reader._named_destinations = (
2892 reader.named_destinations
2893 ) # need for the outline processing below
2895 arr: Any
2897 def _process_named_dests(dest: Any) -> None:
2898 arr = dest.dest_array
2899 if "/Names" in self._root_object and dest["/Title"] in cast(
2900 List[Any],
2901 cast(
2902 DictionaryObject,
2903 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),
2904 ).get("/Names", DictionaryObject()),
2905 ):
2906 # already exists: should not duplicate it
2907 pass
2908 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):
2909 pass
2910 elif isinstance(dest["/Page"], int):
2911 # the page reference is a page number normally not a PDF Reference
2912 # page numbers as int are normally accepted only in external goto
2913 try:
2914 p = reader.pages[dest["/Page"]]
2915 except IndexError:
2916 return
2917 assert p.indirect_reference is not None
2918 try:
2919 arr[NumberObject(0)] = NumberObject(
2920 srcpages[p.indirect_reference.idnum].page_number
2921 )
2922 self.add_named_destination_array(dest["/Title"], arr)
2923 except KeyError:
2924 pass
2925 elif dest["/Page"].indirect_reference.idnum in srcpages:
2926 arr[NumberObject(0)] = srcpages[
2927 dest["/Page"].indirect_reference.idnum
2928 ].indirect_reference
2929 self.add_named_destination_array(dest["/Title"], arr)
2931 for dest in reader._named_destinations.values():
2932 _process_named_dests(dest)
2934 outline_item_typ: TreeObject
2935 if outline_item is not None:
2936 outline_item_typ = cast(
2937 "TreeObject",
2938 self.add_outline_item(
2939 TextStringObject(outline_item),
2940 next(iter(srcpages.values())).indirect_reference,
2941 fit=PAGE_FIT,
2942 ).get_object(),
2943 )
2944 else:
2945 outline_item_typ = self.get_outline_root()
2947 _ro = reader.root_object
2948 if import_outline and CO.OUTLINES in _ro:
2949 outline = self._get_filtered_outline(
2950 _ro.get(CO.OUTLINES, None), srcpages, reader
2951 )
2952 self._insert_filtered_outline(
2953 outline, outline_item_typ, None
2954 ) # TODO: use before parameter
2956 if "/Annots" not in excluded_fields:
2957 for pag in srcpages.values():
2958 lst = self._insert_filtered_annotations(
2959 pag.original_page.get("/Annots", []), pag, srcpages, reader
2960 )
2961 if len(lst) > 0:
2962 pag[NameObject("/Annots")] = lst
2963 self.clean_page(pag)
2965 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:
2966 if "/AcroForm" not in self._root_object:
2967 self._root_object[NameObject("/AcroForm")] = self._add_object(
2968 cast(
2969 DictionaryObject,
2970 reader.root_object["/AcroForm"],
2971 ).clone(self, False, ("/Fields",))
2972 )
2973 arr = ArrayObject()
2974 else:
2975 arr = cast(
2976 ArrayObject,
2977 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
2978 )
2979 trslat = self._id_translated[id(reader)]
2980 try:
2981 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
2982 try:
2983 ind = IndirectObject(trslat[f.idnum], 0, self)
2984 if ind not in arr:
2985 arr.append(ind)
2986 except KeyError:
2987 # for trslat[] which mean the field has not be copied
2988 # through the page
2989 pass
2990 except KeyError: # for /Acroform or /Fields are not existing
2991 arr = self._add_object(ArrayObject())
2992 cast(DictionaryObject, self._root_object["/AcroForm"])[
2993 NameObject("/Fields")
2994 ] = arr
2996 if "/B" not in excluded_fields:
2997 self.add_filtered_articles("", srcpages, reader)
2999 def _add_articles_thread(
3000 self,
3001 thread: DictionaryObject, # thread entry from the reader's array of threads
3002 pages: Dict[int, PageObject],
3003 reader: PdfReader,
3004 ) -> IndirectObject:
3005 """
3006 Clone the thread with only the applicable articles.
3008 Args:
3009 thread:
3010 pages:
3011 reader:
3013 Returns:
3014 The added thread as an indirect reference
3016 """
3017 nthread = thread.clone(
3018 self, force_duplicate=True, ignore_fields=("/F",)
3019 ) # use of clone to keep link between reader and writer
3020 self.threads.append(nthread.indirect_reference)
3021 first_article = cast("DictionaryObject", thread["/F"])
3022 current_article: Optional[DictionaryObject] = first_article
3023 new_article: Optional[DictionaryObject] = None
3024 while current_article is not None:
3025 pag = self._get_cloned_page(
3026 cast("PageObject", current_article["/P"]), pages, reader
3027 )
3028 if pag is not None:
3029 if new_article is None:
3030 new_article = cast(
3031 "DictionaryObject",
3032 self._add_object(DictionaryObject()).get_object(),
3033 )
3034 new_first = new_article
3035 nthread[NameObject("/F")] = new_article.indirect_reference
3036 else:
3037 new_article2 = cast(
3038 "DictionaryObject",
3039 self._add_object(
3040 DictionaryObject(
3041 {NameObject("/V"): new_article.indirect_reference}
3042 )
3043 ).get_object(),
3044 )
3045 new_article[NameObject("/N")] = new_article2.indirect_reference
3046 new_article = new_article2
3047 new_article[NameObject("/P")] = pag
3048 new_article[NameObject("/T")] = nthread.indirect_reference
3049 new_article[NameObject("/R")] = current_article["/R"]
3050 pag_obj = cast("PageObject", pag.get_object())
3051 if "/B" not in pag_obj:
3052 pag_obj[NameObject("/B")] = ArrayObject()
3053 cast("ArrayObject", pag_obj["/B"]).append(
3054 new_article.indirect_reference
3055 )
3056 current_article = cast("DictionaryObject", current_article["/N"])
3057 if current_article == first_article:
3058 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore
3059 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore
3060 current_article = None
3061 assert nthread.indirect_reference is not None
3062 return nthread.indirect_reference
3064 def add_filtered_articles(
3065 self,
3066 fltr: Union[
3067 Pattern[Any], str
3068 ], # thread entry from the reader's array of threads
3069 pages: Dict[int, PageObject],
3070 reader: PdfReader,
3071 ) -> None:
3072 """
3073 Add articles matching the defined criteria.
3075 Args:
3076 fltr:
3077 pages:
3078 reader:
3080 """
3081 if isinstance(fltr, str):
3082 fltr = re.compile(fltr)
3083 elif not isinstance(fltr, Pattern):
3084 fltr = re.compile("")
3085 for p in pages.values():
3086 pp = p.original_page
3087 for a in pp.get("/B", ()):
3088 thr = a.get_object().get("/T")
3089 if thr is None:
3090 continue
3091 thr = thr.get_object()
3092 if thr.indirect_reference.idnum not in self._id_translated[
3093 id(reader)
3094 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):
3095 self._add_articles_thread(thr, pages, reader)
3097 def _get_cloned_page(
3098 self,
3099 page: Union[None, IndirectObject, PageObject, NullObject],
3100 pages: Dict[int, PageObject],
3101 reader: PdfReader,
3102 ) -> Optional[IndirectObject]:
3103 if isinstance(page, NullObject):
3104 return None
3105 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":
3106 _i = page.indirect_reference
3107 elif isinstance(page, IndirectObject):
3108 _i = page
3109 try:
3110 return pages[_i.idnum].indirect_reference # type: ignore
3111 except Exception:
3112 return None
3114 def _insert_filtered_annotations(
3115 self,
3116 annots: Union[IndirectObject, List[DictionaryObject], None],
3117 page: PageObject,
3118 pages: Dict[int, PageObject],
3119 reader: PdfReader,
3120 ) -> List[Destination]:
3121 outlist = ArrayObject()
3122 if isinstance(annots, IndirectObject):
3123 annots = cast("List[Any]", annots.get_object())
3124 if annots is None:
3125 return outlist
3126 if not isinstance(annots, list):
3127 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)
3128 return outlist
3129 for an in annots:
3130 ano = cast("DictionaryObject", an.get_object())
3131 if (
3132 ano["/Subtype"] != "/Link"
3133 or "/A" not in ano
3134 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"
3135 or "/Dest" in ano
3136 ):
3137 if "/Dest" not in ano:
3138 outlist.append(self._add_object(ano.clone(self)))
3139 else:
3140 d = ano["/Dest"]
3141 if isinstance(d, str):
3142 # it is a named dest
3143 if str(d) in self.get_named_dest_root():
3144 outlist.append(ano.clone(self).indirect_reference)
3145 else:
3146 d = cast("ArrayObject", d)
3147 p = self._get_cloned_page(d[0], pages, reader)
3148 if p is not None:
3149 anc = ano.clone(self, ignore_fields=("/Dest",))
3150 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])
3151 outlist.append(self._add_object(anc))
3152 else:
3153 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())
3154 if d is None or isinstance(d, NullObject):
3155 continue
3156 if isinstance(d, str):
3157 # it is a named dest
3158 if str(d) in self.get_named_dest_root():
3159 outlist.append(ano.clone(self).indirect_reference)
3160 else:
3161 d = cast("ArrayObject", d)
3162 p = self._get_cloned_page(d[0], pages, reader)
3163 if p is not None:
3164 anc = ano.clone(self, ignore_fields=("/D",))
3165 cast("DictionaryObject", anc["/A"])[
3166 NameObject("/D")
3167 ] = ArrayObject([p, *d[1:]])
3168 outlist.append(self._add_object(anc))
3169 return outlist
3171 def _get_filtered_outline(
3172 self,
3173 node: Any,
3174 pages: Dict[int, PageObject],
3175 reader: PdfReader,
3176 ) -> List[Destination]:
3177 """
3178 Extract outline item entries that are part of the specified page set.
3180 Args:
3181 node:
3182 pages:
3183 reader:
3185 Returns:
3186 A list of destination objects.
3188 """
3189 new_outline = []
3190 if node is None:
3191 node = NullObject()
3192 node = node.get_object()
3193 if is_null_or_none(node):
3194 node = DictionaryObject()
3195 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:
3196 node = node.get("/First", None)
3197 if node is not None:
3198 node = node.get_object()
3199 new_outline += self._get_filtered_outline(node, pages, reader)
3200 else:
3201 v: Union[None, IndirectObject, NullObject]
3202 while node is not None:
3203 node = node.get_object()
3204 o = cast("Destination", reader._build_outline_item(node))
3205 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)
3206 if v is None:
3207 v = NullObject()
3208 o[NameObject("/Page")] = v
3209 if "/First" in node:
3210 o._filtered_children = self._get_filtered_outline(
3211 node["/First"], pages, reader
3212 )
3213 else:
3214 o._filtered_children = []
3215 if (
3216 not isinstance(o["/Page"], NullObject)
3217 or len(o._filtered_children) > 0
3218 ):
3219 new_outline.append(o)
3220 node = node.get("/Next", None)
3221 return new_outline
3223 def _clone_outline(self, dest: Destination) -> TreeObject:
3224 n_ol = TreeObject()
3225 self._add_object(n_ol)
3226 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])
3227 if not isinstance(dest["/Page"], NullObject):
3228 if dest.node is not None and "/A" in dest.node:
3229 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)
3230 else:
3231 n_ol[NameObject("/Dest")] = dest.dest_array
3232 # TODO: /SE
3233 if dest.node is not None:
3234 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))
3235 n_ol[NameObject("/C")] = ArrayObject(
3236 dest.node.get(
3237 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]
3238 )
3239 )
3240 return n_ol
3242 def _insert_filtered_outline(
3243 self,
3244 outlines: List[Destination],
3245 parent: Union[TreeObject, IndirectObject],
3246 before: Union[None, TreeObject, IndirectObject] = None,
3247 ) -> None:
3248 for dest in outlines:
3249 # TODO: can be improved to keep A and SE entries (ignored for the moment)
3250 # with np=self.add_outline_item_destination(dest,parent,before)
3251 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:
3252 np = parent
3253 else:
3254 np = self._clone_outline(dest)
3255 cast(TreeObject, parent.get_object()).insert_child(np, before, self)
3256 self._insert_filtered_outline(dest._filtered_children, np, None)
3258 def close(self) -> None:
3259 """Implemented for API harmonization."""
3260 return
3262 def find_outline_item(
3263 self,
3264 outline_item: Dict[str, Any],
3265 root: Optional[OutlineType] = None,
3266 ) -> Optional[List[int]]:
3267 if root is None:
3268 o = self.get_outline_root()
3269 else:
3270 o = cast("TreeObject", root)
3272 i = 0
3273 while o is not None:
3274 if (
3275 o.indirect_reference == outline_item
3276 or o.get("/Title", None) == outline_item
3277 ):
3278 return [i]
3279 if "/First" in o:
3280 res = self.find_outline_item(
3281 outline_item, cast(OutlineType, o["/First"])
3282 )
3283 if res:
3284 return ([i] if "/Title" in o else []) + res
3285 if "/Next" in o:
3286 i += 1
3287 o = cast(TreeObject, o["/Next"])
3288 else:
3289 return None
3291 def find_bookmark(
3292 self,
3293 outline_item: Dict[str, Any],
3294 root: Optional[OutlineType] = None,
3295 ) -> None: # deprecated
3296 """
3297 .. deprecated:: 2.9.0
3298 Use :meth:`find_outline_item` instead.
3299 """
3300 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0")
3302 def reset_translation(
3303 self, reader: Union[None, PdfReader, IndirectObject] = None
3304 ) -> None:
3305 """
3306 Reset the translation table between reader and the writer object.
3308 Late cloning will create new independent objects.
3310 Args:
3311 reader: PdfReader or IndirectObject referencing a PdfReader object.
3312 if set to None or omitted, all tables will be reset.
3314 """
3315 if reader is None:
3316 self._id_translated = {}
3317 elif isinstance(reader, PdfReader):
3318 try:
3319 del self._id_translated[id(reader)]
3320 except Exception:
3321 pass
3322 elif isinstance(reader, IndirectObject):
3323 try:
3324 del self._id_translated[id(reader.pdf)]
3325 except Exception:
3326 pass
3327 else:
3328 raise Exception("invalid parameter {reader}")
3330 def set_page_label(
3331 self,
3332 page_index_from: int,
3333 page_index_to: int,
3334 style: Optional[PageLabelStyle] = None,
3335 prefix: Optional[str] = None,
3336 start: Optional[int] = 0,
3337 ) -> None:
3338 """
3339 Set a page label to a range of pages.
3341 Page indexes must be given starting from 0.
3342 Labels must have a style, a prefix or both.
3343 If a range is not assigned any page label, a decimal label starting from 1 is applied.
3345 Args:
3346 page_index_from: page index of the beginning of the range starting from 0
3347 page_index_to: page index of the beginning of the range starting from 0
3348 style: The numbering style to be used for the numeric portion of each page label:
3350 * ``/D`` Decimal Arabic numerals
3351 * ``/R`` Uppercase Roman numerals
3352 * ``/r`` Lowercase Roman numerals
3353 * ``/A`` Uppercase letters (A to Z for the first 26 pages,
3354 AA to ZZ for the next 26, and so on)
3355 * ``/a`` Lowercase letters (a to z for the first 26 pages,
3356 aa to zz for the next 26, and so on)
3358 prefix: The label prefix for page labels in this range.
3359 start: The value of the numeric portion for the first page label
3360 in the range.
3361 Subsequent pages are numbered sequentially from this value,
3362 which must be greater than or equal to 1.
3363 Default value: 1.
3365 """
3366 if style is None and prefix is None:
3367 raise ValueError("At least one of style and prefix must be given")
3368 if page_index_from < 0:
3369 raise ValueError("page_index_from must be greater or equal than 0")
3370 if page_index_to < page_index_from:
3371 raise ValueError(
3372 "page_index_to must be greater or equal than page_index_from"
3373 )
3374 if page_index_to >= len(self.pages):
3375 raise ValueError("page_index_to exceeds number of pages")
3376 if start is not None and start != 0 and start < 1:
3377 raise ValueError("If given, start must be greater or equal than one")
3379 self._set_page_label(page_index_from, page_index_to, style, prefix, start)
3381 def _set_page_label(
3382 self,
3383 page_index_from: int,
3384 page_index_to: int,
3385 style: Optional[PageLabelStyle] = None,
3386 prefix: Optional[str] = None,
3387 start: Optional[int] = 0,
3388 ) -> None:
3389 """
3390 Set a page label to a range of pages.
3392 Page indexes must be given starting from 0.
3393 Labels must have a style, a prefix or both.
3394 If a range is not assigned any page label a decimal label starting from 1 is applied.
3396 Args:
3397 page_index_from: page index of the beginning of the range starting from 0
3398 page_index_to: page index of the beginning of the range starting from 0
3399 style: The numbering style to be used for the numeric portion of each page label:
3400 /D Decimal Arabic numerals
3401 /R Uppercase Roman numerals
3402 /r Lowercase Roman numerals
3403 /A Uppercase letters (A to Z for the first 26 pages,
3404 AA to ZZ for the next 26, and so on)
3405 /a Lowercase letters (a to z for the first 26 pages,
3406 aa to zz for the next 26, and so on)
3407 prefix: The label prefix for page labels in this range.
3408 start: The value of the numeric portion for the first page label
3409 in the range.
3410 Subsequent pages are numbered sequentially from this value,
3411 which must be greater than or equal to 1. Default value: 1.
3413 """
3414 default_page_label = DictionaryObject()
3415 default_page_label[NameObject("/S")] = NameObject("/D")
3417 new_page_label = DictionaryObject()
3418 if style is not None:
3419 new_page_label[NameObject("/S")] = NameObject(style)
3420 if prefix is not None:
3421 new_page_label[NameObject("/P")] = TextStringObject(prefix)
3422 if start != 0:
3423 new_page_label[NameObject("/St")] = NumberObject(start)
3425 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:
3426 nums = ArrayObject()
3427 nums_insert(NumberObject(0), default_page_label, nums)
3428 page_labels = TreeObject()
3429 page_labels[NameObject("/Nums")] = nums
3430 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3432 page_labels = cast(
3433 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]
3434 )
3435 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])
3437 nums_insert(NumberObject(page_index_from), new_page_label, nums)
3438 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)
3439 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)
3440 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):
3441 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)
3443 page_labels[NameObject("/Nums")] = nums
3444 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels
3446 def _repr_mimebundle_(
3447 self,
3448 include: Union[None, Iterable[str]] = None,
3449 exclude: Union[None, Iterable[str]] = None,
3450 ) -> Dict[str, Any]:
3451 """
3452 Integration into Jupyter Notebooks.
3454 This method returns a dictionary that maps a mime-type to its
3455 representation.
3457 .. seealso::
3459 https://ipython.readthedocs.io/en/stable/config/integrating.html
3460 """
3461 pdf_data = BytesIO()
3462 self.write(pdf_data)
3463 data = {
3464 "application/pdf": pdf_data,
3465 }
3467 if include is not None:
3468 # Filter representations based on include list
3469 data = {k: v for k, v in data.items() if k in include}
3471 if exclude is not None:
3472 # Remove representations based on exclude list
3473 data = {k: v for k, v in data.items() if k not in exclude}
3475 return data
3478def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject:
3479 if isinstance(obj, PdfObject):
3480 return obj
3481 if isinstance(obj, dict):
3482 to_add = DictionaryObject()
3483 for key, value in obj.items():
3484 to_add[NameObject(key)] = _pdf_objectify(value)
3485 return to_add
3486 if isinstance(obj, str):
3487 if obj.startswith("/"):
3488 return NameObject(obj)
3489 return TextStringObject(obj)
3490 if isinstance(obj, (float, int)):
3491 return FloatObject(obj)
3492 if isinstance(obj, list):
3493 return ArrayObject(_pdf_objectify(i) for i in obj)
3494 raise NotImplementedError(
3495 f"{type(obj)=} could not be cast to a PdfObject"
3496 )
3499def _create_outline_item(
3500 action_ref: Union[None, IndirectObject],
3501 title: str,
3502 color: Union[Tuple[float, float, float], str, None],
3503 italic: bool,
3504 bold: bool,
3505) -> TreeObject:
3506 outline_item = TreeObject()
3507 if action_ref is not None:
3508 outline_item[NameObject("/A")] = action_ref
3509 outline_item.update(
3510 {
3511 NameObject("/Title"): create_string_object(title),
3512 }
3513 )
3514 if color:
3515 if isinstance(color, str):
3516 color = hex_to_rgb(color)
3517 outline_item.update(
3518 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}
3519 )
3520 if italic or bold:
3521 format_flag = 0
3522 if italic:
3523 format_flag += OutlineFontFlag.italic
3524 if bold:
3525 format_flag += OutlineFontFlag.bold
3526 outline_item.update({NameObject("/F"): NumberObject(format_flag)})
3527 return outline_item
3530def generate_appearance_stream(
3531 txt: str,
3532 sel: List[str],
3533 da: str,
3534 font_full_rev: Dict[str, bytes],
3535 rct: RectangleObject,
3536 font_height: float,
3537 y_offset: float,
3538) -> bytes:
3539 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()
3540 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):
3541 if line in sel:
3542 # may be improved but cannot find how to get fill working => replaced with lined box
3543 ap_stream += (
3544 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"
3545 f"0.5 0.5 0.5 rg s\n{da}\n"
3546 ).encode()
3547 if line_number == 0:
3548 ap_stream += f"2 {y_offset} Td\n".encode()
3549 else:
3550 # Td is a relative translation
3551 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()
3552 enc_line: List[bytes] = [
3553 font_full_rev.get(c, c.encode("utf-16-be")) for c in line
3554 ]
3555 if any(len(c) >= 2 for c in enc_line):
3556 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"
3557 else:
3558 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"
3559 ap_stream += b"ET\nQ\nEMC\nQ\n"
3560 return ap_stream