Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3# Copyright (c) 2024, Pubpub-ZZ
4#
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions are
9# met:
10#
11# * Redistributions of source code must retain the above copyright notice,
12# this list of conditions and the following disclaimer.
13# * Redistributions in binary form must reproduce the above copyright notice,
14# this list of conditions and the following disclaimer in the documentation
15# and/or other materials provided with the distribution.
16# * The name of the author may not be used to endorse or promote products
17# derived from this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29# POSSIBILITY OF SUCH DAMAGE.
31import struct
32from abc import ABC, abstractmethod
33from collections.abc import Generator, Iterable, Iterator, Mapping
34from datetime import datetime
35from typing import (
36 Any,
37 NoReturn,
38 Optional,
39 Union,
40 cast,
41)
43from ._encryption import Encryption
44from ._page import PageObject, _VirtualList
45from ._page_labels import index2label as page_index2page_label
46from ._utils import (
47 deprecation_with_replacement,
48 logger_warning,
49 parse_iso8824_date,
50)
51from .constants import CatalogAttributes as CA
52from .constants import CatalogDictionary as CD
53from .constants import (
54 CheckboxRadioButtonAttributes,
55 GoToActionArguments,
56 PagesAttributes,
57 UserAccessPermissions,
58)
59from .constants import Core as CO
60from .constants import DocumentInformationAttributes as DI
61from .constants import FieldDictionaryAttributes as FA
62from .constants import PageAttributes as PG
63from .errors import PdfReadError, PyPdfError
64from .filters import _decompress_with_limit
65from .generic import (
66 ArrayObject,
67 BooleanObject,
68 ByteStringObject,
69 Destination,
70 DictionaryObject,
71 EncodedStreamObject,
72 Field,
73 Fit,
74 FloatObject,
75 IndirectObject,
76 NameObject,
77 NullObject,
78 NumberObject,
79 PdfObject,
80 TextStringObject,
81 TreeObject,
82 ViewerPreferences,
83 create_string_object,
84 is_null_or_none,
85)
86from .generic._files import EmbeddedFile
87from .types import OutlineType, PagemodeType
88from .xmp import XmpInformation
91def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]:
92 if size > 8:
93 raise PdfReadError("Invalid size in convert_to_int")
94 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d
95 d = d[-8:]
96 return cast(int, struct.unpack(">Q", d)[0])
99class DocumentInformation(DictionaryObject):
100 """
101 A class representing the basic document metadata provided in a PDF File.
102 This class is accessible through
103 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.
105 All text properties of the document metadata have
106 *two* properties, e.g. author and author_raw. The non-raw property will
107 always return a ``TextStringObject``, making it ideal for a case where the
108 metadata is being displayed. The raw property can sometimes return a
109 ``ByteStringObject``, if pypdf was unable to decode the string's text
110 encoding; this requires additional safety in the caller and therefore is not
111 as commonly accessed.
112 """
114 def __init__(self) -> None:
115 DictionaryObject.__init__(self)
117 def _get_text(self, key: str) -> Optional[str]:
118 retval = self.get(key, None)
119 if isinstance(retval, TextStringObject):
120 return retval
121 if isinstance(retval, ByteStringObject):
122 return str(retval)
123 return None
125 @property
126 def title(self) -> Optional[str]:
127 """
128 Read-only property accessing the document's title.
130 Returns a ``TextStringObject`` or ``None`` if the title is not
131 specified.
132 """
133 return (
134 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore[union-attr]
135 if self.get(DI.TITLE)
136 else None
137 )
139 @property
140 def title_raw(self) -> Optional[str]:
141 """The "raw" version of title; can return a ``ByteStringObject``."""
142 return self.get(DI.TITLE)
144 @property
145 def author(self) -> Optional[str]:
146 """
147 Read-only property accessing the document's author.
149 Returns a ``TextStringObject`` or ``None`` if the author is not
150 specified.
151 """
152 return self._get_text(DI.AUTHOR)
154 @property
155 def author_raw(self) -> Optional[str]:
156 """The "raw" version of author; can return a ``ByteStringObject``."""
157 return self.get(DI.AUTHOR)
159 @property
160 def subject(self) -> Optional[str]:
161 """
162 Read-only property accessing the document's subject.
164 Returns a ``TextStringObject`` or ``None`` if the subject is not
165 specified.
166 """
167 return self._get_text(DI.SUBJECT)
169 @property
170 def subject_raw(self) -> Optional[str]:
171 """The "raw" version of subject; can return a ``ByteStringObject``."""
172 return self.get(DI.SUBJECT)
174 @property
175 def creator(self) -> Optional[str]:
176 """
177 Read-only property accessing the document's creator.
179 If the document was converted to PDF from another format, this is the
180 name of the application (e.g. OpenOffice) that created the original
181 document from which it was converted. Returns a ``TextStringObject`` or
182 ``None`` if the creator is not specified.
183 """
184 return self._get_text(DI.CREATOR)
186 @property
187 def creator_raw(self) -> Optional[str]:
188 """The "raw" version of creator; can return a ``ByteStringObject``."""
189 return self.get(DI.CREATOR)
191 @property
192 def producer(self) -> Optional[str]:
193 """
194 Read-only property accessing the document's producer.
196 If the document was converted to PDF from another format, this is the
197 name of the application (for example, macOS Quartz) that converted it to
198 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not
199 specified.
200 """
201 return self._get_text(DI.PRODUCER)
203 @property
204 def producer_raw(self) -> Optional[str]:
205 """The "raw" version of producer; can return a ``ByteStringObject``."""
206 return self.get(DI.PRODUCER)
208 @property
209 def creation_date(self) -> Optional[datetime]:
210 """Read-only property accessing the document's creation date."""
211 return parse_iso8824_date(self._get_text(DI.CREATION_DATE))
213 @property
214 def creation_date_raw(self) -> Optional[str]:
215 """
216 The "raw" version of creation date; can return a ``ByteStringObject``.
218 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
219 is the offset from UTC.
220 """
221 return self.get(DI.CREATION_DATE)
223 @property
224 def modification_date(self) -> Optional[datetime]:
225 """
226 Read-only property accessing the document's modification date.
228 The date and time the document was most recently modified.
229 """
230 return parse_iso8824_date(self._get_text(DI.MOD_DATE))
232 @property
233 def modification_date_raw(self) -> Optional[str]:
234 """
235 The "raw" version of modification date; can return a
236 ``ByteStringObject``.
238 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
239 is the offset from UTC.
240 """
241 return self.get(DI.MOD_DATE)
243 @property
244 def keywords(self) -> Optional[str]:
245 """
246 Read-only property accessing the document's keywords.
248 Returns a ``TextStringObject`` or ``None`` if keywords are not
249 specified.
250 """
251 return self._get_text(DI.KEYWORDS)
253 @property
254 def keywords_raw(self) -> Optional[str]:
255 """The "raw" version of keywords; can return a ``ByteStringObject``."""
256 return self.get(DI.KEYWORDS)
259class PdfDocCommon(ABC):
260 """
261 Common functions from PdfWriter and PdfReader objects.
263 This root class is strongly abstracted.
264 """
266 strict: bool = False # default
268 flattened_pages: Optional[list[PageObject]] = None
270 _encryption: Optional[Encryption] = None
272 _readonly: bool = False
274 @property
275 @abstractmethod
276 def root_object(self) -> DictionaryObject:
277 ... # pragma: no cover
279 @property
280 @abstractmethod
281 def pdf_header(self) -> str:
282 ... # pragma: no cover
284 @abstractmethod
285 def get_object(
286 self, indirect_reference: Union[int, IndirectObject]
287 ) -> Optional[PdfObject]:
288 ... # pragma: no cover
290 @abstractmethod
291 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:
292 ... # pragma: no cover
294 @property
295 @abstractmethod
296 def _info(self) -> Optional[DictionaryObject]:
297 ... # pragma: no cover
299 @property
300 def metadata(self) -> Optional[DocumentInformation]:
301 """
302 Retrieve the PDF file's document information dictionary, if it exists.
304 Note that some PDF files use metadata streams instead of document
305 information dictionaries, and these metadata streams will not be
306 accessed by this function.
307 """
308 retval = DocumentInformation()
309 if self._info is None:
310 return None
311 retval.update(self._info)
312 return retval
314 @property
315 @abstractmethod
316 def xmp_metadata(self) -> Optional[XmpInformation]:
317 ... # pragma: no cover
319 @property
320 def viewer_preferences(self) -> Optional[ViewerPreferences]:
321 """Returns the existing ViewerPreferences as an overloaded dictionary."""
322 o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
323 if o is None:
324 return None
325 o = o.get_object()
326 if not isinstance(o, ViewerPreferences):
327 o = ViewerPreferences(o)
328 if hasattr(o, "indirect_reference") and o.indirect_reference is not None:
329 self._replace_object(o.indirect_reference, o)
330 else:
331 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
332 return o
334 def get_num_pages(self) -> int:
335 """
336 Calculate the number of pages in this PDF file.
338 Returns:
339 The number of pages of the parsed PDF file.
341 Raises:
342 PdfReadError: If restrictions prevent this action.
344 """
345 # Flattened pages will not work on an encrypted PDF;
346 # the PDF file's page count is used in this case. Otherwise,
347 # the original method (flattened page count) is used.
348 if self.is_encrypted:
349 return self.root_object["/Pages"]["/Count"] # type: ignore[no-any-return, index]
350 if self.flattened_pages is None:
351 self._flatten(self._readonly)
352 assert self.flattened_pages is not None
353 return len(self.flattened_pages)
355 def get_page(self, page_number: int) -> PageObject:
356 """
357 Retrieve a page by number from this PDF file.
358 Most of the time ``.pages[page_number]`` is preferred.
360 Args:
361 page_number: The page number to retrieve
362 (pages begin at zero)
364 Returns:
365 A :class:`PageObject<pypdf._page.PageObject>` instance.
367 """
368 if self.flattened_pages is None:
369 self._flatten(self._readonly)
370 assert self.flattened_pages is not None, "hint for mypy"
371 return self.flattened_pages[page_number]
373 def _get_page_in_node(
374 self,
375 page_number: int,
376 ) -> tuple[DictionaryObject, int]:
377 """
378 Retrieve the node and position within the /Kids containing the page.
379 If page_number is greater than the number of pages, it returns the top node, -1.
380 """
381 top = cast(DictionaryObject, self.root_object["/Pages"])
383 def recursive_call(
384 node: DictionaryObject, mi: int
385 ) -> tuple[Optional[PdfObject], int]:
386 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
387 if node["/Type"] == "/Page": # type: ignore[comparison-overlap]
388 if page_number == mi:
389 return node, -1
390 return None, mi + 1
391 if (page_number - mi) >= ma: # not in nodes below
392 if node == top:
393 return top, -1
394 return None, mi + ma
395 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
396 kid = cast(DictionaryObject, kid.get_object())
397 n, i = recursive_call(kid, mi)
398 if n is not None: # page has just been found ...
399 if i < 0: # ... just below!
400 return node, idx
401 # ... at lower levels
402 return n, i
403 mi = i
404 raise PyPdfError("Unexpectedly cannot find the node.")
406 node, idx = recursive_call(top, 0)
407 assert isinstance(node, DictionaryObject), "mypy"
408 return node, idx
410 @property
411 def named_destinations(self) -> dict[str, Destination]:
412 """A read-only dictionary which maps names to destinations."""
413 return self._get_named_destinations()
415 def get_named_dest_root(self) -> ArrayObject:
416 named_dest = ArrayObject()
417 if CA.NAMES in self.root_object and isinstance(
418 self.root_object[CA.NAMES], DictionaryObject
419 ):
420 names = cast(DictionaryObject, self.root_object[CA.NAMES])
421 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):
422 # §3.6.3 Name Dictionary (PDF spec 1.7)
423 dests = cast(DictionaryObject, names[CA.DESTS])
424 dests_ref = dests.indirect_reference
425 if CA.NAMES in dests:
426 # §7.9.6, entries in a name tree node dictionary
427 named_dest = cast(ArrayObject, dests[CA.NAMES])
428 else:
429 named_dest = ArrayObject()
430 dests[NameObject(CA.NAMES)] = named_dest
431 elif hasattr(self, "_add_object"):
432 dests = DictionaryObject()
433 dests_ref = self._add_object(dests)
434 names[NameObject(CA.DESTS)] = dests_ref
435 dests[NameObject(CA.NAMES)] = named_dest
437 elif hasattr(self, "_add_object"):
438 names = DictionaryObject()
439 names_ref = self._add_object(names)
440 self.root_object[NameObject(CA.NAMES)] = names_ref
441 dests = DictionaryObject()
442 dests_ref = self._add_object(dests)
443 names[NameObject(CA.DESTS)] = dests_ref
444 dests[NameObject(CA.NAMES)] = named_dest
446 return named_dest
448 ## common
449 def _get_named_destinations(
450 self,
451 tree: Union[TreeObject, None] = None,
452 retval: Optional[dict[str, Destination]] = None,
453 ) -> dict[str, Destination]:
454 """
455 Retrieve the named destinations present in the document.
457 Args:
458 tree: The current tree.
459 retval: The previously retrieved destinations for nested calls.
461 Returns:
462 A dictionary which maps names to destinations.
464 """
465 if retval is None:
466 retval = {}
467 catalog = self.root_object
469 # get the name tree
470 if CA.DESTS in catalog:
471 tree = cast(TreeObject, catalog[CA.DESTS])
472 elif CA.NAMES in catalog:
473 names = cast(DictionaryObject, catalog[CA.NAMES])
474 if CA.DESTS in names:
475 tree = cast(TreeObject, names[CA.DESTS])
477 if is_null_or_none(tree):
478 return retval
479 assert tree is not None, "mypy"
481 if PagesAttributes.KIDS in tree:
482 # recurse down the tree
483 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]):
484 self._get_named_destinations(kid.get_object(), retval)
485 # §7.9.6, entries in a name tree node dictionary
486 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6)
487 names = cast(DictionaryObject, tree[CA.NAMES])
488 i = 0
489 while i < len(names):
490 key = names[i].get_object()
491 i += 1
492 if not isinstance(key, (bytes, str)):
493 continue
494 try:
495 value = names[i].get_object()
496 except IndexError:
497 break
498 i += 1
499 if isinstance(value, DictionaryObject):
500 if "/D" in value:
501 value = value["/D"]
502 else:
503 continue
504 dest = self._build_destination(key, value)
505 if dest is not None:
506 retval[cast(str, dest["/Title"])] = dest
507 # Remain backwards-compatible.
508 retval[str(key)] = dest
509 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)
510 for k__, v__ in tree.items():
511 val = v__.get_object()
512 if isinstance(val, DictionaryObject):
513 if "/D" in val:
514 val = val["/D"].get_object()
515 else:
516 continue
517 dest = self._build_destination(k__, val)
518 if dest is not None:
519 retval[k__] = dest
520 return retval
522 # A select group of relevant field attributes. For the complete list,
523 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification.
525 def get_fields(
526 self,
527 tree: Optional[TreeObject] = None,
528 retval: Optional[dict[Any, Any]] = None,
529 fileobj: Optional[Any] = None,
530 stack: Optional[list[PdfObject]] = None,
531 ) -> Optional[dict[str, Any]]:
532 """
533 Extract field data if this PDF contains interactive form fields.
535 The *tree*, *retval*, *stack* parameters are for recursive use.
537 Args:
538 tree: Current object to parse.
539 retval: In-progress list of fields.
540 fileobj: A file object (usually a text file) to write
541 a report to on all interactive form fields found.
542 stack: List of already parsed objects.
544 Returns:
545 A dictionary where each key is a field name, and each
546 value is a :class:`Field<pypdf.generic.Field>` object. By
547 default, the mapping name is used for keys.
548 ``None`` if form data could not be located.
550 """
551 field_attributes = FA.attributes_dict()
552 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
553 if retval is None:
554 retval = {}
555 catalog = self.root_object
556 stack = []
557 # get the AcroForm tree
558 if CD.ACRO_FORM in catalog:
559 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
560 else:
561 return None
562 if tree is None:
563 return retval
564 assert stack is not None
565 if "/Fields" in tree:
566 fields = cast(ArrayObject, tree["/Fields"])
567 for f in fields:
568 field = f.get_object()
569 self._build_field(field, retval, fileobj, field_attributes, stack)
570 elif any(attr in tree for attr in field_attributes):
571 # Tree is a field
572 self._build_field(tree, retval, fileobj, field_attributes, stack)
573 return retval
575 def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
576 if "/TM" in parent:
577 return cast(str, parent["/TM"])
578 if "/Parent" in parent:
579 return (
580 self._get_qualified_field_name(
581 cast(DictionaryObject, parent["/Parent"])
582 )
583 + "."
584 + cast(str, parent.get("/T", ""))
585 )
586 return cast(str, parent.get("/T", ""))
588 def _build_field(
589 self,
590 field: Union[TreeObject, DictionaryObject],
591 retval: dict[Any, Any],
592 fileobj: Any,
593 field_attributes: Any,
594 stack: list[PdfObject],
595 ) -> None:
596 if all(attr not in field for attr in ("/T", "/TM")):
597 return
598 key = self._get_qualified_field_name(field)
599 if fileobj:
600 self._write_field(fileobj, field, field_attributes)
601 fileobj.write("\n")
602 retval[key] = Field(field)
603 obj = retval[key].indirect_reference.get_object() # to get the full object
604 if obj.get(FA.FT, "") == "/Ch" and obj.get(NameObject(FA.Opt)):
605 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]
606 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:
607 # Checkbox
608 retval[key][NameObject("/_States_")] = ArrayObject(
609 list(obj["/AP"]["/N"].keys())
610 )
611 if "/Off" not in retval[key]["/_States_"]:
612 retval[key][NameObject("/_States_")].append(NameObject("/Off"))
613 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:
614 states: list[str] = []
615 retval[key][NameObject("/_States_")] = ArrayObject(states)
616 for k in obj.get(FA.Kids, {}):
617 k = k.get_object()
618 for s in list(k["/AP"]["/N"].keys()):
619 if s not in states:
620 states.append(s)
621 retval[key][NameObject("/_States_")] = ArrayObject(states)
622 if (
623 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0
624 and "/Off" in retval[key]["/_States_"]
625 ):
626 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]
627 # at last for order
628 self._check_kids(field, retval, fileobj, stack)
630 def _check_kids(
631 self,
632 tree: Union[TreeObject, DictionaryObject],
633 retval: Any,
634 fileobj: Any,
635 stack: list[PdfObject],
636 ) -> None:
637 if tree in stack:
638 logger_warning(
639 "%(field_name)s already parsed",
640 source=__name__,
641 field_name=self._get_qualified_field_name(tree),
642 )
643 return
644 stack.append(tree)
645 if PagesAttributes.KIDS in tree:
646 # recurse down the tree
647 for kid in tree[PagesAttributes.KIDS]: # type: ignore[attr-defined]
648 kid = kid.get_object()
649 self.get_fields(kid, retval, fileobj, stack)
651 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
652 field_attributes_tuple = FA.attributes()
653 field_attributes_tuple = (
654 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()
655 )
657 for attr in field_attributes_tuple:
658 if attr in (
659 FA.Kids,
660 FA.AA,
661 ):
662 continue
663 attr_name = field_attributes[attr]
664 try:
665 if attr == FA.FT:
666 # Make the field type value clearer
667 types = {
668 "/Btn": "Button",
669 "/Tx": "Text",
670 "/Ch": "Choice",
671 "/Sig": "Signature",
672 }
673 if field[attr] in types:
674 fileobj.write(f"{attr_name}: {types[field[attr]]}\n")
675 elif attr == FA.Parent:
676 # Let's just write the name of the parent
677 try:
678 name = field[attr][FA.TM]
679 except KeyError:
680 name = field[attr][FA.T]
681 fileobj.write(f"{attr_name}: {name}\n")
682 else:
683 fileobj.write(f"{attr_name}: {field[attr]}\n")
684 except KeyError:
685 # Field attribute is N/A or unknown, so don't write anything
686 pass
688 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]:
689 """
690 Retrieve form fields from the document with textual data.
692 Args:
693 full_qualified_name: to get full name
695 Returns:
696 A dictionary. The key is the name of the form field,
697 the value is the content of the field.
699 If the document contains multiple form fields with the same name, the
700 second and following will get the suffix .2, .3, ...
702 """
704 def indexed_key(k: str, fields: dict[Any, Any]) -> str:
705 if k not in fields:
706 return k
707 return (
708 k
709 + "."
710 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2)
711 )
713 # Retrieve document form fields
714 formfields = self.get_fields()
715 if formfields is None:
716 return {}
717 ff = {}
718 for field, value in formfields.items():
719 if value.get("/FT") == "/Tx":
720 if full_qualified_name:
721 ff[field] = value.get("/V")
722 else:
723 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
724 return ff
726 def get_pages_showing_field(
727 self, field: Union[Field, PdfObject, IndirectObject]
728 ) -> list[PageObject]:
729 """
730 Provides list of pages where the field is called.
732 Args:
733 field: Field Object, PdfObject or IndirectObject referencing a Field
735 Returns:
736 List of pages:
737 - Empty list:
738 The field has no widgets attached
739 (either hidden field or ancestor field).
740 - Single page list:
741 Page where the widget is present
742 (most common).
743 - Multi-page list:
744 Field with multiple kids widgets
745 (example: radio buttons, field repeated on multiple pages).
747 """
748 try:
749 # to cope with all types
750 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore[union-attr]
751 except Exception as exc:
752 raise ValueError("Field type is invalid") from exc
753 if is_null_or_none(field.get_inherited(key="/FT", default=None)):
754 raise ValueError("Field is not valid")
755 ret = []
756 if field.get("/Subtype", "") == "/Widget":
757 if "/P" in field:
758 ret = [field["/P"].get_object()]
759 else:
760 ret = [
761 p
762 for p in self.pages
763 if field.indirect_reference in p.get("/Annots", "")
764 ]
765 else:
766 kids = field.get("/Kids", ())
767 for k in kids:
768 k = k.get_object()
769 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
770 # Kid that is just a widget, not a field:
771 if "/P" in k:
772 ret += [k["/P"].get_object()]
773 else:
774 ret += [
775 p
776 for p in self.pages
777 if k.indirect_reference in p.get("/Annots", "")
778 ]
779 return [
780 x
781 if isinstance(x, PageObject)
782 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore[index, union-attr]
783 for x in ret
784 ]
786 @property
787 def open_destination(
788 self,
789 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
790 """
791 Property to access the opening destination (``/OpenAction`` entry in
792 the PDF catalog). It returns ``None`` if the entry does not exist
793 or is not set.
795 Raises:
796 Exception: If a destination is invalid.
798 """
799 if "/OpenAction" not in self.root_object:
800 return None
801 oa: Any = self.root_object["/OpenAction"]
802 if isinstance(oa, bytes): # pragma: no cover
803 oa = oa.decode()
804 if isinstance(oa, str):
805 return create_string_object(oa)
806 if isinstance(oa, ArrayObject):
807 try:
808 page, typ, *array = oa
809 fit = Fit(typ, tuple(array))
810 return Destination("OpenAction", page, fit)
811 except Exception as exc:
812 raise Exception(f"Invalid Destination {oa}: {exc}")
813 else:
814 return None
816 @open_destination.setter
817 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
818 raise NotImplementedError("No setter for open_destination")
820 @property
821 def outline(self) -> OutlineType:
822 """
823 Read-only property for the outline present in the document
824 (i.e., a collection of 'outline items' which are also known as
825 'bookmarks').
826 """
827 return self._get_outline()
829 def _get_outline(
830 self,
831 node: Optional[DictionaryObject] = None,
832 outline: Optional[Any] = None,
833 visited: Optional[set[int]] = None,
834 ) -> OutlineType:
835 if outline is None:
836 outline = []
837 catalog = self.root_object
839 # get the outline dictionary and named destinations
840 if CO.OUTLINES in catalog:
841 lines = cast(DictionaryObject, catalog[CO.OUTLINES])
843 if isinstance(lines, NullObject):
844 return outline
846 # §12.3.3 Document outline, entries in the outline dictionary
847 if not is_null_or_none(lines) and "/First" in lines:
848 node = cast(DictionaryObject, lines["/First"])
849 self._named_destinations = self._get_named_destinations()
851 if node is None:
852 return outline
854 # see if there are any more outline items
855 if visited is None:
856 visited = set()
857 while True:
858 node_id = id(node)
859 if node_id in visited:
860 logger_warning("Detected cycle in outline structure for %(node)s", source=__name__, node=node)
861 break
862 visited.add(node_id)
864 outline_obj = self._build_outline_item(node)
865 if outline_obj:
866 outline.append(outline_obj)
868 # check for sub-outline
869 if "/First" in node:
870 sub_outline: list[Any] = []
871 # Pass a copy to allow multiple outer entries to reference the same inner one.
872 inner_visited = visited.copy()
873 self._get_outline(
874 node=cast(DictionaryObject, node["/First"]),
875 outline=sub_outline,
876 visited=inner_visited,
877 )
878 if sub_outline:
879 outline.append(sub_outline)
881 if "/Next" not in node:
882 break
883 node = cast(DictionaryObject, node["/Next"])
885 return outline
887 @property
888 def threads(self) -> Optional[ArrayObject]:
889 """
890 Read-only property for the list of threads.
892 See §12.4.3 from the PDF 1.7 or 2.0 specification.
894 It is an array of dictionaries with "/F" (the first bead in the thread)
895 and "/I" (a thread information dictionary containing information about
896 the thread, such as its title, author, and creation date) properties or
897 None if there are no articles.
899 Since PDF 2.0 it can also contain an indirect reference to a metadata
900 stream containing information about the thread, such as its title,
901 author, and creation date.
902 """
903 catalog = self.root_object
904 if CO.THREADS in catalog:
905 return cast("ArrayObject", catalog[CO.THREADS])
906 return None
908 @abstractmethod
909 def _get_page_number_by_indirect(
910 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
911 ) -> Optional[int]:
912 ... # pragma: no cover
914 def get_page_number(self, page: PageObject) -> Optional[int]:
915 """
916 Retrieve page number of a given PageObject.
918 Args:
919 page: The page to get page number. Should be
920 an instance of :class:`PageObject<pypdf._page.PageObject>`
922 Returns:
923 The page number or None if page is not found
925 """
926 return self._get_page_number_by_indirect(page.indirect_reference)
928 def get_destination_page_number(self, destination: Destination) -> Optional[int]:
929 """
930 Retrieve page number of a given Destination object.
932 Args:
933 destination: The destination to get page number.
935 Returns:
936 The page number or None if page is not found
938 """
939 return self._get_page_number_by_indirect(destination.page)
941 def _build_destination(
942 self,
943 title: Union[str, bytes],
944 array: Optional[
945 list[
946 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]
947 ]
948 ],
949 ) -> Destination:
950 page, typ = None, None
951 # handle outline items with missing or invalid destination
952 if (
953 isinstance(array, (NullObject, str))
954 or (isinstance(array, ArrayObject) and len(array) == 0)
955 or array is None
956 ):
957 page = NullObject()
958 return Destination(title, page, Fit.fit())
959 page, typ, *array = array # type: ignore[assignment]
960 try:
961 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore[arg-type]
962 except PdfReadError:
963 logger_warning("Unknown destination: %(title)r %(array)s", source=__name__, title=title, array=array)
964 if self.strict:
965 raise
966 # create a link to first Page
967 tmp = self.pages[0].indirect_reference
968 indirect_reference = NullObject() if tmp is None else tmp
969 return Destination(title, indirect_reference, Fit.fit())
971 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
972 dest, title, outline_item = None, None, None
974 # title required for valid outline
975 # §12.3.3, entries in an outline item dictionary
976 try:
977 title = cast("str", node["/Title"])
978 except KeyError:
979 if self.strict:
980 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")
981 title = ""
983 if "/A" in node:
984 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported)
985 action = cast(DictionaryObject, node["/A"])
986 action_type = cast(NameObject, action[GoToActionArguments.S])
987 if action_type == "/GoTo":
988 if GoToActionArguments.D in action:
989 dest = action[GoToActionArguments.D]
990 elif self.strict:
991 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}")
992 elif "/Dest" in node:
993 # Destination, PDF 1.7 and PDF 2.0 §12.3.2
994 dest = node["/Dest"]
995 # if array was referenced in another object, will be a dict w/ key "/D"
996 if isinstance(dest, DictionaryObject) and "/D" in dest:
997 dest = dest["/D"]
999 if isinstance(dest, ArrayObject):
1000 outline_item = self._build_destination(title, dest)
1001 elif isinstance(dest, str):
1002 # named destination, addresses NameObject Issue #193
1003 # TODO: Keep named destination instead of replacing it?
1004 try:
1005 outline_item = self._build_destination(
1006 title, self._named_destinations[dest].dest_array
1007 )
1008 except KeyError:
1009 # named destination not found in Name Dict
1010 outline_item = self._build_destination(title, None)
1011 elif dest is None:
1012 # outline item not required to have destination or action
1013 # PDFv1.7 Table 153
1014 outline_item = self._build_destination(title, dest)
1015 else:
1016 if self.strict:
1017 raise PdfReadError(f"Unexpected destination {dest!r}")
1018 logger_warning(
1019 "Removed unexpected destination %(dest)r from destination",
1020 source=__name__,
1021 dest=dest,
1022 )
1023 outline_item = self._build_destination(title, None)
1025 # if outline item created, add color, format, and child count if present
1026 if outline_item:
1027 if "/C" in node:
1028 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0
1029 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore[attr-defined]
1030 if "/F" in node:
1031 # specifies style characteristics bold and/or italic
1032 # with 1=italic, 2=bold, 3=both
1033 outline_item[NameObject("/F")] = node["/F"]
1034 if "/Count" in node:
1035 # absolute value = num. visible children
1036 # with positive = open/unfolded, negative = closed/folded
1037 outline_item[NameObject("/Count")] = node["/Count"]
1038 # if count is 0 we will consider it as open (to have available is_open)
1039 outline_item[NameObject("/%is_open%")] = BooleanObject(
1040 node.get("/Count", 0) >= 0
1041 )
1042 outline_item.node = node
1043 try:
1044 outline_item.indirect_reference = node.indirect_reference
1045 except AttributeError:
1046 pass
1047 return outline_item
1049 @property
1050 def pages(self) -> list[PageObject]:
1051 """
1052 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
1053 This property allows to get a page or a range of pages.
1055 Note:
1056 For PdfWriter only: Provides the capability to remove a page/range of
1057 page from the list (using the del operator). Remember: Only the page
1058 entry is removed, as the objects beneath can be used elsewhere. A
1059 solution to completely remove them - if they are not used anywhere - is
1060 to write to a buffer/temporary file and then load it into a new
1061 PdfWriter.
1063 """
1064 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore[return-value]
1066 @property
1067 def page_labels(self) -> list[str]:
1068 """
1069 A list of labels for the pages in this document.
1071 This property is read-only. The labels are in the order that the pages
1072 appear in the document.
1073 """
1074 return [page_index2page_label(self, i) for i in range(len(self.pages))]
1076 @property
1077 def page_layout(self) -> Optional[str]:
1078 """
1079 Get the page layout currently being used.
1081 .. list-table:: Valid ``layout`` values
1082 :widths: 50 200
1084 * - /NoLayout
1085 - Layout explicitly not specified
1086 * - /SinglePage
1087 - Show one page at a time
1088 * - /OneColumn
1089 - Show one column at a time
1090 * - /TwoColumnLeft
1091 - Show pages in two columns, odd-numbered pages on the left
1092 * - /TwoColumnRight
1093 - Show pages in two columns, odd-numbered pages on the right
1094 * - /TwoPageLeft
1095 - Show two pages at a time, odd-numbered pages on the left
1096 * - /TwoPageRight
1097 - Show two pages at a time, odd-numbered pages on the right
1098 """
1099 try:
1100 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
1101 except KeyError:
1102 return None
1104 @property
1105 def page_mode(self) -> Optional[PagemodeType]:
1106 """
1107 Get the page mode currently being used.
1109 .. list-table:: Valid ``mode`` values
1110 :widths: 50 200
1112 * - /UseNone
1113 - Do not show outline or thumbnails panels
1114 * - /UseOutlines
1115 - Show outline (aka bookmarks) panel
1116 * - /UseThumbs
1117 - Show page thumbnails panel
1118 * - /FullScreen
1119 - Fullscreen view
1120 * - /UseOC
1121 - Show Optional Content Group (OCG) panel
1122 * - /UseAttachments
1123 - Show attachments panel
1124 """
1125 try:
1126 return self.root_object["/PageMode"] # type: ignore[return-value]
1127 except KeyError:
1128 return None
1130 def _flatten(
1131 self,
1132 list_only: bool = False,
1133 pages: Union[None, DictionaryObject, PageObject] = None,
1134 inherit: Optional[dict[str, Any]] = None,
1135 indirect_reference: Optional[IndirectObject] = None,
1136 ) -> None:
1137 """
1138 Process the document pages to ease searching.
1140 Attributes of a page may inherit from ancestor nodes
1141 in the page tree. Flattening means moving
1142 any inheritance data into descendant nodes,
1143 effectively removing the inheritance dependency.
1145 Note: It is distinct from another use of "flattening" applied to PDFs.
1146 Flattening a PDF also means combining all the contents into one single layer
1147 and making the file less editable.
1149 Args:
1150 list_only: Will only list the pages within _flatten_pages.
1151 pages:
1152 inherit:
1153 indirect_reference: Used recursively to flatten the /Pages object.
1155 """
1156 inheritable_page_attributes = (
1157 NameObject(PG.RESOURCES),
1158 NameObject(PG.MEDIABOX),
1159 NameObject(PG.CROPBOX),
1160 NameObject(PG.ROTATE),
1161 )
1162 if inherit is None:
1163 inherit = {}
1164 if is_null_or_none(pages):
1165 # Fix issue 327: set flattened_pages attribute only for
1166 # decrypted file
1167 catalog = self.root_object
1168 pages = catalog.get("/Pages").get_object() # type: ignore[union-attr]
1169 if not isinstance(pages, DictionaryObject):
1170 raise PdfReadError("Invalid object in /Pages")
1171 self.flattened_pages = []
1172 assert pages is not None, "mypy"
1174 if PagesAttributes.TYPE in pages:
1175 t = cast(str, pages[PagesAttributes.TYPE])
1176 # if the page tree node has no /Type, consider as a page if /Kids is also missing
1177 elif PagesAttributes.KIDS not in pages:
1178 t = "/Page"
1179 else:
1180 t = "/Pages"
1182 if t == "/Pages":
1183 for attr in inheritable_page_attributes:
1184 if attr in pages:
1185 inherit[attr] = pages[attr]
1186 pages_reference = getattr(pages, "indirect_reference", object())
1187 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]):
1188 if getattr(page, "indirect_reference", object()) == pages_reference:
1189 raise PdfReadError("Detected cyclic page references.")
1191 addt = {}
1192 if isinstance(page, IndirectObject):
1193 addt["indirect_reference"] = page
1194 obj = page.get_object()
1195 if obj:
1196 # damaged file may have invalid child in /Pages
1197 try:
1198 self._flatten(list_only, obj, inherit, **addt)
1199 except RecursionError:
1200 raise PdfReadError(
1201 "Maximum recursion depth reached during page flattening."
1202 )
1203 elif t == "/Page":
1204 for attr_in, value in inherit.items():
1205 # if the page has its own value, it does not inherit the
1206 # parent's value
1207 if attr_in not in pages:
1208 pages[attr_in] = value
1209 page_obj = PageObject(self, indirect_reference)
1210 if not list_only:
1211 page_obj.update(pages)
1213 # TODO: Could flattened_pages be None at this point?
1214 self.flattened_pages.append(page_obj) # type: ignore[union-attr]
1216 def remove_page(
1217 self,
1218 page: Union[int, PageObject, IndirectObject],
1219 clean: bool = False,
1220 ) -> None:
1221 """
1222 Remove page from pages list.
1224 Args:
1225 page:
1226 * :class:`int`: Page number to be removed.
1227 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times
1228 only the first one will be removed.
1229 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.
1231 clean: replace PageObject with NullObject to prevent annotations
1232 or destinations to reference a detached page.
1234 """
1235 if self.flattened_pages is None:
1236 self._flatten(self._readonly)
1237 assert self.flattened_pages is not None
1238 if isinstance(page, IndirectObject):
1239 p = page.get_object()
1240 if not isinstance(p, PageObject):
1241 logger_warning("IndirectObject is not referencing a page", source=__name__)
1242 return
1243 page = p
1245 if not isinstance(page, int):
1246 try:
1247 page = self.flattened_pages.index(page)
1248 except ValueError:
1249 logger_warning("Cannot find page in pages", source=__name__)
1250 return
1251 if not (0 <= page < len(self.flattened_pages)):
1252 logger_warning("Page number is out of range", source=__name__)
1253 return
1255 ind = self.pages[page].indirect_reference
1256 del self.pages[page]
1257 if clean and ind is not None:
1258 self._replace_object(ind, NullObject())
1260 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
1261 """
1262 Used to ease development.
1264 This is equivalent to generic.IndirectObject(num,gen,self).get_object()
1266 Args:
1267 num: The object number of the indirect object.
1268 gen: The generation number of the indirect object.
1270 Returns:
1271 A PdfObject
1273 """
1274 return IndirectObject(num, gen, self).get_object()
1276 def decode_permissions(
1277 self, permissions_code: int
1278 ) -> NoReturn: # pragma: no cover
1279 """Take the permissions as an integer, return the allowed access."""
1280 deprecation_with_replacement(
1281 old_name="decode_permissions",
1282 new_name="user_access_permissions",
1283 removed_in="5.0.0",
1284 )
1286 @property
1287 def user_access_permissions(self) -> Optional[UserAccessPermissions]:
1288 """
1289 Get the user access permissions for encrypted documents.
1290 Returns None if not encrypted.
1292 .. warning::
1294 For AES-256 encrypted documents (R=5/R=6), the returned
1295 permissions are derived from the ``/P`` field, which is
1296 only trustworthy if the ``/Perms`` integrity check passed.
1297 Check :attr:`are_permissions_valid` to verify.
1298 """
1299 if self._encryption is None:
1300 return None
1301 return UserAccessPermissions(self._encryption.P)
1303 @property
1304 def are_permissions_valid(self) -> Optional[bool]:
1305 """
1306 Whether the ``/Perms`` integrity check passed for this document.
1308 For AES-256 encrypted documents (R=5/R=6), the ``/Perms`` field
1309 is an encrypted copy of the permissions that can be verified
1310 independently. Returns ``False`` if this check fails (the ``/P``
1311 permissions may have been tampered with).
1313 Returns ``None`` if the document is not encrypted or has not yet
1314 been decrypted via :meth:`decrypt()<pypdf.PdfReader.decrypt>`.
1315 Returns ``True`` for non-AES-256 encryption (no ``/Perms`` to check).
1316 """
1317 if self._encryption is None:
1318 return None
1319 if not self._encryption.is_decrypted():
1320 return None
1321 return self._encryption._are_permissions_valid
1323 @property
1324 @abstractmethod
1325 def is_encrypted(self) -> bool:
1326 """
1327 Read-only boolean property showing whether this PDF file is encrypted.
1329 Note that this property, if true, will remain true even after the
1330 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
1331 """
1332 ... # pragma: no cover
1334 @property
1335 def xfa(self) -> Optional[dict[str, Any]]:
1336 retval: dict[str, Any] = {}
1337 catalog = self.root_object
1339 if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
1340 return None
1342 tree = cast(TreeObject, catalog["/AcroForm"])
1344 if "/XFA" in tree:
1345 fields = cast(ArrayObject, tree["/XFA"])
1346 i = iter(fields)
1347 for f in i:
1348 tag = f
1349 f = next(i)
1350 if isinstance(f, IndirectObject):
1351 field = cast(Optional[EncodedStreamObject], f.get_object())
1352 if field:
1353 es = _decompress_with_limit(field._data)
1354 retval[tag] = es
1355 return retval
1357 @property
1358 def attachments(self) -> Mapping[str, list[bytes]]:
1359 """Mapping of attachment filenames to their content."""
1360 return LazyDict(
1361 {
1362 name: (self._get_attachment_list, name)
1363 for name in self._list_attachments()
1364 }
1365 )
1367 @property
1368 def attachment_list(self) -> Generator[EmbeddedFile, None, None]:
1369 """Iterable of attachment objects."""
1370 yield from EmbeddedFile._load(self.root_object)
1372 def _list_attachments(self) -> list[str]:
1373 """
1374 Retrieves the list of filenames of file attachments.
1376 Returns:
1377 list of filenames
1379 """
1380 names = []
1381 for entry in self.attachment_list:
1382 names.append(entry.name)
1383 if (name := entry.alternative_name) != entry.name and name:
1384 names.append(name)
1385 return names
1387 def _get_attachment_list(self, name: str) -> list[bytes]:
1388 out = self._get_attachments(name)[name]
1389 if isinstance(out, list):
1390 return out
1391 return [out]
1393 def _get_attachments(
1394 self, filename: Optional[str] = None
1395 ) -> dict[str, Union[bytes, list[bytes]]]:
1396 """
1397 Retrieves all or selected file attachments of the PDF as a dictionary of file names
1398 and the file data as a bytestring.
1400 Args:
1401 filename: If filename is None, then a dictionary of all attachments
1402 will be returned, where the key is the filename and the value
1403 is the content. Otherwise, a dictionary with just a single key
1404 - the filename - and its content will be returned.
1406 Returns:
1407 dictionary of filename -> Union[bytestring or List[ByteString]]
1408 If the filename exists multiple times a list of the different versions will be provided.
1410 """
1411 attachments: dict[str, Union[bytes, list[bytes]]] = {}
1412 for entry in self.attachment_list:
1413 names = set()
1414 alternative_name = entry.alternative_name
1415 if filename is not None:
1416 if filename in {entry.name, alternative_name}:
1417 name = entry.name if filename == entry.name else alternative_name
1418 names.add(name)
1419 else:
1420 continue
1421 else:
1422 names = {entry.name, alternative_name}
1424 for name in names:
1425 if name is None:
1426 continue
1427 if name in attachments:
1428 if not isinstance(attachments[name], list):
1429 attachments[name] = [attachments[name]] # type:ignore
1430 attachments[name].append(entry.content) # type:ignore
1431 else:
1432 attachments[name] = entry.content
1433 return attachments
1435 @abstractmethod
1436 def _repr_mimebundle_(
1437 self,
1438 include: Union[None, Iterable[str]] = None,
1439 exclude: Union[None, Iterable[str]] = None,
1440 ) -> dict[str, Any]:
1441 """
1442 Integration into Jupyter Notebooks.
1444 This method returns a dictionary that maps a mime-type to its
1445 representation.
1447 .. seealso::
1449 https://ipython.readthedocs.io/en/stable/config/integrating.html
1450 """
1451 ... # pragma: no cover
1454class LazyDict(Mapping[Any, Any]):
1455 def __init__(self, *args: Any, **kwargs: Any) -> None:
1456 self._raw_dict = dict(*args, **kwargs)
1458 def __getitem__(self, key: str) -> Any:
1459 func, arg = self._raw_dict.__getitem__(key)
1460 return func(arg)
1462 def __iter__(self) -> Iterator[Any]:
1463 return iter(self._raw_dict)
1465 def __len__(self) -> int:
1466 return len(self._raw_dict)
1468 def __str__(self) -> str:
1469 return f"LazyDict(keys={list(self.keys())})"