1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3# Copyright (c) 2024, Pubpub-ZZ
4#
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions are
9# met:
10#
11# * Redistributions of source code must retain the above copyright notice,
12# this list of conditions and the following disclaimer.
13# * Redistributions in binary form must reproduce the above copyright notice,
14# this list of conditions and the following disclaimer in the documentation
15# and/or other materials provided with the distribution.
16# * The name of the author may not be used to endorse or promote products
17# derived from this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29# POSSIBILITY OF SUCH DAMAGE.
30
31import struct
32import zlib
33from abc import abstractmethod
34from datetime import datetime
35from typing import (
36 Any,
37 Dict,
38 Generator,
39 Iterable,
40 Iterator,
41 List,
42 Mapping,
43 Optional,
44 Tuple,
45 Union,
46 cast,
47)
48
49from ._encryption import Encryption
50from ._page import PageObject, _VirtualList
51from ._page_labels import index2label as page_index2page_label
52from ._utils import (
53 deprecate_with_replacement,
54 logger_warning,
55 parse_iso8824_date,
56)
57from .constants import CatalogAttributes as CA
58from .constants import CatalogDictionary as CD
59from .constants import (
60 CheckboxRadioButtonAttributes,
61 GoToActionArguments,
62 UserAccessPermissions,
63)
64from .constants import Core as CO
65from .constants import DocumentInformationAttributes as DI
66from .constants import FieldDictionaryAttributes as FA
67from .constants import PageAttributes as PG
68from .constants import PagesAttributes as PA
69from .errors import PdfReadError, PyPdfError
70from .generic import (
71 ArrayObject,
72 BooleanObject,
73 ByteStringObject,
74 Destination,
75 DictionaryObject,
76 EncodedStreamObject,
77 Field,
78 Fit,
79 FloatObject,
80 IndirectObject,
81 NameObject,
82 NullObject,
83 NumberObject,
84 PdfObject,
85 TextStringObject,
86 TreeObject,
87 ViewerPreferences,
88 create_string_object,
89 is_null_or_none,
90)
91from .generic._files import EmbeddedFile
92from .types import OutlineType, PagemodeType
93from .xmp import XmpInformation
94
95
96def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]:
97 if size > 8:
98 raise PdfReadError("Invalid size in convert_to_int")
99 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d
100 d = d[-8:]
101 return struct.unpack(">q", d)[0]
102
103
104class DocumentInformation(DictionaryObject):
105 """
106 A class representing the basic document metadata provided in a PDF File.
107 This class is accessible through
108 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.
109
110 All text properties of the document metadata have
111 *two* properties, e.g. author and author_raw. The non-raw property will
112 always return a ``TextStringObject``, making it ideal for a case where the
113 metadata is being displayed. The raw property can sometimes return a
114 ``ByteStringObject``, if pypdf was unable to decode the string's text
115 encoding; this requires additional safety in the caller and therefore is not
116 as commonly accessed.
117 """
118
119 def __init__(self) -> None:
120 DictionaryObject.__init__(self)
121
122 def _get_text(self, key: str) -> Optional[str]:
123 retval = self.get(key, None)
124 if isinstance(retval, TextStringObject):
125 return retval
126 if isinstance(retval, ByteStringObject):
127 return str(retval)
128 return None
129
130 @property
131 def title(self) -> Optional[str]:
132 """
133 Read-only property accessing the document's title.
134
135 Returns a ``TextStringObject`` or ``None`` if the title is not
136 specified.
137 """
138 return (
139 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore
140 if self.get(DI.TITLE)
141 else None
142 )
143
144 @property
145 def title_raw(self) -> Optional[str]:
146 """The "raw" version of title; can return a ``ByteStringObject``."""
147 return self.get(DI.TITLE)
148
149 @property
150 def author(self) -> Optional[str]:
151 """
152 Read-only property accessing the document's author.
153
154 Returns a ``TextStringObject`` or ``None`` if the author is not
155 specified.
156 """
157 return self._get_text(DI.AUTHOR)
158
159 @property
160 def author_raw(self) -> Optional[str]:
161 """The "raw" version of author; can return a ``ByteStringObject``."""
162 return self.get(DI.AUTHOR)
163
164 @property
165 def subject(self) -> Optional[str]:
166 """
167 Read-only property accessing the document's subject.
168
169 Returns a ``TextStringObject`` or ``None`` if the subject is not
170 specified.
171 """
172 return self._get_text(DI.SUBJECT)
173
174 @property
175 def subject_raw(self) -> Optional[str]:
176 """The "raw" version of subject; can return a ``ByteStringObject``."""
177 return self.get(DI.SUBJECT)
178
179 @property
180 def creator(self) -> Optional[str]:
181 """
182 Read-only property accessing the document's creator.
183
184 If the document was converted to PDF from another format, this is the
185 name of the application (e.g. OpenOffice) that created the original
186 document from which it was converted. Returns a ``TextStringObject`` or
187 ``None`` if the creator is not specified.
188 """
189 return self._get_text(DI.CREATOR)
190
191 @property
192 def creator_raw(self) -> Optional[str]:
193 """The "raw" version of creator; can return a ``ByteStringObject``."""
194 return self.get(DI.CREATOR)
195
196 @property
197 def producer(self) -> Optional[str]:
198 """
199 Read-only property accessing the document's producer.
200
201 If the document was converted to PDF from another format, this is the
202 name of the application (for example, macOS Quartz) that converted it to
203 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not
204 specified.
205 """
206 return self._get_text(DI.PRODUCER)
207
208 @property
209 def producer_raw(self) -> Optional[str]:
210 """The "raw" version of producer; can return a ``ByteStringObject``."""
211 return self.get(DI.PRODUCER)
212
213 @property
214 def creation_date(self) -> Optional[datetime]:
215 """Read-only property accessing the document's creation date."""
216 return parse_iso8824_date(self._get_text(DI.CREATION_DATE))
217
218 @property
219 def creation_date_raw(self) -> Optional[str]:
220 """
221 The "raw" version of creation date; can return a ``ByteStringObject``.
222
223 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
224 is the offset from UTC.
225 """
226 return self.get(DI.CREATION_DATE)
227
228 @property
229 def modification_date(self) -> Optional[datetime]:
230 """
231 Read-only property accessing the document's modification date.
232
233 The date and time the document was most recently modified.
234 """
235 return parse_iso8824_date(self._get_text(DI.MOD_DATE))
236
237 @property
238 def modification_date_raw(self) -> Optional[str]:
239 """
240 The "raw" version of modification date; can return a
241 ``ByteStringObject``.
242
243 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
244 is the offset from UTC.
245 """
246 return self.get(DI.MOD_DATE)
247
248 @property
249 def keywords(self) -> Optional[str]:
250 """
251 Read-only property accessing the document's keywords.
252
253 Returns a ``TextStringObject`` or ``None`` if keywords are not
254 specified.
255 """
256 return self._get_text(DI.KEYWORDS)
257
258 @property
259 def keywords_raw(self) -> Optional[str]:
260 """The "raw" version of keywords; can return a ``ByteStringObject``."""
261 return self.get(DI.KEYWORDS)
262
263
264class PdfDocCommon:
265 """
266 Common functions from PdfWriter and PdfReader objects.
267
268 This root class is strongly abstracted.
269 """
270
271 strict: bool = False # default
272
273 flattened_pages: Optional[List[PageObject]] = None
274
275 _encryption: Optional[Encryption] = None
276
277 _readonly: bool = False
278
279 @property
280 @abstractmethod
281 def root_object(self) -> DictionaryObject:
282 ... # pragma: no cover
283
284 @property
285 @abstractmethod
286 def pdf_header(self) -> str:
287 ... # pragma: no cover
288
289 @abstractmethod
290 def get_object(
291 self, indirect_reference: Union[int, IndirectObject]
292 ) -> Optional[PdfObject]:
293 ... # pragma: no cover
294
295 @abstractmethod
296 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:
297 ... # pragma: no cover
298
299 @property
300 @abstractmethod
301 def _info(self) -> Optional[DictionaryObject]:
302 ... # pragma: no cover
303
304 @property
305 def metadata(self) -> Optional[DocumentInformation]:
306 """
307 Retrieve the PDF file's document information dictionary, if it exists.
308
309 Note that some PDF files use metadata streams instead of document
310 information dictionaries, and these metadata streams will not be
311 accessed by this function.
312 """
313 retval = DocumentInformation()
314 if self._info is None:
315 return None
316 retval.update(self._info)
317 return retval
318
319 @property
320 def xmp_metadata(self) -> Optional[XmpInformation]:
321 ... # pragma: no cover
322
323 @property
324 def viewer_preferences(self) -> Optional[ViewerPreferences]:
325 """Returns the existing ViewerPreferences as an overloaded dictionary."""
326 o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
327 if o is None:
328 return None
329 o = o.get_object()
330 if not isinstance(o, ViewerPreferences):
331 o = ViewerPreferences(o)
332 if hasattr(o, "indirect_reference") and o.indirect_reference is not None:
333 self._replace_object(o.indirect_reference, o)
334 else:
335 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
336 return o
337
338 def get_num_pages(self) -> int:
339 """
340 Calculate the number of pages in this PDF file.
341
342 Returns:
343 The number of pages of the parsed PDF file.
344
345 Raises:
346 PdfReadError: If restrictions prevent this action.
347
348 """
349 # Flattened pages will not work on an encrypted PDF;
350 # the PDF file's page count is used in this case. Otherwise,
351 # the original method (flattened page count) is used.
352 if self.is_encrypted:
353 return self.root_object["/Pages"]["/Count"] # type: ignore
354 if self.flattened_pages is None:
355 self._flatten(self._readonly)
356 assert self.flattened_pages is not None
357 return len(self.flattened_pages)
358
359 def get_page(self, page_number: int) -> PageObject:
360 """
361 Retrieve a page by number from this PDF file.
362 Most of the time ``.pages[page_number]`` is preferred.
363
364 Args:
365 page_number: The page number to retrieve
366 (pages begin at zero)
367
368 Returns:
369 A :class:`PageObject<pypdf._page.PageObject>` instance.
370
371 """
372 if self.flattened_pages is None:
373 self._flatten(self._readonly)
374 assert self.flattened_pages is not None, "hint for mypy"
375 return self.flattened_pages[page_number]
376
377 def _get_page_in_node(
378 self,
379 page_number: int,
380 ) -> Tuple[DictionaryObject, int]:
381 """
382 Retrieve the node and position within the /Kids containing the page.
383 If page_number is greater than the number of pages, it returns the top node, -1.
384 """
385 top = cast(DictionaryObject, self.root_object["/Pages"])
386
387 def recursive_call(
388 node: DictionaryObject, mi: int
389 ) -> Tuple[Optional[PdfObject], int]:
390 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
391 if node["/Type"] == "/Page":
392 if page_number == mi:
393 return node, -1
394 return None, mi + 1
395 if (page_number - mi) >= ma: # not in nodes below
396 if node == top:
397 return top, -1
398 return None, mi + ma
399 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
400 kid = cast(DictionaryObject, kid.get_object())
401 n, i = recursive_call(kid, mi)
402 if n is not None: # page has just been found ...
403 if i < 0: # ... just below!
404 return node, idx
405 # ... at lower levels
406 return n, i
407 mi = i
408 raise PyPdfError("Unexpectedly cannot find the node.")
409
410 node, idx = recursive_call(top, 0)
411 assert isinstance(node, DictionaryObject), "mypy"
412 return node, idx
413
414 @property
415 def named_destinations(self) -> Dict[str, Destination]:
416 """A read-only dictionary which maps names to destinations."""
417 return self._get_named_destinations()
418
419 def get_named_dest_root(self) -> ArrayObject:
420 named_dest = ArrayObject()
421 if CA.NAMES in self.root_object and isinstance(
422 self.root_object[CA.NAMES], DictionaryObject
423 ):
424 names = cast(DictionaryObject, self.root_object[CA.NAMES])
425 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):
426 # §3.6.3 Name Dictionary (PDF spec 1.7)
427 dests = cast(DictionaryObject, names[CA.DESTS])
428 dests_ref = dests.indirect_reference
429 if CA.NAMES in dests:
430 # §7.9.6, entries in a name tree node dictionary
431 named_dest = cast(ArrayObject, dests[CA.NAMES])
432 else:
433 named_dest = ArrayObject()
434 dests[NameObject(CA.NAMES)] = named_dest
435 elif hasattr(self, "_add_object"):
436 dests = DictionaryObject()
437 dests_ref = self._add_object(dests)
438 names[NameObject(CA.DESTS)] = dests_ref
439 dests[NameObject(CA.NAMES)] = named_dest
440
441 elif hasattr(self, "_add_object"):
442 names = DictionaryObject()
443 names_ref = self._add_object(names)
444 self.root_object[NameObject(CA.NAMES)] = names_ref
445 dests = DictionaryObject()
446 dests_ref = self._add_object(dests)
447 names[NameObject(CA.DESTS)] = dests_ref
448 dests[NameObject(CA.NAMES)] = named_dest
449
450 return named_dest
451
452 ## common
453 def _get_named_destinations(
454 self,
455 tree: Union[TreeObject, None] = None,
456 retval: Optional[Dict[str, Destination]] = None,
457 ) -> Dict[str, Destination]:
458 """
459 Retrieve the named destinations present in the document.
460
461 Args:
462 tree: The current tree.
463 retval: The previously retrieved destinations for nested calls.
464
465 Returns:
466 A dictionary which maps names to destinations.
467
468 """
469 if retval is None:
470 retval = {}
471 catalog = self.root_object
472
473 # get the name tree
474 if CA.DESTS in catalog:
475 tree = cast(TreeObject, catalog[CA.DESTS])
476 elif CA.NAMES in catalog:
477 names = cast(DictionaryObject, catalog[CA.NAMES])
478 if CA.DESTS in names:
479 tree = cast(TreeObject, names[CA.DESTS])
480
481 if is_null_or_none(tree):
482 return retval
483 assert tree is not None, "mypy"
484
485 if PA.KIDS in tree:
486 # recurse down the tree
487 for kid in cast(ArrayObject, tree[PA.KIDS]):
488 self._get_named_destinations(kid.get_object(), retval)
489 # §7.9.6, entries in a name tree node dictionary
490 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6)
491 names = cast(DictionaryObject, tree[CA.NAMES])
492 i = 0
493 while i < len(names):
494 original_key = names[i].get_object()
495 i += 1
496 if not isinstance(original_key, (bytes, str)):
497 continue
498 key = str(original_key)
499 try:
500 value = names[i].get_object()
501 except IndexError:
502 break
503 i += 1
504 if isinstance(value, DictionaryObject):
505 if "/D" in value:
506 value = value["/D"]
507 else:
508 continue
509 dest = self._build_destination(key, value)
510 if dest is not None:
511 retval[key] = dest
512 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)
513 for k__, v__ in tree.items():
514 val = v__.get_object()
515 if isinstance(val, DictionaryObject):
516 if "/D" in val:
517 val = val["/D"].get_object()
518 else:
519 continue
520 dest = self._build_destination(k__, val)
521 if dest is not None:
522 retval[k__] = dest
523 return retval
524
525 # A select group of relevant field attributes. For the complete list,
526 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification.
527
528 def get_fields(
529 self,
530 tree: Optional[TreeObject] = None,
531 retval: Optional[Dict[Any, Any]] = None,
532 fileobj: Optional[Any] = None,
533 stack: Optional[List[PdfObject]] = None,
534 ) -> Optional[Dict[str, Any]]:
535 """
536 Extract field data if this PDF contains interactive form fields.
537
538 The *tree*, *retval*, *stack* parameters are for recursive use.
539
540 Args:
541 tree: Current object to parse.
542 retval: In-progress list of fields.
543 fileobj: A file object (usually a text file) to write
544 a report to on all interactive form fields found.
545 stack: List of already parsed objects.
546
547 Returns:
548 A dictionary where each key is a field name, and each
549 value is a :class:`Field<pypdf.generic.Field>` object. By
550 default, the mapping name is used for keys.
551 ``None`` if form data could not be located.
552
553 """
554 field_attributes = FA.attributes_dict()
555 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
556 if retval is None:
557 retval = {}
558 catalog = self.root_object
559 stack = []
560 # get the AcroForm tree
561 if CD.ACRO_FORM in catalog:
562 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
563 else:
564 return None
565 if tree is None:
566 return retval
567 assert stack is not None
568 if "/Fields" in tree:
569 fields = cast(ArrayObject, tree["/Fields"])
570 for f in fields:
571 field = f.get_object()
572 self._build_field(field, retval, fileobj, field_attributes, stack)
573 elif any(attr in tree for attr in field_attributes):
574 # Tree is a field
575 self._build_field(tree, retval, fileobj, field_attributes, stack)
576 return retval
577
578 def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
579 if "/TM" in parent:
580 return cast(str, parent["/TM"])
581 if "/Parent" in parent:
582 return (
583 self._get_qualified_field_name(
584 cast(DictionaryObject, parent["/Parent"])
585 )
586 + "."
587 + cast(str, parent.get("/T", ""))
588 )
589 return cast(str, parent.get("/T", ""))
590
591 def _build_field(
592 self,
593 field: Union[TreeObject, DictionaryObject],
594 retval: Dict[Any, Any],
595 fileobj: Any,
596 field_attributes: Any,
597 stack: List[PdfObject],
598 ) -> None:
599 if all(attr not in field for attr in ("/T", "/TM")):
600 return
601 key = self._get_qualified_field_name(field)
602 if fileobj:
603 self._write_field(fileobj, field, field_attributes)
604 fileobj.write("\n")
605 retval[key] = Field(field)
606 obj = retval[key].indirect_reference.get_object() # to get the full object
607 if obj.get(FA.FT, "") == "/Ch":
608 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]
609 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:
610 # Checkbox
611 retval[key][NameObject("/_States_")] = ArrayObject(
612 list(obj["/AP"]["/N"].keys())
613 )
614 if "/Off" not in retval[key]["/_States_"]:
615 retval[key][NameObject("/_States_")].append(NameObject("/Off"))
616 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:
617 states: List[str] = []
618 retval[key][NameObject("/_States_")] = ArrayObject(states)
619 for k in obj.get(FA.Kids, {}):
620 k = k.get_object()
621 for s in list(k["/AP"]["/N"].keys()):
622 if s not in states:
623 states.append(s)
624 retval[key][NameObject("/_States_")] = ArrayObject(states)
625 if (
626 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0
627 and "/Off" in retval[key]["/_States_"]
628 ):
629 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]
630 # at last for order
631 self._check_kids(field, retval, fileobj, stack)
632
633 def _check_kids(
634 self,
635 tree: Union[TreeObject, DictionaryObject],
636 retval: Any,
637 fileobj: Any,
638 stack: List[PdfObject],
639 ) -> None:
640 if tree in stack:
641 logger_warning(
642 f"{self._get_qualified_field_name(tree)} already parsed", __name__
643 )
644 return
645 stack.append(tree)
646 if PA.KIDS in tree:
647 # recurse down the tree
648 for kid in tree[PA.KIDS]: # type: ignore
649 kid = kid.get_object()
650 self.get_fields(kid, retval, fileobj, stack)
651
652 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
653 field_attributes_tuple = FA.attributes()
654 field_attributes_tuple = (
655 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()
656 )
657
658 for attr in field_attributes_tuple:
659 if attr in (
660 FA.Kids,
661 FA.AA,
662 ):
663 continue
664 attr_name = field_attributes[attr]
665 try:
666 if attr == FA.FT:
667 # Make the field type value clearer
668 types = {
669 "/Btn": "Button",
670 "/Tx": "Text",
671 "/Ch": "Choice",
672 "/Sig": "Signature",
673 }
674 if field[attr] in types:
675 fileobj.write(f"{attr_name}: {types[field[attr]]}\n")
676 elif attr == FA.Parent:
677 # Let's just write the name of the parent
678 try:
679 name = field[attr][FA.TM]
680 except KeyError:
681 name = field[attr][FA.T]
682 fileobj.write(f"{attr_name}: {name}\n")
683 else:
684 fileobj.write(f"{attr_name}: {field[attr]}\n")
685 except KeyError:
686 # Field attribute is N/A or unknown, so don't write anything
687 pass
688
689 def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]:
690 """
691 Retrieve form fields from the document with textual data.
692
693 Args:
694 full_qualified_name: to get full name
695
696 Returns:
697 A dictionary. The key is the name of the form field,
698 the value is the content of the field.
699
700 If the document contains multiple form fields with the same name, the
701 second and following will get the suffix .2, .3, ...
702
703 """
704
705 def indexed_key(k: str, fields: Dict[Any, Any]) -> str:
706 if k not in fields:
707 return k
708 return (
709 k
710 + "."
711 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2)
712 )
713
714 # Retrieve document form fields
715 formfields = self.get_fields()
716 if formfields is None:
717 return {}
718 ff = {}
719 for field, value in formfields.items():
720 if value.get("/FT") == "/Tx":
721 if full_qualified_name:
722 ff[field] = value.get("/V")
723 else:
724 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
725 return ff
726
727 def get_pages_showing_field(
728 self, field: Union[Field, PdfObject, IndirectObject]
729 ) -> List[PageObject]:
730 """
731 Provides list of pages where the field is called.
732
733 Args:
734 field: Field Object, PdfObject or IndirectObject referencing a Field
735
736 Returns:
737 List of pages:
738 - Empty list:
739 The field has no widgets attached
740 (either hidden field or ancestor field).
741 - Single page list:
742 Page where the widget is present
743 (most common).
744 - Multi-page list:
745 Field with multiple kids widgets
746 (example: radio buttons, field repeated on multiple pages).
747
748 """
749
750 def _get_inherited(obj: DictionaryObject, key: str) -> Any:
751 if key in obj:
752 return obj[key]
753 if "/Parent" in obj:
754 return _get_inherited(
755 cast(DictionaryObject, obj["/Parent"].get_object()), key
756 )
757 return None
758
759 try:
760 # to cope with all types
761 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
762 except Exception as exc:
763 raise ValueError("Field type is invalid") from exc
764 if is_null_or_none(_get_inherited(field, "/FT")):
765 raise ValueError("Field is not valid")
766 ret = []
767 if field.get("/Subtype", "") == "/Widget":
768 if "/P" in field:
769 ret = [field["/P"].get_object()]
770 else:
771 ret = [
772 p
773 for p in self.pages
774 if field.indirect_reference in p.get("/Annots", "")
775 ]
776 else:
777 kids = field.get("/Kids", ())
778 for k in kids:
779 k = k.get_object()
780 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
781 # Kid that is just a widget, not a field:
782 if "/P" in k:
783 ret += [k["/P"].get_object()]
784 else:
785 ret += [
786 p
787 for p in self.pages
788 if k.indirect_reference in p.get("/Annots", "")
789 ]
790 return [
791 x
792 if isinstance(x, PageObject)
793 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
794 for x in ret
795 ]
796
797 @property
798 def open_destination(
799 self,
800 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
801 """
802 Property to access the opening destination (``/OpenAction`` entry in
803 the PDF catalog). It returns ``None`` if the entry does not exist
804 or is not set.
805
806 Raises:
807 Exception: If a destination is invalid.
808
809 """
810 if "/OpenAction" not in self.root_object:
811 return None
812 oa: Any = self.root_object["/OpenAction"]
813 if isinstance(oa, bytes): # pragma: no cover
814 oa = oa.decode()
815 if isinstance(oa, str):
816 return create_string_object(oa)
817 if isinstance(oa, ArrayObject):
818 try:
819 page, typ, *array = oa
820 fit = Fit(typ, tuple(array))
821 return Destination("OpenAction", page, fit)
822 except Exception as exc:
823 raise Exception(f"Invalid Destination {oa}: {exc}")
824 else:
825 return None
826
827 @open_destination.setter
828 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
829 raise NotImplementedError("No setter for open_destination")
830
831 @property
832 def outline(self) -> OutlineType:
833 """
834 Read-only property for the outline present in the document
835 (i.e., a collection of 'outline items' which are also known as
836 'bookmarks').
837 """
838 return self._get_outline()
839
840 def _get_outline(
841 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None
842 ) -> OutlineType:
843 if outline is None:
844 outline = []
845 catalog = self.root_object
846
847 # get the outline dictionary and named destinations
848 if CO.OUTLINES in catalog:
849 lines = cast(DictionaryObject, catalog[CO.OUTLINES])
850
851 if isinstance(lines, NullObject):
852 return outline
853
854 # §12.3.3 Document outline, entries in the outline dictionary
855 if not is_null_or_none(lines) and "/First" in lines:
856 node = cast(DictionaryObject, lines["/First"])
857 self._named_destinations = self._get_named_destinations()
858
859 if node is None:
860 return outline
861
862 # see if there are any more outline items
863 while True:
864 outline_obj = self._build_outline_item(node)
865 if outline_obj:
866 outline.append(outline_obj)
867
868 # check for sub-outline
869 if "/First" in node:
870 sub_outline: List[Any] = []
871 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline)
872 if sub_outline:
873 outline.append(sub_outline)
874
875 if "/Next" not in node:
876 break
877 node = cast(DictionaryObject, node["/Next"])
878
879 return outline
880
881 @property
882 def threads(self) -> Optional[ArrayObject]:
883 """
884 Read-only property for the list of threads.
885
886 See §12.4.3 from the PDF 1.7 or 2.0 specification.
887
888 It is an array of dictionaries with "/F" (the first bead in the thread)
889 and "/I" (a thread information dictionary containing information about
890 the thread, such as its title, author, and creation date) properties or
891 None if there are no articles.
892
893 Since PDF 2.0 it can also contain an indirect reference to a metadata
894 stream containing information about the thread, such as its title,
895 author, and creation date.
896 """
897 catalog = self.root_object
898 if CO.THREADS in catalog:
899 return cast("ArrayObject", catalog[CO.THREADS])
900 return None
901
902 @abstractmethod
903 def _get_page_number_by_indirect(
904 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
905 ) -> Optional[int]:
906 ... # pragma: no cover
907
908 def get_page_number(self, page: PageObject) -> Optional[int]:
909 """
910 Retrieve page number of a given PageObject.
911
912 Args:
913 page: The page to get page number. Should be
914 an instance of :class:`PageObject<pypdf._page.PageObject>`
915
916 Returns:
917 The page number or None if page is not found
918
919 """
920 return self._get_page_number_by_indirect(page.indirect_reference)
921
922 def get_destination_page_number(self, destination: Destination) -> Optional[int]:
923 """
924 Retrieve page number of a given Destination object.
925
926 Args:
927 destination: The destination to get page number.
928
929 Returns:
930 The page number or None if page is not found
931
932 """
933 return self._get_page_number_by_indirect(destination.page)
934
935 def _build_destination(
936 self,
937 title: str,
938 array: Optional[
939 List[
940 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]
941 ]
942 ],
943 ) -> Destination:
944 page, typ = None, None
945 # handle outline items with missing or invalid destination
946 if (
947 isinstance(array, (NullObject, str))
948 or (isinstance(array, ArrayObject) and len(array) == 0)
949 or array is None
950 ):
951 page = NullObject()
952 return Destination(title, page, Fit.fit())
953 page, typ, *array = array # type: ignore
954 try:
955 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore
956 except PdfReadError:
957 logger_warning(f"Unknown destination: {title} {array}", __name__)
958 if self.strict:
959 raise
960 # create a link to first Page
961 tmp = self.pages[0].indirect_reference
962 indirect_reference = NullObject() if tmp is None else tmp
963 return Destination(title, indirect_reference, Fit.fit())
964
965 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
966 dest, title, outline_item = None, None, None
967
968 # title required for valid outline
969 # §12.3.3, entries in an outline item dictionary
970 try:
971 title = cast("str", node["/Title"])
972 except KeyError:
973 if self.strict:
974 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")
975 title = ""
976
977 if "/A" in node:
978 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported)
979 action = cast(DictionaryObject, node["/A"])
980 action_type = cast(NameObject, action[GoToActionArguments.S])
981 if action_type == "/GoTo":
982 if GoToActionArguments.D in action:
983 dest = action[GoToActionArguments.D]
984 elif self.strict:
985 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}")
986 elif "/Dest" in node:
987 # Destination, PDF 1.7 and PDF 2.0 §12.3.2
988 dest = node["/Dest"]
989 # if array was referenced in another object, will be a dict w/ key "/D"
990 if isinstance(dest, DictionaryObject) and "/D" in dest:
991 dest = dest["/D"]
992
993 if isinstance(dest, ArrayObject):
994 outline_item = self._build_destination(title, dest)
995 elif isinstance(dest, str):
996 # named destination, addresses NameObject Issue #193
997 # TODO: Keep named destination instead of replacing it?
998 try:
999 outline_item = self._build_destination(
1000 title, self._named_destinations[dest].dest_array
1001 )
1002 except KeyError:
1003 # named destination not found in Name Dict
1004 outline_item = self._build_destination(title, None)
1005 elif dest is None:
1006 # outline item not required to have destination or action
1007 # PDFv1.7 Table 153
1008 outline_item = self._build_destination(title, dest)
1009 else:
1010 if self.strict:
1011 raise PdfReadError(f"Unexpected destination {dest!r}")
1012 logger_warning(
1013 f"Removed unexpected destination {dest!r} from destination",
1014 __name__,
1015 )
1016 outline_item = self._build_destination(title, None)
1017
1018 # if outline item created, add color, format, and child count if present
1019 if outline_item:
1020 if "/C" in node:
1021 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0
1022 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore
1023 if "/F" in node:
1024 # specifies style characteristics bold and/or italic
1025 # with 1=italic, 2=bold, 3=both
1026 outline_item[NameObject("/F")] = node["/F"]
1027 if "/Count" in node:
1028 # absolute value = num. visible children
1029 # with positive = open/unfolded, negative = closed/folded
1030 outline_item[NameObject("/Count")] = node["/Count"]
1031 # if count is 0 we will consider it as open (to have available is_open)
1032 outline_item[NameObject("/%is_open%")] = BooleanObject(
1033 node.get("/Count", 0) >= 0
1034 )
1035 outline_item.node = node
1036 try:
1037 outline_item.indirect_reference = node.indirect_reference
1038 except AttributeError:
1039 pass
1040 return outline_item
1041
1042 @property
1043 def pages(self) -> List[PageObject]:
1044 """
1045 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
1046 This property allows to get a page or a range of pages.
1047
1048 Note:
1049 For PdfWriter only: Provides the capability to remove a page/range of
1050 page from the list (using the del operator). Remember: Only the page
1051 entry is removed, as the objects beneath can be used elsewhere. A
1052 solution to completely remove them - if they are not used anywhere - is
1053 to write to a buffer/temporary file and then load it into a new
1054 PdfWriter.
1055
1056 """
1057 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore
1058
1059 @property
1060 def page_labels(self) -> List[str]:
1061 """
1062 A list of labels for the pages in this document.
1063
1064 This property is read-only. The labels are in the order that the pages
1065 appear in the document.
1066 """
1067 return [page_index2page_label(self, i) for i in range(len(self.pages))]
1068
1069 @property
1070 def page_layout(self) -> Optional[str]:
1071 """
1072 Get the page layout currently being used.
1073
1074 .. list-table:: Valid ``layout`` values
1075 :widths: 50 200
1076
1077 * - /NoLayout
1078 - Layout explicitly not specified
1079 * - /SinglePage
1080 - Show one page at a time
1081 * - /OneColumn
1082 - Show one column at a time
1083 * - /TwoColumnLeft
1084 - Show pages in two columns, odd-numbered pages on the left
1085 * - /TwoColumnRight
1086 - Show pages in two columns, odd-numbered pages on the right
1087 * - /TwoPageLeft
1088 - Show two pages at a time, odd-numbered pages on the left
1089 * - /TwoPageRight
1090 - Show two pages at a time, odd-numbered pages on the right
1091 """
1092 try:
1093 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
1094 except KeyError:
1095 return None
1096
1097 @property
1098 def page_mode(self) -> Optional[PagemodeType]:
1099 """
1100 Get the page mode currently being used.
1101
1102 .. list-table:: Valid ``mode`` values
1103 :widths: 50 200
1104
1105 * - /UseNone
1106 - Do not show outline or thumbnails panels
1107 * - /UseOutlines
1108 - Show outline (aka bookmarks) panel
1109 * - /UseThumbs
1110 - Show page thumbnails panel
1111 * - /FullScreen
1112 - Fullscreen view
1113 * - /UseOC
1114 - Show Optional Content Group (OCG) panel
1115 * - /UseAttachments
1116 - Show attachments panel
1117 """
1118 try:
1119 return self.root_object["/PageMode"] # type: ignore
1120 except KeyError:
1121 return None
1122
1123 def _flatten(
1124 self,
1125 list_only: bool = False,
1126 pages: Union[None, DictionaryObject, PageObject] = None,
1127 inherit: Optional[Dict[str, Any]] = None,
1128 indirect_reference: Optional[IndirectObject] = None,
1129 ) -> None:
1130 """
1131 Process the document pages to ease searching.
1132
1133 Attributes of a page may inherit from ancestor nodes
1134 in the page tree. Flattening means moving
1135 any inheritance data into descendant nodes,
1136 effectively removing the inheritance dependency.
1137
1138 Note: It is distinct from another use of "flattening" applied to PDFs.
1139 Flattening a PDF also means combining all the contents into one single layer
1140 and making the file less editable.
1141
1142 Args:
1143 list_only: Will only list the pages within _flatten_pages.
1144 pages:
1145 inherit:
1146 indirect_reference: Used recursively to flatten the /Pages object.
1147
1148 """
1149 inheritable_page_attributes = (
1150 NameObject(PG.RESOURCES),
1151 NameObject(PG.MEDIABOX),
1152 NameObject(PG.CROPBOX),
1153 NameObject(PG.ROTATE),
1154 )
1155 if inherit is None:
1156 inherit = {}
1157 if pages is None:
1158 # Fix issue 327: set flattened_pages attribute only for
1159 # decrypted file
1160 catalog = self.root_object
1161 pages = catalog.get("/Pages").get_object() # type: ignore
1162 if not isinstance(pages, DictionaryObject):
1163 raise PdfReadError("Invalid object in /Pages")
1164 self.flattened_pages = []
1165
1166 if PA.TYPE in pages:
1167 t = cast(str, pages[PA.TYPE])
1168 # if the page tree node has no /Type, consider as a page if /Kids is also missing
1169 elif PA.KIDS not in pages:
1170 t = "/Page"
1171 else:
1172 t = "/Pages"
1173
1174 if t == "/Pages":
1175 for attr in inheritable_page_attributes:
1176 if attr in pages:
1177 inherit[attr] = pages[attr]
1178 for page in cast(ArrayObject, pages[PA.KIDS]):
1179 addt = {}
1180 if isinstance(page, IndirectObject):
1181 addt["indirect_reference"] = page
1182 obj = page.get_object()
1183 if obj:
1184 # damaged file may have invalid child in /Pages
1185 try:
1186 self._flatten(list_only, obj, inherit, **addt)
1187 except RecursionError:
1188 raise PdfReadError(
1189 "Maximum recursion depth reached during page flattening."
1190 )
1191 elif t == "/Page":
1192 for attr_in, value in inherit.items():
1193 # if the page has its own value, it does not inherit the
1194 # parent's value
1195 if attr_in not in pages:
1196 pages[attr_in] = value
1197 page_obj = PageObject(self, indirect_reference)
1198 if not list_only:
1199 page_obj.update(pages)
1200
1201 # TODO: Could flattened_pages be None at this point?
1202 self.flattened_pages.append(page_obj) # type: ignore
1203
1204 def remove_page(
1205 self,
1206 page: Union[int, PageObject, IndirectObject],
1207 clean: bool = False,
1208 ) -> None:
1209 """
1210 Remove page from pages list.
1211
1212 Args:
1213 page:
1214 * :class:`int`: Page number to be removed.
1215 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times
1216 only the first one will be removed.
1217 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.
1218
1219 clean: replace PageObject with NullObject to prevent annotations
1220 or destinations to reference a detached page.
1221
1222 """
1223 if self.flattened_pages is None:
1224 self._flatten(self._readonly)
1225 assert self.flattened_pages is not None
1226 if isinstance(page, IndirectObject):
1227 p = page.get_object()
1228 if not isinstance(p, PageObject):
1229 logger_warning("IndirectObject is not referencing a page", __name__)
1230 return
1231 page = p
1232
1233 if not isinstance(page, int):
1234 try:
1235 page = self.flattened_pages.index(page)
1236 except ValueError:
1237 logger_warning("Cannot find page in pages", __name__)
1238 return
1239 if not (0 <= page < len(self.flattened_pages)):
1240 logger_warning("Page number is out of range", __name__)
1241 return
1242
1243 ind = self.pages[page].indirect_reference
1244 del self.pages[page]
1245 if clean and ind is not None:
1246 self._replace_object(ind, NullObject())
1247
1248 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
1249 """
1250 Used to ease development.
1251
1252 This is equivalent to generic.IndirectObject(num,gen,self).get_object()
1253
1254 Args:
1255 num: The object number of the indirect object.
1256 gen: The generation number of the indirect object.
1257
1258 Returns:
1259 A PdfObject
1260
1261 """
1262 return IndirectObject(num, gen, self).get_object()
1263
1264 def decode_permissions(
1265 self, permissions_code: int
1266 ) -> Dict[str, bool]: # pragma: no cover
1267 """Take the permissions as an integer, return the allowed access."""
1268 deprecate_with_replacement(
1269 old_name="decode_permissions",
1270 new_name="user_access_permissions",
1271 removed_in="5.0.0",
1272 )
1273
1274 permissions_mapping = {
1275 "print": UserAccessPermissions.PRINT,
1276 "modify": UserAccessPermissions.MODIFY,
1277 "copy": UserAccessPermissions.EXTRACT,
1278 "annotations": UserAccessPermissions.ADD_OR_MODIFY,
1279 "forms": UserAccessPermissions.FILL_FORM_FIELDS,
1280 # Do not fix typo, as part of official, but deprecated API.
1281 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS,
1282 "assemble": UserAccessPermissions.ASSEMBLE_DOC,
1283 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION,
1284 }
1285
1286 return {
1287 key: permissions_code & flag != 0
1288 for key, flag in permissions_mapping.items()
1289 }
1290
1291 @property
1292 def user_access_permissions(self) -> Optional[UserAccessPermissions]:
1293 """Get the user access permissions for encrypted documents. Returns None if not encrypted."""
1294 if self._encryption is None:
1295 return None
1296 return UserAccessPermissions(self._encryption.P)
1297
1298 @property
1299 @abstractmethod
1300 def is_encrypted(self) -> bool:
1301 """
1302 Read-only boolean property showing whether this PDF file is encrypted.
1303
1304 Note that this property, if true, will remain true even after the
1305 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
1306 """
1307 ... # pragma: no cover
1308
1309 @property
1310 def xfa(self) -> Optional[Dict[str, Any]]:
1311 tree: Optional[TreeObject] = None
1312 retval: Dict[str, Any] = {}
1313 catalog = self.root_object
1314
1315 if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
1316 return None
1317
1318 tree = cast(TreeObject, catalog["/AcroForm"])
1319
1320 if "/XFA" in tree:
1321 fields = cast(ArrayObject, tree["/XFA"])
1322 i = iter(fields)
1323 for f in i:
1324 tag = f
1325 f = next(i)
1326 if isinstance(f, IndirectObject):
1327 field = cast(Optional[EncodedStreamObject], f.get_object())
1328 if field:
1329 es = zlib.decompress(field._data)
1330 retval[tag] = es
1331 return retval
1332
1333 @property
1334 def attachments(self) -> Mapping[str, List[bytes]]:
1335 """Mapping of attachment filenames to their content."""
1336 return LazyDict(
1337 {
1338 name: (self._get_attachment_list, name)
1339 for name in self._list_attachments()
1340 }
1341 )
1342
1343 @property
1344 def attachment_list(self) -> Generator[EmbeddedFile, None, None]:
1345 """Iterable of attachment objects."""
1346 yield from EmbeddedFile._load(self.root_object)
1347
1348 def _list_attachments(self) -> List[str]:
1349 """
1350 Retrieves the list of filenames of file attachments.
1351
1352 Returns:
1353 list of filenames
1354
1355 """
1356 names = []
1357 for entry in self.attachment_list:
1358 names.append(entry.name)
1359 if (name := entry.alternative_name) != entry.name and name:
1360 names.append(name)
1361 return names
1362
1363 def _get_attachment_list(self, name: str) -> List[bytes]:
1364 out = self._get_attachments(name)[name]
1365 if isinstance(out, list):
1366 return out
1367 return [out]
1368
1369 def _get_attachments(
1370 self, filename: Optional[str] = None
1371 ) -> Dict[str, Union[bytes, List[bytes]]]:
1372 """
1373 Retrieves all or selected file attachments of the PDF as a dictionary of file names
1374 and the file data as a bytestring.
1375
1376 Args:
1377 filename: If filename is None, then a dictionary of all attachments
1378 will be returned, where the key is the filename and the value
1379 is the content. Otherwise, a dictionary with just a single key
1380 - the filename - and its content will be returned.
1381
1382 Returns:
1383 dictionary of filename -> Union[bytestring or List[ByteString]]
1384 If the filename exists multiple times a list of the different versions will be provided.
1385
1386 """
1387 attachments: Dict[str, Union[bytes, List[bytes]]] = {}
1388 for entry in self.attachment_list:
1389 names = set()
1390 alternative_name = entry.alternative_name
1391 if filename is not None:
1392 if filename in {entry.name, alternative_name}:
1393 name = entry.name if filename == entry.name else alternative_name
1394 names.add(name)
1395 else:
1396 continue
1397 else:
1398 names = {entry.name, alternative_name}
1399
1400 for name in names:
1401 if name is None:
1402 continue
1403 if name in attachments:
1404 if not isinstance(attachments[name], list):
1405 attachments[name] = [attachments[name]] # type:ignore
1406 attachments[name].append(entry.content) # type:ignore
1407 else:
1408 attachments[name] = entry.content
1409 return attachments
1410
1411 @abstractmethod
1412 def _repr_mimebundle_(
1413 self,
1414 include: Union[None, Iterable[str]] = None,
1415 exclude: Union[None, Iterable[str]] = None,
1416 ) -> Dict[str, Any]:
1417 """
1418 Integration into Jupyter Notebooks.
1419
1420 This method returns a dictionary that maps a mime-type to its
1421 representation.
1422
1423 .. seealso::
1424
1425 https://ipython.readthedocs.io/en/stable/config/integrating.html
1426 """
1427 ... # pragma: no cover
1428
1429
1430class LazyDict(Mapping[Any, Any]):
1431 def __init__(self, *args: Any, **kwargs: Any) -> None:
1432 self._raw_dict = dict(*args, **kwargs)
1433
1434 def __getitem__(self, key: str) -> Any:
1435 func, arg = self._raw_dict.__getitem__(key)
1436 return func(arg)
1437
1438 def __iter__(self) -> Iterator[Any]:
1439 return iter(self._raw_dict)
1440
1441 def __len__(self) -> int:
1442 return len(self._raw_dict)
1443
1444 def __str__(self) -> str:
1445 return f"LazyDict(keys={list(self.keys())})"