1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3# Copyright (c) 2024, Pubpub-ZZ
4#
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions are
9# met:
10#
11# * Redistributions of source code must retain the above copyright notice,
12# this list of conditions and the following disclaimer.
13# * Redistributions in binary form must reproduce the above copyright notice,
14# this list of conditions and the following disclaimer in the documentation
15# and/or other materials provided with the distribution.
16# * The name of the author may not be used to endorse or promote products
17# derived from this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29# POSSIBILITY OF SUCH DAMAGE.
30
31import struct
32import zlib
33from abc import abstractmethod
34from collections.abc import Generator, Iterable, Iterator, Mapping
35from datetime import datetime
36from typing import (
37 Any,
38 Optional,
39 Union,
40 cast,
41)
42
43from ._encryption import Encryption
44from ._page import PageObject, _VirtualList
45from ._page_labels import index2label as page_index2page_label
46from ._utils import (
47 deprecation_with_replacement,
48 logger_warning,
49 parse_iso8824_date,
50)
51from .constants import CatalogAttributes as CA
52from .constants import CatalogDictionary as CD
53from .constants import (
54 CheckboxRadioButtonAttributes,
55 GoToActionArguments,
56 PagesAttributes,
57 UserAccessPermissions,
58)
59from .constants import Core as CO
60from .constants import DocumentInformationAttributes as DI
61from .constants import FieldDictionaryAttributes as FA
62from .constants import PageAttributes as PG
63from .errors import PdfReadError, PyPdfError
64from .generic import (
65 ArrayObject,
66 BooleanObject,
67 ByteStringObject,
68 Destination,
69 DictionaryObject,
70 EncodedStreamObject,
71 Field,
72 Fit,
73 FloatObject,
74 IndirectObject,
75 NameObject,
76 NullObject,
77 NumberObject,
78 PdfObject,
79 TextStringObject,
80 TreeObject,
81 ViewerPreferences,
82 create_string_object,
83 is_null_or_none,
84)
85from .generic._files import EmbeddedFile
86from .types import OutlineType, PagemodeType
87from .xmp import XmpInformation
88
89
90def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]:
91 if size > 8:
92 raise PdfReadError("Invalid size in convert_to_int")
93 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d
94 d = d[-8:]
95 return struct.unpack(">q", d)[0]
96
97
98class DocumentInformation(DictionaryObject):
99 """
100 A class representing the basic document metadata provided in a PDF File.
101 This class is accessible through
102 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.
103
104 All text properties of the document metadata have
105 *two* properties, e.g. author and author_raw. The non-raw property will
106 always return a ``TextStringObject``, making it ideal for a case where the
107 metadata is being displayed. The raw property can sometimes return a
108 ``ByteStringObject``, if pypdf was unable to decode the string's text
109 encoding; this requires additional safety in the caller and therefore is not
110 as commonly accessed.
111 """
112
113 def __init__(self) -> None:
114 DictionaryObject.__init__(self)
115
116 def _get_text(self, key: str) -> Optional[str]:
117 retval = self.get(key, None)
118 if isinstance(retval, TextStringObject):
119 return retval
120 if isinstance(retval, ByteStringObject):
121 return str(retval)
122 return None
123
124 @property
125 def title(self) -> Optional[str]:
126 """
127 Read-only property accessing the document's title.
128
129 Returns a ``TextStringObject`` or ``None`` if the title is not
130 specified.
131 """
132 return (
133 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore
134 if self.get(DI.TITLE)
135 else None
136 )
137
138 @property
139 def title_raw(self) -> Optional[str]:
140 """The "raw" version of title; can return a ``ByteStringObject``."""
141 return self.get(DI.TITLE)
142
143 @property
144 def author(self) -> Optional[str]:
145 """
146 Read-only property accessing the document's author.
147
148 Returns a ``TextStringObject`` or ``None`` if the author is not
149 specified.
150 """
151 return self._get_text(DI.AUTHOR)
152
153 @property
154 def author_raw(self) -> Optional[str]:
155 """The "raw" version of author; can return a ``ByteStringObject``."""
156 return self.get(DI.AUTHOR)
157
158 @property
159 def subject(self) -> Optional[str]:
160 """
161 Read-only property accessing the document's subject.
162
163 Returns a ``TextStringObject`` or ``None`` if the subject is not
164 specified.
165 """
166 return self._get_text(DI.SUBJECT)
167
168 @property
169 def subject_raw(self) -> Optional[str]:
170 """The "raw" version of subject; can return a ``ByteStringObject``."""
171 return self.get(DI.SUBJECT)
172
173 @property
174 def creator(self) -> Optional[str]:
175 """
176 Read-only property accessing the document's creator.
177
178 If the document was converted to PDF from another format, this is the
179 name of the application (e.g. OpenOffice) that created the original
180 document from which it was converted. Returns a ``TextStringObject`` or
181 ``None`` if the creator is not specified.
182 """
183 return self._get_text(DI.CREATOR)
184
185 @property
186 def creator_raw(self) -> Optional[str]:
187 """The "raw" version of creator; can return a ``ByteStringObject``."""
188 return self.get(DI.CREATOR)
189
190 @property
191 def producer(self) -> Optional[str]:
192 """
193 Read-only property accessing the document's producer.
194
195 If the document was converted to PDF from another format, this is the
196 name of the application (for example, macOS Quartz) that converted it to
197 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not
198 specified.
199 """
200 return self._get_text(DI.PRODUCER)
201
202 @property
203 def producer_raw(self) -> Optional[str]:
204 """The "raw" version of producer; can return a ``ByteStringObject``."""
205 return self.get(DI.PRODUCER)
206
207 @property
208 def creation_date(self) -> Optional[datetime]:
209 """Read-only property accessing the document's creation date."""
210 return parse_iso8824_date(self._get_text(DI.CREATION_DATE))
211
212 @property
213 def creation_date_raw(self) -> Optional[str]:
214 """
215 The "raw" version of creation date; can return a ``ByteStringObject``.
216
217 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
218 is the offset from UTC.
219 """
220 return self.get(DI.CREATION_DATE)
221
222 @property
223 def modification_date(self) -> Optional[datetime]:
224 """
225 Read-only property accessing the document's modification date.
226
227 The date and time the document was most recently modified.
228 """
229 return parse_iso8824_date(self._get_text(DI.MOD_DATE))
230
231 @property
232 def modification_date_raw(self) -> Optional[str]:
233 """
234 The "raw" version of modification date; can return a
235 ``ByteStringObject``.
236
237 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
238 is the offset from UTC.
239 """
240 return self.get(DI.MOD_DATE)
241
242 @property
243 def keywords(self) -> Optional[str]:
244 """
245 Read-only property accessing the document's keywords.
246
247 Returns a ``TextStringObject`` or ``None`` if keywords are not
248 specified.
249 """
250 return self._get_text(DI.KEYWORDS)
251
252 @property
253 def keywords_raw(self) -> Optional[str]:
254 """The "raw" version of keywords; can return a ``ByteStringObject``."""
255 return self.get(DI.KEYWORDS)
256
257
258class PdfDocCommon:
259 """
260 Common functions from PdfWriter and PdfReader objects.
261
262 This root class is strongly abstracted.
263 """
264
265 strict: bool = False # default
266
267 flattened_pages: Optional[list[PageObject]] = None
268
269 _encryption: Optional[Encryption] = None
270
271 _readonly: bool = False
272
273 @property
274 @abstractmethod
275 def root_object(self) -> DictionaryObject:
276 ... # pragma: no cover
277
278 @property
279 @abstractmethod
280 def pdf_header(self) -> str:
281 ... # pragma: no cover
282
283 @abstractmethod
284 def get_object(
285 self, indirect_reference: Union[int, IndirectObject]
286 ) -> Optional[PdfObject]:
287 ... # pragma: no cover
288
289 @abstractmethod
290 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:
291 ... # pragma: no cover
292
293 @property
294 @abstractmethod
295 def _info(self) -> Optional[DictionaryObject]:
296 ... # pragma: no cover
297
298 @property
299 def metadata(self) -> Optional[DocumentInformation]:
300 """
301 Retrieve the PDF file's document information dictionary, if it exists.
302
303 Note that some PDF files use metadata streams instead of document
304 information dictionaries, and these metadata streams will not be
305 accessed by this function.
306 """
307 retval = DocumentInformation()
308 if self._info is None:
309 return None
310 retval.update(self._info)
311 return retval
312
313 @property
314 def xmp_metadata(self) -> Optional[XmpInformation]:
315 ... # pragma: no cover
316
317 @property
318 def viewer_preferences(self) -> Optional[ViewerPreferences]:
319 """Returns the existing ViewerPreferences as an overloaded dictionary."""
320 o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
321 if o is None:
322 return None
323 o = o.get_object()
324 if not isinstance(o, ViewerPreferences):
325 o = ViewerPreferences(o)
326 if hasattr(o, "indirect_reference") and o.indirect_reference is not None:
327 self._replace_object(o.indirect_reference, o)
328 else:
329 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
330 return o
331
332 def get_num_pages(self) -> int:
333 """
334 Calculate the number of pages in this PDF file.
335
336 Returns:
337 The number of pages of the parsed PDF file.
338
339 Raises:
340 PdfReadError: If restrictions prevent this action.
341
342 """
343 # Flattened pages will not work on an encrypted PDF;
344 # the PDF file's page count is used in this case. Otherwise,
345 # the original method (flattened page count) is used.
346 if self.is_encrypted:
347 return self.root_object["/Pages"]["/Count"] # type: ignore
348 if self.flattened_pages is None:
349 self._flatten(self._readonly)
350 assert self.flattened_pages is not None
351 return len(self.flattened_pages)
352
353 def get_page(self, page_number: int) -> PageObject:
354 """
355 Retrieve a page by number from this PDF file.
356 Most of the time ``.pages[page_number]`` is preferred.
357
358 Args:
359 page_number: The page number to retrieve
360 (pages begin at zero)
361
362 Returns:
363 A :class:`PageObject<pypdf._page.PageObject>` instance.
364
365 """
366 if self.flattened_pages is None:
367 self._flatten(self._readonly)
368 assert self.flattened_pages is not None, "hint for mypy"
369 return self.flattened_pages[page_number]
370
371 def _get_page_in_node(
372 self,
373 page_number: int,
374 ) -> tuple[DictionaryObject, int]:
375 """
376 Retrieve the node and position within the /Kids containing the page.
377 If page_number is greater than the number of pages, it returns the top node, -1.
378 """
379 top = cast(DictionaryObject, self.root_object["/Pages"])
380
381 def recursive_call(
382 node: DictionaryObject, mi: int
383 ) -> tuple[Optional[PdfObject], int]:
384 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
385 if node["/Type"] == "/Page":
386 if page_number == mi:
387 return node, -1
388 return None, mi + 1
389 if (page_number - mi) >= ma: # not in nodes below
390 if node == top:
391 return top, -1
392 return None, mi + ma
393 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
394 kid = cast(DictionaryObject, kid.get_object())
395 n, i = recursive_call(kid, mi)
396 if n is not None: # page has just been found ...
397 if i < 0: # ... just below!
398 return node, idx
399 # ... at lower levels
400 return n, i
401 mi = i
402 raise PyPdfError("Unexpectedly cannot find the node.")
403
404 node, idx = recursive_call(top, 0)
405 assert isinstance(node, DictionaryObject), "mypy"
406 return node, idx
407
408 @property
409 def named_destinations(self) -> dict[str, Destination]:
410 """A read-only dictionary which maps names to destinations."""
411 return self._get_named_destinations()
412
413 def get_named_dest_root(self) -> ArrayObject:
414 named_dest = ArrayObject()
415 if CA.NAMES in self.root_object and isinstance(
416 self.root_object[CA.NAMES], DictionaryObject
417 ):
418 names = cast(DictionaryObject, self.root_object[CA.NAMES])
419 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):
420 # §3.6.3 Name Dictionary (PDF spec 1.7)
421 dests = cast(DictionaryObject, names[CA.DESTS])
422 dests_ref = dests.indirect_reference
423 if CA.NAMES in dests:
424 # §7.9.6, entries in a name tree node dictionary
425 named_dest = cast(ArrayObject, dests[CA.NAMES])
426 else:
427 named_dest = ArrayObject()
428 dests[NameObject(CA.NAMES)] = named_dest
429 elif hasattr(self, "_add_object"):
430 dests = DictionaryObject()
431 dests_ref = self._add_object(dests)
432 names[NameObject(CA.DESTS)] = dests_ref
433 dests[NameObject(CA.NAMES)] = named_dest
434
435 elif hasattr(self, "_add_object"):
436 names = DictionaryObject()
437 names_ref = self._add_object(names)
438 self.root_object[NameObject(CA.NAMES)] = names_ref
439 dests = DictionaryObject()
440 dests_ref = self._add_object(dests)
441 names[NameObject(CA.DESTS)] = dests_ref
442 dests[NameObject(CA.NAMES)] = named_dest
443
444 return named_dest
445
446 ## common
447 def _get_named_destinations(
448 self,
449 tree: Union[TreeObject, None] = None,
450 retval: Optional[dict[str, Destination]] = None,
451 ) -> dict[str, Destination]:
452 """
453 Retrieve the named destinations present in the document.
454
455 Args:
456 tree: The current tree.
457 retval: The previously retrieved destinations for nested calls.
458
459 Returns:
460 A dictionary which maps names to destinations.
461
462 """
463 if retval is None:
464 retval = {}
465 catalog = self.root_object
466
467 # get the name tree
468 if CA.DESTS in catalog:
469 tree = cast(TreeObject, catalog[CA.DESTS])
470 elif CA.NAMES in catalog:
471 names = cast(DictionaryObject, catalog[CA.NAMES])
472 if CA.DESTS in names:
473 tree = cast(TreeObject, names[CA.DESTS])
474
475 if is_null_or_none(tree):
476 return retval
477 assert tree is not None, "mypy"
478
479 if PagesAttributes.KIDS in tree:
480 # recurse down the tree
481 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]):
482 self._get_named_destinations(kid.get_object(), retval)
483 # §7.9.6, entries in a name tree node dictionary
484 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6)
485 names = cast(DictionaryObject, tree[CA.NAMES])
486 i = 0
487 while i < len(names):
488 original_key = names[i].get_object()
489 i += 1
490 if not isinstance(original_key, (bytes, str)):
491 continue
492 key = str(original_key)
493 try:
494 value = names[i].get_object()
495 except IndexError:
496 break
497 i += 1
498 if isinstance(value, DictionaryObject):
499 if "/D" in value:
500 value = value["/D"]
501 else:
502 continue
503 dest = self._build_destination(key, value)
504 if dest is not None:
505 retval[key] = dest
506 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)
507 for k__, v__ in tree.items():
508 val = v__.get_object()
509 if isinstance(val, DictionaryObject):
510 if "/D" in val:
511 val = val["/D"].get_object()
512 else:
513 continue
514 dest = self._build_destination(k__, val)
515 if dest is not None:
516 retval[k__] = dest
517 return retval
518
519 # A select group of relevant field attributes. For the complete list,
520 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification.
521
522 def get_fields(
523 self,
524 tree: Optional[TreeObject] = None,
525 retval: Optional[dict[Any, Any]] = None,
526 fileobj: Optional[Any] = None,
527 stack: Optional[list[PdfObject]] = None,
528 ) -> Optional[dict[str, Any]]:
529 """
530 Extract field data if this PDF contains interactive form fields.
531
532 The *tree*, *retval*, *stack* parameters are for recursive use.
533
534 Args:
535 tree: Current object to parse.
536 retval: In-progress list of fields.
537 fileobj: A file object (usually a text file) to write
538 a report to on all interactive form fields found.
539 stack: List of already parsed objects.
540
541 Returns:
542 A dictionary where each key is a field name, and each
543 value is a :class:`Field<pypdf.generic.Field>` object. By
544 default, the mapping name is used for keys.
545 ``None`` if form data could not be located.
546
547 """
548 field_attributes = FA.attributes_dict()
549 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
550 if retval is None:
551 retval = {}
552 catalog = self.root_object
553 stack = []
554 # get the AcroForm tree
555 if CD.ACRO_FORM in catalog:
556 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
557 else:
558 return None
559 if tree is None:
560 return retval
561 assert stack is not None
562 if "/Fields" in tree:
563 fields = cast(ArrayObject, tree["/Fields"])
564 for f in fields:
565 field = f.get_object()
566 self._build_field(field, retval, fileobj, field_attributes, stack)
567 elif any(attr in tree for attr in field_attributes):
568 # Tree is a field
569 self._build_field(tree, retval, fileobj, field_attributes, stack)
570 return retval
571
572 def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
573 if "/TM" in parent:
574 return cast(str, parent["/TM"])
575 if "/Parent" in parent:
576 return (
577 self._get_qualified_field_name(
578 cast(DictionaryObject, parent["/Parent"])
579 )
580 + "."
581 + cast(str, parent.get("/T", ""))
582 )
583 return cast(str, parent.get("/T", ""))
584
585 def _build_field(
586 self,
587 field: Union[TreeObject, DictionaryObject],
588 retval: dict[Any, Any],
589 fileobj: Any,
590 field_attributes: Any,
591 stack: list[PdfObject],
592 ) -> None:
593 if all(attr not in field for attr in ("/T", "/TM")):
594 return
595 key = self._get_qualified_field_name(field)
596 if fileobj:
597 self._write_field(fileobj, field, field_attributes)
598 fileobj.write("\n")
599 retval[key] = Field(field)
600 obj = retval[key].indirect_reference.get_object() # to get the full object
601 if obj.get(FA.FT, "") == "/Ch":
602 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]
603 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:
604 # Checkbox
605 retval[key][NameObject("/_States_")] = ArrayObject(
606 list(obj["/AP"]["/N"].keys())
607 )
608 if "/Off" not in retval[key]["/_States_"]:
609 retval[key][NameObject("/_States_")].append(NameObject("/Off"))
610 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:
611 states: list[str] = []
612 retval[key][NameObject("/_States_")] = ArrayObject(states)
613 for k in obj.get(FA.Kids, {}):
614 k = k.get_object()
615 for s in list(k["/AP"]["/N"].keys()):
616 if s not in states:
617 states.append(s)
618 retval[key][NameObject("/_States_")] = ArrayObject(states)
619 if (
620 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0
621 and "/Off" in retval[key]["/_States_"]
622 ):
623 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]
624 # at last for order
625 self._check_kids(field, retval, fileobj, stack)
626
627 def _check_kids(
628 self,
629 tree: Union[TreeObject, DictionaryObject],
630 retval: Any,
631 fileobj: Any,
632 stack: list[PdfObject],
633 ) -> None:
634 if tree in stack:
635 logger_warning(
636 f"{self._get_qualified_field_name(tree)} already parsed", __name__
637 )
638 return
639 stack.append(tree)
640 if PagesAttributes.KIDS in tree:
641 # recurse down the tree
642 for kid in tree[PagesAttributes.KIDS]: # type: ignore
643 kid = kid.get_object()
644 self.get_fields(kid, retval, fileobj, stack)
645
646 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
647 field_attributes_tuple = FA.attributes()
648 field_attributes_tuple = (
649 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()
650 )
651
652 for attr in field_attributes_tuple:
653 if attr in (
654 FA.Kids,
655 FA.AA,
656 ):
657 continue
658 attr_name = field_attributes[attr]
659 try:
660 if attr == FA.FT:
661 # Make the field type value clearer
662 types = {
663 "/Btn": "Button",
664 "/Tx": "Text",
665 "/Ch": "Choice",
666 "/Sig": "Signature",
667 }
668 if field[attr] in types:
669 fileobj.write(f"{attr_name}: {types[field[attr]]}\n")
670 elif attr == FA.Parent:
671 # Let's just write the name of the parent
672 try:
673 name = field[attr][FA.TM]
674 except KeyError:
675 name = field[attr][FA.T]
676 fileobj.write(f"{attr_name}: {name}\n")
677 else:
678 fileobj.write(f"{attr_name}: {field[attr]}\n")
679 except KeyError:
680 # Field attribute is N/A or unknown, so don't write anything
681 pass
682
683 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]:
684 """
685 Retrieve form fields from the document with textual data.
686
687 Args:
688 full_qualified_name: to get full name
689
690 Returns:
691 A dictionary. The key is the name of the form field,
692 the value is the content of the field.
693
694 If the document contains multiple form fields with the same name, the
695 second and following will get the suffix .2, .3, ...
696
697 """
698
699 def indexed_key(k: str, fields: dict[Any, Any]) -> str:
700 if k not in fields:
701 return k
702 return (
703 k
704 + "."
705 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2)
706 )
707
708 # Retrieve document form fields
709 formfields = self.get_fields()
710 if formfields is None:
711 return {}
712 ff = {}
713 for field, value in formfields.items():
714 if value.get("/FT") == "/Tx":
715 if full_qualified_name:
716 ff[field] = value.get("/V")
717 else:
718 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
719 return ff
720
721 def get_pages_showing_field(
722 self, field: Union[Field, PdfObject, IndirectObject]
723 ) -> list[PageObject]:
724 """
725 Provides list of pages where the field is called.
726
727 Args:
728 field: Field Object, PdfObject or IndirectObject referencing a Field
729
730 Returns:
731 List of pages:
732 - Empty list:
733 The field has no widgets attached
734 (either hidden field or ancestor field).
735 - Single page list:
736 Page where the widget is present
737 (most common).
738 - Multi-page list:
739 Field with multiple kids widgets
740 (example: radio buttons, field repeated on multiple pages).
741
742 """
743
744 def _get_inherited(obj: DictionaryObject, key: str) -> Any:
745 if key in obj:
746 return obj[key]
747 if "/Parent" in obj:
748 return _get_inherited(
749 cast(DictionaryObject, obj["/Parent"].get_object()), key
750 )
751 return None
752
753 try:
754 # to cope with all types
755 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
756 except Exception as exc:
757 raise ValueError("Field type is invalid") from exc
758 if is_null_or_none(_get_inherited(field, "/FT")):
759 raise ValueError("Field is not valid")
760 ret = []
761 if field.get("/Subtype", "") == "/Widget":
762 if "/P" in field:
763 ret = [field["/P"].get_object()]
764 else:
765 ret = [
766 p
767 for p in self.pages
768 if field.indirect_reference in p.get("/Annots", "")
769 ]
770 else:
771 kids = field.get("/Kids", ())
772 for k in kids:
773 k = k.get_object()
774 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
775 # Kid that is just a widget, not a field:
776 if "/P" in k:
777 ret += [k["/P"].get_object()]
778 else:
779 ret += [
780 p
781 for p in self.pages
782 if k.indirect_reference in p.get("/Annots", "")
783 ]
784 return [
785 x
786 if isinstance(x, PageObject)
787 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
788 for x in ret
789 ]
790
791 @property
792 def open_destination(
793 self,
794 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
795 """
796 Property to access the opening destination (``/OpenAction`` entry in
797 the PDF catalog). It returns ``None`` if the entry does not exist
798 or is not set.
799
800 Raises:
801 Exception: If a destination is invalid.
802
803 """
804 if "/OpenAction" not in self.root_object:
805 return None
806 oa: Any = self.root_object["/OpenAction"]
807 if isinstance(oa, bytes): # pragma: no cover
808 oa = oa.decode()
809 if isinstance(oa, str):
810 return create_string_object(oa)
811 if isinstance(oa, ArrayObject):
812 try:
813 page, typ, *array = oa
814 fit = Fit(typ, tuple(array))
815 return Destination("OpenAction", page, fit)
816 except Exception as exc:
817 raise Exception(f"Invalid Destination {oa}: {exc}")
818 else:
819 return None
820
821 @open_destination.setter
822 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
823 raise NotImplementedError("No setter for open_destination")
824
825 @property
826 def outline(self) -> OutlineType:
827 """
828 Read-only property for the outline present in the document
829 (i.e., a collection of 'outline items' which are also known as
830 'bookmarks').
831 """
832 return self._get_outline()
833
834 def _get_outline(
835 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None
836 ) -> OutlineType:
837 if outline is None:
838 outline = []
839 catalog = self.root_object
840
841 # get the outline dictionary and named destinations
842 if CO.OUTLINES in catalog:
843 lines = cast(DictionaryObject, catalog[CO.OUTLINES])
844
845 if isinstance(lines, NullObject):
846 return outline
847
848 # §12.3.3 Document outline, entries in the outline dictionary
849 if not is_null_or_none(lines) and "/First" in lines:
850 node = cast(DictionaryObject, lines["/First"])
851 self._named_destinations = self._get_named_destinations()
852
853 if node is None:
854 return outline
855
856 # see if there are any more outline items
857 while True:
858 outline_obj = self._build_outline_item(node)
859 if outline_obj:
860 outline.append(outline_obj)
861
862 # check for sub-outline
863 if "/First" in node:
864 sub_outline: list[Any] = []
865 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline)
866 if sub_outline:
867 outline.append(sub_outline)
868
869 if "/Next" not in node:
870 break
871 node = cast(DictionaryObject, node["/Next"])
872
873 return outline
874
875 @property
876 def threads(self) -> Optional[ArrayObject]:
877 """
878 Read-only property for the list of threads.
879
880 See §12.4.3 from the PDF 1.7 or 2.0 specification.
881
882 It is an array of dictionaries with "/F" (the first bead in the thread)
883 and "/I" (a thread information dictionary containing information about
884 the thread, such as its title, author, and creation date) properties or
885 None if there are no articles.
886
887 Since PDF 2.0 it can also contain an indirect reference to a metadata
888 stream containing information about the thread, such as its title,
889 author, and creation date.
890 """
891 catalog = self.root_object
892 if CO.THREADS in catalog:
893 return cast("ArrayObject", catalog[CO.THREADS])
894 return None
895
896 @abstractmethod
897 def _get_page_number_by_indirect(
898 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
899 ) -> Optional[int]:
900 ... # pragma: no cover
901
902 def get_page_number(self, page: PageObject) -> Optional[int]:
903 """
904 Retrieve page number of a given PageObject.
905
906 Args:
907 page: The page to get page number. Should be
908 an instance of :class:`PageObject<pypdf._page.PageObject>`
909
910 Returns:
911 The page number or None if page is not found
912
913 """
914 return self._get_page_number_by_indirect(page.indirect_reference)
915
916 def get_destination_page_number(self, destination: Destination) -> Optional[int]:
917 """
918 Retrieve page number of a given Destination object.
919
920 Args:
921 destination: The destination to get page number.
922
923 Returns:
924 The page number or None if page is not found
925
926 """
927 return self._get_page_number_by_indirect(destination.page)
928
929 def _build_destination(
930 self,
931 title: str,
932 array: Optional[
933 list[
934 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]
935 ]
936 ],
937 ) -> Destination:
938 page, typ = None, None
939 # handle outline items with missing or invalid destination
940 if (
941 isinstance(array, (NullObject, str))
942 or (isinstance(array, ArrayObject) and len(array) == 0)
943 or array is None
944 ):
945 page = NullObject()
946 return Destination(title, page, Fit.fit())
947 page, typ, *array = array # type: ignore
948 try:
949 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore
950 except PdfReadError:
951 logger_warning(f"Unknown destination: {title} {array}", __name__)
952 if self.strict:
953 raise
954 # create a link to first Page
955 tmp = self.pages[0].indirect_reference
956 indirect_reference = NullObject() if tmp is None else tmp
957 return Destination(title, indirect_reference, Fit.fit())
958
959 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
960 dest, title, outline_item = None, None, None
961
962 # title required for valid outline
963 # §12.3.3, entries in an outline item dictionary
964 try:
965 title = cast("str", node["/Title"])
966 except KeyError:
967 if self.strict:
968 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")
969 title = ""
970
971 if "/A" in node:
972 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported)
973 action = cast(DictionaryObject, node["/A"])
974 action_type = cast(NameObject, action[GoToActionArguments.S])
975 if action_type == "/GoTo":
976 if GoToActionArguments.D in action:
977 dest = action[GoToActionArguments.D]
978 elif self.strict:
979 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}")
980 elif "/Dest" in node:
981 # Destination, PDF 1.7 and PDF 2.0 §12.3.2
982 dest = node["/Dest"]
983 # if array was referenced in another object, will be a dict w/ key "/D"
984 if isinstance(dest, DictionaryObject) and "/D" in dest:
985 dest = dest["/D"]
986
987 if isinstance(dest, ArrayObject):
988 outline_item = self._build_destination(title, dest)
989 elif isinstance(dest, str):
990 # named destination, addresses NameObject Issue #193
991 # TODO: Keep named destination instead of replacing it?
992 try:
993 outline_item = self._build_destination(
994 title, self._named_destinations[dest].dest_array
995 )
996 except KeyError:
997 # named destination not found in Name Dict
998 outline_item = self._build_destination(title, None)
999 elif dest is None:
1000 # outline item not required to have destination or action
1001 # PDFv1.7 Table 153
1002 outline_item = self._build_destination(title, dest)
1003 else:
1004 if self.strict:
1005 raise PdfReadError(f"Unexpected destination {dest!r}")
1006 logger_warning(
1007 f"Removed unexpected destination {dest!r} from destination",
1008 __name__,
1009 )
1010 outline_item = self._build_destination(title, None)
1011
1012 # if outline item created, add color, format, and child count if present
1013 if outline_item:
1014 if "/C" in node:
1015 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0
1016 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore
1017 if "/F" in node:
1018 # specifies style characteristics bold and/or italic
1019 # with 1=italic, 2=bold, 3=both
1020 outline_item[NameObject("/F")] = node["/F"]
1021 if "/Count" in node:
1022 # absolute value = num. visible children
1023 # with positive = open/unfolded, negative = closed/folded
1024 outline_item[NameObject("/Count")] = node["/Count"]
1025 # if count is 0 we will consider it as open (to have available is_open)
1026 outline_item[NameObject("/%is_open%")] = BooleanObject(
1027 node.get("/Count", 0) >= 0
1028 )
1029 outline_item.node = node
1030 try:
1031 outline_item.indirect_reference = node.indirect_reference
1032 except AttributeError:
1033 pass
1034 return outline_item
1035
1036 @property
1037 def pages(self) -> list[PageObject]:
1038 """
1039 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
1040 This property allows to get a page or a range of pages.
1041
1042 Note:
1043 For PdfWriter only: Provides the capability to remove a page/range of
1044 page from the list (using the del operator). Remember: Only the page
1045 entry is removed, as the objects beneath can be used elsewhere. A
1046 solution to completely remove them - if they are not used anywhere - is
1047 to write to a buffer/temporary file and then load it into a new
1048 PdfWriter.
1049
1050 """
1051 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore
1052
1053 @property
1054 def page_labels(self) -> list[str]:
1055 """
1056 A list of labels for the pages in this document.
1057
1058 This property is read-only. The labels are in the order that the pages
1059 appear in the document.
1060 """
1061 return [page_index2page_label(self, i) for i in range(len(self.pages))]
1062
1063 @property
1064 def page_layout(self) -> Optional[str]:
1065 """
1066 Get the page layout currently being used.
1067
1068 .. list-table:: Valid ``layout`` values
1069 :widths: 50 200
1070
1071 * - /NoLayout
1072 - Layout explicitly not specified
1073 * - /SinglePage
1074 - Show one page at a time
1075 * - /OneColumn
1076 - Show one column at a time
1077 * - /TwoColumnLeft
1078 - Show pages in two columns, odd-numbered pages on the left
1079 * - /TwoColumnRight
1080 - Show pages in two columns, odd-numbered pages on the right
1081 * - /TwoPageLeft
1082 - Show two pages at a time, odd-numbered pages on the left
1083 * - /TwoPageRight
1084 - Show two pages at a time, odd-numbered pages on the right
1085 """
1086 try:
1087 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
1088 except KeyError:
1089 return None
1090
1091 @property
1092 def page_mode(self) -> Optional[PagemodeType]:
1093 """
1094 Get the page mode currently being used.
1095
1096 .. list-table:: Valid ``mode`` values
1097 :widths: 50 200
1098
1099 * - /UseNone
1100 - Do not show outline or thumbnails panels
1101 * - /UseOutlines
1102 - Show outline (aka bookmarks) panel
1103 * - /UseThumbs
1104 - Show page thumbnails panel
1105 * - /FullScreen
1106 - Fullscreen view
1107 * - /UseOC
1108 - Show Optional Content Group (OCG) panel
1109 * - /UseAttachments
1110 - Show attachments panel
1111 """
1112 try:
1113 return self.root_object["/PageMode"] # type: ignore
1114 except KeyError:
1115 return None
1116
1117 def _flatten(
1118 self,
1119 list_only: bool = False,
1120 pages: Union[None, DictionaryObject, PageObject] = None,
1121 inherit: Optional[dict[str, Any]] = None,
1122 indirect_reference: Optional[IndirectObject] = None,
1123 ) -> None:
1124 """
1125 Process the document pages to ease searching.
1126
1127 Attributes of a page may inherit from ancestor nodes
1128 in the page tree. Flattening means moving
1129 any inheritance data into descendant nodes,
1130 effectively removing the inheritance dependency.
1131
1132 Note: It is distinct from another use of "flattening" applied to PDFs.
1133 Flattening a PDF also means combining all the contents into one single layer
1134 and making the file less editable.
1135
1136 Args:
1137 list_only: Will only list the pages within _flatten_pages.
1138 pages:
1139 inherit:
1140 indirect_reference: Used recursively to flatten the /Pages object.
1141
1142 """
1143 inheritable_page_attributes = (
1144 NameObject(PG.RESOURCES),
1145 NameObject(PG.MEDIABOX),
1146 NameObject(PG.CROPBOX),
1147 NameObject(PG.ROTATE),
1148 )
1149 if inherit is None:
1150 inherit = {}
1151 if pages is None:
1152 # Fix issue 327: set flattened_pages attribute only for
1153 # decrypted file
1154 catalog = self.root_object
1155 pages = catalog.get("/Pages").get_object() # type: ignore
1156 if not isinstance(pages, DictionaryObject):
1157 raise PdfReadError("Invalid object in /Pages")
1158 self.flattened_pages = []
1159
1160 if PagesAttributes.TYPE in pages:
1161 t = cast(str, pages[PagesAttributes.TYPE])
1162 # if the page tree node has no /Type, consider as a page if /Kids is also missing
1163 elif PagesAttributes.KIDS not in pages:
1164 t = "/Page"
1165 else:
1166 t = "/Pages"
1167
1168 if t == "/Pages":
1169 for attr in inheritable_page_attributes:
1170 if attr in pages:
1171 inherit[attr] = pages[attr]
1172 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]):
1173 addt = {}
1174 if isinstance(page, IndirectObject):
1175 addt["indirect_reference"] = page
1176 obj = page.get_object()
1177 if obj:
1178 # damaged file may have invalid child in /Pages
1179 try:
1180 self._flatten(list_only, obj, inherit, **addt)
1181 except RecursionError:
1182 raise PdfReadError(
1183 "Maximum recursion depth reached during page flattening."
1184 )
1185 elif t == "/Page":
1186 for attr_in, value in inherit.items():
1187 # if the page has its own value, it does not inherit the
1188 # parent's value
1189 if attr_in not in pages:
1190 pages[attr_in] = value
1191 page_obj = PageObject(self, indirect_reference)
1192 if not list_only:
1193 page_obj.update(pages)
1194
1195 # TODO: Could flattened_pages be None at this point?
1196 self.flattened_pages.append(page_obj) # type: ignore
1197
1198 def remove_page(
1199 self,
1200 page: Union[int, PageObject, IndirectObject],
1201 clean: bool = False,
1202 ) -> None:
1203 """
1204 Remove page from pages list.
1205
1206 Args:
1207 page:
1208 * :class:`int`: Page number to be removed.
1209 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times
1210 only the first one will be removed.
1211 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.
1212
1213 clean: replace PageObject with NullObject to prevent annotations
1214 or destinations to reference a detached page.
1215
1216 """
1217 if self.flattened_pages is None:
1218 self._flatten(self._readonly)
1219 assert self.flattened_pages is not None
1220 if isinstance(page, IndirectObject):
1221 p = page.get_object()
1222 if not isinstance(p, PageObject):
1223 logger_warning("IndirectObject is not referencing a page", __name__)
1224 return
1225 page = p
1226
1227 if not isinstance(page, int):
1228 try:
1229 page = self.flattened_pages.index(page)
1230 except ValueError:
1231 logger_warning("Cannot find page in pages", __name__)
1232 return
1233 if not (0 <= page < len(self.flattened_pages)):
1234 logger_warning("Page number is out of range", __name__)
1235 return
1236
1237 ind = self.pages[page].indirect_reference
1238 del self.pages[page]
1239 if clean and ind is not None:
1240 self._replace_object(ind, NullObject())
1241
1242 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
1243 """
1244 Used to ease development.
1245
1246 This is equivalent to generic.IndirectObject(num,gen,self).get_object()
1247
1248 Args:
1249 num: The object number of the indirect object.
1250 gen: The generation number of the indirect object.
1251
1252 Returns:
1253 A PdfObject
1254
1255 """
1256 return IndirectObject(num, gen, self).get_object()
1257
1258 def decode_permissions(
1259 self, permissions_code: int
1260 ) -> dict[str, bool]: # pragma: no cover
1261 """Take the permissions as an integer, return the allowed access."""
1262 deprecation_with_replacement(
1263 old_name="decode_permissions",
1264 new_name="user_access_permissions",
1265 removed_in="5.0.0",
1266 )
1267
1268 permissions_mapping = {
1269 "print": UserAccessPermissions.PRINT,
1270 "modify": UserAccessPermissions.MODIFY,
1271 "copy": UserAccessPermissions.EXTRACT,
1272 "annotations": UserAccessPermissions.ADD_OR_MODIFY,
1273 "forms": UserAccessPermissions.FILL_FORM_FIELDS,
1274 # Do not fix typo, as part of official, but deprecated API.
1275 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS,
1276 "assemble": UserAccessPermissions.ASSEMBLE_DOC,
1277 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION,
1278 }
1279
1280 return {
1281 key: permissions_code & flag != 0
1282 for key, flag in permissions_mapping.items()
1283 }
1284
1285 @property
1286 def user_access_permissions(self) -> Optional[UserAccessPermissions]:
1287 """Get the user access permissions for encrypted documents. Returns None if not encrypted."""
1288 if self._encryption is None:
1289 return None
1290 return UserAccessPermissions(self._encryption.P)
1291
1292 @property
1293 @abstractmethod
1294 def is_encrypted(self) -> bool:
1295 """
1296 Read-only boolean property showing whether this PDF file is encrypted.
1297
1298 Note that this property, if true, will remain true even after the
1299 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
1300 """
1301 ... # pragma: no cover
1302
1303 @property
1304 def xfa(self) -> Optional[dict[str, Any]]:
1305 tree: Optional[TreeObject] = None
1306 retval: dict[str, Any] = {}
1307 catalog = self.root_object
1308
1309 if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
1310 return None
1311
1312 tree = cast(TreeObject, catalog["/AcroForm"])
1313
1314 if "/XFA" in tree:
1315 fields = cast(ArrayObject, tree["/XFA"])
1316 i = iter(fields)
1317 for f in i:
1318 tag = f
1319 f = next(i)
1320 if isinstance(f, IndirectObject):
1321 field = cast(Optional[EncodedStreamObject], f.get_object())
1322 if field:
1323 es = zlib.decompress(field._data)
1324 retval[tag] = es
1325 return retval
1326
1327 @property
1328 def attachments(self) -> Mapping[str, list[bytes]]:
1329 """Mapping of attachment filenames to their content."""
1330 return LazyDict(
1331 {
1332 name: (self._get_attachment_list, name)
1333 for name in self._list_attachments()
1334 }
1335 )
1336
1337 @property
1338 def attachment_list(self) -> Generator[EmbeddedFile, None, None]:
1339 """Iterable of attachment objects."""
1340 yield from EmbeddedFile._load(self.root_object)
1341
1342 def _list_attachments(self) -> list[str]:
1343 """
1344 Retrieves the list of filenames of file attachments.
1345
1346 Returns:
1347 list of filenames
1348
1349 """
1350 names = []
1351 for entry in self.attachment_list:
1352 names.append(entry.name)
1353 if (name := entry.alternative_name) != entry.name and name:
1354 names.append(name)
1355 return names
1356
1357 def _get_attachment_list(self, name: str) -> list[bytes]:
1358 out = self._get_attachments(name)[name]
1359 if isinstance(out, list):
1360 return out
1361 return [out]
1362
1363 def _get_attachments(
1364 self, filename: Optional[str] = None
1365 ) -> dict[str, Union[bytes, list[bytes]]]:
1366 """
1367 Retrieves all or selected file attachments of the PDF as a dictionary of file names
1368 and the file data as a bytestring.
1369
1370 Args:
1371 filename: If filename is None, then a dictionary of all attachments
1372 will be returned, where the key is the filename and the value
1373 is the content. Otherwise, a dictionary with just a single key
1374 - the filename - and its content will be returned.
1375
1376 Returns:
1377 dictionary of filename -> Union[bytestring or List[ByteString]]
1378 If the filename exists multiple times a list of the different versions will be provided.
1379
1380 """
1381 attachments: dict[str, Union[bytes, list[bytes]]] = {}
1382 for entry in self.attachment_list:
1383 names = set()
1384 alternative_name = entry.alternative_name
1385 if filename is not None:
1386 if filename in {entry.name, alternative_name}:
1387 name = entry.name if filename == entry.name else alternative_name
1388 names.add(name)
1389 else:
1390 continue
1391 else:
1392 names = {entry.name, alternative_name}
1393
1394 for name in names:
1395 if name is None:
1396 continue
1397 if name in attachments:
1398 if not isinstance(attachments[name], list):
1399 attachments[name] = [attachments[name]] # type:ignore
1400 attachments[name].append(entry.content) # type:ignore
1401 else:
1402 attachments[name] = entry.content
1403 return attachments
1404
1405 @abstractmethod
1406 def _repr_mimebundle_(
1407 self,
1408 include: Union[None, Iterable[str]] = None,
1409 exclude: Union[None, Iterable[str]] = None,
1410 ) -> dict[str, Any]:
1411 """
1412 Integration into Jupyter Notebooks.
1413
1414 This method returns a dictionary that maps a mime-type to its
1415 representation.
1416
1417 .. seealso::
1418
1419 https://ipython.readthedocs.io/en/stable/config/integrating.html
1420 """
1421 ... # pragma: no cover
1422
1423
1424class LazyDict(Mapping[Any, Any]):
1425 def __init__(self, *args: Any, **kwargs: Any) -> None:
1426 self._raw_dict = dict(*args, **kwargs)
1427
1428 def __getitem__(self, key: str) -> Any:
1429 func, arg = self._raw_dict.__getitem__(key)
1430 return func(arg)
1431
1432 def __iter__(self) -> Iterator[Any]:
1433 return iter(self._raw_dict)
1434
1435 def __len__(self) -> int:
1436 return len(self._raw_dict)
1437
1438 def __str__(self) -> str:
1439 return f"LazyDict(keys={list(self.keys())})"