1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3# Copyright (c) 2024, Pubpub-ZZ
4#
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions are
9# met:
10#
11# * Redistributions of source code must retain the above copyright notice,
12# this list of conditions and the following disclaimer.
13# * Redistributions in binary form must reproduce the above copyright notice,
14# this list of conditions and the following disclaimer in the documentation
15# and/or other materials provided with the distribution.
16# * The name of the author may not be used to endorse or promote products
17# derived from this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29# POSSIBILITY OF SUCH DAMAGE.
30
31import struct
32import zlib
33from abc import abstractmethod
34from collections.abc import Generator, Iterable, Iterator, Mapping
35from datetime import datetime
36from typing import (
37 Any,
38 Optional,
39 Union,
40 cast,
41)
42
43from ._encryption import Encryption
44from ._page import PageObject, _VirtualList
45from ._page_labels import index2label as page_index2page_label
46from ._utils import (
47 deprecation_with_replacement,
48 logger_warning,
49 parse_iso8824_date,
50)
51from .constants import CatalogAttributes as CA
52from .constants import CatalogDictionary as CD
53from .constants import (
54 CheckboxRadioButtonAttributes,
55 GoToActionArguments,
56 PagesAttributes,
57 UserAccessPermissions,
58)
59from .constants import Core as CO
60from .constants import DocumentInformationAttributes as DI
61from .constants import FieldDictionaryAttributes as FA
62from .constants import PageAttributes as PG
63from .errors import PdfReadError, PyPdfError
64from .generic import (
65 ArrayObject,
66 BooleanObject,
67 ByteStringObject,
68 Destination,
69 DictionaryObject,
70 EncodedStreamObject,
71 Field,
72 Fit,
73 FloatObject,
74 IndirectObject,
75 NameObject,
76 NullObject,
77 NumberObject,
78 PdfObject,
79 TextStringObject,
80 TreeObject,
81 ViewerPreferences,
82 create_string_object,
83 is_null_or_none,
84)
85from .generic._files import EmbeddedFile
86from .types import OutlineType, PagemodeType
87from .xmp import XmpInformation
88
89
90def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]:
91 if size > 8:
92 raise PdfReadError("Invalid size in convert_to_int")
93 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d
94 d = d[-8:]
95 return struct.unpack(">q", d)[0]
96
97
98class DocumentInformation(DictionaryObject):
99 """
100 A class representing the basic document metadata provided in a PDF File.
101 This class is accessible through
102 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.
103
104 All text properties of the document metadata have
105 *two* properties, e.g. author and author_raw. The non-raw property will
106 always return a ``TextStringObject``, making it ideal for a case where the
107 metadata is being displayed. The raw property can sometimes return a
108 ``ByteStringObject``, if pypdf was unable to decode the string's text
109 encoding; this requires additional safety in the caller and therefore is not
110 as commonly accessed.
111 """
112
113 def __init__(self) -> None:
114 DictionaryObject.__init__(self)
115
116 def _get_text(self, key: str) -> Optional[str]:
117 retval = self.get(key, None)
118 if isinstance(retval, TextStringObject):
119 return retval
120 if isinstance(retval, ByteStringObject):
121 return str(retval)
122 return None
123
124 @property
125 def title(self) -> Optional[str]:
126 """
127 Read-only property accessing the document's title.
128
129 Returns a ``TextStringObject`` or ``None`` if the title is not
130 specified.
131 """
132 return (
133 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore
134 if self.get(DI.TITLE)
135 else None
136 )
137
138 @property
139 def title_raw(self) -> Optional[str]:
140 """The "raw" version of title; can return a ``ByteStringObject``."""
141 return self.get(DI.TITLE)
142
143 @property
144 def author(self) -> Optional[str]:
145 """
146 Read-only property accessing the document's author.
147
148 Returns a ``TextStringObject`` or ``None`` if the author is not
149 specified.
150 """
151 return self._get_text(DI.AUTHOR)
152
153 @property
154 def author_raw(self) -> Optional[str]:
155 """The "raw" version of author; can return a ``ByteStringObject``."""
156 return self.get(DI.AUTHOR)
157
158 @property
159 def subject(self) -> Optional[str]:
160 """
161 Read-only property accessing the document's subject.
162
163 Returns a ``TextStringObject`` or ``None`` if the subject is not
164 specified.
165 """
166 return self._get_text(DI.SUBJECT)
167
168 @property
169 def subject_raw(self) -> Optional[str]:
170 """The "raw" version of subject; can return a ``ByteStringObject``."""
171 return self.get(DI.SUBJECT)
172
173 @property
174 def creator(self) -> Optional[str]:
175 """
176 Read-only property accessing the document's creator.
177
178 If the document was converted to PDF from another format, this is the
179 name of the application (e.g. OpenOffice) that created the original
180 document from which it was converted. Returns a ``TextStringObject`` or
181 ``None`` if the creator is not specified.
182 """
183 return self._get_text(DI.CREATOR)
184
185 @property
186 def creator_raw(self) -> Optional[str]:
187 """The "raw" version of creator; can return a ``ByteStringObject``."""
188 return self.get(DI.CREATOR)
189
190 @property
191 def producer(self) -> Optional[str]:
192 """
193 Read-only property accessing the document's producer.
194
195 If the document was converted to PDF from another format, this is the
196 name of the application (for example, macOS Quartz) that converted it to
197 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not
198 specified.
199 """
200 return self._get_text(DI.PRODUCER)
201
202 @property
203 def producer_raw(self) -> Optional[str]:
204 """The "raw" version of producer; can return a ``ByteStringObject``."""
205 return self.get(DI.PRODUCER)
206
207 @property
208 def creation_date(self) -> Optional[datetime]:
209 """Read-only property accessing the document's creation date."""
210 return parse_iso8824_date(self._get_text(DI.CREATION_DATE))
211
212 @property
213 def creation_date_raw(self) -> Optional[str]:
214 """
215 The "raw" version of creation date; can return a ``ByteStringObject``.
216
217 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
218 is the offset from UTC.
219 """
220 return self.get(DI.CREATION_DATE)
221
222 @property
223 def modification_date(self) -> Optional[datetime]:
224 """
225 Read-only property accessing the document's modification date.
226
227 The date and time the document was most recently modified.
228 """
229 return parse_iso8824_date(self._get_text(DI.MOD_DATE))
230
231 @property
232 def modification_date_raw(self) -> Optional[str]:
233 """
234 The "raw" version of modification date; can return a
235 ``ByteStringObject``.
236
237 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
238 is the offset from UTC.
239 """
240 return self.get(DI.MOD_DATE)
241
242 @property
243 def keywords(self) -> Optional[str]:
244 """
245 Read-only property accessing the document's keywords.
246
247 Returns a ``TextStringObject`` or ``None`` if keywords are not
248 specified.
249 """
250 return self._get_text(DI.KEYWORDS)
251
252 @property
253 def keywords_raw(self) -> Optional[str]:
254 """The "raw" version of keywords; can return a ``ByteStringObject``."""
255 return self.get(DI.KEYWORDS)
256
257
258class PdfDocCommon:
259 """
260 Common functions from PdfWriter and PdfReader objects.
261
262 This root class is strongly abstracted.
263 """
264
265 strict: bool = False # default
266
267 flattened_pages: Optional[list[PageObject]] = None
268
269 _encryption: Optional[Encryption] = None
270
271 _readonly: bool = False
272
273 @property
274 @abstractmethod
275 def root_object(self) -> DictionaryObject:
276 ... # pragma: no cover
277
278 @property
279 @abstractmethod
280 def pdf_header(self) -> str:
281 ... # pragma: no cover
282
283 @abstractmethod
284 def get_object(
285 self, indirect_reference: Union[int, IndirectObject]
286 ) -> Optional[PdfObject]:
287 ... # pragma: no cover
288
289 @abstractmethod
290 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:
291 ... # pragma: no cover
292
293 @property
294 @abstractmethod
295 def _info(self) -> Optional[DictionaryObject]:
296 ... # pragma: no cover
297
298 @property
299 def metadata(self) -> Optional[DocumentInformation]:
300 """
301 Retrieve the PDF file's document information dictionary, if it exists.
302
303 Note that some PDF files use metadata streams instead of document
304 information dictionaries, and these metadata streams will not be
305 accessed by this function.
306 """
307 retval = DocumentInformation()
308 if self._info is None:
309 return None
310 retval.update(self._info)
311 return retval
312
313 @property
314 def xmp_metadata(self) -> Optional[XmpInformation]:
315 ... # pragma: no cover
316
317 @property
318 def viewer_preferences(self) -> Optional[ViewerPreferences]:
319 """Returns the existing ViewerPreferences as an overloaded dictionary."""
320 o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
321 if o is None:
322 return None
323 o = o.get_object()
324 if not isinstance(o, ViewerPreferences):
325 o = ViewerPreferences(o)
326 if hasattr(o, "indirect_reference") and o.indirect_reference is not None:
327 self._replace_object(o.indirect_reference, o)
328 else:
329 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
330 return o
331
332 def get_num_pages(self) -> int:
333 """
334 Calculate the number of pages in this PDF file.
335
336 Returns:
337 The number of pages of the parsed PDF file.
338
339 Raises:
340 PdfReadError: If restrictions prevent this action.
341
342 """
343 # Flattened pages will not work on an encrypted PDF;
344 # the PDF file's page count is used in this case. Otherwise,
345 # the original method (flattened page count) is used.
346 if self.is_encrypted:
347 return self.root_object["/Pages"]["/Count"] # type: ignore
348 if self.flattened_pages is None:
349 self._flatten(self._readonly)
350 assert self.flattened_pages is not None
351 return len(self.flattened_pages)
352
353 def get_page(self, page_number: int) -> PageObject:
354 """
355 Retrieve a page by number from this PDF file.
356 Most of the time ``.pages[page_number]`` is preferred.
357
358 Args:
359 page_number: The page number to retrieve
360 (pages begin at zero)
361
362 Returns:
363 A :class:`PageObject<pypdf._page.PageObject>` instance.
364
365 """
366 if self.flattened_pages is None:
367 self._flatten(self._readonly)
368 assert self.flattened_pages is not None, "hint for mypy"
369 return self.flattened_pages[page_number]
370
371 def _get_page_in_node(
372 self,
373 page_number: int,
374 ) -> tuple[DictionaryObject, int]:
375 """
376 Retrieve the node and position within the /Kids containing the page.
377 If page_number is greater than the number of pages, it returns the top node, -1.
378 """
379 top = cast(DictionaryObject, self.root_object["/Pages"])
380
381 def recursive_call(
382 node: DictionaryObject, mi: int
383 ) -> tuple[Optional[PdfObject], int]:
384 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
385 if node["/Type"] == "/Page":
386 if page_number == mi:
387 return node, -1
388 return None, mi + 1
389 if (page_number - mi) >= ma: # not in nodes below
390 if node == top:
391 return top, -1
392 return None, mi + ma
393 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
394 kid = cast(DictionaryObject, kid.get_object())
395 n, i = recursive_call(kid, mi)
396 if n is not None: # page has just been found ...
397 if i < 0: # ... just below!
398 return node, idx
399 # ... at lower levels
400 return n, i
401 mi = i
402 raise PyPdfError("Unexpectedly cannot find the node.")
403
404 node, idx = recursive_call(top, 0)
405 assert isinstance(node, DictionaryObject), "mypy"
406 return node, idx
407
408 @property
409 def named_destinations(self) -> dict[str, Destination]:
410 """A read-only dictionary which maps names to destinations."""
411 return self._get_named_destinations()
412
413 def get_named_dest_root(self) -> ArrayObject:
414 named_dest = ArrayObject()
415 if CA.NAMES in self.root_object and isinstance(
416 self.root_object[CA.NAMES], DictionaryObject
417 ):
418 names = cast(DictionaryObject, self.root_object[CA.NAMES])
419 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):
420 # §3.6.3 Name Dictionary (PDF spec 1.7)
421 dests = cast(DictionaryObject, names[CA.DESTS])
422 dests_ref = dests.indirect_reference
423 if CA.NAMES in dests:
424 # §7.9.6, entries in a name tree node dictionary
425 named_dest = cast(ArrayObject, dests[CA.NAMES])
426 else:
427 named_dest = ArrayObject()
428 dests[NameObject(CA.NAMES)] = named_dest
429 elif hasattr(self, "_add_object"):
430 dests = DictionaryObject()
431 dests_ref = self._add_object(dests)
432 names[NameObject(CA.DESTS)] = dests_ref
433 dests[NameObject(CA.NAMES)] = named_dest
434
435 elif hasattr(self, "_add_object"):
436 names = DictionaryObject()
437 names_ref = self._add_object(names)
438 self.root_object[NameObject(CA.NAMES)] = names_ref
439 dests = DictionaryObject()
440 dests_ref = self._add_object(dests)
441 names[NameObject(CA.DESTS)] = dests_ref
442 dests[NameObject(CA.NAMES)] = named_dest
443
444 return named_dest
445
446 ## common
447 def _get_named_destinations(
448 self,
449 tree: Union[TreeObject, None] = None,
450 retval: Optional[dict[str, Destination]] = None,
451 ) -> dict[str, Destination]:
452 """
453 Retrieve the named destinations present in the document.
454
455 Args:
456 tree: The current tree.
457 retval: The previously retrieved destinations for nested calls.
458
459 Returns:
460 A dictionary which maps names to destinations.
461
462 """
463 if retval is None:
464 retval = {}
465 catalog = self.root_object
466
467 # get the name tree
468 if CA.DESTS in catalog:
469 tree = cast(TreeObject, catalog[CA.DESTS])
470 elif CA.NAMES in catalog:
471 names = cast(DictionaryObject, catalog[CA.NAMES])
472 if CA.DESTS in names:
473 tree = cast(TreeObject, names[CA.DESTS])
474
475 if is_null_or_none(tree):
476 return retval
477 assert tree is not None, "mypy"
478
479 if PagesAttributes.KIDS in tree:
480 # recurse down the tree
481 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]):
482 self._get_named_destinations(kid.get_object(), retval)
483 # §7.9.6, entries in a name tree node dictionary
484 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6)
485 names = cast(DictionaryObject, tree[CA.NAMES])
486 i = 0
487 while i < len(names):
488 key = names[i].get_object()
489 i += 1
490 if not isinstance(key, (bytes, str)):
491 continue
492 try:
493 value = names[i].get_object()
494 except IndexError:
495 break
496 i += 1
497 if isinstance(value, DictionaryObject):
498 if "/D" in value:
499 value = value["/D"]
500 else:
501 continue
502 dest = self._build_destination(key, value)
503 if dest is not None:
504 retval[cast(str, dest["/Title"])] = dest
505 # Remain backwards-compatible.
506 retval[str(key)] = dest
507 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)
508 for k__, v__ in tree.items():
509 val = v__.get_object()
510 if isinstance(val, DictionaryObject):
511 if "/D" in val:
512 val = val["/D"].get_object()
513 else:
514 continue
515 dest = self._build_destination(k__, val)
516 if dest is not None:
517 retval[k__] = dest
518 return retval
519
520 # A select group of relevant field attributes. For the complete list,
521 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification.
522
523 def get_fields(
524 self,
525 tree: Optional[TreeObject] = None,
526 retval: Optional[dict[Any, Any]] = None,
527 fileobj: Optional[Any] = None,
528 stack: Optional[list[PdfObject]] = None,
529 ) -> Optional[dict[str, Any]]:
530 """
531 Extract field data if this PDF contains interactive form fields.
532
533 The *tree*, *retval*, *stack* parameters are for recursive use.
534
535 Args:
536 tree: Current object to parse.
537 retval: In-progress list of fields.
538 fileobj: A file object (usually a text file) to write
539 a report to on all interactive form fields found.
540 stack: List of already parsed objects.
541
542 Returns:
543 A dictionary where each key is a field name, and each
544 value is a :class:`Field<pypdf.generic.Field>` object. By
545 default, the mapping name is used for keys.
546 ``None`` if form data could not be located.
547
548 """
549 field_attributes = FA.attributes_dict()
550 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
551 if retval is None:
552 retval = {}
553 catalog = self.root_object
554 stack = []
555 # get the AcroForm tree
556 if CD.ACRO_FORM in catalog:
557 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
558 else:
559 return None
560 if tree is None:
561 return retval
562 assert stack is not None
563 if "/Fields" in tree:
564 fields = cast(ArrayObject, tree["/Fields"])
565 for f in fields:
566 field = f.get_object()
567 self._build_field(field, retval, fileobj, field_attributes, stack)
568 elif any(attr in tree for attr in field_attributes):
569 # Tree is a field
570 self._build_field(tree, retval, fileobj, field_attributes, stack)
571 return retval
572
573 def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
574 if "/TM" in parent:
575 return cast(str, parent["/TM"])
576 if "/Parent" in parent:
577 return (
578 self._get_qualified_field_name(
579 cast(DictionaryObject, parent["/Parent"])
580 )
581 + "."
582 + cast(str, parent.get("/T", ""))
583 )
584 return cast(str, parent.get("/T", ""))
585
586 def _build_field(
587 self,
588 field: Union[TreeObject, DictionaryObject],
589 retval: dict[Any, Any],
590 fileobj: Any,
591 field_attributes: Any,
592 stack: list[PdfObject],
593 ) -> None:
594 if all(attr not in field for attr in ("/T", "/TM")):
595 return
596 key = self._get_qualified_field_name(field)
597 if fileobj:
598 self._write_field(fileobj, field, field_attributes)
599 fileobj.write("\n")
600 retval[key] = Field(field)
601 obj = retval[key].indirect_reference.get_object() # to get the full object
602 if obj.get(FA.FT, "") == "/Ch":
603 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]
604 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:
605 # Checkbox
606 retval[key][NameObject("/_States_")] = ArrayObject(
607 list(obj["/AP"]["/N"].keys())
608 )
609 if "/Off" not in retval[key]["/_States_"]:
610 retval[key][NameObject("/_States_")].append(NameObject("/Off"))
611 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:
612 states: list[str] = []
613 retval[key][NameObject("/_States_")] = ArrayObject(states)
614 for k in obj.get(FA.Kids, {}):
615 k = k.get_object()
616 for s in list(k["/AP"]["/N"].keys()):
617 if s not in states:
618 states.append(s)
619 retval[key][NameObject("/_States_")] = ArrayObject(states)
620 if (
621 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0
622 and "/Off" in retval[key]["/_States_"]
623 ):
624 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]
625 # at last for order
626 self._check_kids(field, retval, fileobj, stack)
627
628 def _check_kids(
629 self,
630 tree: Union[TreeObject, DictionaryObject],
631 retval: Any,
632 fileobj: Any,
633 stack: list[PdfObject],
634 ) -> None:
635 if tree in stack:
636 logger_warning(
637 f"{self._get_qualified_field_name(tree)} already parsed", __name__
638 )
639 return
640 stack.append(tree)
641 if PagesAttributes.KIDS in tree:
642 # recurse down the tree
643 for kid in tree[PagesAttributes.KIDS]: # type: ignore
644 kid = kid.get_object()
645 self.get_fields(kid, retval, fileobj, stack)
646
647 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
648 field_attributes_tuple = FA.attributes()
649 field_attributes_tuple = (
650 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()
651 )
652
653 for attr in field_attributes_tuple:
654 if attr in (
655 FA.Kids,
656 FA.AA,
657 ):
658 continue
659 attr_name = field_attributes[attr]
660 try:
661 if attr == FA.FT:
662 # Make the field type value clearer
663 types = {
664 "/Btn": "Button",
665 "/Tx": "Text",
666 "/Ch": "Choice",
667 "/Sig": "Signature",
668 }
669 if field[attr] in types:
670 fileobj.write(f"{attr_name}: {types[field[attr]]}\n")
671 elif attr == FA.Parent:
672 # Let's just write the name of the parent
673 try:
674 name = field[attr][FA.TM]
675 except KeyError:
676 name = field[attr][FA.T]
677 fileobj.write(f"{attr_name}: {name}\n")
678 else:
679 fileobj.write(f"{attr_name}: {field[attr]}\n")
680 except KeyError:
681 # Field attribute is N/A or unknown, so don't write anything
682 pass
683
684 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]:
685 """
686 Retrieve form fields from the document with textual data.
687
688 Args:
689 full_qualified_name: to get full name
690
691 Returns:
692 A dictionary. The key is the name of the form field,
693 the value is the content of the field.
694
695 If the document contains multiple form fields with the same name, the
696 second and following will get the suffix .2, .3, ...
697
698 """
699
700 def indexed_key(k: str, fields: dict[Any, Any]) -> str:
701 if k not in fields:
702 return k
703 return (
704 k
705 + "."
706 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2)
707 )
708
709 # Retrieve document form fields
710 formfields = self.get_fields()
711 if formfields is None:
712 return {}
713 ff = {}
714 for field, value in formfields.items():
715 if value.get("/FT") == "/Tx":
716 if full_qualified_name:
717 ff[field] = value.get("/V")
718 else:
719 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
720 return ff
721
722 def get_pages_showing_field(
723 self, field: Union[Field, PdfObject, IndirectObject]
724 ) -> list[PageObject]:
725 """
726 Provides list of pages where the field is called.
727
728 Args:
729 field: Field Object, PdfObject or IndirectObject referencing a Field
730
731 Returns:
732 List of pages:
733 - Empty list:
734 The field has no widgets attached
735 (either hidden field or ancestor field).
736 - Single page list:
737 Page where the widget is present
738 (most common).
739 - Multi-page list:
740 Field with multiple kids widgets
741 (example: radio buttons, field repeated on multiple pages).
742
743 """
744
745 def _get_inherited(obj: DictionaryObject, key: str) -> Any:
746 if key in obj:
747 return obj[key]
748 if "/Parent" in obj:
749 return _get_inherited(
750 cast(DictionaryObject, obj["/Parent"].get_object()), key
751 )
752 return None
753
754 try:
755 # to cope with all types
756 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
757 except Exception as exc:
758 raise ValueError("Field type is invalid") from exc
759 if is_null_or_none(_get_inherited(field, "/FT")):
760 raise ValueError("Field is not valid")
761 ret = []
762 if field.get("/Subtype", "") == "/Widget":
763 if "/P" in field:
764 ret = [field["/P"].get_object()]
765 else:
766 ret = [
767 p
768 for p in self.pages
769 if field.indirect_reference in p.get("/Annots", "")
770 ]
771 else:
772 kids = field.get("/Kids", ())
773 for k in kids:
774 k = k.get_object()
775 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
776 # Kid that is just a widget, not a field:
777 if "/P" in k:
778 ret += [k["/P"].get_object()]
779 else:
780 ret += [
781 p
782 for p in self.pages
783 if k.indirect_reference in p.get("/Annots", "")
784 ]
785 return [
786 x
787 if isinstance(x, PageObject)
788 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
789 for x in ret
790 ]
791
792 @property
793 def open_destination(
794 self,
795 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
796 """
797 Property to access the opening destination (``/OpenAction`` entry in
798 the PDF catalog). It returns ``None`` if the entry does not exist
799 or is not set.
800
801 Raises:
802 Exception: If a destination is invalid.
803
804 """
805 if "/OpenAction" not in self.root_object:
806 return None
807 oa: Any = self.root_object["/OpenAction"]
808 if isinstance(oa, bytes): # pragma: no cover
809 oa = oa.decode()
810 if isinstance(oa, str):
811 return create_string_object(oa)
812 if isinstance(oa, ArrayObject):
813 try:
814 page, typ, *array = oa
815 fit = Fit(typ, tuple(array))
816 return Destination("OpenAction", page, fit)
817 except Exception as exc:
818 raise Exception(f"Invalid Destination {oa}: {exc}")
819 else:
820 return None
821
822 @open_destination.setter
823 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
824 raise NotImplementedError("No setter for open_destination")
825
826 @property
827 def outline(self) -> OutlineType:
828 """
829 Read-only property for the outline present in the document
830 (i.e., a collection of 'outline items' which are also known as
831 'bookmarks').
832 """
833 return self._get_outline()
834
835 def _get_outline(
836 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None
837 ) -> OutlineType:
838 if outline is None:
839 outline = []
840 catalog = self.root_object
841
842 # get the outline dictionary and named destinations
843 if CO.OUTLINES in catalog:
844 lines = cast(DictionaryObject, catalog[CO.OUTLINES])
845
846 if isinstance(lines, NullObject):
847 return outline
848
849 # §12.3.3 Document outline, entries in the outline dictionary
850 if not is_null_or_none(lines) and "/First" in lines:
851 node = cast(DictionaryObject, lines["/First"])
852 self._named_destinations = self._get_named_destinations()
853
854 if node is None:
855 return outline
856
857 # see if there are any more outline items
858 while True:
859 outline_obj = self._build_outline_item(node)
860 if outline_obj:
861 outline.append(outline_obj)
862
863 # check for sub-outline
864 if "/First" in node:
865 sub_outline: list[Any] = []
866 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline)
867 if sub_outline:
868 outline.append(sub_outline)
869
870 if "/Next" not in node:
871 break
872 node = cast(DictionaryObject, node["/Next"])
873
874 return outline
875
876 @property
877 def threads(self) -> Optional[ArrayObject]:
878 """
879 Read-only property for the list of threads.
880
881 See §12.4.3 from the PDF 1.7 or 2.0 specification.
882
883 It is an array of dictionaries with "/F" (the first bead in the thread)
884 and "/I" (a thread information dictionary containing information about
885 the thread, such as its title, author, and creation date) properties or
886 None if there are no articles.
887
888 Since PDF 2.0 it can also contain an indirect reference to a metadata
889 stream containing information about the thread, such as its title,
890 author, and creation date.
891 """
892 catalog = self.root_object
893 if CO.THREADS in catalog:
894 return cast("ArrayObject", catalog[CO.THREADS])
895 return None
896
897 @abstractmethod
898 def _get_page_number_by_indirect(
899 self, indirect_reference: Union[None, int, NullObject, IndirectObject]
900 ) -> Optional[int]:
901 ... # pragma: no cover
902
903 def get_page_number(self, page: PageObject) -> Optional[int]:
904 """
905 Retrieve page number of a given PageObject.
906
907 Args:
908 page: The page to get page number. Should be
909 an instance of :class:`PageObject<pypdf._page.PageObject>`
910
911 Returns:
912 The page number or None if page is not found
913
914 """
915 return self._get_page_number_by_indirect(page.indirect_reference)
916
917 def get_destination_page_number(self, destination: Destination) -> Optional[int]:
918 """
919 Retrieve page number of a given Destination object.
920
921 Args:
922 destination: The destination to get page number.
923
924 Returns:
925 The page number or None if page is not found
926
927 """
928 return self._get_page_number_by_indirect(destination.page)
929
930 def _build_destination(
931 self,
932 title: Union[str, bytes],
933 array: Optional[
934 list[
935 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]
936 ]
937 ],
938 ) -> Destination:
939 page, typ = None, None
940 # handle outline items with missing or invalid destination
941 if (
942 isinstance(array, (NullObject, str))
943 or (isinstance(array, ArrayObject) and len(array) == 0)
944 or array is None
945 ):
946 page = NullObject()
947 return Destination(title, page, Fit.fit())
948 page, typ, *array = array # type: ignore
949 try:
950 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore
951 except PdfReadError:
952 logger_warning(f"Unknown destination: {title!r} {array}", __name__)
953 if self.strict:
954 raise
955 # create a link to first Page
956 tmp = self.pages[0].indirect_reference
957 indirect_reference = NullObject() if tmp is None else tmp
958 return Destination(title, indirect_reference, Fit.fit())
959
960 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
961 dest, title, outline_item = None, None, None
962
963 # title required for valid outline
964 # §12.3.3, entries in an outline item dictionary
965 try:
966 title = cast("str", node["/Title"])
967 except KeyError:
968 if self.strict:
969 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")
970 title = ""
971
972 if "/A" in node:
973 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported)
974 action = cast(DictionaryObject, node["/A"])
975 action_type = cast(NameObject, action[GoToActionArguments.S])
976 if action_type == "/GoTo":
977 if GoToActionArguments.D in action:
978 dest = action[GoToActionArguments.D]
979 elif self.strict:
980 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}")
981 elif "/Dest" in node:
982 # Destination, PDF 1.7 and PDF 2.0 §12.3.2
983 dest = node["/Dest"]
984 # if array was referenced in another object, will be a dict w/ key "/D"
985 if isinstance(dest, DictionaryObject) and "/D" in dest:
986 dest = dest["/D"]
987
988 if isinstance(dest, ArrayObject):
989 outline_item = self._build_destination(title, dest)
990 elif isinstance(dest, str):
991 # named destination, addresses NameObject Issue #193
992 # TODO: Keep named destination instead of replacing it?
993 try:
994 outline_item = self._build_destination(
995 title, self._named_destinations[dest].dest_array
996 )
997 except KeyError:
998 # named destination not found in Name Dict
999 outline_item = self._build_destination(title, None)
1000 elif dest is None:
1001 # outline item not required to have destination or action
1002 # PDFv1.7 Table 153
1003 outline_item = self._build_destination(title, dest)
1004 else:
1005 if self.strict:
1006 raise PdfReadError(f"Unexpected destination {dest!r}")
1007 logger_warning(
1008 f"Removed unexpected destination {dest!r} from destination",
1009 __name__,
1010 )
1011 outline_item = self._build_destination(title, None)
1012
1013 # if outline item created, add color, format, and child count if present
1014 if outline_item:
1015 if "/C" in node:
1016 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0
1017 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore
1018 if "/F" in node:
1019 # specifies style characteristics bold and/or italic
1020 # with 1=italic, 2=bold, 3=both
1021 outline_item[NameObject("/F")] = node["/F"]
1022 if "/Count" in node:
1023 # absolute value = num. visible children
1024 # with positive = open/unfolded, negative = closed/folded
1025 outline_item[NameObject("/Count")] = node["/Count"]
1026 # if count is 0 we will consider it as open (to have available is_open)
1027 outline_item[NameObject("/%is_open%")] = BooleanObject(
1028 node.get("/Count", 0) >= 0
1029 )
1030 outline_item.node = node
1031 try:
1032 outline_item.indirect_reference = node.indirect_reference
1033 except AttributeError:
1034 pass
1035 return outline_item
1036
1037 @property
1038 def pages(self) -> list[PageObject]:
1039 """
1040 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
1041 This property allows to get a page or a range of pages.
1042
1043 Note:
1044 For PdfWriter only: Provides the capability to remove a page/range of
1045 page from the list (using the del operator). Remember: Only the page
1046 entry is removed, as the objects beneath can be used elsewhere. A
1047 solution to completely remove them - if they are not used anywhere - is
1048 to write to a buffer/temporary file and then load it into a new
1049 PdfWriter.
1050
1051 """
1052 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore
1053
1054 @property
1055 def page_labels(self) -> list[str]:
1056 """
1057 A list of labels for the pages in this document.
1058
1059 This property is read-only. The labels are in the order that the pages
1060 appear in the document.
1061 """
1062 return [page_index2page_label(self, i) for i in range(len(self.pages))]
1063
1064 @property
1065 def page_layout(self) -> Optional[str]:
1066 """
1067 Get the page layout currently being used.
1068
1069 .. list-table:: Valid ``layout`` values
1070 :widths: 50 200
1071
1072 * - /NoLayout
1073 - Layout explicitly not specified
1074 * - /SinglePage
1075 - Show one page at a time
1076 * - /OneColumn
1077 - Show one column at a time
1078 * - /TwoColumnLeft
1079 - Show pages in two columns, odd-numbered pages on the left
1080 * - /TwoColumnRight
1081 - Show pages in two columns, odd-numbered pages on the right
1082 * - /TwoPageLeft
1083 - Show two pages at a time, odd-numbered pages on the left
1084 * - /TwoPageRight
1085 - Show two pages at a time, odd-numbered pages on the right
1086 """
1087 try:
1088 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
1089 except KeyError:
1090 return None
1091
1092 @property
1093 def page_mode(self) -> Optional[PagemodeType]:
1094 """
1095 Get the page mode currently being used.
1096
1097 .. list-table:: Valid ``mode`` values
1098 :widths: 50 200
1099
1100 * - /UseNone
1101 - Do not show outline or thumbnails panels
1102 * - /UseOutlines
1103 - Show outline (aka bookmarks) panel
1104 * - /UseThumbs
1105 - Show page thumbnails panel
1106 * - /FullScreen
1107 - Fullscreen view
1108 * - /UseOC
1109 - Show Optional Content Group (OCG) panel
1110 * - /UseAttachments
1111 - Show attachments panel
1112 """
1113 try:
1114 return self.root_object["/PageMode"] # type: ignore
1115 except KeyError:
1116 return None
1117
1118 def _flatten(
1119 self,
1120 list_only: bool = False,
1121 pages: Union[None, DictionaryObject, PageObject] = None,
1122 inherit: Optional[dict[str, Any]] = None,
1123 indirect_reference: Optional[IndirectObject] = None,
1124 ) -> None:
1125 """
1126 Process the document pages to ease searching.
1127
1128 Attributes of a page may inherit from ancestor nodes
1129 in the page tree. Flattening means moving
1130 any inheritance data into descendant nodes,
1131 effectively removing the inheritance dependency.
1132
1133 Note: It is distinct from another use of "flattening" applied to PDFs.
1134 Flattening a PDF also means combining all the contents into one single layer
1135 and making the file less editable.
1136
1137 Args:
1138 list_only: Will only list the pages within _flatten_pages.
1139 pages:
1140 inherit:
1141 indirect_reference: Used recursively to flatten the /Pages object.
1142
1143 """
1144 inheritable_page_attributes = (
1145 NameObject(PG.RESOURCES),
1146 NameObject(PG.MEDIABOX),
1147 NameObject(PG.CROPBOX),
1148 NameObject(PG.ROTATE),
1149 )
1150 if inherit is None:
1151 inherit = {}
1152 if pages is None:
1153 # Fix issue 327: set flattened_pages attribute only for
1154 # decrypted file
1155 catalog = self.root_object
1156 pages = catalog.get("/Pages").get_object() # type: ignore
1157 if not isinstance(pages, DictionaryObject):
1158 raise PdfReadError("Invalid object in /Pages")
1159 self.flattened_pages = []
1160
1161 if PagesAttributes.TYPE in pages:
1162 t = cast(str, pages[PagesAttributes.TYPE])
1163 # if the page tree node has no /Type, consider as a page if /Kids is also missing
1164 elif PagesAttributes.KIDS not in pages:
1165 t = "/Page"
1166 else:
1167 t = "/Pages"
1168
1169 if t == "/Pages":
1170 for attr in inheritable_page_attributes:
1171 if attr in pages:
1172 inherit[attr] = pages[attr]
1173 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]):
1174 addt = {}
1175 if isinstance(page, IndirectObject):
1176 addt["indirect_reference"] = page
1177 obj = page.get_object()
1178 if obj:
1179 # damaged file may have invalid child in /Pages
1180 try:
1181 self._flatten(list_only, obj, inherit, **addt)
1182 except RecursionError:
1183 raise PdfReadError(
1184 "Maximum recursion depth reached during page flattening."
1185 )
1186 elif t == "/Page":
1187 for attr_in, value in inherit.items():
1188 # if the page has its own value, it does not inherit the
1189 # parent's value
1190 if attr_in not in pages:
1191 pages[attr_in] = value
1192 page_obj = PageObject(self, indirect_reference)
1193 if not list_only:
1194 page_obj.update(pages)
1195
1196 # TODO: Could flattened_pages be None at this point?
1197 self.flattened_pages.append(page_obj) # type: ignore
1198
1199 def remove_page(
1200 self,
1201 page: Union[int, PageObject, IndirectObject],
1202 clean: bool = False,
1203 ) -> None:
1204 """
1205 Remove page from pages list.
1206
1207 Args:
1208 page:
1209 * :class:`int`: Page number to be removed.
1210 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times
1211 only the first one will be removed.
1212 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.
1213
1214 clean: replace PageObject with NullObject to prevent annotations
1215 or destinations to reference a detached page.
1216
1217 """
1218 if self.flattened_pages is None:
1219 self._flatten(self._readonly)
1220 assert self.flattened_pages is not None
1221 if isinstance(page, IndirectObject):
1222 p = page.get_object()
1223 if not isinstance(p, PageObject):
1224 logger_warning("IndirectObject is not referencing a page", __name__)
1225 return
1226 page = p
1227
1228 if not isinstance(page, int):
1229 try:
1230 page = self.flattened_pages.index(page)
1231 except ValueError:
1232 logger_warning("Cannot find page in pages", __name__)
1233 return
1234 if not (0 <= page < len(self.flattened_pages)):
1235 logger_warning("Page number is out of range", __name__)
1236 return
1237
1238 ind = self.pages[page].indirect_reference
1239 del self.pages[page]
1240 if clean and ind is not None:
1241 self._replace_object(ind, NullObject())
1242
1243 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
1244 """
1245 Used to ease development.
1246
1247 This is equivalent to generic.IndirectObject(num,gen,self).get_object()
1248
1249 Args:
1250 num: The object number of the indirect object.
1251 gen: The generation number of the indirect object.
1252
1253 Returns:
1254 A PdfObject
1255
1256 """
1257 return IndirectObject(num, gen, self).get_object()
1258
1259 def decode_permissions(
1260 self, permissions_code: int
1261 ) -> dict[str, bool]: # pragma: no cover
1262 """Take the permissions as an integer, return the allowed access."""
1263 deprecation_with_replacement(
1264 old_name="decode_permissions",
1265 new_name="user_access_permissions",
1266 removed_in="5.0.0",
1267 )
1268
1269 permissions_mapping = {
1270 "print": UserAccessPermissions.PRINT,
1271 "modify": UserAccessPermissions.MODIFY,
1272 "copy": UserAccessPermissions.EXTRACT,
1273 "annotations": UserAccessPermissions.ADD_OR_MODIFY,
1274 "forms": UserAccessPermissions.FILL_FORM_FIELDS,
1275 # Do not fix typo, as part of official, but deprecated API.
1276 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS,
1277 "assemble": UserAccessPermissions.ASSEMBLE_DOC,
1278 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION,
1279 }
1280
1281 return {
1282 key: permissions_code & flag != 0
1283 for key, flag in permissions_mapping.items()
1284 }
1285
1286 @property
1287 def user_access_permissions(self) -> Optional[UserAccessPermissions]:
1288 """Get the user access permissions for encrypted documents. Returns None if not encrypted."""
1289 if self._encryption is None:
1290 return None
1291 return UserAccessPermissions(self._encryption.P)
1292
1293 @property
1294 @abstractmethod
1295 def is_encrypted(self) -> bool:
1296 """
1297 Read-only boolean property showing whether this PDF file is encrypted.
1298
1299 Note that this property, if true, will remain true even after the
1300 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
1301 """
1302 ... # pragma: no cover
1303
1304 @property
1305 def xfa(self) -> Optional[dict[str, Any]]:
1306 tree: Optional[TreeObject] = None
1307 retval: dict[str, Any] = {}
1308 catalog = self.root_object
1309
1310 if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
1311 return None
1312
1313 tree = cast(TreeObject, catalog["/AcroForm"])
1314
1315 if "/XFA" in tree:
1316 fields = cast(ArrayObject, tree["/XFA"])
1317 i = iter(fields)
1318 for f in i:
1319 tag = f
1320 f = next(i)
1321 if isinstance(f, IndirectObject):
1322 field = cast(Optional[EncodedStreamObject], f.get_object())
1323 if field:
1324 es = zlib.decompress(field._data)
1325 retval[tag] = es
1326 return retval
1327
1328 @property
1329 def attachments(self) -> Mapping[str, list[bytes]]:
1330 """Mapping of attachment filenames to their content."""
1331 return LazyDict(
1332 {
1333 name: (self._get_attachment_list, name)
1334 for name in self._list_attachments()
1335 }
1336 )
1337
1338 @property
1339 def attachment_list(self) -> Generator[EmbeddedFile, None, None]:
1340 """Iterable of attachment objects."""
1341 yield from EmbeddedFile._load(self.root_object)
1342
1343 def _list_attachments(self) -> list[str]:
1344 """
1345 Retrieves the list of filenames of file attachments.
1346
1347 Returns:
1348 list of filenames
1349
1350 """
1351 names = []
1352 for entry in self.attachment_list:
1353 names.append(entry.name)
1354 if (name := entry.alternative_name) != entry.name and name:
1355 names.append(name)
1356 return names
1357
1358 def _get_attachment_list(self, name: str) -> list[bytes]:
1359 out = self._get_attachments(name)[name]
1360 if isinstance(out, list):
1361 return out
1362 return [out]
1363
1364 def _get_attachments(
1365 self, filename: Optional[str] = None
1366 ) -> dict[str, Union[bytes, list[bytes]]]:
1367 """
1368 Retrieves all or selected file attachments of the PDF as a dictionary of file names
1369 and the file data as a bytestring.
1370
1371 Args:
1372 filename: If filename is None, then a dictionary of all attachments
1373 will be returned, where the key is the filename and the value
1374 is the content. Otherwise, a dictionary with just a single key
1375 - the filename - and its content will be returned.
1376
1377 Returns:
1378 dictionary of filename -> Union[bytestring or List[ByteString]]
1379 If the filename exists multiple times a list of the different versions will be provided.
1380
1381 """
1382 attachments: dict[str, Union[bytes, list[bytes]]] = {}
1383 for entry in self.attachment_list:
1384 names = set()
1385 alternative_name = entry.alternative_name
1386 if filename is not None:
1387 if filename in {entry.name, alternative_name}:
1388 name = entry.name if filename == entry.name else alternative_name
1389 names.add(name)
1390 else:
1391 continue
1392 else:
1393 names = {entry.name, alternative_name}
1394
1395 for name in names:
1396 if name is None:
1397 continue
1398 if name in attachments:
1399 if not isinstance(attachments[name], list):
1400 attachments[name] = [attachments[name]] # type:ignore
1401 attachments[name].append(entry.content) # type:ignore
1402 else:
1403 attachments[name] = entry.content
1404 return attachments
1405
1406 @abstractmethod
1407 def _repr_mimebundle_(
1408 self,
1409 include: Union[None, Iterable[str]] = None,
1410 exclude: Union[None, Iterable[str]] = None,
1411 ) -> dict[str, Any]:
1412 """
1413 Integration into Jupyter Notebooks.
1414
1415 This method returns a dictionary that maps a mime-type to its
1416 representation.
1417
1418 .. seealso::
1419
1420 https://ipython.readthedocs.io/en/stable/config/integrating.html
1421 """
1422 ... # pragma: no cover
1423
1424
1425class LazyDict(Mapping[Any, Any]):
1426 def __init__(self, *args: Any, **kwargs: Any) -> None:
1427 self._raw_dict = dict(*args, **kwargs)
1428
1429 def __getitem__(self, key: str) -> Any:
1430 func, arg = self._raw_dict.__getitem__(key)
1431 return func(arg)
1432
1433 def __iter__(self) -> Iterator[Any]:
1434 return iter(self._raw_dict)
1435
1436 def __len__(self) -> int:
1437 return len(self._raw_dict)
1438
1439 def __str__(self) -> str:
1440 return f"LazyDict(keys={list(self.keys())})"