Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import math
31from collections.abc import Iterable, Iterator, Sequence
32from copy import deepcopy
33from dataclasses import asdict, dataclass
34from decimal import Decimal
35from io import BytesIO
36from pathlib import Path
37from typing import (
38 Any,
39 Callable,
40 Literal,
41 Optional,
42 Union,
43 cast,
44 overload,
45)
47from ._font import Font
48from ._protocols import PdfCommonDocProtocol
49from ._text_extraction import (
50 _layout_mode,
51)
52from ._text_extraction._text_extractor import TextExtraction
53from ._utils import (
54 CompressedTransformationMatrix,
55 TransformationMatrixType,
56 _human_readable_bytes,
57 deprecate,
58 logger_warning,
59 matrix_multiply,
60)
61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING
62from .constants import AnnotationDictionaryAttributes as ADA
63from .constants import ImageAttributes as IA
64from .constants import PageAttributes as PG
65from .constants import Resources as RES
66from .errors import PageSizeNotDefinedError, PdfReadError
67from .generic import (
68 ArrayObject,
69 ContentStream,
70 DictionaryObject,
71 EncodedStreamObject,
72 FloatObject,
73 IndirectObject,
74 NameObject,
75 NullObject,
76 NumberObject,
77 PdfObject,
78 RectangleObject,
79 StreamObject,
80 is_null_or_none,
81)
83try:
84 from PIL.Image import Image
86 pil_not_imported = False
87except ImportError:
88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
89 pil_not_imported = True # error will be raised only when using images
91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"
94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)
96 if isinstance(retval, RectangleObject):
97 return retval
98 if is_null_or_none(retval):
99 for d in defaults:
100 retval = self.get(d)
101 if retval is not None:
102 break
103 if isinstance(retval, IndirectObject):
104 retval = self.pdf.get_object(retval)
105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:
106 logger_warning(f"Expected four values, got {length}: {retval}", __name__)
107 retval = RectangleObject(tuple(retval[:4]))
108 else:
109 retval = RectangleObject(retval) # type: ignore
110 _set_rectangle(self, name, retval)
111 return retval
114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
115 self[NameObject(name)] = value
118def _delete_rectangle(self: Any, name: str) -> None:
119 del self[name]
122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
123 return property(
124 lambda self: _get_rectangle(self, name, fallback),
125 lambda self, value: _set_rectangle(self, name, value),
126 lambda self: _delete_rectangle(self, name),
127 )
130class Transformation:
131 """
132 Represent a 2D transformation.
134 The transformation between two coordinate systems is represented by a 3-by-3
135 transformation matrix with the following form::
137 a b 0
138 c d 0
139 e f 1
141 Because a transformation matrix has only six elements that can be changed,
142 it is usually specified in PDF as the six-element array [ a b c d e f ].
144 Coordinate transformations are expressed as matrix multiplications::
146 a b 0
147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0
148 e f 1
151 Example:
152 >>> from pypdf import PdfWriter, Transformation
153 >>> page = PdfWriter().add_blank_page(800, 600)
154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
155 >>> page.add_transformation(op)
157 """
159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:
160 self.ctm = ctm
162 @property
163 def matrix(self) -> TransformationMatrixType:
164 """
165 Return the transformation matrix as a tuple of tuples in the form:
167 ((a, b, 0), (c, d, 0), (e, f, 1))
168 """
169 return (
170 (self.ctm[0], self.ctm[1], 0),
171 (self.ctm[2], self.ctm[3], 0),
172 (self.ctm[4], self.ctm[5], 1),
173 )
175 @staticmethod
176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
177 """
178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
180 Args:
181 matrix: The transformation matrix as a tuple of tuples.
183 Returns:
184 A tuple representing the transformation matrix as (a, b, c, d, e, f)
186 """
187 return (
188 matrix[0][0],
189 matrix[0][1],
190 matrix[1][0],
191 matrix[1][1],
192 matrix[2][0],
193 matrix[2][1],
194 )
196 def _to_cm(self) -> str:
197 # Returns the cm operation string for the given transformation matrix
198 return (
199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
201 )
203 def transform(self, m: "Transformation") -> "Transformation":
204 """
205 Apply one transformation to another.
207 Args:
208 m: a Transformation to apply.
210 Returns:
211 A new ``Transformation`` instance
213 Example:
214 >>> from pypdf import PdfWriter, Transformation
215 >>> height, width = 40, 50
216 >>> page = PdfWriter().add_blank_page(800, 600)
217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror
219 >>> page.add_transformation(op)
221 """
222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
223 return Transformation(ctm)
225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
226 """
227 Translate the contents of a page.
229 Args:
230 tx: The translation along the x-axis.
231 ty: The translation along the y-axis.
233 Returns:
234 A new ``Transformation`` instance
236 """
237 m = self.ctm
238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
240 def scale(
241 self, sx: Optional[float] = None, sy: Optional[float] = None
242 ) -> "Transformation":
243 """
244 Scale the contents of a page towards the origin of the coordinate system.
246 Typically, that is the lower-left corner of the page. That can be
247 changed by translating the contents / the page boxes.
249 Args:
250 sx: The scale factor along the x-axis.
251 sy: The scale factor along the y-axis.
253 Returns:
254 A new Transformation instance with the scaled matrix.
256 """
257 if sx is None and sy is None:
258 raise ValueError("Either sx or sy must be specified")
259 if sx is None:
260 sx = sy
261 if sy is None:
262 sy = sx
263 assert sx is not None
264 assert sy is not None
265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
266 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
267 return Transformation(ctm)
269 def rotate(self, rotation: float) -> "Transformation":
270 """
271 Rotate the contents of a page.
273 Args:
274 rotation: The angle of rotation in degrees.
276 Returns:
277 A new ``Transformation`` instance with the rotated matrix.
279 """
280 rotation = math.radians(rotation)
281 op: TransformationMatrixType = (
282 (math.cos(rotation), math.sin(rotation), 0),
283 (-math.sin(rotation), math.cos(rotation), 0),
284 (0, 0, 1),
285 )
286 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
287 return Transformation(ctm)
289 def __repr__(self) -> str:
290 return f"Transformation(ctm={self.ctm})"
292 @overload
293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:
294 ...
296 @overload
297 def apply_on(
298 self, pt: tuple[float, float], as_object: bool = False
299 ) -> tuple[float, float]:
300 ...
302 def apply_on(
303 self,
304 pt: Union[tuple[float, float], list[float]],
305 as_object: bool = False,
306 ) -> Union[tuple[float, float], list[float]]:
307 """
308 Apply the transformation matrix on the given point.
310 Args:
311 pt: A tuple or list representing the point in the form (x, y).
312 as_object: If True, return items as FloatObject, otherwise as plain floats.
314 Returns:
315 A tuple or list representing the transformed point in the form (x', y')
317 """
318 typ = FloatObject if as_object else float
319 pt1 = (
320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
322 )
323 return list(pt1) if isinstance(pt, list) else pt1
326@dataclass
327class ImageFile:
328 """
329 Image within the PDF file. *This object is not designed to be built.*
331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.
332 """
334 name: str = ""
335 """
336 Filename as identified within the PDF file.
337 """
339 data: bytes = b""
340 """
341 Data as bytes.
342 """
344 image: Optional[Image] = None
345 """
346 Data as PIL image.
347 """
349 indirect_reference: Optional[IndirectObject] = None
350 """
351 Reference to the object storing the stream.
352 """
354 def replace(self, new_image: Image, **kwargs: Any) -> None:
355 """
356 Replace the image with a new PIL image.
358 Args:
359 new_image (PIL.Image.Image): The new PIL image to replace the existing image.
360 **kwargs: Additional keyword arguments to pass to `Image.save()`.
362 Raises:
363 TypeError: If the image is inline or in a PdfReader.
364 TypeError: If the image does not belong to a PdfWriter.
365 TypeError: If `new_image` is not a PIL Image.
367 Note:
368 This method replaces the existing image with a new image.
369 It is not allowed for inline images or images within a PdfReader.
370 The `kwargs` parameter allows passing additional parameters
371 to `Image.save()`, such as quality.
373 """
374 if pil_not_imported:
375 raise ImportError(
376 "pillow is required to do image extraction. "
377 "It can be installed via 'pip install pypdf[image]'"
378 )
380 from ._reader import PdfReader # noqa: PLC0415
382 # to prevent circular import
383 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415
384 from .generic import DictionaryObject, PdfObject # noqa: PLC0415
386 if self.indirect_reference is None:
387 raise TypeError("Cannot update an inline image.")
388 if not hasattr(self.indirect_reference.pdf, "_id_translated"):
389 raise TypeError("Cannot update an image not belonging to a PdfWriter.")
390 if not isinstance(new_image, Image):
391 raise TypeError("new_image shall be a PIL Image")
392 b = BytesIO()
393 new_image.save(b, "PDF", **kwargs)
394 reader = PdfReader(b)
395 page_image = reader.pages[0].images[0]
396 assert page_image.indirect_reference is not None
397 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
398 page_image.indirect_reference.get_object()
399 )
400 cast(
401 PdfObject, self.indirect_reference.get_object()
402 ).indirect_reference = self.indirect_reference
403 # change the object attributes
404 extension, byte_stream, img = _xobj_to_image(
405 cast(DictionaryObject, self.indirect_reference.get_object()),
406 pillow_parameters=kwargs,
407 )
408 assert extension is not None
409 self.name = self.name[: self.name.rfind(".")] + extension
410 self.data = byte_stream
411 self.image = img
413 def __str__(self) -> str:
414 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
416 def __repr__(self) -> str:
417 return self.__str__()[:-1] + f", hash: {hash(self.data)})"
420class VirtualListImages(Sequence[ImageFile]):
421 """
422 Provides access to images referenced within a page.
423 Only one copy will be returned if the usage is used on the same page multiple times.
424 See :func:`PageObject.images` for more details.
425 """
427 def __init__(
428 self,
429 ids_function: Callable[[], list[Union[str, list[str]]]],
430 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],
431 ) -> None:
432 self.ids_function = ids_function
433 self.get_function = get_function
434 self.current = -1
436 def __len__(self) -> int:
437 return len(self.ids_function())
439 def keys(self) -> list[Union[str, list[str]]]:
440 return self.ids_function()
442 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:
443 return [(x, self[x]) for x in self.ids_function()]
445 @overload
446 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:
447 ...
449 @overload
450 def __getitem__(self, index: slice) -> Sequence[ImageFile]:
451 ...
453 def __getitem__(
454 self, index: Union[int, slice, str, list[str], tuple[str]]
455 ) -> Union[ImageFile, Sequence[ImageFile]]:
456 lst = self.ids_function()
457 if isinstance(index, slice):
458 indices = range(*index.indices(len(self)))
459 lst = [lst[x] for x in indices]
460 cls = type(self)
461 return cls((lambda: lst), self.get_function)
462 if isinstance(index, (str, list, tuple)):
463 return self.get_function(index)
464 if not isinstance(index, int):
465 raise TypeError("Invalid sequence indices type")
466 len_self = len(lst)
467 if index < 0:
468 # support negative indexes
469 index += len_self
470 if not (0 <= index < len_self):
471 raise IndexError("Sequence index out of range")
472 return self.get_function(lst[index])
474 def __iter__(self) -> Iterator[ImageFile]:
475 for i in range(len(self)):
476 yield self[i]
478 def __str__(self) -> str:
479 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
480 return f"[{', '.join(p)}]"
483class PageObject(DictionaryObject):
484 """
485 PageObject represents a single page within a PDF file.
487 Typically these objects will be created by accessing the
488 :attr:`pages<pypdf.PdfReader.pages>` property of the
489 :class:`PdfReader<pypdf.PdfReader>` class, but it is
490 also possible to create an empty page with the
491 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
493 Args:
494 pdf: PDF file the page belongs to.
495 indirect_reference: Stores the original indirect reference to
496 this object in its source PDF
498 """
500 original_page: "PageObject" # very local use in writer when appending
502 def __init__(
503 self,
504 pdf: Optional[PdfCommonDocProtocol] = None,
505 indirect_reference: Optional[IndirectObject] = None,
506 ) -> None:
507 DictionaryObject.__init__(self)
508 self.pdf = pdf
509 self.inline_images: Optional[dict[str, ImageFile]] = None
510 self.indirect_reference = indirect_reference
511 if not is_null_or_none(indirect_reference):
512 assert indirect_reference is not None, "mypy"
513 self.update(cast(DictionaryObject, indirect_reference.get_object()))
514 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}
516 def hash_bin(self) -> int:
517 """
518 Used to detect modified object.
520 Note: this function is overloaded to return the same results
521 as a DictionaryObject.
523 Returns:
524 Hash considering type and value.
526 """
527 return hash(
528 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
529 )
531 def hash_value_data(self) -> bytes:
532 data = super().hash_value_data()
533 data += f"{id(self)}".encode()
534 return data
536 @property
537 def user_unit(self) -> float:
538 """
539 A read-only positive number giving the size of user space units.
541 It is in multiples of 1/72 inch. Hence a value of 1 means a user
542 space unit is 1/72 inch, and a value of 3 means that a user
543 space unit is 3/72 inch.
544 """
545 return self.get(PG.USER_UNIT, 1)
547 @staticmethod
548 def create_blank_page(
549 pdf: Optional[PdfCommonDocProtocol] = None,
550 width: Union[float, Decimal, None] = None,
551 height: Union[float, Decimal, None] = None,
552 ) -> "PageObject":
553 """
554 Return a new blank page.
556 If ``width`` or ``height`` is ``None``, try to get the page size
557 from the last page of *pdf*.
559 Args:
560 pdf: PDF file the page is within.
561 width: The width of the new page expressed in default user
562 space units.
563 height: The height of the new page expressed in default user
564 space units.
566 Returns:
567 The new blank page
569 Raises:
570 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
571 no page
573 """
574 page = PageObject(pdf)
576 # Creates a new page (cf PDF Reference §7.7.3.3)
577 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
578 page.__setitem__(NameObject(PG.PARENT), NullObject())
579 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
580 if width is None or height is None:
581 if pdf is not None and len(pdf.pages) > 0:
582 lastpage = pdf.pages[len(pdf.pages) - 1]
583 width = lastpage.mediabox.width
584 height = lastpage.mediabox.height
585 else:
586 raise PageSizeNotDefinedError
587 page.__setitem__(
588 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore
589 )
591 return page
593 def _get_ids_image(
594 self,
595 obj: Optional[DictionaryObject] = None,
596 ancest: Optional[list[str]] = None,
597 call_stack: Optional[list[Any]] = None,
598 ) -> list[Union[str, list[str]]]:
599 if call_stack is None:
600 call_stack = []
601 _i = getattr(obj, "indirect_reference", None)
602 if _i in call_stack:
603 return []
604 call_stack.append(_i)
605 if self.inline_images is None:
606 self.inline_images = self._get_inline_images()
607 if obj is None:
608 obj = self
609 if ancest is None:
610 ancest = []
611 lst: list[Union[str, list[str]]] = []
612 if (
613 PG.RESOURCES not in obj or
614 is_null_or_none(resources := obj[PG.RESOURCES]) or
615 RES.XOBJECT not in cast(DictionaryObject, resources)
616 ):
617 return [] if self.inline_images is None else list(self.inline_images.keys())
619 x_object = resources[RES.XOBJECT].get_object() # type: ignore
620 for o in x_object:
621 if not isinstance(x_object[o], StreamObject):
622 continue
623 if x_object[o][IA.SUBTYPE] == "/Image":
624 lst.append(o if len(ancest) == 0 else [*ancest, o])
625 else: # is a form with possible images inside
626 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))
627 assert self.inline_images is not None
628 lst.extend(list(self.inline_images.keys()))
629 return lst
631 def _get_image(
632 self,
633 id: Union[str, list[str], tuple[str]],
634 obj: Optional[DictionaryObject] = None,
635 ) -> ImageFile:
636 if obj is None:
637 obj = cast(DictionaryObject, self)
638 if isinstance(id, tuple):
639 id = list(id)
640 if isinstance(id, list) and len(id) == 1:
641 id = id[0]
642 try:
643 xobjs = cast(
644 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
645 )
646 except KeyError:
647 if not (id[0] == "~" and id[-1] == "~"):
648 raise
649 if isinstance(id, str):
650 if id[0] == "~" and id[-1] == "~":
651 if self.inline_images is None:
652 self.inline_images = self._get_inline_images()
653 if self.inline_images is None: # pragma: no cover
654 raise KeyError("No inline image can be found")
655 return self.inline_images[id]
657 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415
658 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
659 extension, byte_stream = imgd[:2]
660 return ImageFile(
661 name=f"{id[1:]}{extension}",
662 data=byte_stream,
663 image=imgd[2],
664 indirect_reference=xobjs[id].indirect_reference,
665 )
666 # in a subobject
667 ids = id[1:]
668 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
670 @property
671 def images(self) -> VirtualListImages:
672 """
673 Read-only property emulating a list of images on a page.
675 Get a list of all images on the page. The key can be:
676 - A string (for the top object)
677 - A tuple (for images within XObject forms)
678 - An integer
680 Examples:
681 * `reader.pages[0].images[0]` # return first image
682 * `reader.pages[0].images['/I0']` # return image '/I0'
683 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form
684 * `for img in reader.pages[0].images:` # loops through all objects
686 images.keys() and images.items() can be used.
688 The ImageFile has the following properties:
690 * `.name` : name of the object
691 * `.data` : bytes of the object
692 * `.image` : PIL Image Object
693 * `.indirect_reference` : object reference
695 and the following methods:
696 `.replace(new_image: PIL.Image.Image, **kwargs)` :
697 replace the image in the pdf with the new image
698 applying the saving parameters indicated (such as quality)
700 Example usage:
702 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)
704 Inline images are extracted and named ~0~, ~1~, ..., with the
705 indirect_reference set to None.
707 """
708 return VirtualListImages(self._get_ids_image, self._get_image)
710 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:
711 """Translate values used in inline image"""
712 try:
713 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])
714 except (TypeError, KeyError):
715 if isinstance(v, NameObject):
716 # It is a custom name, thus we have to look in resources.
717 # The only applicable case is for ColorSpace.
718 try:
719 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
720 v = cast(DictionaryObject, res)[v]
721 except KeyError: # for res and v
722 raise PdfReadError(f"Cannot find resource entry {v} for {k}")
723 return v
725 def _get_inline_images(self) -> dict[str, ImageFile]:
726 """Load inline images. Entries will be identified as `~1~`."""
727 content = self.get_contents()
728 if is_null_or_none(content):
729 return {}
730 imgs_data = []
731 assert content is not None, "mypy"
732 for param, ope in content.operations:
733 if ope == b"INLINE IMAGE":
734 imgs_data.append(
735 {"settings": param["settings"], "__streamdata__": param["data"]}
736 )
737 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
738 raise PdfReadError(
739 f"{ope!r} operator met whereas not expected, "
740 "please share use case with pypdf dev team"
741 )
742 files = {}
743 for num, ii in enumerate(imgs_data):
744 init = {
745 "__streamdata__": ii["__streamdata__"],
746 "/Length": len(ii["__streamdata__"]),
747 }
748 for k, v in ii["settings"].items():
749 if k in {"/Length", "/L"}: # no length is expected
750 continue
751 if isinstance(v, list):
752 v = ArrayObject(
753 [self._translate_value_inline_image(k, x) for x in v]
754 )
755 else:
756 v = self._translate_value_inline_image(k, v)
757 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])
758 if k not in init:
759 init[k] = v
760 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
761 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415
762 extension, byte_stream, img = _xobj_to_image(ii["object"])
763 files[f"~{num}~"] = ImageFile(
764 name=f"~{num}~{extension}",
765 data=byte_stream,
766 image=img,
767 indirect_reference=None,
768 )
769 return files
771 @property
772 def rotation(self) -> int:
773 """
774 The visual rotation of the page.
776 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
777 valid values. This property does not affect ``/Contents``.
778 """
779 rotate_obj = self.get(PG.ROTATE, 0)
780 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
782 @rotation.setter
783 def rotation(self, r: float) -> None:
784 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
786 def transfer_rotation_to_content(self) -> None:
787 """
788 Apply the rotation of the page to the content and the media/crop/...
789 boxes.
791 It is recommended to apply this function before page merging.
792 """
793 r = -self.rotation # rotation to apply is in the otherway
794 self.rotation = 0
795 mb = RectangleObject(self.mediabox)
796 trsf = (
797 Transformation()
798 .translate(
799 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
800 )
801 .rotate(r)
802 )
803 pt1 = trsf.apply_on(mb.lower_left)
804 pt2 = trsf.apply_on(mb.upper_right)
805 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
806 self.add_transformation(trsf, False)
807 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
808 if b in self:
809 rr = RectangleObject(self[b]) # type: ignore
810 pt1 = trsf.apply_on(rr.lower_left)
811 pt2 = trsf.apply_on(rr.upper_right)
812 self[NameObject(b)] = RectangleObject(
813 (
814 min(pt1[0], pt2[0]),
815 min(pt1[1], pt2[1]),
816 max(pt1[0], pt2[0]),
817 max(pt1[1], pt2[1]),
818 )
819 )
821 def rotate(self, angle: int) -> "PageObject":
822 """
823 Rotate a page clockwise by increments of 90 degrees.
825 Args:
826 angle: Angle to rotate the page. Must be an increment of 90 deg.
828 Returns:
829 The rotated PageObject
831 """
832 if angle % 90 != 0:
833 raise ValueError("Rotation angle must be a multiple of 90")
834 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
835 return self
837 def _merge_resources(
838 self,
839 res1: DictionaryObject,
840 res2: DictionaryObject,
841 resource: Any,
842 new_res1: bool = True,
843 ) -> tuple[dict[str, Any], dict[str, Any]]:
844 try:
845 assert isinstance(self.indirect_reference, IndirectObject)
846 pdf = self.indirect_reference.pdf
847 is_pdf_writer = hasattr(
848 pdf, "_add_object"
849 ) # expect isinstance(pdf, PdfWriter)
850 except (AssertionError, AttributeError):
851 pdf = None
852 is_pdf_writer = False
854 def compute_unique_key(base_key: str) -> tuple[str, bool]:
855 """
856 Find a key that either doesn't already exist or has the same value
857 (indicated by the bool)
859 Args:
860 base_key: An index is added to this to get the computed key
862 Returns:
863 A tuple (computed key, bool) where the boolean indicates
864 if there is a resource of the given computed_key with the same
865 value.
867 """
868 value = page2res.raw_get(base_key)
869 # TODO: a possible improvement for writer, the indirect_reference
870 # cannot be found because translated
872 # try the current key first (e.g. "foo"), but otherwise iterate
873 # through "foo-0", "foo-1", etc. new_res can contain only finitely
874 # many keys, thus this'll eventually end, even if it's been crafted
875 # to be maximally annoying.
876 computed_key = base_key
877 idx = 0
878 while computed_key in new_res:
879 if new_res.raw_get(computed_key) == value:
880 # there's already a resource of this name, with the exact
881 # same value
882 return computed_key, True
883 computed_key = f"{base_key}-{idx}"
884 idx += 1
885 return computed_key, False
887 if new_res1:
888 new_res = DictionaryObject()
889 new_res.update(res1.get(resource, DictionaryObject()).get_object())
890 else:
891 new_res = cast(DictionaryObject, res1[resource])
892 page2res = cast(
893 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
894 )
895 rename_res = {}
896 for key in page2res:
897 unique_key, same_value = compute_unique_key(key)
898 newname = NameObject(unique_key)
899 if key != unique_key:
900 # we have to use a different name for this
901 rename_res[key] = newname
903 if not same_value:
904 if is_pdf_writer:
905 new_res[newname] = page2res.raw_get(key).clone(pdf)
906 try:
907 new_res[newname] = new_res[newname].indirect_reference
908 except AttributeError:
909 pass
910 else:
911 new_res[newname] = page2res.raw_get(key)
912 lst = sorted(new_res.items())
913 new_res.clear()
914 for el in lst:
915 new_res[el[0]] = el[1]
916 return new_res, rename_res
918 @staticmethod
919 def _content_stream_rename(
920 stream: ContentStream,
921 rename: dict[Any, Any],
922 pdf: Optional[PdfCommonDocProtocol],
923 ) -> ContentStream:
924 if not rename:
925 return stream
926 stream = ContentStream(stream, pdf)
927 for operands, _operator in stream.operations:
928 if isinstance(operands, list):
929 for i, op in enumerate(operands):
930 if isinstance(op, NameObject):
931 operands[i] = rename.get(op, op)
932 elif isinstance(operands, dict):
933 for i, op in operands.items():
934 if isinstance(op, NameObject):
935 operands[i] = rename.get(op, op)
936 else:
937 raise KeyError(f"Type of operands is {type(operands)}")
938 return stream
940 @staticmethod
941 def _add_transformation_matrix(
942 contents: Any,
943 pdf: Optional[PdfCommonDocProtocol],
944 ctm: CompressedTransformationMatrix,
945 ) -> ContentStream:
946 """Add transformation matrix at the beginning of the given contents stream."""
947 contents = ContentStream(contents, pdf)
948 contents.operations.insert(
949 0,
950 [
951 [FloatObject(x) for x in ctm],
952 b"cm",
953 ],
954 )
955 return contents
957 def _get_contents_as_bytes(self) -> Optional[bytes]:
958 """
959 Return the page contents as bytes.
961 Returns:
962 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
964 """
965 if PG.CONTENTS in self:
966 obj = self[PG.CONTENTS].get_object()
967 if isinstance(obj, list):
968 return b"".join(x.get_object().get_data() for x in obj)
969 return cast(EncodedStreamObject, obj).get_data()
970 return None
972 def get_contents(self) -> Optional[ContentStream]:
973 """
974 Access the page contents.
976 Returns:
977 The ``/Contents`` object, or ``None`` if it does not exist.
978 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
980 """
981 if PG.CONTENTS in self:
982 try:
983 pdf = cast(IndirectObject, self.indirect_reference).pdf
984 except AttributeError:
985 pdf = None
986 obj = self[PG.CONTENTS]
987 if is_null_or_none(obj):
988 return None
989 resolved_object = obj.get_object()
990 return ContentStream(resolved_object, pdf)
991 return None
993 def replace_contents(
994 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
995 ) -> None:
996 """
997 Replace the page contents with the new content and nullify old objects
998 Args:
999 content: new content; if None delete the content field.
1000 """
1001 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
1002 # the page is not attached : the content is directly attached.
1003 self[NameObject(PG.CONTENTS)] = content
1004 return
1006 from pypdf._writer import PdfWriter # noqa: PLC0415
1007 if not isinstance(self.indirect_reference.pdf, PdfWriter):
1008 deprecate(
1009 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "
1010 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "
1011 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."
1012 )
1014 writer = self.indirect_reference.pdf
1015 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
1016 content_array = cast(ArrayObject, self[PG.CONTENTS])
1017 for reference in content_array:
1018 try:
1019 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())
1020 except ValueError:
1021 # Occurs when called on PdfReader.
1022 pass
1024 if isinstance(content, ArrayObject):
1025 content = ArrayObject(writer._add_object(obj) for obj in content)
1027 if is_null_or_none(content):
1028 if PG.CONTENTS not in self:
1029 return
1030 assert self[PG.CONTENTS].indirect_reference is not None
1031 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())
1032 del self[PG.CONTENTS]
1033 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
1034 try:
1035 self[NameObject(PG.CONTENTS)] = writer._add_object(content)
1036 except AttributeError:
1037 # applies at least for page not in writer
1038 # as a backup solution, we put content as an object although not in accordance with pdf ref
1039 # this will be fixed with the _add_object
1040 self[NameObject(PG.CONTENTS)] = content
1041 else:
1042 assert content is not None, "mypy"
1043 content.indirect_reference = self[
1044 PG.CONTENTS
1045 ].indirect_reference # TODO: in the future may require generation management
1046 try:
1047 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)
1048 except AttributeError:
1049 # applies at least for page not in writer
1050 # as a backup solution, we put content as an object although not in accordance with pdf ref
1051 # this will be fixed with the _add_object
1052 self[NameObject(PG.CONTENTS)] = content
1053 # forces recalculation of inline_images
1054 self.inline_images = None
1056 def merge_page(
1057 self, page2: "PageObject", expand: bool = False, over: bool = True
1058 ) -> None:
1059 """
1060 Merge the content streams of two pages into one.
1062 Resource references (e.g. fonts) are maintained from both pages.
1063 The mediabox, cropbox, etc of this page are not altered.
1064 The parameter page's content stream will
1065 be added to the end of this page's content stream,
1066 meaning that it will be drawn after, or "on top" of this page.
1068 Args:
1069 page2: The page to be merged into this one. Should be
1070 an instance of :class:`PageObject<PageObject>`.
1071 over: set the page2 content over page1 if True (default) else under
1072 expand: If True, the current page dimensions will be
1073 expanded to accommodate the dimensions of the page to be merged.
1075 """
1076 self._merge_page(page2, over=over, expand=expand)
1078 def _merge_page(
1079 self,
1080 page2: "PageObject",
1081 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1082 ctm: Optional[CompressedTransformationMatrix] = None,
1083 over: bool = True,
1084 expand: bool = False,
1085 ) -> None:
1086 # First we work on merging the resource dictionaries. This allows us
1087 # to find out what symbols in the content streams we might need to
1088 # rename.
1089 try:
1090 assert isinstance(self.indirect_reference, IndirectObject)
1091 if hasattr(
1092 self.indirect_reference.pdf, "_add_object"
1093 ): # to detect PdfWriter
1094 return self._merge_page_writer(
1095 page2, page2transformation, ctm, over, expand
1096 )
1097 except (AssertionError, AttributeError):
1098 pass
1100 new_resources = DictionaryObject()
1101 rename = {}
1102 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())
1103 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())
1104 new_annots = ArrayObject()
1106 for page in (self, page2):
1107 if PG.ANNOTS in page:
1108 annots = page[PG.ANNOTS]
1109 if isinstance(annots, ArrayObject):
1110 new_annots.extend(annots)
1112 for res in (
1113 RES.EXT_G_STATE,
1114 RES.FONT,
1115 RES.XOBJECT,
1116 RES.COLOR_SPACE,
1117 RES.PATTERN,
1118 RES.SHADING,
1119 RES.PROPERTIES,
1120 ):
1121 new, newrename = self._merge_resources(
1122 original_resources, page2resources, res
1123 )
1124 if new:
1125 new_resources[NameObject(res)] = new
1126 rename.update(newrename)
1128 # Combine /ProcSet sets, making sure there's a consistent order
1129 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
1130 sorted(
1131 set(
1132 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
1133 ).union(
1134 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
1135 )
1136 )
1137 )
1139 new_content_array = ArrayObject()
1140 original_content = self.get_contents()
1141 if original_content is not None:
1142 original_content.isolate_graphics_state()
1143 new_content_array.append(original_content)
1145 page2content = page2.get_contents()
1146 if page2content is not None:
1147 rect = getattr(page2, MERGE_CROP_BOX)
1148 page2content.operations.insert(
1149 0,
1150 (
1151 map(
1152 FloatObject,
1153 [
1154 rect.left,
1155 rect.bottom,
1156 rect.width,
1157 rect.height,
1158 ],
1159 ),
1160 b"re",
1161 ),
1162 )
1163 page2content.operations.insert(1, ([], b"W"))
1164 page2content.operations.insert(2, ([], b"n"))
1165 if page2transformation is not None:
1166 page2content = page2transformation(page2content)
1167 page2content = PageObject._content_stream_rename(
1168 page2content, rename, self.pdf
1169 )
1170 page2content.isolate_graphics_state()
1171 if over:
1172 new_content_array.append(page2content)
1173 else:
1174 new_content_array.insert(0, page2content)
1176 # if expanding the page to fit a new page, calculate the new media box size
1177 if expand:
1178 self._expand_mediabox(page2, ctm)
1180 self.replace_contents(ContentStream(new_content_array, self.pdf))
1181 self[NameObject(PG.RESOURCES)] = new_resources
1182 self[NameObject(PG.ANNOTS)] = new_annots
1183 return None
1185 def _merge_page_writer(
1186 self,
1187 page2: "PageObject",
1188 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1189 ctm: Optional[CompressedTransformationMatrix] = None,
1190 over: bool = True,
1191 expand: bool = False,
1192 ) -> None:
1193 # First we work on merging the resource dictionaries. This allows us
1194 # to find which symbols in the content streams we might need to
1195 # rename.
1196 assert isinstance(self.indirect_reference, IndirectObject)
1197 pdf = self.indirect_reference.pdf
1199 rename = {}
1200 if PG.RESOURCES not in self:
1201 self[NameObject(PG.RESOURCES)] = DictionaryObject()
1202 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1203 if PG.RESOURCES not in page2:
1204 page2resources = DictionaryObject()
1205 else:
1206 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1208 for res in (
1209 RES.EXT_G_STATE,
1210 RES.FONT,
1211 RES.XOBJECT,
1212 RES.COLOR_SPACE,
1213 RES.PATTERN,
1214 RES.SHADING,
1215 RES.PROPERTIES,
1216 ):
1217 if res in page2resources:
1218 if res not in original_resources:
1219 original_resources[NameObject(res)] = DictionaryObject()
1220 _, newrename = self._merge_resources(
1221 original_resources, page2resources, res, False
1222 )
1223 rename.update(newrename)
1224 # Combine /ProcSet sets.
1225 if RES.PROC_SET in page2resources:
1226 if RES.PROC_SET not in original_resources:
1227 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
1228 arr = cast(ArrayObject, original_resources[RES.PROC_SET])
1229 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
1230 if x not in arr:
1231 arr.append(x)
1232 arr.sort()
1234 if PG.ANNOTS in page2:
1235 if PG.ANNOTS not in self:
1236 self[NameObject(PG.ANNOTS)] = ArrayObject()
1237 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
1238 if ctm is None:
1239 trsf = Transformation()
1240 else:
1241 trsf = Transformation(ctm)
1242 # Ensure we are working on a copy of the list. Otherwise, if both pages
1243 # are the same object, we might run into an infinite loop.
1244 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):
1245 a = a.get_object()
1246 aa = a.clone(
1247 pdf,
1248 ignore_fields=("/P", "/StructParent", "/Parent"),
1249 force_duplicate=True,
1250 )
1251 r = cast(ArrayObject, a["/Rect"])
1252 pt1 = trsf.apply_on((r[0], r[1]), True)
1253 pt2 = trsf.apply_on((r[2], r[3]), True)
1254 aa[NameObject("/Rect")] = ArrayObject(
1255 (
1256 min(pt1[0], pt2[0]),
1257 min(pt1[1], pt2[1]),
1258 max(pt1[0], pt2[0]),
1259 max(pt1[1], pt2[1]),
1260 )
1261 )
1262 if "/QuadPoints" in a:
1263 q = cast(ArrayObject, a["/QuadPoints"])
1264 aa[NameObject("/QuadPoints")] = ArrayObject(
1265 trsf.apply_on((q[0], q[1]), True)
1266 + trsf.apply_on((q[2], q[3]), True)
1267 + trsf.apply_on((q[4], q[5]), True)
1268 + trsf.apply_on((q[6], q[7]), True)
1269 )
1270 try:
1271 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
1272 except KeyError:
1273 pass
1274 try:
1275 aa[NameObject("/P")] = self.indirect_reference
1276 annots.append(aa.indirect_reference)
1277 except AttributeError:
1278 pass
1280 new_content_array = ArrayObject()
1281 original_content = self.get_contents()
1282 if original_content is not None:
1283 original_content.isolate_graphics_state()
1284 new_content_array.append(original_content)
1286 page2content = page2.get_contents()
1287 if page2content is not None:
1288 rect = getattr(page2, MERGE_CROP_BOX)
1289 page2content.operations.insert(
1290 0,
1291 (
1292 map(
1293 FloatObject,
1294 [
1295 rect.left,
1296 rect.bottom,
1297 rect.width,
1298 rect.height,
1299 ],
1300 ),
1301 b"re",
1302 ),
1303 )
1304 page2content.operations.insert(1, ([], b"W"))
1305 page2content.operations.insert(2, ([], b"n"))
1306 if page2transformation is not None:
1307 page2content = page2transformation(page2content)
1308 page2content = PageObject._content_stream_rename(
1309 page2content, rename, self.pdf
1310 )
1311 page2content.isolate_graphics_state()
1312 if over:
1313 new_content_array.append(page2content)
1314 else:
1315 new_content_array.insert(0, page2content)
1317 # if expanding the page to fit a new page, calculate the new media box size
1318 if expand:
1319 self._expand_mediabox(page2, ctm)
1321 self.replace_contents(new_content_array)
1323 def _expand_mediabox(
1324 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
1325 ) -> None:
1326 corners1 = (
1327 self.mediabox.left.as_numeric(),
1328 self.mediabox.bottom.as_numeric(),
1329 self.mediabox.right.as_numeric(),
1330 self.mediabox.top.as_numeric(),
1331 )
1332 corners2 = (
1333 page2.mediabox.left.as_numeric(),
1334 page2.mediabox.bottom.as_numeric(),
1335 page2.mediabox.left.as_numeric(),
1336 page2.mediabox.top.as_numeric(),
1337 page2.mediabox.right.as_numeric(),
1338 page2.mediabox.top.as_numeric(),
1339 page2.mediabox.right.as_numeric(),
1340 page2.mediabox.bottom.as_numeric(),
1341 )
1342 if ctm is not None:
1343 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1344 new_x = tuple(
1345 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
1346 for i in range(0, 8, 2)
1347 )
1348 new_y = tuple(
1349 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
1350 for i in range(0, 8, 2)
1351 )
1352 else:
1353 new_x = corners2[0:8:2]
1354 new_y = corners2[1:8:2]
1355 lowerleft = (min(new_x), min(new_y))
1356 upperright = (max(new_x), max(new_y))
1357 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
1358 upperright = (
1359 max(corners1[2], upperright[0]),
1360 max(corners1[3], upperright[1]),
1361 )
1363 self.mediabox.lower_left = lowerleft
1364 self.mediabox.upper_right = upperright
1366 def merge_transformed_page(
1367 self,
1368 page2: "PageObject",
1369 ctm: Union[CompressedTransformationMatrix, Transformation],
1370 over: bool = True,
1371 expand: bool = False,
1372 ) -> None:
1373 """
1374 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation
1375 matrix is applied to the merged stream.
1377 Args:
1378 page2: The page to be merged into this one.
1379 ctm: a 6-element tuple containing the operands of the
1380 transformation matrix
1381 over: set the page2 content over page1 if True (default) else under
1382 expand: Whether the page should be expanded to fit the dimensions
1383 of the page to be merged.
1385 """
1386 if isinstance(ctm, Transformation):
1387 ctm = ctm.ctm
1388 self._merge_page(
1389 page2,
1390 lambda page2_content: PageObject._add_transformation_matrix(
1391 page2_content, page2.pdf, ctm
1392 ),
1393 ctm,
1394 over,
1395 expand,
1396 )
1398 def merge_scaled_page(
1399 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
1400 ) -> None:
1401 """
1402 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1403 is scaled by applying a transformation matrix.
1405 Args:
1406 page2: The page to be merged into this one.
1407 scale: The scaling factor
1408 over: set the page2 content over page1 if True (default) else under
1409 expand: Whether the page should be expanded to fit the
1410 dimensions of the page to be merged.
1412 """
1413 op = Transformation().scale(scale, scale)
1414 self.merge_transformed_page(page2, op, over, expand)
1416 def merge_rotated_page(
1417 self,
1418 page2: "PageObject",
1419 rotation: float,
1420 over: bool = True,
1421 expand: bool = False,
1422 ) -> None:
1423 """
1424 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1425 is rotated by applying a transformation matrix.
1427 Args:
1428 page2: The page to be merged into this one.
1429 rotation: The angle of the rotation, in degrees
1430 over: set the page2 content over page1 if True (default) else under
1431 expand: Whether the page should be expanded to fit the
1432 dimensions of the page to be merged.
1434 """
1435 op = Transformation().rotate(rotation)
1436 self.merge_transformed_page(page2, op, over, expand)
1438 def merge_translated_page(
1439 self,
1440 page2: "PageObject",
1441 tx: float,
1442 ty: float,
1443 over: bool = True,
1444 expand: bool = False,
1445 ) -> None:
1446 """
1447 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be
1448 merged is translated by applying a transformation matrix.
1450 Args:
1451 page2: the page to be merged into this one.
1452 tx: The translation on X axis
1453 ty: The translation on Y axis
1454 over: set the page2 content over page1 if True (default) else under
1455 expand: Whether the page should be expanded to fit the
1456 dimensions of the page to be merged.
1458 """
1459 op = Transformation().translate(tx, ty)
1460 self.merge_transformed_page(page2, op, over, expand)
1462 def add_transformation(
1463 self,
1464 ctm: Union[Transformation, CompressedTransformationMatrix],
1465 expand: bool = False,
1466 ) -> None:
1467 """
1468 Apply a transformation matrix to the page.
1470 Args:
1471 ctm: A 6-element tuple containing the operands of the
1472 transformation matrix. Alternatively, a
1473 :py:class:`Transformation<pypdf.Transformation>`
1474 object can be passed.
1476 See :doc:`/user/cropping-and-transforming`.
1478 """
1479 if isinstance(ctm, Transformation):
1480 ctm = ctm.ctm
1481 content = self.get_contents()
1482 if content is not None:
1483 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
1484 content.isolate_graphics_state()
1485 self.replace_contents(content)
1486 # if expanding the page to fit a new page, calculate the new media box size
1487 if expand:
1488 corners = [
1489 self.mediabox.left.as_numeric(),
1490 self.mediabox.bottom.as_numeric(),
1491 self.mediabox.left.as_numeric(),
1492 self.mediabox.top.as_numeric(),
1493 self.mediabox.right.as_numeric(),
1494 self.mediabox.top.as_numeric(),
1495 self.mediabox.right.as_numeric(),
1496 self.mediabox.bottom.as_numeric(),
1497 ]
1499 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1500 new_x = [
1501 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
1502 for i in range(0, 8, 2)
1503 ]
1504 new_y = [
1505 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
1506 for i in range(0, 8, 2)
1507 ]
1509 self.mediabox.lower_left = (min(new_x), min(new_y))
1510 self.mediabox.upper_right = (max(new_x), max(new_y))
1512 def scale(self, sx: float, sy: float) -> None:
1513 """
1514 Scale a page by the given factors by applying a transformation matrix
1515 to its content and updating the page size.
1517 This updates the various page boundaries (bleedbox, trimbox, etc.)
1518 and the contents of the page.
1520 Args:
1521 sx: The scaling factor on horizontal axis.
1522 sy: The scaling factor on vertical axis.
1524 """
1525 self.add_transformation((sx, 0, 0, sy, 0, 0))
1526 self.bleedbox = self.bleedbox.scale(sx, sy)
1527 self.trimbox = self.trimbox.scale(sx, sy)
1528 self.artbox = self.artbox.scale(sx, sy)
1529 self.cropbox = self.cropbox.scale(sx, sy)
1530 self.mediabox = self.mediabox.scale(sx, sy)
1532 if PG.ANNOTS in self:
1533 annotations = self[PG.ANNOTS]
1534 if isinstance(annotations, ArrayObject):
1535 for annotation in annotations:
1536 annotation_obj = annotation.get_object()
1537 if ADA.Rect in annotation_obj:
1538 rectangle = annotation_obj[ADA.Rect]
1539 if isinstance(rectangle, ArrayObject):
1540 rectangle[0] = FloatObject(float(rectangle[0]) * sx)
1541 rectangle[1] = FloatObject(float(rectangle[1]) * sy)
1542 rectangle[2] = FloatObject(float(rectangle[2]) * sx)
1543 rectangle[3] = FloatObject(float(rectangle[3]) * sy)
1545 if PG.VP in self:
1546 viewport = self[PG.VP]
1547 if isinstance(viewport, ArrayObject):
1548 bbox = viewport[0]["/BBox"]
1549 else:
1550 bbox = viewport["/BBox"] # type: ignore
1551 scaled_bbox = RectangleObject(
1552 (
1553 float(bbox[0]) * sx,
1554 float(bbox[1]) * sy,
1555 float(bbox[2]) * sx,
1556 float(bbox[3]) * sy,
1557 )
1558 )
1559 if isinstance(viewport, ArrayObject):
1560 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore
1561 NameObject("/BBox")
1562 ] = scaled_bbox
1563 else:
1564 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore
1566 def scale_by(self, factor: float) -> None:
1567 """
1568 Scale a page by the given factor by applying a transformation matrix to
1569 its content and updating the page size.
1571 Args:
1572 factor: The scaling factor (for both X and Y axis).
1574 """
1575 self.scale(factor, factor)
1577 def scale_to(self, width: float, height: float) -> None:
1578 """
1579 Scale a page to the specified dimensions by applying a transformation
1580 matrix to its content and updating the page size.
1582 Args:
1583 width: The new width.
1584 height: The new height.
1586 """
1587 sx = width / float(self.mediabox.width)
1588 sy = height / float(self.mediabox.height)
1589 self.scale(sx, sy)
1591 def compress_content_streams(self, level: int = -1) -> None:
1592 """
1593 Compress the size of this page by joining all content streams and
1594 applying a FlateDecode filter.
1596 However, it is possible that this function will perform no action if
1597 content stream compression becomes "automatic".
1598 """
1599 content = self.get_contents()
1600 if content is not None:
1601 content_obj = content.flate_encode(level)
1602 try:
1603 content.indirect_reference.pdf._objects[ # type: ignore
1604 content.indirect_reference.idnum - 1 # type: ignore
1605 ] = content_obj
1606 except AttributeError:
1607 if self.indirect_reference is not None and hasattr(
1608 self.indirect_reference.pdf, "_add_object"
1609 ):
1610 self.replace_contents(content_obj)
1611 else:
1612 raise ValueError("Page must be part of a PdfWriter")
1614 @property
1615 def page_number(self) -> Optional[int]:
1616 """
1617 Read-only property which returns the page number within the PDF file.
1619 Returns:
1620 Page number; None if the page is not attached to a PDF.
1622 """
1623 if self.indirect_reference is None:
1624 return None
1625 try:
1626 lst = self.indirect_reference.pdf.pages
1627 return lst.index(self)
1628 except ValueError:
1629 return None
1631 def _debug_for_extract(self) -> str: # pragma: no cover
1632 out = ""
1633 for ope, op in ContentStream(
1634 self["/Contents"].get_object(), self.pdf, "bytes"
1635 ).operations:
1636 if op == b"TJ":
1637 s = [x for x in ope[0] if isinstance(x, str)]
1638 else:
1639 s = []
1640 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
1641 out += "\n=============================\n"
1642 try:
1643 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore
1644 out += fo + "\n"
1645 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore
1646 try:
1647 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1648 "/Encoding"
1649 ].__repr__()
1650 out += enc_repr + "\n"
1651 except Exception:
1652 pass
1653 try:
1654 out += (
1655 self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1656 "/ToUnicode"
1657 ]
1658 .get_data()
1659 .decode()
1660 + "\n"
1661 )
1662 except Exception:
1663 pass
1665 except KeyError:
1666 out += "No Font\n"
1667 return out
1669 def _extract_text(
1670 self,
1671 obj: Any,
1672 pdf: Any,
1673 orientations: tuple[int, ...] = (0, 90, 180, 270),
1674 space_width: float = 200.0,
1675 content_key: Optional[str] = PG.CONTENTS,
1676 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1677 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1678 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1679 ) -> str:
1680 """
1681 See extract_text for most arguments.
1683 Args:
1684 content_key: indicate the default key where to extract data
1685 None = the object; this allows reusing the function on an XObject
1686 default = "/Content"
1688 """
1689 extractor = TextExtraction()
1690 font_resources: dict[str, DictionaryObject] = {}
1691 fonts: dict[str, Font] = {}
1693 try:
1694 objr = obj
1695 while NameObject(PG.RESOURCES) not in objr:
1696 # /Resources can be inherited so we look to parents
1697 objr = objr["/Parent"].get_object()
1698 # If no parents then no /Resources will be available,
1699 # so an exception will be raised
1700 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
1701 except Exception:
1702 # No resources means no text is possible (no font); we consider the
1703 # file as not damaged, no need to check for TJ or Tj
1704 return ""
1706 if (
1707 not is_null_or_none(resources_dict)
1708 and "/Font" in resources_dict
1709 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))
1710 ):
1711 for font_resource in font_resources_dict:
1712 try:
1713 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())
1714 font_resources[font_resource] = font_resource_object
1715 fonts[font_resource] = Font.from_font_resource(font_resource_object)
1716 # Override space width, if applicable
1717 if fonts[font_resource].character_widths.get(" ", 0) == 0:
1718 fonts[font_resource].space_width = space_width
1719 except (AttributeError, TypeError):
1720 pass
1722 try:
1723 content = (
1724 obj[content_key].get_object() if isinstance(content_key, str) else obj
1725 )
1726 if not isinstance(content, ContentStream):
1727 content = ContentStream(content, pdf, "bytes")
1728 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
1729 return ""
1730 # We check all strings are TextStringObjects. ByteStringObjects
1731 # are strings where the byte->string encoding was unknown, so adding
1732 # them to the text here would be gibberish.
1734 # Initialize the extractor with the necessary parameters
1735 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)
1737 for operands, operator in content.operations:
1738 if visitor_operand_before is not None:
1739 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1740 # Multiple operators are handled here
1741 if operator == b"'":
1742 extractor.process_operation(b"T*", [])
1743 extractor.process_operation(b"Tj", operands)
1744 elif operator == b'"':
1745 extractor.process_operation(b"Tw", [operands[0]])
1746 extractor.process_operation(b"Tc", [operands[1]])
1747 extractor.process_operation(b"T*", [])
1748 extractor.process_operation(b"Tj", operands[2:])
1749 elif operator == b"TJ":
1750 # The space width may be smaller than the font width, so the width should be 95%.
1751 _confirm_space_width = extractor._space_width * 0.95
1752 if operands:
1753 for op in operands[0]:
1754 if isinstance(op, (str, bytes)):
1755 extractor.process_operation(b"Tj", [op])
1756 if isinstance(op, (int, float, NumberObject, FloatObject)) and (
1757 abs(float(op)) >= _confirm_space_width
1758 and extractor.text
1759 and extractor.text[-1] != " "
1760 ):
1761 extractor.process_operation(b"Tj", [" "])
1762 elif operator == b"TD":
1763 extractor.process_operation(b"TL", [-operands[1]])
1764 extractor.process_operation(b"Td", operands)
1765 elif operator == b"Do":
1766 extractor.output += extractor.text
1767 if visitor_text is not None:
1768 visitor_text(
1769 extractor.text,
1770 extractor.memo_cm,
1771 extractor.memo_tm,
1772 extractor.font_resource,
1773 extractor.font_size,
1774 )
1775 try:
1776 if extractor.output[-1] != "\n":
1777 extractor.output += "\n"
1778 if visitor_text is not None:
1779 visitor_text(
1780 "\n",
1781 extractor.memo_cm,
1782 extractor.memo_tm,
1783 extractor.font_resource,
1784 extractor.font_size,
1785 )
1786 except IndexError:
1787 pass
1788 try:
1789 xobj = resources_dict["/XObject"]
1790 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
1791 text = self.extract_xform_text(
1792 xobj[operands[0]], # type: ignore
1793 orientations,
1794 space_width,
1795 visitor_operand_before,
1796 visitor_operand_after,
1797 visitor_text,
1798 )
1799 extractor.output += text
1800 if visitor_text is not None:
1801 visitor_text(
1802 text,
1803 extractor.memo_cm,
1804 extractor.memo_tm,
1805 extractor.font_resource,
1806 extractor.font_size,
1807 )
1808 except Exception as exception:
1809 logger_warning(
1810 f"Impossible to decode XFormObject {operands[0]}: {exception}",
1811 __name__,
1812 )
1813 finally:
1814 extractor.text = ""
1815 extractor.memo_cm = extractor.cm_matrix.copy()
1816 extractor.memo_tm = extractor.tm_matrix.copy()
1817 else:
1818 extractor.process_operation(operator, operands)
1819 if visitor_operand_after is not None:
1820 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1821 extractor.output += extractor.text # just in case
1822 if extractor.text != "" and visitor_text is not None:
1823 visitor_text(
1824 extractor.text,
1825 extractor.memo_cm,
1826 extractor.memo_tm,
1827 extractor.font_resource,
1828 extractor.font_size,
1829 )
1830 return extractor.output
1832 def _layout_mode_fonts(self) -> dict[str, Font]:
1833 """
1834 Get fonts formatted for "layout" mode text extraction.
1836 Returns:
1837 Dict[str, Font]: dictionary of Font instances keyed by font name
1839 """
1840 # Font retrieval logic adapted from pypdf.PageObject._extract_text()
1841 objr: Any = self
1842 fonts: dict[str, Font] = {}
1843 while objr is not None:
1844 try:
1845 resources_dict: Any = objr[PG.RESOURCES]
1846 except KeyError:
1847 resources_dict = {}
1848 if "/Font" in resources_dict and self.pdf is not None:
1849 for font_name in resources_dict["/Font"]:
1850 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])
1851 try:
1852 objr = objr["/Parent"].get_object()
1853 except KeyError:
1854 objr = None
1856 return fonts
1858 def _layout_mode_text(
1859 self,
1860 space_vertically: bool = True,
1861 scale_weight: float = 1.25,
1862 strip_rotated: bool = True,
1863 debug_path: Optional[Path] = None,
1864 font_height_weight: float = 1,
1865 ) -> str:
1866 """
1867 Get text preserving fidelity to source PDF text layout.
1869 Args:
1870 space_vertically: include blank lines inferred from y distance + font
1871 height. Defaults to True.
1872 scale_weight: multiplier for string length when calculating weighted
1873 average character width. Defaults to 1.25.
1874 strip_rotated: Removes text that is rotated w.r.t. to the page from
1875 layout mode output. Defaults to True.
1876 debug_path (Path | None): if supplied, must target a directory.
1877 creates the following files with debug information for layout mode
1878 functions if supplied:
1879 - fonts.json: output of self._layout_mode_fonts
1880 - tjs.json: individual text render ops with corresponding transform matrices
1881 - bts.json: text render ops left justified and grouped by BT/ET operators
1882 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1883 Defaults to None.
1884 font_height_weight: multiplier for font height when calculating
1885 blank lines. Defaults to 1.
1887 Returns:
1888 str: multiline string containing page text in a fixed width format that
1889 closely adheres to the rendered layout in the source pdf.
1891 """
1892 fonts = self._layout_mode_fonts()
1893 if debug_path: # pragma: no cover
1894 import json # noqa: PLC0415
1896 debug_path.joinpath("fonts.json").write_text(
1897 json.dumps(fonts, indent=2, default=asdict),
1898 "utf-8"
1899 )
1901 ops = iter(
1902 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
1903 )
1904 bt_groups = _layout_mode.text_show_operations(
1905 ops, fonts, strip_rotated, debug_path
1906 )
1908 if not bt_groups:
1909 return ""
1911 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
1913 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
1915 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
1917 def extract_text(
1918 self,
1919 *args: Any,
1920 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),
1921 space_width: float = 200.0,
1922 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1923 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1924 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1925 extraction_mode: Literal["plain", "layout"] = "plain",
1926 **kwargs: Any,
1927 ) -> str:
1928 """
1929 Locate all text drawing commands, in the order they are provided in the
1930 content stream, and extract the text.
1932 This works well for some PDF files, but poorly for others, depending on
1933 the generator used. This will be refined in the future.
1935 Do not rely on the order of text coming out of this function, as it
1936 will change if this function is made more sophisticated.
1938 Arabic and Hebrew are extracted in the correct order.
1939 If required a custom RTL range of characters can be defined;
1940 see function set_custom_rtl.
1942 Additionally you can provide visitor methods to get informed on all
1943 operations and all text objects.
1944 For example in some PDF files this can be useful to parse tables.
1946 Args:
1947 orientations: list of orientations extract_text will look for
1948 default = (0, 90, 180, 270)
1949 note: currently only 0 (up),90 (turned left), 180 (upside down),
1950 270 (turned right)
1951 Silently ignored in "layout" mode.
1952 space_width: force default space width
1953 if not extracted from font (default: 200)
1954 Silently ignored in "layout" mode.
1955 visitor_operand_before: function to be called before processing an operation.
1956 It has four arguments: operator, operand-arguments,
1957 current transformation matrix and text matrix.
1958 Ignored with a warning in "layout" mode.
1959 visitor_operand_after: function to be called after processing an operation.
1960 It has four arguments: operator, operand-arguments,
1961 current transformation matrix and text matrix.
1962 Ignored with a warning in "layout" mode.
1963 visitor_text: function to be called when extracting some text at some position.
1964 It has five arguments: text, current transformation matrix,
1965 text matrix, font-dictionary and font-size.
1966 The font-dictionary may be None in case of unknown fonts.
1967 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
1968 Ignored with a warning in "layout" mode.
1969 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
1970 "layout" for experimental layout mode functionality.
1971 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
1972 in "layout" mode.
1974 kwargs:
1975 layout_mode_space_vertically (bool): include blank lines inferred from
1976 y distance + font height. Defaults to True.
1977 layout_mode_scale_weight (float): multiplier for string length when calculating
1978 weighted average character width. Defaults to 1.25.
1979 layout_mode_strip_rotated (bool): layout mode does not support rotated text.
1980 Set to False to include rotated text anyway. If rotated text is discovered,
1981 layout will be degraded and a warning will result. Defaults to True.
1982 layout_mode_debug_path (Path | None): if supplied, must target a directory.
1983 creates the following files with debug information for layout mode
1984 functions if supplied:
1986 - fonts.json: output of self._layout_mode_fonts
1987 - tjs.json: individual text render ops with corresponding transform matrices
1988 - bts.json: text render ops left justified and grouped by BT/ET operators
1989 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1990 layout_mode_font_height_weight (float): multiplier for font height when calculating
1991 blank lines. Defaults to 1.
1993 Returns:
1994 The extracted text
1996 """
1997 if extraction_mode not in ["plain", "layout"]:
1998 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
1999 if extraction_mode == "layout":
2000 for visitor in (
2001 "visitor_operand_before",
2002 "visitor_operand_after",
2003 "visitor_text",
2004 ):
2005 if locals()[visitor]:
2006 logger_warning(
2007 f"Argument {visitor} is ignored in layout mode",
2008 __name__,
2009 )
2010 return self._layout_mode_text(
2011 space_vertically=kwargs.get("layout_mode_space_vertically", True),
2012 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
2013 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
2014 debug_path=kwargs.get("layout_mode_debug_path"),
2015 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
2016 )
2017 if len(args) >= 1:
2018 if isinstance(args[0], str):
2019 if len(args) >= 3:
2020 if isinstance(args[2], (tuple, int)):
2021 orientations = args[2]
2022 else:
2023 raise TypeError(f"Invalid positional parameter {args[2]}")
2024 if len(args) >= 4:
2025 if isinstance(args[3], (float, int)):
2026 space_width = args[3]
2027 else:
2028 raise TypeError(f"Invalid positional parameter {args[3]}")
2029 elif isinstance(args[0], (tuple, int)):
2030 orientations = args[0]
2031 if len(args) >= 2:
2032 if isinstance(args[1], (float, int)):
2033 space_width = args[1]
2034 else:
2035 raise TypeError(f"Invalid positional parameter {args[1]}")
2036 else:
2037 raise TypeError(f"Invalid positional parameter {args[0]}")
2039 if isinstance(orientations, int):
2040 orientations = (orientations,)
2042 return self._extract_text(
2043 self,
2044 self.pdf,
2045 orientations,
2046 space_width,
2047 PG.CONTENTS,
2048 visitor_operand_before,
2049 visitor_operand_after,
2050 visitor_text,
2051 )
2053 def extract_xform_text(
2054 self,
2055 xform: EncodedStreamObject,
2056 orientations: tuple[int, ...] = (0, 90, 270, 360),
2057 space_width: float = 200.0,
2058 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2059 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2060 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2061 ) -> str:
2062 """
2063 Extract text from an XObject.
2065 Args:
2066 xform:
2067 orientations:
2068 space_width: force default space width (if not extracted from font (default 200)
2069 visitor_operand_before:
2070 visitor_operand_after:
2071 visitor_text:
2073 Returns:
2074 The extracted text
2076 """
2077 return self._extract_text(
2078 xform,
2079 self.pdf,
2080 orientations,
2081 space_width,
2082 None,
2083 visitor_operand_before,
2084 visitor_operand_after,
2085 visitor_text,
2086 )
2088 def _get_fonts(self) -> tuple[set[str], set[str]]:
2089 """
2090 Get the names of embedded fonts and unembedded fonts.
2092 Returns:
2093 A tuple (set of embedded fonts, set of unembedded fonts)
2095 """
2096 obj = self.get_object()
2097 assert isinstance(obj, DictionaryObject)
2098 fonts: set[str] = set()
2099 embedded: set[str] = set()
2100 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
2101 unembedded = fonts - embedded
2102 return embedded, unembedded
2104 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
2105 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2106 default user space units, defining the boundaries of the physical medium on
2107 which the page is intended to be displayed or printed."""
2109 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
2110 """
2111 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2112 default user space units, defining the visible region of default user
2113 space.
2115 When the page is displayed or printed, its contents are to be clipped
2116 (cropped) to this rectangle and then imposed on the output medium in some
2117 implementation-defined manner. Default value: same as
2118 :attr:`mediabox<mediabox>`.
2119 """
2121 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
2122 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2123 default user space units, defining the region to which the contents of the
2124 page should be clipped when output in a production environment."""
2126 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
2127 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2128 default user space units, defining the intended dimensions of the finished
2129 page after trimming."""
2131 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
2132 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2133 default user space units, defining the extent of the page's meaningful
2134 content as intended by the page's creator."""
2136 @property
2137 def annotations(self) -> Optional[ArrayObject]:
2138 if "/Annots" not in self:
2139 return None
2140 return cast(ArrayObject, self["/Annots"])
2142 @annotations.setter
2143 def annotations(self, value: Optional[ArrayObject]) -> None:
2144 """
2145 Set the annotations array of the page.
2147 Typically you do not want to set this value, but append to it.
2148 If you append to it, remember to add the object first to the writer
2149 and only add the indirect object.
2150 """
2151 if value is None:
2152 if "/Annots" not in self:
2153 return
2154 del self[NameObject("/Annots")]
2155 else:
2156 self[NameObject("/Annots")] = value
2159class _VirtualList(Sequence[PageObject]):
2160 def __init__(
2161 self,
2162 length_function: Callable[[], int],
2163 get_function: Callable[[int], PageObject],
2164 ) -> None:
2165 self.length_function = length_function
2166 self.get_function = get_function
2167 self.current = -1
2169 def __len__(self) -> int:
2170 return self.length_function()
2172 @overload
2173 def __getitem__(self, index: int) -> PageObject:
2174 ...
2176 @overload
2177 def __getitem__(self, index: slice) -> Sequence[PageObject]:
2178 ...
2180 def __getitem__(
2181 self, index: Union[int, slice]
2182 ) -> Union[PageObject, Sequence[PageObject]]:
2183 if isinstance(index, slice):
2184 indices = range(*index.indices(len(self)))
2185 cls = type(self)
2186 return cls(indices.__len__, lambda idx: self[indices[idx]])
2187 if not isinstance(index, int):
2188 raise TypeError("Sequence indices must be integers")
2189 len_self = len(self)
2190 if index < 0:
2191 # support negative indexes
2192 index += len_self
2193 if not (0 <= index < len_self):
2194 raise IndexError("Sequence index out of range")
2195 return self.get_function(index)
2197 def __delitem__(self, index: Union[int, slice]) -> None:
2198 if isinstance(index, slice):
2199 r = list(range(*index.indices(len(self))))
2200 # pages have to be deleted from last to first
2201 r.sort()
2202 r.reverse()
2203 for p in r:
2204 del self[p] # recursive call
2205 return
2206 if not isinstance(index, int):
2207 raise TypeError("Index must be integers")
2208 len_self = len(self)
2209 if index < 0:
2210 # support negative indexes
2211 index += len_self
2212 if not (0 <= index < len_self):
2213 raise IndexError("Index out of range")
2214 ind = self[index].indirect_reference
2215 assert ind is not None
2216 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2217 "/Parent", None
2218 )
2219 first = True
2220 while parent is not None:
2221 parent = cast(DictionaryObject, parent.get_object())
2222 try:
2223 i = cast(ArrayObject, parent["/Kids"]).index(ind)
2224 del cast(ArrayObject, parent["/Kids"])[i]
2225 first = False
2226 try:
2227 assert ind is not None
2228 del ind.pdf.flattened_pages[index] # case of page in a Reader
2229 except Exception: # pragma: no cover
2230 pass
2231 if "/Count" in parent:
2232 parent[NameObject("/Count")] = NumberObject(
2233 cast(int, parent["/Count"]) - 1
2234 )
2235 if len(cast(ArrayObject, parent["/Kids"])) == 0:
2236 # No more objects in this part of this subtree
2237 ind = parent.indirect_reference
2238 parent = parent.get("/Parent", None)
2239 except ValueError: # from index
2240 if first:
2241 raise PdfReadError(f"Page not found in page tree: {ind}")
2242 break
2244 def __iter__(self) -> Iterator[PageObject]:
2245 for i in range(len(self)):
2246 yield self[i]
2248 def __str__(self) -> str:
2249 p = [f"PageObject({i})" for i in range(self.length_function())]
2250 return f"[{', '.join(p)}]"
2253def _get_fonts_walk(
2254 obj: DictionaryObject,
2255 fnt: set[str],
2256 emb: set[str],
2257) -> tuple[set[str], set[str]]:
2258 """
2259 Get the set of all fonts and all embedded fonts.
2261 Args:
2262 obj: Page resources dictionary
2263 fnt: font
2264 emb: embedded fonts
2266 Returns:
2267 A tuple (fnt, emb)
2269 If there is a key called 'BaseFont', that is a font that is used in the document.
2270 If there is a key called 'FontName' and another key in the same dictionary object
2271 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
2272 embedded.
2274 We create and add to two sets, fnt = fonts used and emb = fonts embedded.
2276 """
2277 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
2279 def process_font(f: DictionaryObject) -> None:
2280 nonlocal fnt, emb
2281 f = cast(DictionaryObject, f.get_object()) # to be sure
2282 if "/BaseFont" in f:
2283 fnt.add(cast(str, f["/BaseFont"]))
2285 if (
2286 ("/CharProcs" in f)
2287 or (
2288 "/FontDescriptor" in f
2289 and any(
2290 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
2291 )
2292 )
2293 or (
2294 "/DescendantFonts" in f
2295 and "/FontDescriptor"
2296 in cast(
2297 DictionaryObject,
2298 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2299 )
2300 and any(
2301 x
2302 in cast(
2303 DictionaryObject,
2304 cast(
2305 DictionaryObject,
2306 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2307 )["/FontDescriptor"],
2308 )
2309 for x in fontkeys
2310 )
2311 )
2312 ):
2313 # the list comprehension ensures there is FontFile
2314 try:
2315 emb.add(cast(str, f["/BaseFont"]))
2316 except KeyError:
2317 emb.add("(" + cast(str, f["/Subtype"]) + ")")
2319 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
2320 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
2321 process_font(f)
2322 if "/Resources" in obj:
2323 if "/Font" in cast(DictionaryObject, obj["/Resources"]):
2324 for f in cast(
2325 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
2326 ).values():
2327 process_font(f)
2328 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
2329 for x in cast(
2330 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
2331 ).values():
2332 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
2333 if "/Annots" in obj:
2334 for a in cast(ArrayObject, obj["/Annots"]):
2335 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
2336 if "/AP" in obj:
2337 if (
2338 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
2339 "/Type"
2340 )
2341 == "/XObject"
2342 ):
2343 _get_fonts_walk(
2344 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
2345 fnt,
2346 emb,
2347 )
2348 else:
2349 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
2350 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
2351 return fnt, emb # return the sets for each page