Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import math
31from collections.abc import Iterable, Iterator, Sequence
32from copy import deepcopy
33from dataclasses import asdict, dataclass
34from decimal import Decimal
35from io import BytesIO
36from pathlib import Path
37from typing import (
38 Any,
39 Callable,
40 Literal,
41 Optional,
42 Union,
43 cast,
44 overload,
45)
47from ._font import Font
48from ._protocols import PdfCommonDocProtocol
49from ._text_extraction import (
50 _layout_mode,
51)
52from ._text_extraction._text_extractor import TextExtraction
53from ._utils import (
54 CompressedTransformationMatrix,
55 TransformationMatrixType,
56 _human_readable_bytes,
57 deprecate,
58 logger_warning,
59 matrix_multiply,
60)
61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING
62from .constants import AnnotationDictionaryAttributes as ADA
63from .constants import ImageAttributes as IA
64from .constants import PageAttributes as PG
65from .constants import Resources as RES
66from .errors import PageSizeNotDefinedError, PdfReadError
67from .generic import (
68 ArrayObject,
69 ContentStream,
70 DictionaryObject,
71 EncodedStreamObject,
72 FloatObject,
73 IndirectObject,
74 NameObject,
75 NullObject,
76 NumberObject,
77 PdfObject,
78 RectangleObject,
79 StreamObject,
80 is_null_or_none,
81)
83try:
84 from PIL.Image import Image
86 pil_not_imported = False
87except ImportError:
88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
89 pil_not_imported = True # error will be raised only when using images
91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"
94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)
96 if isinstance(retval, RectangleObject):
97 return retval
98 if is_null_or_none(retval):
99 for d in defaults:
100 retval = self.get(d)
101 if retval is not None:
102 break
103 if isinstance(retval, IndirectObject):
104 retval = self.pdf.get_object(retval)
105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:
106 logger_warning(f"Expected four values, got {length}: {retval}", __name__)
107 retval = RectangleObject(tuple(retval[:4]))
108 else:
109 retval = RectangleObject(retval) # type: ignore
110 _set_rectangle(self, name, retval)
111 return retval
114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
115 self[NameObject(name)] = value
118def _delete_rectangle(self: Any, name: str) -> None:
119 del self[name]
122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
123 return property(
124 lambda self: _get_rectangle(self, name, fallback),
125 lambda self, value: _set_rectangle(self, name, value),
126 lambda self: _delete_rectangle(self, name),
127 )
130class Transformation:
131 """
132 Represent a 2D transformation.
134 The transformation between two coordinate systems is represented by a 3-by-3
135 transformation matrix with the following form::
137 a b 0
138 c d 0
139 e f 1
141 Because a transformation matrix has only six elements that can be changed,
142 it is usually specified in PDF as the six-element array [ a b c d e f ].
144 Coordinate transformations are expressed as matrix multiplications::
146 a b 0
147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0
148 e f 1
151 Example:
152 >>> from pypdf import PdfWriter, Transformation
153 >>> page = PdfWriter().add_blank_page(800, 600)
154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
155 >>> page.add_transformation(op)
157 """
159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:
160 self.ctm = ctm
162 @property
163 def matrix(self) -> TransformationMatrixType:
164 """
165 Return the transformation matrix as a tuple of tuples in the form:
167 ((a, b, 0), (c, d, 0), (e, f, 1))
168 """
169 return (
170 (self.ctm[0], self.ctm[1], 0),
171 (self.ctm[2], self.ctm[3], 0),
172 (self.ctm[4], self.ctm[5], 1),
173 )
175 @staticmethod
176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
177 """
178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
180 Args:
181 matrix: The transformation matrix as a tuple of tuples.
183 Returns:
184 A tuple representing the transformation matrix as (a, b, c, d, e, f)
186 """
187 return (
188 matrix[0][0],
189 matrix[0][1],
190 matrix[1][0],
191 matrix[1][1],
192 matrix[2][0],
193 matrix[2][1],
194 )
196 def _to_cm(self) -> str:
197 # Returns the cm operation string for the given transformation matrix
198 return (
199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
201 )
203 def transform(self, m: "Transformation") -> "Transformation":
204 """
205 Apply one transformation to another.
207 Args:
208 m: a Transformation to apply.
210 Returns:
211 A new ``Transformation`` instance
213 Example:
214 >>> from pypdf import PdfWriter, Transformation
215 >>> height, width = 40, 50
216 >>> page = PdfWriter().add_blank_page(800, 600)
217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror
219 >>> page.add_transformation(op)
221 """
222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
223 return Transformation(ctm)
225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
226 """
227 Translate the contents of a page.
229 Args:
230 tx: The translation along the x-axis.
231 ty: The translation along the y-axis.
233 Returns:
234 A new ``Transformation`` instance
236 """
237 m = self.ctm
238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
240 def scale(
241 self, sx: Optional[float] = None, sy: Optional[float] = None
242 ) -> "Transformation":
243 """
244 Scale the contents of a page towards the origin of the coordinate system.
246 Typically, that is the lower-left corner of the page. That can be
247 changed by translating the contents / the page boxes.
249 Args:
250 sx: The scale factor along the x-axis.
251 sy: The scale factor along the y-axis.
253 Returns:
254 A new Transformation instance with the scaled matrix.
256 """
257 if sx is None and sy is None:
258 raise ValueError("Either sx or sy must be specified")
259 if sx is None:
260 sx = sy
261 if sy is None:
262 sy = sx
263 assert sx is not None
264 assert sy is not None
265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
266 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
267 return Transformation(ctm)
269 def rotate(self, rotation: float) -> "Transformation":
270 """
271 Rotate the contents of a page.
273 Args:
274 rotation: The angle of rotation in degrees.
276 Returns:
277 A new ``Transformation`` instance with the rotated matrix.
279 """
280 rotation = math.radians(rotation)
281 op: TransformationMatrixType = (
282 (math.cos(rotation), math.sin(rotation), 0),
283 (-math.sin(rotation), math.cos(rotation), 0),
284 (0, 0, 1),
285 )
286 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
287 return Transformation(ctm)
289 def __repr__(self) -> str:
290 return f"Transformation(ctm={self.ctm})"
292 @overload
293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:
294 ...
296 @overload
297 def apply_on(
298 self, pt: tuple[float, float], as_object: bool = False
299 ) -> tuple[float, float]:
300 ...
302 def apply_on(
303 self,
304 pt: Union[tuple[float, float], list[float]],
305 as_object: bool = False,
306 ) -> Union[tuple[float, float], list[float]]:
307 """
308 Apply the transformation matrix on the given point.
310 Args:
311 pt: A tuple or list representing the point in the form (x, y).
312 as_object: If True, return items as FloatObject, otherwise as plain floats.
314 Returns:
315 A tuple or list representing the transformed point in the form (x', y')
317 """
318 typ = FloatObject if as_object else float
319 pt1 = (
320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
322 )
323 return list(pt1) if isinstance(pt, list) else pt1
326@dataclass
327class ImageFile:
328 """
329 Image within the PDF file. *This object is not designed to be built.*
331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.
332 """
334 name: str = ""
335 """
336 Filename as identified within the PDF file.
337 """
339 data: bytes = b""
340 """
341 Data as bytes.
342 """
344 image: Optional[Image] = None
345 """
346 Data as PIL image.
347 """
349 indirect_reference: Optional[IndirectObject] = None
350 """
351 Reference to the object storing the stream.
352 """
354 def replace(self, new_image: Image, **kwargs: Any) -> None:
355 """
356 Replace the image with a new PIL image.
358 Args:
359 new_image (PIL.Image.Image): The new PIL image to replace the existing image.
360 **kwargs: Additional keyword arguments to pass to `Image.save()`.
362 Raises:
363 TypeError: If the image is inline or in a PdfReader.
364 TypeError: If the image does not belong to a PdfWriter.
365 TypeError: If `new_image` is not a PIL Image.
367 Note:
368 This method replaces the existing image with a new image.
369 It is not allowed for inline images or images within a PdfReader.
370 The `kwargs` parameter allows passing additional parameters
371 to `Image.save()`, such as quality.
373 """
374 if pil_not_imported:
375 raise ImportError(
376 "pillow is required to do image extraction. "
377 "It can be installed via 'pip install pypdf[image]'"
378 )
380 from ._reader import PdfReader # noqa: PLC0415
381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415
382 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
384 if self.indirect_reference is None:
385 raise TypeError("Cannot update an inline image.")
386 if not hasattr(self.indirect_reference.pdf, "_id_translated"):
387 raise TypeError("Cannot update an image not belonging to a PdfWriter.")
388 if not isinstance(new_image, Image):
389 raise TypeError("new_image shall be a PIL Image")
390 b = BytesIO()
391 new_image.save(b, "PDF", **kwargs)
392 reader = PdfReader(b)
393 page_image = reader.pages[0].images[0]
394 assert page_image.indirect_reference is not None
395 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
396 page_image.indirect_reference.get_object()
397 )
398 cast(
399 PdfObject, self.indirect_reference.get_object()
400 ).indirect_reference = self.indirect_reference
401 # change the object attributes
402 extension, byte_stream, img = _xobj_to_image(
403 cast(DictionaryObject, self.indirect_reference.get_object()),
404 pillow_parameters=kwargs,
405 )
406 assert extension is not None
407 self.name = self.name[: self.name.rfind(".")] + extension
408 self.data = byte_stream
409 self.image = img
411 def __str__(self) -> str:
412 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
414 def __repr__(self) -> str:
415 return self.__str__()[:-1] + f", hash: {hash(self.data)})"
418class VirtualListImages(Sequence[ImageFile]):
419 """
420 Provides access to images referenced within a page.
421 Only one copy will be returned if the usage is used on the same page multiple times.
422 See :func:`PageObject.images` for more details.
423 """
425 def __init__(
426 self,
427 ids_function: Callable[[], list[Union[str, list[str]]]],
428 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],
429 ) -> None:
430 self.ids_function = ids_function
431 self.get_function = get_function
432 self.current = -1
434 def __len__(self) -> int:
435 return len(self.ids_function())
437 def keys(self) -> list[Union[str, list[str]]]:
438 return self.ids_function()
440 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:
441 return [(x, self[x]) for x in self.ids_function()]
443 @overload
444 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:
445 ...
447 @overload
448 def __getitem__(self, index: slice) -> Sequence[ImageFile]:
449 ...
451 def __getitem__(
452 self, index: Union[int, slice, str, list[str], tuple[str]]
453 ) -> Union[ImageFile, Sequence[ImageFile]]:
454 lst = self.ids_function()
455 if isinstance(index, slice):
456 indices = range(*index.indices(len(self)))
457 lst = [lst[x] for x in indices]
458 cls = type(self)
459 return cls((lambda: lst), self.get_function)
460 if isinstance(index, (str, list, tuple)):
461 return self.get_function(index)
462 if not isinstance(index, int):
463 raise TypeError("Invalid sequence indices type")
464 len_self = len(lst)
465 if index < 0:
466 # support negative indexes
467 index += len_self
468 if not (0 <= index < len_self):
469 raise IndexError("Sequence index out of range")
470 return self.get_function(lst[index])
472 def __iter__(self) -> Iterator[ImageFile]:
473 for i in range(len(self)):
474 yield self[i]
476 def __str__(self) -> str:
477 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
478 return f"[{', '.join(p)}]"
481class PageObject(DictionaryObject):
482 """
483 PageObject represents a single page within a PDF file.
485 Typically these objects will be created by accessing the
486 :attr:`pages<pypdf.PdfReader.pages>` property of the
487 :class:`PdfReader<pypdf.PdfReader>` class, but it is
488 also possible to create an empty page with the
489 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
491 Args:
492 pdf: PDF file the page belongs to.
493 indirect_reference: Stores the original indirect reference to
494 this object in its source PDF
496 """
498 original_page: "PageObject" # very local use in writer when appending
500 def __init__(
501 self,
502 pdf: Optional[PdfCommonDocProtocol] = None,
503 indirect_reference: Optional[IndirectObject] = None,
504 ) -> None:
505 DictionaryObject.__init__(self)
506 self.pdf = pdf
507 self.inline_images: Optional[dict[str, ImageFile]] = None
508 self.indirect_reference = indirect_reference
509 if not is_null_or_none(indirect_reference):
510 assert indirect_reference is not None, "mypy"
511 self.update(cast(DictionaryObject, indirect_reference.get_object()))
512 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}
514 def hash_bin(self) -> int:
515 """
516 Used to detect modified object.
518 Note: this function is overloaded to return the same results
519 as a DictionaryObject.
521 Returns:
522 Hash considering type and value.
524 """
525 return hash(
526 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
527 )
529 def hash_value_data(self) -> bytes:
530 data = super().hash_value_data()
531 data += f"{id(self)}".encode()
532 return data
534 @property
535 def user_unit(self) -> float:
536 """
537 A read-only positive number giving the size of user space units.
539 It is in multiples of 1/72 inch. Hence a value of 1 means a user
540 space unit is 1/72 inch, and a value of 3 means that a user
541 space unit is 3/72 inch.
542 """
543 return self.get(PG.USER_UNIT, 1)
545 @staticmethod
546 def create_blank_page(
547 pdf: Optional[PdfCommonDocProtocol] = None,
548 width: Union[float, Decimal, None] = None,
549 height: Union[float, Decimal, None] = None,
550 ) -> "PageObject":
551 """
552 Return a new blank page.
554 If ``width`` or ``height`` is ``None``, try to get the page size
555 from the last page of *pdf*.
557 Args:
558 pdf: PDF file the page is within.
559 width: The width of the new page expressed in default user
560 space units.
561 height: The height of the new page expressed in default user
562 space units.
564 Returns:
565 The new blank page
567 Raises:
568 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
569 no page
571 """
572 page = PageObject(pdf)
574 # Creates a new page (cf PDF Reference §7.7.3.3)
575 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
576 page.__setitem__(NameObject(PG.PARENT), NullObject())
577 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
578 if width is None or height is None:
579 if pdf is not None and len(pdf.pages) > 0:
580 lastpage = pdf.pages[len(pdf.pages) - 1]
581 width = lastpage.mediabox.width
582 height = lastpage.mediabox.height
583 else:
584 raise PageSizeNotDefinedError
585 page.__setitem__(
586 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore
587 )
589 return page
591 def _get_ids_image(
592 self,
593 obj: Optional[DictionaryObject] = None,
594 ancest: Optional[list[str]] = None,
595 call_stack: Optional[list[Any]] = None,
596 ) -> list[Union[str, list[str]]]:
597 if call_stack is None:
598 call_stack = []
599 _i = getattr(obj, "indirect_reference", None)
600 if _i in call_stack:
601 return []
602 call_stack.append(_i)
603 if self.inline_images is None:
604 self.inline_images = self._get_inline_images()
605 if obj is None:
606 obj = self
607 if ancest is None:
608 ancest = []
609 lst: list[Union[str, list[str]]] = []
610 if (
611 PG.RESOURCES not in obj or
612 is_null_or_none(resources := obj[PG.RESOURCES]) or
613 RES.XOBJECT not in cast(DictionaryObject, resources)
614 ):
615 return [] if self.inline_images is None else list(self.inline_images.keys())
617 x_object = resources[RES.XOBJECT].get_object() # type: ignore
618 for o in x_object:
619 if not isinstance(x_object[o], StreamObject):
620 continue
621 if x_object[o][IA.SUBTYPE] == "/Image":
622 lst.append(o if len(ancest) == 0 else [*ancest, o])
623 else: # is a form with possible images inside
624 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))
625 assert self.inline_images is not None
626 lst.extend(list(self.inline_images.keys()))
627 return lst
629 def _get_image(
630 self,
631 id: Union[str, list[str], tuple[str]],
632 obj: Optional[DictionaryObject] = None,
633 ) -> ImageFile:
634 if obj is None:
635 obj = cast(DictionaryObject, self)
636 if isinstance(id, tuple):
637 id = list(id)
638 if isinstance(id, list) and len(id) == 1:
639 id = id[0]
640 try:
641 xobjs = cast(
642 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
643 )
644 except KeyError:
645 if not (id[0] == "~" and id[-1] == "~"):
646 raise
647 if isinstance(id, str):
648 if id[0] == "~" and id[-1] == "~":
649 if self.inline_images is None:
650 self.inline_images = self._get_inline_images()
651 if self.inline_images is None: # pragma: no cover
652 raise KeyError("No inline image can be found")
653 return self.inline_images[id]
655 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
656 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
657 extension, byte_stream = imgd[:2]
658 return ImageFile(
659 name=f"{id[1:]}{extension}",
660 data=byte_stream,
661 image=imgd[2],
662 indirect_reference=xobjs[id].indirect_reference,
663 )
664 # in a subobject
665 ids = id[1:]
666 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
668 @property
669 def images(self) -> VirtualListImages:
670 """
671 Read-only property emulating a list of images on a page.
673 Get a list of all images on the page. The key can be:
674 - A string (for the top object)
675 - A tuple (for images within XObject forms)
676 - An integer
678 Examples:
679 * `reader.pages[0].images[0]` # return first image
680 * `reader.pages[0].images['/I0']` # return image '/I0'
681 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form
682 * `for img in reader.pages[0].images:` # loops through all objects
684 images.keys() and images.items() can be used.
686 The ImageFile has the following properties:
688 * `.name` : name of the object
689 * `.data` : bytes of the object
690 * `.image` : PIL Image Object
691 * `.indirect_reference` : object reference
693 and the following methods:
694 `.replace(new_image: PIL.Image.Image, **kwargs)` :
695 replace the image in the pdf with the new image
696 applying the saving parameters indicated (such as quality)
698 Example usage:
700 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)
702 Inline images are extracted and named ~0~, ~1~, ..., with the
703 indirect_reference set to None.
705 """
706 return VirtualListImages(self._get_ids_image, self._get_image)
708 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:
709 """Translate values used in inline image"""
710 try:
711 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])
712 except (TypeError, KeyError):
713 if isinstance(v, NameObject):
714 # It is a custom name, thus we have to look in resources.
715 # The only applicable case is for ColorSpace.
716 try:
717 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
718 v = cast(DictionaryObject, res)[v]
719 except KeyError: # for res and v
720 raise PdfReadError(f"Cannot find resource entry {v} for {k}")
721 return v
723 def _get_inline_images(self) -> dict[str, ImageFile]:
724 """Load inline images. Entries will be identified as `~1~`."""
725 content = self.get_contents()
726 if is_null_or_none(content):
727 return {}
728 imgs_data = []
729 assert content is not None, "mypy"
730 for param, ope in content.operations:
731 if ope == b"INLINE IMAGE":
732 imgs_data.append(
733 {"settings": param["settings"], "__streamdata__": param["data"]}
734 )
735 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
736 raise PdfReadError(
737 f"{ope!r} operator met whereas not expected, "
738 "please share use case with pypdf dev team"
739 )
740 files = {}
741 for num, ii in enumerate(imgs_data):
742 init = {
743 "__streamdata__": ii["__streamdata__"],
744 "/Length": len(ii["__streamdata__"]),
745 }
746 for k, v in ii["settings"].items():
747 if k in {"/Length", "/L"}: # no length is expected
748 continue
749 if isinstance(v, list):
750 v = ArrayObject(
751 [self._translate_value_inline_image(k, x) for x in v]
752 )
753 else:
754 v = self._translate_value_inline_image(k, v)
755 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])
756 if k not in init:
757 init[k] = v
758 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
759 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
760 extension, byte_stream, img = _xobj_to_image(ii["object"])
761 files[f"~{num}~"] = ImageFile(
762 name=f"~{num}~{extension}",
763 data=byte_stream,
764 image=img,
765 indirect_reference=None,
766 )
767 return files
769 @property
770 def rotation(self) -> int:
771 """
772 The visual rotation of the page.
774 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
775 valid values. This property does not affect ``/Contents``.
776 """
777 rotate_obj = self.get(PG.ROTATE, 0)
778 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
780 @rotation.setter
781 def rotation(self, r: float) -> None:
782 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
784 def transfer_rotation_to_content(self) -> None:
785 """
786 Apply the rotation of the page to the content and the media/crop/...
787 boxes.
789 It is recommended to apply this function before page merging.
790 """
791 r = -self.rotation # rotation to apply is in the otherway
792 self.rotation = 0
793 mb = RectangleObject(self.mediabox)
794 trsf = (
795 Transformation()
796 .translate(
797 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
798 )
799 .rotate(r)
800 )
801 pt1 = trsf.apply_on(mb.lower_left)
802 pt2 = trsf.apply_on(mb.upper_right)
803 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
804 self.add_transformation(trsf, False)
805 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
806 if b in self:
807 rr = RectangleObject(self[b]) # type: ignore
808 pt1 = trsf.apply_on(rr.lower_left)
809 pt2 = trsf.apply_on(rr.upper_right)
810 self[NameObject(b)] = RectangleObject(
811 (
812 min(pt1[0], pt2[0]),
813 min(pt1[1], pt2[1]),
814 max(pt1[0], pt2[0]),
815 max(pt1[1], pt2[1]),
816 )
817 )
819 def rotate(self, angle: int) -> "PageObject":
820 """
821 Rotate a page clockwise by increments of 90 degrees.
823 Args:
824 angle: Angle to rotate the page. Must be an increment of 90 deg.
826 Returns:
827 The rotated PageObject
829 """
830 if angle % 90 != 0:
831 raise ValueError("Rotation angle must be a multiple of 90")
832 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
833 return self
835 def _merge_resources(
836 self,
837 res1: DictionaryObject,
838 res2: DictionaryObject,
839 resource: Any,
840 new_res1: bool = True,
841 ) -> tuple[dict[str, Any], dict[str, Any]]:
842 try:
843 assert isinstance(self.indirect_reference, IndirectObject)
844 pdf = self.indirect_reference.pdf
845 is_pdf_writer = hasattr(
846 pdf, "_add_object"
847 ) # expect isinstance(pdf, PdfWriter)
848 except (AssertionError, AttributeError):
849 pdf = None
850 is_pdf_writer = False
852 def compute_unique_key(base_key: str) -> tuple[str, bool]:
853 """
854 Find a key that either doesn't already exist or has the same value
855 (indicated by the bool)
857 Args:
858 base_key: An index is added to this to get the computed key
860 Returns:
861 A tuple (computed key, bool) where the boolean indicates
862 if there is a resource of the given computed_key with the same
863 value.
865 """
866 value = page2res.raw_get(base_key)
867 # TODO: a possible improvement for writer, the indirect_reference
868 # cannot be found because translated
870 # try the current key first (e.g. "foo"), but otherwise iterate
871 # through "foo-0", "foo-1", etc. new_res can contain only finitely
872 # many keys, thus this'll eventually end, even if it's been crafted
873 # to be maximally annoying.
874 computed_key = base_key
875 idx = 0
876 while computed_key in new_res:
877 if new_res.raw_get(computed_key) == value:
878 # there's already a resource of this name, with the exact
879 # same value
880 return computed_key, True
881 computed_key = f"{base_key}-{idx}"
882 idx += 1
883 return computed_key, False
885 if new_res1:
886 new_res = DictionaryObject()
887 new_res.update(res1.get(resource, DictionaryObject()).get_object())
888 else:
889 new_res = cast(DictionaryObject, res1[resource])
890 page2res = cast(
891 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
892 )
893 rename_res = {}
894 for key in page2res:
895 unique_key, same_value = compute_unique_key(key)
896 newname = NameObject(unique_key)
897 if key != unique_key:
898 # we have to use a different name for this
899 rename_res[key] = newname
901 if not same_value:
902 if is_pdf_writer:
903 new_res[newname] = page2res.raw_get(key).clone(pdf)
904 try:
905 new_res[newname] = new_res[newname].indirect_reference
906 except AttributeError:
907 pass
908 else:
909 new_res[newname] = page2res.raw_get(key)
910 lst = sorted(new_res.items())
911 new_res.clear()
912 for el in lst:
913 new_res[el[0]] = el[1]
914 return new_res, rename_res
916 @staticmethod
917 def _content_stream_rename(
918 stream: ContentStream,
919 rename: dict[Any, Any],
920 pdf: Optional[PdfCommonDocProtocol],
921 ) -> ContentStream:
922 if not rename:
923 return stream
924 stream = ContentStream(stream, pdf)
925 for operands, _operator in stream.operations:
926 if isinstance(operands, list):
927 for i, op in enumerate(operands):
928 if isinstance(op, NameObject):
929 operands[i] = rename.get(op, op)
930 elif isinstance(operands, dict):
931 for i, op in operands.items():
932 if isinstance(op, NameObject):
933 operands[i] = rename.get(op, op)
934 else:
935 raise KeyError(f"Type of operands is {type(operands)}")
936 return stream
938 @staticmethod
939 def _add_transformation_matrix(
940 contents: Any,
941 pdf: Optional[PdfCommonDocProtocol],
942 ctm: CompressedTransformationMatrix,
943 ) -> ContentStream:
944 """Add transformation matrix at the beginning of the given contents stream."""
945 contents = ContentStream(contents, pdf)
946 contents.operations.insert(
947 0,
948 [
949 [FloatObject(x) for x in ctm],
950 b"cm",
951 ],
952 )
953 return contents
955 def _get_contents_as_bytes(self) -> Optional[bytes]:
956 """
957 Return the page contents as bytes.
959 Returns:
960 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
962 """
963 if PG.CONTENTS in self:
964 obj = self[PG.CONTENTS].get_object()
965 if isinstance(obj, list):
966 return b"".join(x.get_object().get_data() for x in obj)
967 return cast(EncodedStreamObject, obj).get_data()
968 return None
970 def get_contents(self) -> Optional[ContentStream]:
971 """
972 Access the page contents.
974 Returns:
975 The ``/Contents`` object, or ``None`` if it does not exist.
976 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
978 """
979 if PG.CONTENTS in self:
980 try:
981 pdf = cast(IndirectObject, self.indirect_reference).pdf
982 except AttributeError:
983 pdf = None
984 obj = self[PG.CONTENTS]
985 if is_null_or_none(obj):
986 return None
987 resolved_object = obj.get_object()
988 return ContentStream(resolved_object, pdf)
989 return None
991 def replace_contents(
992 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
993 ) -> None:
994 """
995 Replace the page contents with the new content and nullify old objects
996 Args:
997 content: new content; if None delete the content field.
998 """
999 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
1000 # the page is not attached : the content is directly attached.
1001 self[NameObject(PG.CONTENTS)] = content
1002 return
1004 from pypdf._writer import PdfWriter # noqa: PLC0415
1005 if not isinstance(self.indirect_reference.pdf, PdfWriter):
1006 deprecate(
1007 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "
1008 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "
1009 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."
1010 )
1012 writer = self.indirect_reference.pdf
1013 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
1014 content_array = cast(ArrayObject, self[PG.CONTENTS])
1015 for reference in content_array:
1016 try:
1017 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())
1018 except ValueError:
1019 # Occurs when called on PdfReader.
1020 pass
1022 if isinstance(content, ArrayObject):
1023 content = ArrayObject(writer._add_object(obj) for obj in content)
1025 if is_null_or_none(content):
1026 if PG.CONTENTS not in self:
1027 return
1028 assert self[PG.CONTENTS].indirect_reference is not None
1029 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())
1030 del self[PG.CONTENTS]
1031 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
1032 try:
1033 self[NameObject(PG.CONTENTS)] = writer._add_object(content)
1034 except AttributeError:
1035 # applies at least for page not in writer
1036 # as a backup solution, we put content as an object although not in accordance with pdf ref
1037 # this will be fixed with the _add_object
1038 self[NameObject(PG.CONTENTS)] = content
1039 else:
1040 assert content is not None, "mypy"
1041 content.indirect_reference = self[
1042 PG.CONTENTS
1043 ].indirect_reference # TODO: in the future may require generation management
1044 try:
1045 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)
1046 except AttributeError:
1047 # applies at least for page not in writer
1048 # as a backup solution, we put content as an object although not in accordance with pdf ref
1049 # this will be fixed with the _add_object
1050 self[NameObject(PG.CONTENTS)] = content
1051 # forces recalculation of inline_images
1052 self.inline_images = None
1054 def merge_page(
1055 self, page2: "PageObject", expand: bool = False, over: bool = True
1056 ) -> None:
1057 """
1058 Merge the content streams of two pages into one.
1060 Resource references (e.g. fonts) are maintained from both pages.
1061 The mediabox, cropbox, etc of this page are not altered.
1062 The parameter page's content stream will
1063 be added to the end of this page's content stream,
1064 meaning that it will be drawn after, or "on top" of this page.
1066 Args:
1067 page2: The page to be merged into this one. Should be
1068 an instance of :class:`PageObject<PageObject>`.
1069 over: set the page2 content over page1 if True (default) else under
1070 expand: If True, the current page dimensions will be
1071 expanded to accommodate the dimensions of the page to be merged.
1073 """
1074 self._merge_page(page2, over=over, expand=expand)
1076 def _merge_page(
1077 self,
1078 page2: "PageObject",
1079 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1080 ctm: Optional[CompressedTransformationMatrix] = None,
1081 over: bool = True,
1082 expand: bool = False,
1083 ) -> None:
1084 # First we work on merging the resource dictionaries. This allows us
1085 # to find out what symbols in the content streams we might need to
1086 # rename.
1087 try:
1088 assert isinstance(self.indirect_reference, IndirectObject)
1089 if hasattr(
1090 self.indirect_reference.pdf, "_add_object"
1091 ): # to detect PdfWriter
1092 return self._merge_page_writer(
1093 page2, page2transformation, ctm, over, expand
1094 )
1095 except (AssertionError, AttributeError):
1096 pass
1098 new_resources = DictionaryObject()
1099 rename = {}
1100 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())
1101 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())
1102 new_annots = ArrayObject()
1104 for page in (self, page2):
1105 if PG.ANNOTS in page:
1106 annots = page[PG.ANNOTS]
1107 if isinstance(annots, ArrayObject):
1108 new_annots.extend(annots)
1110 for res in (
1111 RES.EXT_G_STATE,
1112 RES.FONT,
1113 RES.XOBJECT,
1114 RES.COLOR_SPACE,
1115 RES.PATTERN,
1116 RES.SHADING,
1117 RES.PROPERTIES,
1118 ):
1119 new, newrename = self._merge_resources(
1120 original_resources, page2resources, res
1121 )
1122 if new:
1123 new_resources[NameObject(res)] = new
1124 rename.update(newrename)
1126 # Combine /ProcSet sets, making sure there's a consistent order
1127 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
1128 sorted(
1129 set(
1130 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
1131 ).union(
1132 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
1133 )
1134 )
1135 )
1137 new_content_array = ArrayObject()
1138 original_content = self.get_contents()
1139 if original_content is not None:
1140 original_content.isolate_graphics_state()
1141 new_content_array.append(original_content)
1143 page2content = page2.get_contents()
1144 if page2content is not None:
1145 rect = getattr(page2, MERGE_CROP_BOX)
1146 page2content.operations.insert(
1147 0,
1148 (
1149 map(
1150 FloatObject,
1151 [
1152 rect.left,
1153 rect.bottom,
1154 rect.width,
1155 rect.height,
1156 ],
1157 ),
1158 b"re",
1159 ),
1160 )
1161 page2content.operations.insert(1, ([], b"W"))
1162 page2content.operations.insert(2, ([], b"n"))
1163 if page2transformation is not None:
1164 page2content = page2transformation(page2content)
1165 page2content = PageObject._content_stream_rename(
1166 page2content, rename, self.pdf
1167 )
1168 page2content.isolate_graphics_state()
1169 if over:
1170 new_content_array.append(page2content)
1171 else:
1172 new_content_array.insert(0, page2content)
1174 # if expanding the page to fit a new page, calculate the new media box size
1175 if expand:
1176 self._expand_mediabox(page2, ctm)
1178 self.replace_contents(ContentStream(new_content_array, self.pdf))
1179 self[NameObject(PG.RESOURCES)] = new_resources
1180 self[NameObject(PG.ANNOTS)] = new_annots
1181 return None
1183 def _merge_page_writer(
1184 self,
1185 page2: "PageObject",
1186 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1187 ctm: Optional[CompressedTransformationMatrix] = None,
1188 over: bool = True,
1189 expand: bool = False,
1190 ) -> None:
1191 # First we work on merging the resource dictionaries. This allows us
1192 # to find which symbols in the content streams we might need to
1193 # rename.
1194 assert isinstance(self.indirect_reference, IndirectObject)
1195 pdf = self.indirect_reference.pdf
1197 rename = {}
1198 if PG.RESOURCES not in self:
1199 self[NameObject(PG.RESOURCES)] = DictionaryObject()
1200 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1201 if PG.RESOURCES not in page2:
1202 page2resources = DictionaryObject()
1203 else:
1204 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1206 for res in (
1207 RES.EXT_G_STATE,
1208 RES.FONT,
1209 RES.XOBJECT,
1210 RES.COLOR_SPACE,
1211 RES.PATTERN,
1212 RES.SHADING,
1213 RES.PROPERTIES,
1214 ):
1215 if res in page2resources:
1216 if res not in original_resources:
1217 original_resources[NameObject(res)] = DictionaryObject()
1218 _, newrename = self._merge_resources(
1219 original_resources, page2resources, res, False
1220 )
1221 rename.update(newrename)
1222 # Combine /ProcSet sets.
1223 if RES.PROC_SET in page2resources:
1224 if RES.PROC_SET not in original_resources:
1225 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
1226 arr = cast(ArrayObject, original_resources[RES.PROC_SET])
1227 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
1228 if x not in arr:
1229 arr.append(x)
1230 arr.sort()
1232 if PG.ANNOTS in page2:
1233 if PG.ANNOTS not in self:
1234 self[NameObject(PG.ANNOTS)] = ArrayObject()
1235 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
1236 if ctm is None:
1237 trsf = Transformation()
1238 else:
1239 trsf = Transformation(ctm)
1240 # Ensure we are working on a copy of the list. Otherwise, if both pages
1241 # are the same object, we might run into an infinite loop.
1242 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):
1243 a = a.get_object()
1244 aa = a.clone(
1245 pdf,
1246 ignore_fields=("/P", "/StructParent", "/Parent"),
1247 force_duplicate=True,
1248 )
1249 r = cast(ArrayObject, a["/Rect"])
1250 pt1 = trsf.apply_on((r[0], r[1]), True)
1251 pt2 = trsf.apply_on((r[2], r[3]), True)
1252 aa[NameObject("/Rect")] = ArrayObject(
1253 (
1254 min(pt1[0], pt2[0]),
1255 min(pt1[1], pt2[1]),
1256 max(pt1[0], pt2[0]),
1257 max(pt1[1], pt2[1]),
1258 )
1259 )
1260 if "/QuadPoints" in a:
1261 q = cast(ArrayObject, a["/QuadPoints"])
1262 aa[NameObject("/QuadPoints")] = ArrayObject(
1263 trsf.apply_on((q[0], q[1]), True)
1264 + trsf.apply_on((q[2], q[3]), True)
1265 + trsf.apply_on((q[4], q[5]), True)
1266 + trsf.apply_on((q[6], q[7]), True)
1267 )
1268 try:
1269 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
1270 except KeyError:
1271 pass
1272 try:
1273 aa[NameObject("/P")] = self.indirect_reference
1274 annots.append(aa.indirect_reference)
1275 except AttributeError:
1276 pass
1278 new_content_array = ArrayObject()
1279 original_content = self.get_contents()
1280 if original_content is not None:
1281 original_content.isolate_graphics_state()
1282 new_content_array.append(original_content)
1284 page2content = page2.get_contents()
1285 if page2content is not None:
1286 rect = getattr(page2, MERGE_CROP_BOX)
1287 page2content.operations.insert(
1288 0,
1289 (
1290 map(
1291 FloatObject,
1292 [
1293 rect.left,
1294 rect.bottom,
1295 rect.width,
1296 rect.height,
1297 ],
1298 ),
1299 b"re",
1300 ),
1301 )
1302 page2content.operations.insert(1, ([], b"W"))
1303 page2content.operations.insert(2, ([], b"n"))
1304 if page2transformation is not None:
1305 page2content = page2transformation(page2content)
1306 page2content = PageObject._content_stream_rename(
1307 page2content, rename, self.pdf
1308 )
1309 page2content.isolate_graphics_state()
1310 if over:
1311 new_content_array.append(page2content)
1312 else:
1313 new_content_array.insert(0, page2content)
1315 # if expanding the page to fit a new page, calculate the new media box size
1316 if expand:
1317 self._expand_mediabox(page2, ctm)
1319 self.replace_contents(new_content_array)
1321 def _expand_mediabox(
1322 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
1323 ) -> None:
1324 corners1 = (
1325 self.mediabox.left.as_numeric(),
1326 self.mediabox.bottom.as_numeric(),
1327 self.mediabox.right.as_numeric(),
1328 self.mediabox.top.as_numeric(),
1329 )
1330 corners2 = (
1331 page2.mediabox.left.as_numeric(),
1332 page2.mediabox.bottom.as_numeric(),
1333 page2.mediabox.left.as_numeric(),
1334 page2.mediabox.top.as_numeric(),
1335 page2.mediabox.right.as_numeric(),
1336 page2.mediabox.top.as_numeric(),
1337 page2.mediabox.right.as_numeric(),
1338 page2.mediabox.bottom.as_numeric(),
1339 )
1340 if ctm is not None:
1341 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1342 new_x = tuple(
1343 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
1344 for i in range(0, 8, 2)
1345 )
1346 new_y = tuple(
1347 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
1348 for i in range(0, 8, 2)
1349 )
1350 else:
1351 new_x = corners2[0:8:2]
1352 new_y = corners2[1:8:2]
1353 lowerleft = (min(new_x), min(new_y))
1354 upperright = (max(new_x), max(new_y))
1355 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
1356 upperright = (
1357 max(corners1[2], upperright[0]),
1358 max(corners1[3], upperright[1]),
1359 )
1361 self.mediabox.lower_left = lowerleft
1362 self.mediabox.upper_right = upperright
1364 def merge_transformed_page(
1365 self,
1366 page2: "PageObject",
1367 ctm: Union[CompressedTransformationMatrix, Transformation],
1368 over: bool = True,
1369 expand: bool = False,
1370 ) -> None:
1371 """
1372 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation
1373 matrix is applied to the merged stream.
1375 Args:
1376 page2: The page to be merged into this one.
1377 ctm: a 6-element tuple containing the operands of the
1378 transformation matrix
1379 over: set the page2 content over page1 if True (default) else under
1380 expand: Whether the page should be expanded to fit the dimensions
1381 of the page to be merged.
1383 """
1384 if isinstance(ctm, Transformation):
1385 ctm = ctm.ctm
1386 self._merge_page(
1387 page2,
1388 lambda page2_content: PageObject._add_transformation_matrix(
1389 page2_content, page2.pdf, ctm
1390 ),
1391 ctm,
1392 over,
1393 expand,
1394 )
1396 def merge_scaled_page(
1397 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
1398 ) -> None:
1399 """
1400 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1401 is scaled by applying a transformation matrix.
1403 Args:
1404 page2: The page to be merged into this one.
1405 scale: The scaling factor
1406 over: set the page2 content over page1 if True (default) else under
1407 expand: Whether the page should be expanded to fit the
1408 dimensions of the page to be merged.
1410 """
1411 op = Transformation().scale(scale, scale)
1412 self.merge_transformed_page(page2, op, over, expand)
1414 def merge_rotated_page(
1415 self,
1416 page2: "PageObject",
1417 rotation: float,
1418 over: bool = True,
1419 expand: bool = False,
1420 ) -> None:
1421 """
1422 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1423 is rotated by applying a transformation matrix.
1425 Args:
1426 page2: The page to be merged into this one.
1427 rotation: The angle of the rotation, in degrees
1428 over: set the page2 content over page1 if True (default) else under
1429 expand: Whether the page should be expanded to fit the
1430 dimensions of the page to be merged.
1432 """
1433 op = Transformation().rotate(rotation)
1434 self.merge_transformed_page(page2, op, over, expand)
1436 def merge_translated_page(
1437 self,
1438 page2: "PageObject",
1439 tx: float,
1440 ty: float,
1441 over: bool = True,
1442 expand: bool = False,
1443 ) -> None:
1444 """
1445 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be
1446 merged is translated by applying a transformation matrix.
1448 Args:
1449 page2: the page to be merged into this one.
1450 tx: The translation on X axis
1451 ty: The translation on Y axis
1452 over: set the page2 content over page1 if True (default) else under
1453 expand: Whether the page should be expanded to fit the
1454 dimensions of the page to be merged.
1456 """
1457 op = Transformation().translate(tx, ty)
1458 self.merge_transformed_page(page2, op, over, expand)
1460 def add_transformation(
1461 self,
1462 ctm: Union[Transformation, CompressedTransformationMatrix],
1463 expand: bool = False,
1464 ) -> None:
1465 """
1466 Apply a transformation matrix to the page.
1468 Args:
1469 ctm: A 6-element tuple containing the operands of the
1470 transformation matrix. Alternatively, a
1471 :py:class:`Transformation<pypdf.Transformation>`
1472 object can be passed.
1474 See :doc:`/user/cropping-and-transforming`.
1476 """
1477 if isinstance(ctm, Transformation):
1478 ctm = ctm.ctm
1479 content = self.get_contents()
1480 if content is not None:
1481 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
1482 content.isolate_graphics_state()
1483 self.replace_contents(content)
1484 # if expanding the page to fit a new page, calculate the new media box size
1485 if expand:
1486 corners = [
1487 self.mediabox.left.as_numeric(),
1488 self.mediabox.bottom.as_numeric(),
1489 self.mediabox.left.as_numeric(),
1490 self.mediabox.top.as_numeric(),
1491 self.mediabox.right.as_numeric(),
1492 self.mediabox.top.as_numeric(),
1493 self.mediabox.right.as_numeric(),
1494 self.mediabox.bottom.as_numeric(),
1495 ]
1497 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1498 new_x = [
1499 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
1500 for i in range(0, 8, 2)
1501 ]
1502 new_y = [
1503 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
1504 for i in range(0, 8, 2)
1505 ]
1507 self.mediabox.lower_left = (min(new_x), min(new_y))
1508 self.mediabox.upper_right = (max(new_x), max(new_y))
1510 def scale(self, sx: float, sy: float) -> None:
1511 """
1512 Scale a page by the given factors by applying a transformation matrix
1513 to its content and updating the page size.
1515 This updates the various page boundaries (bleedbox, trimbox, etc.)
1516 and the contents of the page.
1518 Args:
1519 sx: The scaling factor on horizontal axis.
1520 sy: The scaling factor on vertical axis.
1522 """
1523 self.add_transformation((sx, 0, 0, sy, 0, 0))
1524 self.bleedbox = self.bleedbox.scale(sx, sy)
1525 self.trimbox = self.trimbox.scale(sx, sy)
1526 self.artbox = self.artbox.scale(sx, sy)
1527 self.cropbox = self.cropbox.scale(sx, sy)
1528 self.mediabox = self.mediabox.scale(sx, sy)
1530 if PG.ANNOTS in self:
1531 annotations = self[PG.ANNOTS]
1532 if isinstance(annotations, ArrayObject):
1533 for annotation in annotations:
1534 annotation_obj = annotation.get_object()
1535 if ADA.Rect in annotation_obj:
1536 rectangle = annotation_obj[ADA.Rect]
1537 if isinstance(rectangle, ArrayObject):
1538 rectangle[0] = FloatObject(float(rectangle[0]) * sx)
1539 rectangle[1] = FloatObject(float(rectangle[1]) * sy)
1540 rectangle[2] = FloatObject(float(rectangle[2]) * sx)
1541 rectangle[3] = FloatObject(float(rectangle[3]) * sy)
1543 if PG.VP in self:
1544 viewport = self[PG.VP]
1545 if isinstance(viewport, ArrayObject):
1546 bbox = viewport[0]["/BBox"]
1547 else:
1548 bbox = viewport["/BBox"] # type: ignore
1549 scaled_bbox = RectangleObject(
1550 (
1551 float(bbox[0]) * sx,
1552 float(bbox[1]) * sy,
1553 float(bbox[2]) * sx,
1554 float(bbox[3]) * sy,
1555 )
1556 )
1557 if isinstance(viewport, ArrayObject):
1558 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore
1559 NameObject("/BBox")
1560 ] = scaled_bbox
1561 else:
1562 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore
1564 def scale_by(self, factor: float) -> None:
1565 """
1566 Scale a page by the given factor by applying a transformation matrix to
1567 its content and updating the page size.
1569 Args:
1570 factor: The scaling factor (for both X and Y axis).
1572 """
1573 self.scale(factor, factor)
1575 def scale_to(self, width: float, height: float) -> None:
1576 """
1577 Scale a page to the specified dimensions by applying a transformation
1578 matrix to its content and updating the page size.
1580 Args:
1581 width: The new width.
1582 height: The new height.
1584 """
1585 sx = width / float(self.mediabox.width)
1586 sy = height / float(self.mediabox.height)
1587 self.scale(sx, sy)
1589 def compress_content_streams(self, level: int = -1) -> None:
1590 """
1591 Compress the size of this page by joining all content streams and
1592 applying a FlateDecode filter.
1594 However, it is possible that this function will perform no action if
1595 content stream compression becomes "automatic".
1596 """
1597 content = self.get_contents()
1598 if content is not None:
1599 content_obj = content.flate_encode(level)
1600 try:
1601 content.indirect_reference.pdf._objects[ # type: ignore
1602 content.indirect_reference.idnum - 1 # type: ignore
1603 ] = content_obj
1604 except AttributeError:
1605 if self.indirect_reference is not None and hasattr(
1606 self.indirect_reference.pdf, "_add_object"
1607 ):
1608 self.replace_contents(content_obj)
1609 else:
1610 raise ValueError("Page must be part of a PdfWriter")
1612 @property
1613 def page_number(self) -> Optional[int]:
1614 """
1615 Read-only property which returns the page number within the PDF file.
1617 Returns:
1618 Page number; None if the page is not attached to a PDF.
1620 """
1621 if self.indirect_reference is None:
1622 return None
1623 try:
1624 lst = self.indirect_reference.pdf.pages
1625 return lst.index(self)
1626 except ValueError:
1627 return None
1629 def _debug_for_extract(self) -> str: # pragma: no cover
1630 out = ""
1631 for ope, op in ContentStream(
1632 self["/Contents"].get_object(), self.pdf, "bytes"
1633 ).operations:
1634 if op == b"TJ":
1635 s = [x for x in ope[0] if isinstance(x, str)]
1636 else:
1637 s = []
1638 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
1639 out += "\n=============================\n"
1640 try:
1641 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore
1642 out += fo + "\n"
1643 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore
1644 try:
1645 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1646 "/Encoding"
1647 ].__repr__()
1648 out += enc_repr + "\n"
1649 except Exception:
1650 pass
1651 try:
1652 out += (
1653 self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1654 "/ToUnicode"
1655 ]
1656 .get_data()
1657 .decode()
1658 + "\n"
1659 )
1660 except Exception:
1661 pass
1663 except KeyError:
1664 out += "No Font\n"
1665 return out
1667 def _extract_text(
1668 self,
1669 obj: Any,
1670 pdf: Any,
1671 orientations: tuple[int, ...] = (0, 90, 180, 270),
1672 space_width: float = 200.0,
1673 content_key: Optional[str] = PG.CONTENTS,
1674 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1675 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1676 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1677 ) -> str:
1678 """
1679 See extract_text for most arguments.
1681 Args:
1682 content_key: indicate the default key where to extract data
1683 None = the object; this allows reusing the function on an XObject
1684 default = "/Content"
1686 """
1687 extractor = TextExtraction()
1688 font_resources: dict[str, DictionaryObject] = {}
1689 fonts: dict[str, Font] = {}
1691 try:
1692 objr = obj
1693 while NameObject(PG.RESOURCES) not in objr:
1694 # /Resources can be inherited so we look to parents
1695 objr = objr["/Parent"].get_object()
1696 # If no parents then no /Resources will be available,
1697 # so an exception will be raised
1698 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
1699 except Exception:
1700 # No resources means no text is possible (no font); we consider the
1701 # file as not damaged, no need to check for TJ or Tj
1702 return ""
1704 if (
1705 not is_null_or_none(resources_dict)
1706 and "/Font" in resources_dict
1707 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))
1708 ):
1709 for font_resource in font_resources_dict:
1710 try:
1711 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())
1712 font_resources[font_resource] = font_resource_object
1713 fonts[font_resource] = Font.from_font_resource(font_resource_object)
1714 # Override space width, if applicable
1715 if fonts[font_resource].character_widths.get(" ", 0) == 0:
1716 fonts[font_resource].space_width = space_width
1717 except (AttributeError, TypeError):
1718 pass
1720 try:
1721 content = (
1722 obj[content_key].get_object() if isinstance(content_key, str) else obj
1723 )
1724 if not isinstance(content, ContentStream):
1725 content = ContentStream(content, pdf, "bytes")
1726 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
1727 return ""
1728 # We check all strings are TextStringObjects. ByteStringObjects
1729 # are strings where the byte->string encoding was unknown, so adding
1730 # them to the text here would be gibberish.
1732 # Initialize the extractor with the necessary parameters
1733 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)
1735 for operands, operator in content.operations:
1736 if visitor_operand_before is not None:
1737 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1738 # Multiple operators are handled here
1739 if operator == b"'":
1740 extractor.process_operation(b"T*", [])
1741 extractor.process_operation(b"Tj", operands)
1742 elif operator == b'"':
1743 extractor.process_operation(b"Tw", [operands[0]])
1744 extractor.process_operation(b"Tc", [operands[1]])
1745 extractor.process_operation(b"T*", [])
1746 extractor.process_operation(b"Tj", operands[2:])
1747 elif operator == b"TJ":
1748 # The space width may be smaller than the font width, so the width should be 95%.
1749 _confirm_space_width = extractor._space_width * 0.95
1750 if operands:
1751 for op in operands[0]:
1752 if isinstance(op, (str, bytes)):
1753 extractor.process_operation(b"Tj", [op])
1754 if isinstance(op, (int, float, NumberObject, FloatObject)) and (
1755 abs(float(op)) >= _confirm_space_width
1756 and extractor.text
1757 and extractor.text[-1] != " "
1758 ):
1759 extractor.process_operation(b"Tj", [" "])
1760 elif operator == b"TD":
1761 extractor.process_operation(b"TL", [-operands[1]])
1762 extractor.process_operation(b"Td", operands)
1763 elif operator == b"Do":
1764 extractor.output += extractor.text
1765 if visitor_text is not None:
1766 visitor_text(
1767 extractor.text,
1768 extractor.memo_cm,
1769 extractor.memo_tm,
1770 extractor.font_resource,
1771 extractor.font_size,
1772 )
1773 try:
1774 if extractor.output[-1] != "\n":
1775 extractor.output += "\n"
1776 if visitor_text is not None:
1777 visitor_text(
1778 "\n",
1779 extractor.memo_cm,
1780 extractor.memo_tm,
1781 extractor.font_resource,
1782 extractor.font_size,
1783 )
1784 except IndexError:
1785 pass
1786 try:
1787 xobj = resources_dict["/XObject"]
1788 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
1789 text = self.extract_xform_text(
1790 xobj[operands[0]], # type: ignore
1791 orientations,
1792 space_width,
1793 visitor_operand_before,
1794 visitor_operand_after,
1795 visitor_text,
1796 )
1797 extractor.output += text
1798 if visitor_text is not None:
1799 visitor_text(
1800 text,
1801 extractor.memo_cm,
1802 extractor.memo_tm,
1803 extractor.font_resource,
1804 extractor.font_size,
1805 )
1806 except Exception as exception:
1807 logger_warning(
1808 f"Impossible to decode XFormObject {operands[0]}: {exception}",
1809 __name__,
1810 )
1811 finally:
1812 extractor.text = ""
1813 extractor.memo_cm = extractor.cm_matrix.copy()
1814 extractor.memo_tm = extractor.tm_matrix.copy()
1815 else:
1816 extractor.process_operation(operator, operands)
1817 if visitor_operand_after is not None:
1818 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1819 extractor.output += extractor.text # just in case
1820 if extractor.text != "" and visitor_text is not None:
1821 visitor_text(
1822 extractor.text,
1823 extractor.memo_cm,
1824 extractor.memo_tm,
1825 extractor.font_resource,
1826 extractor.font_size,
1827 )
1828 return extractor.output
1830 def _layout_mode_fonts(self) -> dict[str, Font]:
1831 """
1832 Get fonts formatted for "layout" mode text extraction.
1834 Returns:
1835 Dict[str, Font]: dictionary of Font instances keyed by font name
1837 """
1838 # Font retrieval logic adapted from pypdf.PageObject._extract_text()
1839 objr: Any = self
1840 fonts: dict[str, Font] = {}
1841 while objr is not None:
1842 try:
1843 resources_dict: Any = objr[PG.RESOURCES]
1844 except KeyError:
1845 resources_dict = {}
1846 if "/Font" in resources_dict and self.pdf is not None:
1847 for font_name in resources_dict["/Font"]:
1848 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])
1849 try:
1850 objr = objr["/Parent"].get_object()
1851 except KeyError:
1852 objr = None
1854 return fonts
1856 def _layout_mode_text(
1857 self,
1858 space_vertically: bool = True,
1859 scale_weight: float = 1.25,
1860 strip_rotated: bool = True,
1861 debug_path: Optional[Path] = None,
1862 font_height_weight: float = 1,
1863 ) -> str:
1864 """
1865 Get text preserving fidelity to source PDF text layout.
1867 Args:
1868 space_vertically: include blank lines inferred from y distance + font
1869 height. Defaults to True.
1870 scale_weight: multiplier for string length when calculating weighted
1871 average character width. Defaults to 1.25.
1872 strip_rotated: Removes text that is rotated w.r.t. to the page from
1873 layout mode output. Defaults to True.
1874 debug_path (Path | None): if supplied, must target a directory.
1875 creates the following files with debug information for layout mode
1876 functions if supplied:
1877 - fonts.json: output of self._layout_mode_fonts
1878 - tjs.json: individual text render ops with corresponding transform matrices
1879 - bts.json: text render ops left justified and grouped by BT/ET operators
1880 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1881 Defaults to None.
1882 font_height_weight: multiplier for font height when calculating
1883 blank lines. Defaults to 1.
1885 Returns:
1886 str: multiline string containing page text in a fixed width format that
1887 closely adheres to the rendered layout in the source pdf.
1889 """
1890 fonts = self._layout_mode_fonts()
1891 if debug_path: # pragma: no cover
1892 import json # noqa: PLC0415
1894 debug_path.joinpath("fonts.json").write_text(
1895 json.dumps(fonts, indent=2, default=asdict),
1896 "utf-8"
1897 )
1899 ops = iter(
1900 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
1901 )
1902 bt_groups = _layout_mode.text_show_operations(
1903 ops, fonts, strip_rotated, debug_path
1904 )
1906 if not bt_groups:
1907 return ""
1909 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
1911 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
1913 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
1915 def extract_text(
1916 self,
1917 *args: Any,
1918 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),
1919 space_width: float = 200.0,
1920 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1921 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1922 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1923 extraction_mode: Literal["plain", "layout"] = "plain",
1924 **kwargs: Any,
1925 ) -> str:
1926 """
1927 Locate all text drawing commands, in the order they are provided in the
1928 content stream, and extract the text.
1930 This works well for some PDF files, but poorly for others, depending on
1931 the generator used. This will be refined in the future.
1933 Do not rely on the order of text coming out of this function, as it
1934 will change if this function is made more sophisticated.
1936 Arabic and Hebrew are extracted in the correct order.
1937 If required a custom RTL range of characters can be defined;
1938 see function set_custom_rtl.
1940 Additionally you can provide visitor methods to get informed on all
1941 operations and all text objects.
1942 For example in some PDF files this can be useful to parse tables.
1944 Args:
1945 orientations: list of orientations extract_text will look for
1946 default = (0, 90, 180, 270)
1947 note: currently only 0 (up),90 (turned left), 180 (upside down),
1948 270 (turned right)
1949 Silently ignored in "layout" mode.
1950 space_width: force default space width
1951 if not extracted from font (default: 200)
1952 Silently ignored in "layout" mode.
1953 visitor_operand_before: function to be called before processing an operation.
1954 It has four arguments: operator, operand-arguments,
1955 current transformation matrix and text matrix.
1956 Ignored with a warning in "layout" mode.
1957 visitor_operand_after: function to be called after processing an operation.
1958 It has four arguments: operator, operand-arguments,
1959 current transformation matrix and text matrix.
1960 Ignored with a warning in "layout" mode.
1961 visitor_text: function to be called when extracting some text at some position.
1962 It has five arguments: text, current transformation matrix,
1963 text matrix, font-dictionary and font-size.
1964 The font-dictionary may be None in case of unknown fonts.
1965 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
1966 Ignored with a warning in "layout" mode.
1967 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
1968 "layout" for experimental layout mode functionality.
1969 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
1970 in "layout" mode.
1972 kwargs:
1973 layout_mode_space_vertically (bool): include blank lines inferred from
1974 y distance + font height. Defaults to True.
1975 layout_mode_scale_weight (float): multiplier for string length when calculating
1976 weighted average character width. Defaults to 1.25.
1977 layout_mode_strip_rotated (bool): layout mode does not support rotated text.
1978 Set to False to include rotated text anyway. If rotated text is discovered,
1979 layout will be degraded and a warning will result. Defaults to True.
1980 layout_mode_debug_path (Path | None): if supplied, must target a directory.
1981 creates the following files with debug information for layout mode
1982 functions if supplied:
1984 - fonts.json: output of self._layout_mode_fonts
1985 - tjs.json: individual text render ops with corresponding transform matrices
1986 - bts.json: text render ops left justified and grouped by BT/ET operators
1987 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1988 layout_mode_font_height_weight (float): multiplier for font height when calculating
1989 blank lines. Defaults to 1.
1991 Returns:
1992 The extracted text
1994 """
1995 if extraction_mode not in ["plain", "layout"]:
1996 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
1997 if extraction_mode == "layout":
1998 for visitor in (
1999 "visitor_operand_before",
2000 "visitor_operand_after",
2001 "visitor_text",
2002 ):
2003 if locals()[visitor]:
2004 logger_warning(
2005 f"Argument {visitor} is ignored in layout mode",
2006 __name__,
2007 )
2008 return self._layout_mode_text(
2009 space_vertically=kwargs.get("layout_mode_space_vertically", True),
2010 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
2011 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
2012 debug_path=kwargs.get("layout_mode_debug_path"),
2013 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
2014 )
2015 if len(args) >= 1:
2016 if isinstance(args[0], str):
2017 if len(args) >= 3:
2018 if isinstance(args[2], (tuple, int)):
2019 orientations = args[2]
2020 else:
2021 raise TypeError(f"Invalid positional parameter {args[2]}")
2022 if len(args) >= 4:
2023 if isinstance(args[3], (float, int)):
2024 space_width = args[3]
2025 else:
2026 raise TypeError(f"Invalid positional parameter {args[3]}")
2027 elif isinstance(args[0], (tuple, int)):
2028 orientations = args[0]
2029 if len(args) >= 2:
2030 if isinstance(args[1], (float, int)):
2031 space_width = args[1]
2032 else:
2033 raise TypeError(f"Invalid positional parameter {args[1]}")
2034 else:
2035 raise TypeError(f"Invalid positional parameter {args[0]}")
2037 if isinstance(orientations, int):
2038 orientations = (orientations,)
2040 return self._extract_text(
2041 self,
2042 self.pdf,
2043 orientations,
2044 space_width,
2045 PG.CONTENTS,
2046 visitor_operand_before,
2047 visitor_operand_after,
2048 visitor_text,
2049 )
2051 def extract_xform_text(
2052 self,
2053 xform: EncodedStreamObject,
2054 orientations: tuple[int, ...] = (0, 90, 270, 360),
2055 space_width: float = 200.0,
2056 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2057 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2058 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2059 ) -> str:
2060 """
2061 Extract text from an XObject.
2063 Args:
2064 xform:
2065 orientations:
2066 space_width: force default space width (if not extracted from font (default 200)
2067 visitor_operand_before:
2068 visitor_operand_after:
2069 visitor_text:
2071 Returns:
2072 The extracted text
2074 """
2075 return self._extract_text(
2076 xform,
2077 self.pdf,
2078 orientations,
2079 space_width,
2080 None,
2081 visitor_operand_before,
2082 visitor_operand_after,
2083 visitor_text,
2084 )
2086 def _get_fonts(self) -> tuple[set[str], set[str]]:
2087 """
2088 Get the names of embedded fonts and unembedded fonts.
2090 Returns:
2091 A tuple (set of embedded fonts, set of unembedded fonts)
2093 """
2094 obj = self.get_object()
2095 assert isinstance(obj, DictionaryObject)
2096 fonts: set[str] = set()
2097 embedded: set[str] = set()
2098 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
2099 unembedded = fonts - embedded
2100 return embedded, unembedded
2102 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
2103 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2104 default user space units, defining the boundaries of the physical medium on
2105 which the page is intended to be displayed or printed."""
2107 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
2108 """
2109 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2110 default user space units, defining the visible region of default user
2111 space.
2113 When the page is displayed or printed, its contents are to be clipped
2114 (cropped) to this rectangle and then imposed on the output medium in some
2115 implementation-defined manner. Default value: same as
2116 :attr:`mediabox<mediabox>`.
2117 """
2119 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
2120 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2121 default user space units, defining the region to which the contents of the
2122 page should be clipped when output in a production environment."""
2124 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
2125 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2126 default user space units, defining the intended dimensions of the finished
2127 page after trimming."""
2129 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
2130 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2131 default user space units, defining the extent of the page's meaningful
2132 content as intended by the page's creator."""
2134 @property
2135 def annotations(self) -> Optional[ArrayObject]:
2136 if "/Annots" not in self:
2137 return None
2138 return cast(ArrayObject, self["/Annots"])
2140 @annotations.setter
2141 def annotations(self, value: Optional[ArrayObject]) -> None:
2142 """
2143 Set the annotations array of the page.
2145 Typically you do not want to set this value, but append to it.
2146 If you append to it, remember to add the object first to the writer
2147 and only add the indirect object.
2148 """
2149 if value is None:
2150 if "/Annots" not in self:
2151 return
2152 del self[NameObject("/Annots")]
2153 else:
2154 self[NameObject("/Annots")] = value
2157class _VirtualList(Sequence[PageObject]):
2158 def __init__(
2159 self,
2160 length_function: Callable[[], int],
2161 get_function: Callable[[int], PageObject],
2162 ) -> None:
2163 self.length_function = length_function
2164 self.get_function = get_function
2165 self.current = -1
2167 def __len__(self) -> int:
2168 return self.length_function()
2170 @overload
2171 def __getitem__(self, index: int) -> PageObject:
2172 ...
2174 @overload
2175 def __getitem__(self, index: slice) -> Sequence[PageObject]:
2176 ...
2178 def __getitem__(
2179 self, index: Union[int, slice]
2180 ) -> Union[PageObject, Sequence[PageObject]]:
2181 if isinstance(index, slice):
2182 indices = range(*index.indices(len(self)))
2183 cls = type(self)
2184 return cls(indices.__len__, lambda idx: self[indices[idx]])
2185 if not isinstance(index, int):
2186 raise TypeError("Sequence indices must be integers")
2187 len_self = len(self)
2188 if index < 0:
2189 # support negative indexes
2190 index += len_self
2191 if not (0 <= index < len_self):
2192 raise IndexError("Sequence index out of range")
2193 return self.get_function(index)
2195 def __delitem__(self, index: Union[int, slice]) -> None:
2196 if isinstance(index, slice):
2197 r = list(range(*index.indices(len(self))))
2198 # pages have to be deleted from last to first
2199 r.sort()
2200 r.reverse()
2201 for p in r:
2202 del self[p] # recursive call
2203 return
2204 if not isinstance(index, int):
2205 raise TypeError("Index must be integers")
2206 len_self = len(self)
2207 if index < 0:
2208 # support negative indexes
2209 index += len_self
2210 if not (0 <= index < len_self):
2211 raise IndexError("Index out of range")
2212 ind = self[index].indirect_reference
2213 assert ind is not None
2214 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2215 "/Parent", None
2216 )
2217 first = True
2218 while parent is not None:
2219 parent = cast(DictionaryObject, parent.get_object())
2220 try:
2221 i = cast(ArrayObject, parent["/Kids"]).index(ind)
2222 del cast(ArrayObject, parent["/Kids"])[i]
2223 first = False
2224 try:
2225 assert ind is not None
2226 del ind.pdf.flattened_pages[index] # case of page in a Reader
2227 except Exception: # pragma: no cover
2228 pass
2229 if "/Count" in parent:
2230 parent[NameObject("/Count")] = NumberObject(
2231 cast(int, parent["/Count"]) - 1
2232 )
2233 if len(cast(ArrayObject, parent["/Kids"])) == 0:
2234 # No more objects in this part of this subtree
2235 ind = parent.indirect_reference
2236 parent = parent.get("/Parent", None)
2237 except ValueError: # from index
2238 if first:
2239 raise PdfReadError(f"Page not found in page tree: {ind}")
2240 break
2242 def __iter__(self) -> Iterator[PageObject]:
2243 for i in range(len(self)):
2244 yield self[i]
2246 def __str__(self) -> str:
2247 p = [f"PageObject({i})" for i in range(self.length_function())]
2248 return f"[{', '.join(p)}]"
2251def _get_fonts_walk(
2252 obj: DictionaryObject,
2253 fnt: set[str],
2254 emb: set[str],
2255) -> tuple[set[str], set[str]]:
2256 """
2257 Get the set of all fonts and all embedded fonts.
2259 Args:
2260 obj: Page resources dictionary
2261 fnt: font
2262 emb: embedded fonts
2264 Returns:
2265 A tuple (fnt, emb)
2267 If there is a key called 'BaseFont', that is a font that is used in the document.
2268 If there is a key called 'FontName' and another key in the same dictionary object
2269 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
2270 embedded.
2272 We create and add to two sets, fnt = fonts used and emb = fonts embedded.
2274 """
2275 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
2277 def process_font(f: DictionaryObject) -> None:
2278 nonlocal fnt, emb
2279 f = cast(DictionaryObject, f.get_object()) # to be sure
2280 if "/BaseFont" in f:
2281 fnt.add(cast(str, f["/BaseFont"]))
2283 if (
2284 ("/CharProcs" in f)
2285 or (
2286 "/FontDescriptor" in f
2287 and any(
2288 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
2289 )
2290 )
2291 or (
2292 "/DescendantFonts" in f
2293 and "/FontDescriptor"
2294 in cast(
2295 DictionaryObject,
2296 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2297 )
2298 and any(
2299 x
2300 in cast(
2301 DictionaryObject,
2302 cast(
2303 DictionaryObject,
2304 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2305 )["/FontDescriptor"],
2306 )
2307 for x in fontkeys
2308 )
2309 )
2310 ):
2311 # the list comprehension ensures there is FontFile
2312 try:
2313 emb.add(cast(str, f["/BaseFont"]))
2314 except KeyError:
2315 emb.add("(" + cast(str, f["/Subtype"]) + ")")
2317 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
2318 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
2319 process_font(f)
2320 if "/Resources" in obj:
2321 if "/Font" in cast(DictionaryObject, obj["/Resources"]):
2322 for f in cast(
2323 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
2324 ).values():
2325 process_font(f)
2326 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
2327 for x in cast(
2328 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
2329 ).values():
2330 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
2331 if "/Annots" in obj:
2332 for a in cast(ArrayObject, obj["/Annots"]):
2333 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
2334 if "/AP" in obj:
2335 if (
2336 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
2337 "/Type"
2338 )
2339 == "/XObject"
2340 ):
2341 _get_fonts_walk(
2342 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
2343 fnt,
2344 emb,
2345 )
2346 else:
2347 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
2348 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
2349 return fnt, emb # return the sets for each page