Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import math
31from collections.abc import Iterable, Iterator, Sequence
32from copy import deepcopy
33from dataclasses import asdict, dataclass
34from decimal import Decimal
35from io import BytesIO
36from pathlib import Path
37from typing import (
38 Any,
39 Callable,
40 Literal,
41 Optional,
42 Union,
43 cast,
44 overload,
45)
47from ._font import Font
48from ._protocols import PdfCommonDocProtocol
49from ._text_extraction import (
50 _layout_mode,
51)
52from ._text_extraction._text_extractor import TextExtraction
53from ._utils import (
54 CompressedTransformationMatrix,
55 TransformationMatrixType,
56 _human_readable_bytes,
57 deprecate,
58 logger_warning,
59 matrix_multiply,
60)
61from .constants import (
62 _INLINE_IMAGE_KEY_MAPPING,
63 _INLINE_IMAGE_VALUE_MAPPING,
64 AnnotationDictionaryAttributes,
65 ImageAttributes,
66)
67from .constants import PageAttributes as PG
68from .constants import Resources as RES
69from .errors import PageSizeNotDefinedError, PdfReadError
70from .generic import (
71 ArrayObject,
72 ContentStream,
73 DictionaryObject,
74 EncodedStreamObject,
75 FloatObject,
76 IndirectObject,
77 NameObject,
78 NullObject,
79 NumberObject,
80 PdfObject,
81 RectangleObject,
82 StreamObject,
83 is_null_or_none,
84)
86try:
87 from PIL.Image import Image
89 pil_not_imported = False
90except ImportError:
91 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
92 pil_not_imported = True # error will be raised only when using images
94MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"
97def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
98 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)
99 if isinstance(retval, RectangleObject):
100 return retval
101 if is_null_or_none(retval):
102 for d in defaults:
103 retval = self.get(d)
104 if retval is not None:
105 break
106 if isinstance(retval, IndirectObject):
107 retval = self.pdf.get_object(retval)
108 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:
109 logger_warning(
110 "Expected four values, got %(length)d: %(retval)s",
111 source=__name__,
112 length=length,
113 retval=retval,
114 )
115 retval = RectangleObject(tuple(retval[:4]))
116 else:
117 retval = RectangleObject(retval) # type: ignore[arg-type]
118 _set_rectangle(self, name, retval)
119 return retval
122def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
123 self[NameObject(name)] = value
126def _delete_rectangle(self: Any, name: str) -> None:
127 del self[name]
130def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
131 return property(
132 lambda self: _get_rectangle(self, name, fallback),
133 lambda self, value: _set_rectangle(self, name, value),
134 lambda self: _delete_rectangle(self, name),
135 )
138class Transformation:
139 """
140 Represent a 2D transformation.
142 The transformation between two coordinate systems is represented by a 3-by-3
143 transformation matrix with the following form::
145 a b 0
146 c d 0
147 e f 1
149 Because a transformation matrix has only six elements that can be changed,
150 it is usually specified in PDF as the six-element array [ a b c d e f ].
152 Coordinate transformations are expressed as matrix multiplications::
154 a b 0
155 [ x′ y′ 1 ] = [ x y 1 ] × c d 0
156 e f 1
159 Example:
160 >>> from pypdf import PdfWriter, Transformation
161 >>> page = PdfWriter().add_blank_page(800, 600)
162 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
163 >>> page.add_transformation(op)
165 """
167 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:
168 self.ctm = ctm
170 @property
171 def matrix(self) -> TransformationMatrixType:
172 """
173 Return the transformation matrix as a tuple of tuples in the form:
175 ((a, b, 0), (c, d, 0), (e, f, 1))
176 """
177 return (
178 (self.ctm[0], self.ctm[1], 0),
179 (self.ctm[2], self.ctm[3], 0),
180 (self.ctm[4], self.ctm[5], 1),
181 )
183 @staticmethod
184 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
185 """
186 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
188 Args:
189 matrix: The transformation matrix as a tuple of tuples.
191 Returns:
192 A tuple representing the transformation matrix as (a, b, c, d, e, f)
194 """
195 return (
196 matrix[0][0],
197 matrix[0][1],
198 matrix[1][0],
199 matrix[1][1],
200 matrix[2][0],
201 matrix[2][1],
202 )
204 def _to_cm(self) -> str:
205 # Returns the cm operation string for the given transformation matrix
206 return (
207 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
208 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
209 )
211 def transform(self, m: "Transformation") -> "Transformation":
212 """
213 Apply one transformation to another.
215 Args:
216 m: a Transformation to apply.
218 Returns:
219 A new ``Transformation`` instance
221 Example:
222 >>> from pypdf import PdfWriter, Transformation
223 >>> height, width = 40, 50
224 >>> page = PdfWriter().add_blank_page(800, 600)
225 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
226 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror
227 >>> page.add_transformation(op)
229 """
230 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
231 return Transformation(ctm)
233 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
234 """
235 Translate the contents of a page.
237 Args:
238 tx: The translation along the x-axis.
239 ty: The translation along the y-axis.
241 Returns:
242 A new ``Transformation`` instance
244 """
245 m = self.ctm
246 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
248 def scale(
249 self, sx: Optional[float] = None, sy: Optional[float] = None
250 ) -> "Transformation":
251 """
252 Scale the contents of a page towards the origin of the coordinate system.
254 Typically, that is the lower-left corner of the page. That can be
255 changed by translating the contents / the page boxes.
257 Args:
258 sx: The scale factor along the x-axis.
259 sy: The scale factor along the y-axis.
261 Returns:
262 A new Transformation instance with the scaled matrix.
264 """
265 if sx is None and sy is None:
266 raise ValueError("Either sx or sy must be specified")
267 if sx is None:
268 sx = sy
269 if sy is None:
270 sy = sx
271 assert sx is not None
272 assert sy is not None
273 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
274 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
275 return Transformation(ctm)
277 def rotate(self, rotation: float) -> "Transformation":
278 """
279 Rotate the contents of a page.
281 Args:
282 rotation: The angle of rotation in degrees.
284 Returns:
285 A new ``Transformation`` instance with the rotated matrix.
287 """
288 rotation = math.radians(rotation)
289 op: TransformationMatrixType = (
290 (math.cos(rotation), math.sin(rotation), 0),
291 (-math.sin(rotation), math.cos(rotation), 0),
292 (0, 0, 1),
293 )
294 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
295 return Transformation(ctm)
297 def __repr__(self) -> str:
298 return f"Transformation(ctm={self.ctm})"
300 @overload
301 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:
302 ...
304 @overload
305 def apply_on(
306 self, pt: tuple[float, float], as_object: bool = False
307 ) -> tuple[float, float]:
308 ...
310 def apply_on(
311 self,
312 pt: Union[tuple[float, float], list[float]],
313 as_object: bool = False,
314 ) -> Union[tuple[float, float], list[float]]:
315 """
316 Apply the transformation matrix on the given point.
318 Args:
319 pt: A tuple or list representing the point in the form (x, y).
320 as_object: If True, return items as FloatObject, otherwise as plain floats.
322 Returns:
323 A tuple or list representing the transformed point in the form (x', y')
325 """
326 typ = FloatObject if as_object else float
327 pt1 = (
328 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
329 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
330 )
331 return list(pt1) if isinstance(pt, list) else pt1
334@dataclass
335class ImageFile:
336 """
337 Image within the PDF file. *This object is not designed to be built.*
339 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.
340 """
342 name: str = ""
343 """
344 Filename as identified within the PDF file.
345 """
347 data: bytes = b""
348 """
349 Data as bytes.
350 """
352 image: Optional[Image] = None
353 """
354 Data as PIL image.
355 """
357 indirect_reference: Optional[IndirectObject] = None
358 """
359 Reference to the object storing the stream.
360 """
362 def replace(self, new_image: Image, **kwargs: Any) -> None:
363 """
364 Replace the image with a new PIL image.
366 Args:
367 new_image (PIL.Image.Image): The new PIL image to replace the existing image.
368 **kwargs: Additional keyword arguments to pass to `Image.save()`.
370 Raises:
371 TypeError: If the image is inline or in a PdfReader.
372 TypeError: If the image does not belong to a PdfWriter.
373 TypeError: If `new_image` is not a PIL Image.
375 Note:
376 This method replaces the existing image with a new image.
377 It is not allowed for inline images or images within a PdfReader.
378 The `kwargs` parameter allows passing additional parameters
379 to `Image.save()`, such as quality.
381 """
382 if pil_not_imported:
383 raise ImportError(
384 "pillow is required to do image extraction. "
385 "It can be installed via 'pip install pypdf[image]'"
386 )
388 from ._reader import PdfReader # noqa: PLC0415
389 from .generic import DictionaryObject, PdfObject # noqa: PLC0415
390 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
392 if self.indirect_reference is None:
393 raise TypeError("Cannot update an inline image.")
394 if not hasattr(self.indirect_reference.pdf, "_id_translated"):
395 raise TypeError("Cannot update an image not belonging to a PdfWriter.")
396 if not isinstance(new_image, Image):
397 raise TypeError("new_image shall be a PIL Image")
398 b = BytesIO()
399 new_image.save(b, "PDF", **kwargs)
400 reader = PdfReader(b)
401 page_image = reader.pages[0].images[0]
402 assert page_image.indirect_reference is not None
403 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
404 page_image.indirect_reference.get_object()
405 )
406 cast(
407 PdfObject, self.indirect_reference.get_object()
408 ).indirect_reference = self.indirect_reference
409 # change the object attributes
410 extension, byte_stream, img = _xobj_to_image(
411 cast(DictionaryObject, self.indirect_reference.get_object()),
412 pillow_parameters=kwargs,
413 )
414 assert extension is not None
415 self.name = self.name[: self.name.rfind(".")] + extension
416 self.data = byte_stream
417 self.image = img
419 def __str__(self) -> str:
420 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
422 def __repr__(self) -> str:
423 return self.__str__()[:-1] + f", hash: {hash(self.data)})"
426class VirtualListImages(Sequence[ImageFile]):
427 """
428 Provides access to images referenced within a page.
429 Only one copy will be returned if the usage is used on the same page multiple times.
430 See :func:`PageObject.images` for more details.
431 """
433 def __init__(
434 self,
435 ids_function: Callable[[], list[Union[str, list[str]]]],
436 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],
437 ) -> None:
438 self.ids_function = ids_function
439 self.get_function = get_function
440 self.current = -1
442 def __len__(self) -> int:
443 return len(self.ids_function())
445 def keys(self) -> list[Union[str, list[str]]]:
446 return self.ids_function()
448 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:
449 return [(x, self[x]) for x in self.ids_function()]
451 @overload
452 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:
453 ...
455 @overload
456 def __getitem__(self, index: slice) -> Sequence[ImageFile]:
457 ...
459 def __getitem__(
460 self, index: Union[int, slice, str, list[str], tuple[str]]
461 ) -> Union[ImageFile, Sequence[ImageFile]]:
462 lst = self.ids_function()
463 if isinstance(index, slice):
464 indices = range(*index.indices(len(self)))
465 lst = [lst[x] for x in indices]
466 cls = type(self)
467 return cls((lambda: lst), self.get_function)
468 if isinstance(index, (str, list, tuple)):
469 return self.get_function(index)
470 if not isinstance(index, int):
471 raise TypeError("Invalid sequence indices type")
472 len_self = len(lst)
473 if index < 0:
474 # support negative indexes
475 index += len_self
476 if not (0 <= index < len_self):
477 raise IndexError("Sequence index out of range")
478 return self.get_function(lst[index])
480 def __iter__(self) -> Iterator[ImageFile]:
481 for i in range(len(self)):
482 yield self[i]
484 def __str__(self) -> str:
485 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
486 return f"[{', '.join(p)}]"
489class PageObject(DictionaryObject):
490 """
491 PageObject represents a single page within a PDF file.
493 Typically these objects will be created by accessing the
494 :attr:`pages<pypdf.PdfReader.pages>` property of the
495 :class:`PdfReader<pypdf.PdfReader>` class, but it is
496 also possible to create an empty page with the
497 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
499 Args:
500 pdf: PDF file the page belongs to.
501 indirect_reference: Stores the original indirect reference to
502 this object in its source PDF
504 """
506 original_page: "PageObject" # very local use in writer when appending
508 def __init__(
509 self,
510 pdf: Optional[PdfCommonDocProtocol] = None,
511 indirect_reference: Optional[IndirectObject] = None,
512 ) -> None:
513 DictionaryObject.__init__(self)
514 self.pdf = pdf
515 self.inline_images: Optional[dict[str, ImageFile]] = None
516 self.indirect_reference = indirect_reference
517 if not is_null_or_none(indirect_reference):
518 assert indirect_reference is not None, "mypy"
519 self.update(cast(DictionaryObject, indirect_reference.get_object()))
521 def hash_bin(self) -> int:
522 """
523 Used to detect modified object.
525 Note: this function is overloaded to return the same results
526 as a DictionaryObject.
528 Returns:
529 Hash considering type and value.
531 """
532 return hash(
533 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
534 )
536 def hash_value_data(self) -> bytes:
537 data = super().hash_value_data()
538 data += f"{id(self)}".encode()
539 return data
541 @property
542 def user_unit(self) -> float:
543 """
544 A read-only positive number giving the size of user space units.
546 It is in multiples of 1/72 inch. Hence a value of 1 means a user
547 space unit is 1/72 inch, and a value of 3 means that a user
548 space unit is 3/72 inch.
549 """
550 return cast(float, self.get(PG.USER_UNIT, 1))
552 @staticmethod
553 def create_blank_page(
554 pdf: Optional[PdfCommonDocProtocol] = None,
555 width: Union[float, Decimal, None] = None,
556 height: Union[float, Decimal, None] = None,
557 ) -> "PageObject":
558 """
559 Return a new blank page.
561 If ``width`` or ``height`` is ``None``, try to get the page size
562 from the last page of *pdf*.
564 Args:
565 pdf: PDF file the page is within.
566 width: The width of the new page expressed in default user
567 space units.
568 height: The height of the new page expressed in default user
569 space units.
571 Returns:
572 The new blank page
574 Raises:
575 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
576 no page
578 """
579 page = PageObject(pdf)
581 # Creates a new page (cf PDF Reference §7.7.3.3)
582 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
583 page.__setitem__(NameObject(PG.PARENT), NullObject())
584 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
585 if width is None or height is None:
586 if pdf is not None and len(pdf.pages) > 0:
587 lastpage = pdf.pages[len(pdf.pages) - 1]
588 width = lastpage.mediabox.width
589 height = lastpage.mediabox.height
590 else:
591 raise PageSizeNotDefinedError
592 page.__setitem__(
593 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore[arg-type]
594 )
596 return page
598 def _get_ids_image(
599 self,
600 obj: Optional[DictionaryObject] = None,
601 ancest: Optional[list[str]] = None,
602 call_stack: Optional[list[Any]] = None,
603 ) -> list[Union[str, list[str]]]:
604 if call_stack is None:
605 call_stack = []
606 _i = getattr(obj, "indirect_reference", None)
607 if _i in call_stack:
608 return []
609 call_stack.append(_i)
610 if self.inline_images is None:
611 self.inline_images = self._get_inline_images()
612 if obj is None:
613 obj = self
614 if ancest is None:
615 ancest = []
616 lst: list[Union[str, list[str]]] = []
617 if (
618 PG.RESOURCES not in obj or
619 is_null_or_none(resources := obj[PG.RESOURCES]) or
620 RES.XOBJECT not in cast(DictionaryObject, resources)
621 ):
622 return [] if self.inline_images is None else list(self.inline_images.keys())
624 x_object = resources[RES.XOBJECT].get_object() # type: ignore[index]
625 for o in x_object:
626 if not isinstance(x_object[o], StreamObject):
627 continue
628 if x_object[o][ImageAttributes.SUBTYPE] == "/Image":
629 lst.append(o if len(ancest) == 0 else [*ancest, o])
630 else: # is a form with possible images inside
631 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))
632 assert self.inline_images is not None
633 lst.extend(list(self.inline_images.keys()))
634 return lst
636 def _get_image(
637 self,
638 id: Union[str, list[str], tuple[str]],
639 obj: Optional[DictionaryObject] = None,
640 ) -> ImageFile:
641 if obj is None:
642 obj = cast(DictionaryObject, self)
643 if isinstance(id, tuple):
644 id = list(id)
645 if isinstance(id, list) and len(id) == 1:
646 id = id[0]
647 xobjs: Optional[DictionaryObject] = None
648 try:
649 xobjs = cast(
650 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
651 )
652 except KeyError as exc:
653 if not (id[0] == "~" and id[-1] == "~"):
654 raise KeyError(
655 f"Cannot access image object {id} without XObject resources"
656 ) from exc
657 if isinstance(id, str):
658 if id[0] == "~" and id[-1] == "~":
659 if self.inline_images is None:
660 self.inline_images = self._get_inline_images()
661 if self.inline_images is None:
662 raise KeyError("No inline image can be found")
663 return self.inline_images[id]
665 assert xobjs is not None
666 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
667 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
668 extension, byte_stream = imgd[:2]
669 return ImageFile(
670 name=f"{id[1:]}{extension}",
671 data=byte_stream,
672 image=imgd[2],
673 indirect_reference=xobjs[id].indirect_reference,
674 )
675 # in a subobject
676 assert xobjs is not None
677 ids = id[1:]
678 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
680 @property
681 def images(self) -> VirtualListImages:
682 """
683 Read-only property emulating a list of images on a page.
685 Get a list of all images on the page. The key can be:
686 - A string (for the top object)
687 - A tuple (for images within XObject forms)
688 - An integer
690 Examples:
691 * `reader.pages[0].images[0]` # return first image
692 * `reader.pages[0].images['/I0']` # return image '/I0'
693 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form
694 * `for img in reader.pages[0].images:` # loops through all objects
696 images.keys() and images.items() can be used.
698 The ImageFile has the following properties:
700 * `.name` : name of the object
701 * `.data` : bytes of the object
702 * `.image` : PIL Image Object
703 * `.indirect_reference` : object reference
705 and the following methods:
706 `.replace(new_image: PIL.Image.Image, **kwargs)` :
707 replace the image in the pdf with the new image
708 applying the saving parameters indicated (such as quality)
710 Example usage:
712 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)
714 Inline images are extracted and named ~0~, ~1~, ..., with the
715 indirect_reference set to None.
717 """
718 return VirtualListImages(self._get_ids_image, self._get_image)
720 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:
721 """Translate values used in inline image"""
722 try:
723 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])
724 except (TypeError, KeyError):
725 if isinstance(v, NameObject):
726 # It is a custom name, thus we have to look in resources.
727 # The only applicable case is for ColorSpace.
728 try:
729 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
730 v = cast(DictionaryObject, res)[v]
731 except KeyError: # for res and v
732 raise PdfReadError(f"Cannot find resource entry {v} for {k}")
733 return v
735 def _get_inline_images(self) -> dict[str, ImageFile]:
736 """Load inline images. Entries will be identified as `~1~`."""
737 content = self.get_contents()
738 if is_null_or_none(content):
739 return {}
740 imgs_data = []
741 assert content is not None, "mypy"
742 for param, ope in content.operations:
743 if ope == b"INLINE IMAGE":
744 imgs_data.append(
745 {"settings": param["settings"], "__streamdata__": param["data"]}
746 )
747 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
748 raise PdfReadError(
749 f"{ope!r} operator met whereas not expected, "
750 "please share use case with pypdf dev team"
751 )
752 files = {}
753 for num, ii in enumerate(imgs_data):
754 init = {
755 "__streamdata__": ii["__streamdata__"],
756 "/Length": len(ii["__streamdata__"]),
757 }
758 for k, v in ii["settings"].items():
759 if k in {"/Length", "/L"}: # no length is expected
760 continue
761 if isinstance(v, list):
762 v = ArrayObject(
763 [self._translate_value_inline_image(k, x) for x in v]
764 )
765 else:
766 v = self._translate_value_inline_image(k, v)
767 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])
768 if k not in init:
769 init[k] = v
770 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
771 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
772 extension, byte_stream, img = _xobj_to_image(ii["object"])
773 files[f"~{num}~"] = ImageFile(
774 name=f"~{num}~{extension}",
775 data=byte_stream,
776 image=img,
777 indirect_reference=None,
778 )
779 return files
781 @property
782 def rotation(self) -> int:
783 """
784 The visual rotation of the page.
786 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
787 valid values. This property does not affect ``/Contents``.
788 """
789 rotate_obj = self.get(PG.ROTATE, 0)
790 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
792 @rotation.setter
793 def rotation(self, r: float) -> None:
794 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
796 def transfer_rotation_to_content(self) -> None:
797 """
798 Apply the rotation of the page to the content and the media/crop/...
799 boxes.
801 It is recommended to apply this function before page merging.
802 """
803 r = -self.rotation # rotation to apply is in the otherway
804 self.rotation = 0
805 mb = RectangleObject(self.mediabox)
806 trsf = (
807 Transformation()
808 .translate(
809 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
810 )
811 .rotate(r)
812 )
813 pt1 = trsf.apply_on(mb.lower_left)
814 pt2 = trsf.apply_on(mb.upper_right)
815 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
816 self.add_transformation(trsf, False)
817 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
818 if b in self:
819 rr = RectangleObject(self[b]) # type: ignore[arg-type]
820 pt1 = trsf.apply_on(rr.lower_left)
821 pt2 = trsf.apply_on(rr.upper_right)
822 self[NameObject(b)] = RectangleObject(
823 (
824 min(pt1[0], pt2[0]),
825 min(pt1[1], pt2[1]),
826 max(pt1[0], pt2[0]),
827 max(pt1[1], pt2[1]),
828 )
829 )
831 def rotate(self, angle: int) -> "PageObject":
832 """
833 Rotate a page clockwise by increments of 90 degrees.
835 Args:
836 angle: Angle to rotate the page. Must be an increment of 90 deg.
838 Returns:
839 The rotated PageObject
841 """
842 if angle % 90 != 0:
843 raise ValueError("Rotation angle must be a multiple of 90")
844 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
845 return self
847 def _merge_resources(
848 self,
849 res1: DictionaryObject,
850 res2: DictionaryObject,
851 resource: Any,
852 new_res1: bool = True,
853 ) -> tuple[dict[str, Any], dict[str, Any]]:
854 try:
855 assert isinstance(self.indirect_reference, IndirectObject)
856 pdf = self.indirect_reference.pdf
857 is_pdf_writer = hasattr(
858 pdf, "_add_object"
859 ) # expect isinstance(pdf, PdfWriter)
860 except (AssertionError, AttributeError):
861 pdf = None
862 is_pdf_writer = False
864 def compute_unique_key(base_key: str) -> tuple[str, bool]:
865 """
866 Find a key that either doesn't already exist or has the same value
867 (indicated by the bool)
869 Args:
870 base_key: An index is added to this to get the computed key
872 Returns:
873 A tuple (computed key, bool) where the boolean indicates
874 if there is a resource of the given computed_key with the same
875 value.
877 """
878 value = page2res.raw_get(base_key)
879 # TODO: a possible improvement for writer, the indirect_reference
880 # cannot be found because translated
882 # try the current key first (e.g. "foo"), but otherwise iterate
883 # through "foo-0", "foo-1", etc. new_res can contain only finitely
884 # many keys, thus this'll eventually end, even if it's been crafted
885 # to be maximally annoying.
886 computed_key = base_key
887 idx = 0
888 while computed_key in new_res:
889 if new_res.raw_get(computed_key) == value:
890 # there's already a resource of this name, with the exact
891 # same value
892 return computed_key, True
893 computed_key = f"{base_key}-{idx}"
894 idx += 1
895 return computed_key, False
897 if new_res1:
898 new_res = DictionaryObject()
899 new_res.update(res1.get(resource, DictionaryObject()).get_object())
900 else:
901 new_res = cast(DictionaryObject, res1[resource])
902 page2res = cast(
903 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
904 )
905 rename_res = {}
906 for key in page2res:
907 unique_key, same_value = compute_unique_key(key)
908 newname = NameObject(unique_key)
909 if key != unique_key:
910 # we have to use a different name for this
911 rename_res[key] = newname
913 if not same_value:
914 if is_pdf_writer:
915 new_res[newname] = page2res.raw_get(key).clone(pdf)
916 try:
917 new_res[newname] = new_res[newname].indirect_reference
918 except AttributeError:
919 pass
920 else:
921 new_res[newname] = page2res.raw_get(key)
922 lst = sorted(new_res.items())
923 new_res.clear()
924 for el in lst:
925 new_res[el[0]] = el[1]
926 return new_res, rename_res
928 @staticmethod
929 def _content_stream_rename(
930 stream: ContentStream,
931 rename: dict[Any, Any],
932 pdf: Optional[PdfCommonDocProtocol],
933 ) -> ContentStream:
934 if not rename:
935 return stream
936 stream = ContentStream(stream, pdf)
937 for operands, _operator in stream.operations:
938 if isinstance(operands, list):
939 for i, op in enumerate(operands):
940 if isinstance(op, NameObject):
941 operands[i] = rename.get(op, op)
942 elif isinstance(operands, dict):
943 for i, op in operands.items():
944 if isinstance(op, NameObject):
945 operands[i] = rename.get(op, op)
946 else:
947 raise KeyError(f"Type of operands is {type(operands)}")
948 return stream
950 @staticmethod
951 def _add_transformation_matrix(
952 contents: Any,
953 pdf: Optional[PdfCommonDocProtocol],
954 ctm: CompressedTransformationMatrix,
955 ) -> ContentStream:
956 """Add transformation matrix at the beginning of the given contents stream."""
957 content_stream = ContentStream(contents, pdf)
958 content_stream.operations.insert(
959 0,
960 (
961 [FloatObject(x) for x in ctm],
962 b"cm",
963 ),
964 )
965 return content_stream
967 def _get_contents_as_bytes(self) -> Optional[bytes]:
968 """
969 Return the page contents as bytes.
971 Returns:
972 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
974 """
975 if PG.CONTENTS in self:
976 obj = self[PG.CONTENTS].get_object()
977 if isinstance(obj, list):
978 return b"".join(x.get_object().get_data() for x in obj)
979 return cast(EncodedStreamObject, obj).get_data()
980 return None
982 def get_contents(self) -> Optional[ContentStream]:
983 """
984 Access the page contents.
986 Returns:
987 The ``/Contents`` object, or ``None`` if it does not exist.
988 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
990 """
991 if PG.CONTENTS in self:
992 try:
993 pdf = cast(IndirectObject, self.indirect_reference).pdf
994 except AttributeError:
995 pdf = None
996 obj = self[PG.CONTENTS]
997 if is_null_or_none(obj):
998 return None
999 resolved_object = obj.get_object()
1000 return ContentStream(resolved_object, pdf)
1001 return None
1003 def replace_contents(
1004 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
1005 ) -> None:
1006 """
1007 Replace the page contents with the new content and nullify old objects
1008 Args:
1009 content: new content; if None delete the content field.
1010 """
1011 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
1012 # the page is not attached : the content is directly attached.
1013 self[NameObject(PG.CONTENTS)] = content
1014 return
1016 from pypdf._writer import PdfWriter # noqa: PLC0415
1017 if not isinstance(self.indirect_reference.pdf, PdfWriter):
1018 deprecate(
1019 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "
1020 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "
1021 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."
1022 )
1024 writer = self.indirect_reference.pdf
1025 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
1026 content_array = cast(ArrayObject, self[PG.CONTENTS])
1027 for reference in content_array:
1028 try:
1029 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())
1030 except ValueError:
1031 # Occurs when called on PdfReader.
1032 pass
1034 if isinstance(content, ArrayObject):
1035 content = ArrayObject(writer._add_object(obj) for obj in content)
1037 if is_null_or_none(content):
1038 if PG.CONTENTS not in self:
1039 return
1040 assert self[PG.CONTENTS].indirect_reference is not None
1041 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())
1042 del self[PG.CONTENTS]
1043 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
1044 try:
1045 self[NameObject(PG.CONTENTS)] = writer._add_object(content)
1046 except AttributeError:
1047 # applies at least for page not in writer
1048 # as a backup solution, we put content as an object although not in accordance with pdf ref
1049 # this will be fixed with the _add_object
1050 self[NameObject(PG.CONTENTS)] = content
1051 else:
1052 assert content is not None, "mypy"
1053 content.indirect_reference = self[
1054 PG.CONTENTS
1055 ].indirect_reference # TODO: in the future may require generation management
1056 try:
1057 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)
1058 except AttributeError:
1059 # applies at least for page not in writer
1060 # as a backup solution, we put content as an object although not in accordance with pdf ref
1061 # this will be fixed with the _add_object
1062 self[NameObject(PG.CONTENTS)] = content
1063 # forces recalculation of inline_images
1064 self.inline_images = None
1066 def merge_page(
1067 self, page2: "PageObject", expand: bool = False, over: bool = True
1068 ) -> None:
1069 """
1070 Merge the content streams of two pages into one.
1072 Resource references (e.g. fonts) are maintained from both pages.
1073 The mediabox, cropbox, etc of this page are not altered.
1074 The parameter page's content stream will
1075 be added to the end of this page's content stream,
1076 meaning that it will be drawn after, or "on top" of this page.
1078 Args:
1079 page2: The page to be merged into this one. Should be
1080 an instance of :class:`PageObject<PageObject>`.
1081 over: set the page2 content over page1 if True (default) else under
1082 expand: If True, the current page dimensions will be
1083 expanded to accommodate the dimensions of the page to be merged.
1085 """
1086 self._merge_page(page2, over=over, expand=expand)
1088 def _merge_page(
1089 self,
1090 page2: "PageObject",
1091 page2_transformation: Optional[Callable[[Any], ContentStream]] = None,
1092 ctm: Optional[CompressedTransformationMatrix] = None,
1093 over: bool = True,
1094 expand: bool = False,
1095 ) -> None:
1096 # First we work on merging the resource dictionaries. This allows us
1097 # to find out what symbols in the content streams we might need to
1098 # rename.
1099 try:
1100 assert isinstance(self.indirect_reference, IndirectObject)
1101 if hasattr(self.indirect_reference.pdf, "_add_object"): # to detect PdfWriter
1102 return self._merge_page_writer(
1103 page2, page2_transformation, ctm, over, expand
1104 )
1105 except (AssertionError, AttributeError):
1106 pass
1108 new_resources = DictionaryObject()
1109 rename: dict[str, Any] = {}
1110 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())
1111 page2_resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())
1112 new_annots = ArrayObject()
1114 for page in (self, page2):
1115 if PG.ANNOTS in page:
1116 annots = page[PG.ANNOTS]
1117 if isinstance(annots, ArrayObject):
1118 new_annots.extend(annots)
1119 self[NameObject(PG.ANNOTS)] = new_annots
1121 for res in (
1122 RES.EXT_G_STATE,
1123 RES.COLOR_SPACE,
1124 RES.PATTERN,
1125 RES.SHADING,
1126 RES.XOBJECT,
1127 RES.FONT,
1128 RES.PROPERTIES,
1129 ):
1130 new, new_resource_name = self._merge_resources(
1131 original_resources, page2_resources, res
1132 )
1133 if new:
1134 new_resources[NameObject(res)] = new
1135 rename.update(new_resource_name)
1137 # Combine /ProcSet sets, making sure there is a consistent order
1138 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
1139 sorted(
1140 set(
1141 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
1142 ).union(
1143 set(page2_resources.get(RES.PROC_SET, ArrayObject()).get_object())
1144 )
1145 )
1146 )
1148 new_content_array = ArrayObject()
1149 original_content = self.get_contents()
1150 if original_content is not None:
1151 original_content.isolate_graphics_state()
1152 new_content_array.append(original_content)
1154 page2_content = page2.get_contents()
1155 if page2_content is not None:
1156 rect = getattr(page2, MERGE_CROP_BOX)
1157 page2_content.operations.insert(
1158 0,
1159 (
1160 map(
1161 FloatObject,
1162 [
1163 rect.left,
1164 rect.bottom,
1165 rect.width,
1166 rect.height,
1167 ],
1168 ),
1169 b"re",
1170 ),
1171 )
1172 page2_content.operations.insert(1, ([], b"W"))
1173 page2_content.operations.insert(2, ([], b"n"))
1174 if page2_transformation is not None:
1175 page2_content = page2_transformation(page2_content)
1176 page2_content = PageObject._content_stream_rename(
1177 page2_content, rename, self.pdf
1178 )
1179 page2_content.isolate_graphics_state()
1180 if over:
1181 new_content_array.append(page2_content)
1182 else:
1183 new_content_array.insert(0, page2_content)
1185 # if expanding the page to fit a new page, calculate the new media box size
1186 if expand:
1187 self._expand_mediabox(page2, ctm)
1189 self.replace_contents(ContentStream(new_content_array, self.pdf))
1190 self[NameObject(PG.RESOURCES)] = new_resources
1192 return None
1194 def _merge_page_writer(
1195 self,
1196 page2: "PageObject",
1197 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1198 ctm: Optional[CompressedTransformationMatrix] = None,
1199 over: bool = True,
1200 expand: bool = False,
1201 ) -> None:
1202 # First we work on merging the resource dictionaries. This allows us
1203 # to find which symbols in the content streams we might need to
1204 # rename.
1205 assert isinstance(self.indirect_reference, IndirectObject)
1206 pdf = self.indirect_reference.pdf
1208 if PG.RESOURCES not in self:
1209 self[NameObject(PG.RESOURCES)] = DictionaryObject()
1210 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1211 if PG.RESOURCES not in page2:
1212 page2resources = DictionaryObject()
1213 else:
1214 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1216 rename = {}
1217 for res in (
1218 RES.EXT_G_STATE,
1219 RES.COLOR_SPACE,
1220 RES.PATTERN,
1221 RES.SHADING,
1222 RES.XOBJECT,
1223 RES.FONT,
1224 RES.PROPERTIES,
1225 ):
1226 if res in page2resources:
1227 if res not in original_resources:
1228 original_resources[NameObject(res)] = DictionaryObject()
1229 _, newrename = self._merge_resources(
1230 original_resources, page2resources, res, False
1231 )
1232 rename.update(newrename)
1233 # Combine /ProcSet sets
1234 if RES.PROC_SET in page2resources:
1235 if RES.PROC_SET not in original_resources:
1236 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
1237 arr = cast(ArrayObject, original_resources[RES.PROC_SET])
1238 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
1239 if x not in arr:
1240 arr.append(x)
1241 arr.sort()
1243 if not is_null_or_none(page2.get(PG.ANNOTS, None)):
1244 if PG.ANNOTS not in self:
1245 self[NameObject(PG.ANNOTS)] = ArrayObject()
1246 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
1247 if ctm is None:
1248 trsf = Transformation()
1249 else:
1250 trsf = Transformation(ctm)
1251 # Ensure we are working on a copy of the list. Otherwise, if both pages
1252 # are the same object, we might run into an infinite loop.
1253 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):
1254 a = a.get_object()
1255 aa = a.clone(
1256 pdf,
1257 ignore_fields=("/P", "/StructParent", "/Parent"),
1258 force_duplicate=True,
1259 )
1260 r = cast(ArrayObject, a["/Rect"])
1261 pt1 = trsf.apply_on((r[0], r[1]), True)
1262 pt2 = trsf.apply_on((r[2], r[3]), True)
1263 aa[NameObject("/Rect")] = ArrayObject(
1264 (
1265 min(pt1[0], pt2[0]),
1266 min(pt1[1], pt2[1]),
1267 max(pt1[0], pt2[0]),
1268 max(pt1[1], pt2[1]),
1269 )
1270 )
1271 if "/QuadPoints" in a:
1272 q = cast(ArrayObject, a["/QuadPoints"])
1273 aa[NameObject("/QuadPoints")] = ArrayObject(
1274 trsf.apply_on((q[0], q[1]), True)
1275 + trsf.apply_on((q[2], q[3]), True)
1276 + trsf.apply_on((q[4], q[5]), True)
1277 + trsf.apply_on((q[6], q[7]), True)
1278 )
1279 try:
1280 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
1281 except KeyError:
1282 pass
1283 try:
1284 aa[NameObject("/P")] = self.indirect_reference
1285 annots.append(aa.indirect_reference)
1286 except AttributeError:
1287 pass
1289 new_content_array = ArrayObject()
1290 original_content = self.get_contents()
1291 if original_content is not None:
1292 original_content.isolate_graphics_state()
1293 new_content_array.append(original_content)
1295 page2content = page2.get_contents()
1296 if page2content is not None:
1297 rect = getattr(page2, MERGE_CROP_BOX)
1298 page2content.operations.insert(
1299 0,
1300 (
1301 map(
1302 FloatObject,
1303 [
1304 rect.left,
1305 rect.bottom,
1306 rect.width,
1307 rect.height,
1308 ],
1309 ),
1310 b"re",
1311 ),
1312 )
1313 page2content.operations.insert(1, ([], b"W"))
1314 page2content.operations.insert(2, ([], b"n"))
1315 if page2transformation is not None:
1316 page2content = page2transformation(page2content)
1317 page2content = PageObject._content_stream_rename(
1318 page2content, rename, self.pdf
1319 )
1320 page2content.isolate_graphics_state()
1321 if over:
1322 new_content_array.append(page2content)
1323 else:
1324 new_content_array.insert(0, page2content)
1326 # if expanding the page to fit a new page, calculate the new media box size
1327 if expand:
1328 self._expand_mediabox(page2, ctm)
1330 self.replace_contents(new_content_array)
1332 def _expand_mediabox(
1333 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
1334 ) -> None:
1335 corners1 = (
1336 self.mediabox.left.as_numeric(),
1337 self.mediabox.bottom.as_numeric(),
1338 self.mediabox.right.as_numeric(),
1339 self.mediabox.top.as_numeric(),
1340 )
1341 corners2 = (
1342 page2.mediabox.left.as_numeric(),
1343 page2.mediabox.bottom.as_numeric(),
1344 page2.mediabox.left.as_numeric(),
1345 page2.mediabox.top.as_numeric(),
1346 page2.mediabox.right.as_numeric(),
1347 page2.mediabox.top.as_numeric(),
1348 page2.mediabox.right.as_numeric(),
1349 page2.mediabox.bottom.as_numeric(),
1350 )
1351 if ctm is not None:
1352 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1353 new_x = tuple(
1354 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
1355 for i in range(0, 8, 2)
1356 )
1357 new_y = tuple(
1358 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
1359 for i in range(0, 8, 2)
1360 )
1361 else:
1362 new_x = corners2[0:8:2]
1363 new_y = corners2[1:8:2]
1364 lowerleft = (min(new_x), min(new_y))
1365 upperright = (max(new_x), max(new_y))
1366 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
1367 upperright = (
1368 max(corners1[2], upperright[0]),
1369 max(corners1[3], upperright[1]),
1370 )
1372 self.mediabox.lower_left = lowerleft
1373 self.mediabox.upper_right = upperright
1375 def merge_transformed_page(
1376 self,
1377 page2: "PageObject",
1378 ctm: Union[CompressedTransformationMatrix, Transformation],
1379 over: bool = True,
1380 expand: bool = False,
1381 ) -> None:
1382 """
1383 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation
1384 matrix is applied to the merged stream.
1386 Args:
1387 page2: The page to be merged into this one.
1388 ctm: a 6-element tuple containing the operands of the
1389 transformation matrix
1390 over: set the page2 content over page1 if True (default) else under
1391 expand: Whether the page should be expanded to fit the dimensions
1392 of the page to be merged.
1394 """
1395 if isinstance(ctm, Transformation):
1396 ctm = ctm.ctm
1397 self._merge_page(
1398 page2,
1399 lambda page2_content: PageObject._add_transformation_matrix(
1400 page2_content, page2.pdf, ctm
1401 ),
1402 ctm,
1403 over,
1404 expand,
1405 )
1407 def merge_scaled_page(
1408 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
1409 ) -> None:
1410 """
1411 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1412 is scaled by applying a transformation matrix.
1414 Args:
1415 page2: The page to be merged into this one.
1416 scale: The scaling factor
1417 over: set the page2 content over page1 if True (default) else under
1418 expand: Whether the page should be expanded to fit the
1419 dimensions of the page to be merged.
1421 """
1422 op = Transformation().scale(scale, scale)
1423 self.merge_transformed_page(page2, op, over, expand)
1425 def merge_rotated_page(
1426 self,
1427 page2: "PageObject",
1428 rotation: float,
1429 over: bool = True,
1430 expand: bool = False,
1431 ) -> None:
1432 """
1433 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1434 is rotated by applying a transformation matrix.
1436 Args:
1437 page2: The page to be merged into this one.
1438 rotation: The angle of the rotation, in degrees
1439 over: set the page2 content over page1 if True (default) else under
1440 expand: Whether the page should be expanded to fit the
1441 dimensions of the page to be merged.
1443 """
1444 op = Transformation().rotate(rotation)
1445 self.merge_transformed_page(page2, op, over, expand)
1447 def merge_translated_page(
1448 self,
1449 page2: "PageObject",
1450 tx: float,
1451 ty: float,
1452 over: bool = True,
1453 expand: bool = False,
1454 ) -> None:
1455 """
1456 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be
1457 merged is translated by applying a transformation matrix.
1459 Args:
1460 page2: the page to be merged into this one.
1461 tx: The translation on X axis
1462 ty: The translation on Y axis
1463 over: set the page2 content over page1 if True (default) else under
1464 expand: Whether the page should be expanded to fit the
1465 dimensions of the page to be merged.
1467 """
1468 op = Transformation().translate(tx, ty)
1469 self.merge_transformed_page(page2, op, over, expand)
1471 def add_transformation(
1472 self,
1473 ctm: Union[Transformation, CompressedTransformationMatrix],
1474 expand: bool = False,
1475 ) -> None:
1476 """
1477 Apply a transformation matrix to the page.
1479 Args:
1480 ctm: A 6-element tuple containing the operands of the
1481 transformation matrix. Alternatively, a
1482 :py:class:`Transformation<pypdf.Transformation>`
1483 object can be passed.
1485 See :doc:`/user/cropping-and-transforming`.
1487 """
1488 if isinstance(ctm, Transformation):
1489 ctm = ctm.ctm
1490 content = self.get_contents()
1491 if content is not None:
1492 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
1493 content.isolate_graphics_state()
1494 self.replace_contents(content)
1495 # if expanding the page to fit a new page, calculate the new media box size
1496 if expand:
1497 corners = [
1498 self.mediabox.left.as_numeric(),
1499 self.mediabox.bottom.as_numeric(),
1500 self.mediabox.left.as_numeric(),
1501 self.mediabox.top.as_numeric(),
1502 self.mediabox.right.as_numeric(),
1503 self.mediabox.top.as_numeric(),
1504 self.mediabox.right.as_numeric(),
1505 self.mediabox.bottom.as_numeric(),
1506 ]
1508 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1509 new_x = [
1510 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
1511 for i in range(0, 8, 2)
1512 ]
1513 new_y = [
1514 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
1515 for i in range(0, 8, 2)
1516 ]
1518 self.mediabox.lower_left = (min(new_x), min(new_y))
1519 self.mediabox.upper_right = (max(new_x), max(new_y))
1521 def scale(self, sx: float, sy: float) -> None:
1522 """
1523 Scale a page by the given factors by applying a transformation matrix
1524 to its content and updating the page size.
1526 This updates the various page boundaries (bleedbox, trimbox, etc.)
1527 and the contents of the page.
1529 Args:
1530 sx: The scaling factor on horizontal axis.
1531 sy: The scaling factor on vertical axis.
1533 """
1534 self.add_transformation((sx, 0, 0, sy, 0, 0))
1535 self.bleedbox = self.bleedbox.scale(sx, sy)
1536 self.trimbox = self.trimbox.scale(sx, sy)
1537 self.artbox = self.artbox.scale(sx, sy)
1538 self.cropbox = self.cropbox.scale(sx, sy)
1539 self.mediabox = self.mediabox.scale(sx, sy)
1541 if PG.ANNOTS in self:
1542 annotations = self[PG.ANNOTS]
1543 if isinstance(annotations, ArrayObject):
1544 for annotation in annotations:
1545 annotation_obj = annotation.get_object()
1546 if AnnotationDictionaryAttributes.Rect in annotation_obj:
1547 rectangle = annotation_obj[AnnotationDictionaryAttributes.Rect]
1548 if isinstance(rectangle, ArrayObject):
1549 rectangle[0] = FloatObject(float(rectangle[0]) * sx)
1550 rectangle[1] = FloatObject(float(rectangle[1]) * sy)
1551 rectangle[2] = FloatObject(float(rectangle[2]) * sx)
1552 rectangle[3] = FloatObject(float(rectangle[3]) * sy)
1554 if PG.VP in self:
1555 viewport = self[PG.VP]
1556 if isinstance(viewport, ArrayObject):
1557 bbox = viewport[0]["/BBox"]
1558 else:
1559 bbox = viewport["/BBox"] # type: ignore[index]
1560 scaled_bbox = RectangleObject(
1561 (
1562 float(bbox[0]) * sx,
1563 float(bbox[1]) * sy,
1564 float(bbox[2]) * sx,
1565 float(bbox[3]) * sy,
1566 )
1567 )
1568 if isinstance(viewport, ArrayObject):
1569 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore[index]
1570 NameObject("/BBox")
1571 ] = scaled_bbox
1572 else:
1573 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore[index]
1575 def scale_by(self, factor: float) -> None:
1576 """
1577 Scale a page by the given factor by applying a transformation matrix to
1578 its content and updating the page size.
1580 Args:
1581 factor: The scaling factor (for both X and Y axis).
1583 """
1584 self.scale(factor, factor)
1586 def scale_to(self, width: float, height: float) -> None:
1587 """
1588 Scale a page to the specified dimensions by applying a transformation
1589 matrix to its content and updating the page size.
1591 Args:
1592 width: The new width.
1593 height: The new height.
1595 """
1596 sx = width / float(self.mediabox.width)
1597 sy = height / float(self.mediabox.height)
1598 self.scale(sx, sy)
1600 def compress_content_streams(self, level: int = -1) -> None:
1601 """
1602 Compress the size of this page by joining all content streams and
1603 applying a FlateDecode filter.
1605 However, it is possible that this function will perform no action if
1606 content stream compression becomes "automatic".
1607 """
1608 content = self.get_contents()
1609 if content is not None:
1610 content_obj = content.flate_encode(level)
1611 try:
1612 content.indirect_reference.pdf._objects[ # type: ignore[union-attr]
1613 content.indirect_reference.idnum - 1 # type: ignore[union-attr]
1614 ] = content_obj
1615 except AttributeError:
1616 if self.indirect_reference is not None and hasattr(
1617 self.indirect_reference.pdf, "_add_object"
1618 ):
1619 self.replace_contents(content_obj)
1620 else:
1621 raise ValueError("Page must be part of a PdfWriter")
1623 @property
1624 def page_number(self) -> Optional[int]:
1625 """
1626 Read-only property which returns the page number within the PDF file.
1628 Returns:
1629 Page number; None if the page is not attached to a PDF.
1631 """
1632 if self.indirect_reference is None:
1633 return None
1634 try:
1635 lst = self.indirect_reference.pdf.pages
1636 return int(lst.index(self))
1637 except ValueError:
1638 return None
1640 def _debug_for_extract(self) -> str: # pragma: no cover
1641 out = ""
1642 for ope, op in ContentStream(
1643 self["/Contents"].get_object(), self.pdf, "bytes"
1644 ).operations:
1645 if op == b"TJ":
1646 s = [x for x in ope[0] if isinstance(x, str)]
1647 else:
1648 s = []
1649 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
1650 out += "\n=============================\n"
1651 try:
1652 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore
1653 out += fo + "\n"
1654 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore
1655 try:
1656 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1657 "/Encoding"
1658 ].__repr__()
1659 out += enc_repr + "\n"
1660 except Exception:
1661 pass
1662 try:
1663 out += (
1664 self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1665 "/ToUnicode"
1666 ]
1667 .get_data()
1668 .decode()
1669 + "\n"
1670 )
1671 except Exception:
1672 pass
1674 except KeyError:
1675 out += "No Font\n"
1676 return out
1678 def _extract_text(
1679 self,
1680 obj: DictionaryObject,
1681 pdf: Any,
1682 orientations: tuple[int, ...] = (0, 90, 180, 270),
1683 space_width: float = 200.0,
1684 content_key: Optional[str] = PG.CONTENTS,
1685 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1686 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1687 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1688 *,
1689 known_ids: Optional[set[int]] = None,
1690 ) -> str:
1691 """
1692 See extract_text for most arguments.
1694 Args:
1695 content_key: indicate the default key where to extract data
1696 None = the object; this allows reusing the function on an XObject
1697 default = "/Content"
1699 """
1700 if known_ids is None:
1701 known_ids = set()
1703 extractor = TextExtraction()
1704 font_resources: dict[str, DictionaryObject] = {}
1705 fonts: dict[str, Font] = {}
1707 resources_dict = cast(
1708 Optional[DictionaryObject],
1709 obj.get_inherited(key=PG.RESOURCES, default=DictionaryObject())
1710 )
1711 if is_null_or_none(resources_dict) or not resources_dict:
1712 # No resources means no text is possible (no font); we consider the
1713 # file as not damaged, no need to check for TJ or Tj
1714 return ""
1716 if (
1717 "/Font" in resources_dict
1718 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))
1719 ):
1720 for font_resource in font_resources_dict:
1721 try:
1722 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())
1723 font_resources[font_resource] = font_resource_object
1724 fonts[font_resource] = Font.from_font_resource(font_resource_object)
1725 # Override space width, if applicable
1726 if fonts[font_resource].character_widths.get(fonts[font_resource].space_char, 0) == 0:
1727 fonts[font_resource].space_width = space_width
1728 except (AttributeError, TypeError):
1729 pass
1731 try:
1732 content = (
1733 obj[content_key].get_object() if isinstance(content_key, str) else obj
1734 )
1735 if not isinstance(content, ContentStream):
1736 content = ContentStream(content, pdf, "bytes")
1737 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
1738 return ""
1739 # We check all strings are TextStringObjects. ByteStringObjects
1740 # are strings where the byte->string encoding was unknown, so adding
1741 # them to the text here would be gibberish.
1743 # Initialize the extractor with the necessary parameters
1744 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)
1746 for operands, operator in content.operations:
1747 if visitor_operand_before is not None:
1748 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1749 # Multiple operators are handled here
1750 if operator == b"'":
1751 extractor.process_operation(b"T*", [])
1752 extractor.process_operation(b"Tj", operands)
1753 elif operator == b'"':
1754 extractor.process_operation(b"Tw", [operands[0]])
1755 extractor.process_operation(b"Tc", [operands[1]])
1756 extractor.process_operation(b"T*", [])
1757 extractor.process_operation(b"Tj", operands[2:])
1758 elif operator == b"TJ":
1759 # The space width may be smaller than the font width, so the width should be 95%.
1760 _confirm_space_width = extractor._space_width * 0.95
1761 if operands:
1762 for op in operands[0]:
1763 if isinstance(op, (str, bytes)):
1764 extractor.process_operation(b"Tj", [op])
1765 if isinstance(op, (int, float, NumberObject, FloatObject)) and (
1766 abs(float(op)) >= _confirm_space_width
1767 and extractor.text
1768 and extractor.text[-1] != " "
1769 ):
1770 extractor.process_operation(b"Tj", [" "])
1771 elif operator == b"TD":
1772 extractor.process_operation(b"TL", [-operands[1]])
1773 extractor.process_operation(b"Td", operands)
1774 elif operator == b"Do":
1775 extractor.output += extractor.text
1776 if visitor_text is not None:
1777 visitor_text(
1778 extractor.text,
1779 extractor.memo_cm,
1780 extractor.memo_tm,
1781 extractor.font_resource,
1782 extractor.font_size,
1783 )
1784 try:
1785 if extractor.output[-1] != "\n":
1786 extractor.output += "\n"
1787 if visitor_text is not None:
1788 visitor_text(
1789 "\n",
1790 extractor.memo_cm,
1791 extractor.memo_tm,
1792 extractor.font_resource,
1793 extractor.font_size,
1794 )
1795 except IndexError:
1796 pass
1797 try:
1798 xobj = cast(DictionaryObject, resources_dict["/XObject"])
1799 xform = cast(EncodedStreamObject, xobj[operands[0]])
1800 if xform["/Subtype"] != NameObject("/Image"):
1801 xform_id = id(xform)
1802 if xform_id in known_ids:
1803 logger_warning(
1804 "Detected cyclic form XObject reference, skipping %(operand)s.",
1805 source=__name__,
1806 operand=operands[0]
1807 )
1808 text = ""
1809 else:
1810 known_ids.add(xform_id)
1811 try:
1812 text = self.extract_xform_text(
1813 xform,
1814 orientations,
1815 space_width,
1816 visitor_operand_before,
1817 visitor_operand_after,
1818 visitor_text,
1819 known_ids=known_ids,
1820 )
1821 finally:
1822 known_ids.discard(xform_id)
1823 extractor.output += text
1824 if visitor_text is not None:
1825 visitor_text(
1826 text,
1827 extractor.memo_cm,
1828 extractor.memo_tm,
1829 extractor.font_resource,
1830 extractor.font_size,
1831 )
1832 except Exception as exception:
1833 logger_warning(
1834 "Impossible to decode XFormObject %(operand)s: %(exception)s",
1835 source=__name__,
1836 operand=operands[0],
1837 exception=exception,
1838 )
1839 finally:
1840 extractor.text = ""
1841 extractor.memo_cm = extractor.cm_matrix.copy()
1842 extractor.memo_tm = extractor.tm_matrix.copy()
1843 else:
1844 extractor.process_operation(operator, operands)
1845 if visitor_operand_after is not None:
1846 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1847 extractor.output += extractor.text # just in case
1848 if extractor.text != "" and visitor_text is not None:
1849 visitor_text(
1850 extractor.text,
1851 extractor.memo_cm,
1852 extractor.memo_tm,
1853 extractor.font_resource,
1854 extractor.font_size,
1855 )
1856 return extractor.output
1858 def _layout_mode_fonts(self) -> dict[str, Font]:
1859 """
1860 Get fonts formatted for "layout" mode text extraction.
1862 Returns:
1863 Dict[str, Font]: dictionary of Font instances keyed by font name
1865 """
1866 # Font retrieval logic adapted from pypdf.PageObject._extract_text()
1867 objr: Any = self
1868 fonts: dict[str, Font] = {}
1869 while objr is not None:
1870 try:
1871 resources_dict: Any = objr[PG.RESOURCES]
1872 except KeyError:
1873 resources_dict = {}
1874 if "/Font" in resources_dict and self.pdf is not None:
1875 for font_name in resources_dict["/Font"]:
1876 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])
1877 try:
1878 objr = objr["/Parent"].get_object()
1879 except KeyError:
1880 objr = None
1882 return fonts
1884 def _layout_mode_text(
1885 self,
1886 space_vertically: bool = True,
1887 scale_weight: float = 1.25,
1888 strip_rotated: bool = True,
1889 debug_path: Optional[Path] = None,
1890 font_height_weight: float = 1,
1891 ) -> str:
1892 """
1893 Get text preserving fidelity to source PDF text layout.
1895 Args:
1896 space_vertically: include blank lines inferred from y distance + font
1897 height. Defaults to True.
1898 scale_weight: multiplier for string length when calculating weighted
1899 average character width. Defaults to 1.25.
1900 strip_rotated: Removes text that is rotated w.r.t. to the page from
1901 layout mode output. Defaults to True.
1902 debug_path (Path | None): if supplied, must target a directory.
1903 creates the following files with debug information for layout mode
1904 functions if supplied:
1905 - fonts.json: output of self._layout_mode_fonts
1906 - tjs.json: individual text render ops with corresponding transform matrices
1907 - bts.json: text render ops left justified and grouped by BT/ET operators
1908 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1909 Defaults to None.
1910 font_height_weight: multiplier for font height when calculating
1911 blank lines. Defaults to 1.
1913 Returns:
1914 str: multiline string containing page text in a fixed width format that
1915 closely adheres to the rendered layout in the source pdf.
1917 """
1918 fonts = self._layout_mode_fonts()
1919 if debug_path: # pragma: no cover
1920 import json # noqa: PLC0415
1922 debug_path.joinpath("fonts.json").write_text(
1923 json.dumps(fonts, indent=2, default=asdict),
1924 "utf-8"
1925 )
1927 ops = iter(
1928 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
1929 )
1930 bt_groups = _layout_mode.text_show_operations(
1931 ops, fonts, strip_rotated, debug_path
1932 )
1934 if not bt_groups:
1935 return ""
1937 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
1939 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
1941 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
1943 def extract_text(
1944 self,
1945 *args: Any,
1946 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),
1947 space_width: float = 200.0,
1948 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1949 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1950 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1951 extraction_mode: Literal["plain", "layout"] = "plain",
1952 **kwargs: Any,
1953 ) -> str:
1954 """
1955 Locate all text drawing commands, in the order they are provided in the
1956 content stream, and extract the text.
1958 This works well for some PDF files, but poorly for others, depending on
1959 the generator used. This will be refined in the future.
1961 Do not rely on the order of text coming out of this function, as it
1962 will change if this function is made more sophisticated.
1964 Arabic and Hebrew are extracted in the correct order.
1965 If required a custom RTL range of characters can be defined;
1966 see function set_custom_rtl.
1968 Additionally you can provide visitor methods to get informed on all
1969 operations and all text objects.
1970 For example in some PDF files this can be useful to parse tables.
1972 Args:
1973 orientations: list of orientations extract_text will look for
1974 default = (0, 90, 180, 270)
1975 note: currently only 0 (up),90 (turned left), 180 (upside down),
1976 270 (turned right)
1977 Silently ignored in "layout" mode.
1978 space_width: force default space width
1979 if not extracted from font (default: 200)
1980 Silently ignored in "layout" mode.
1981 visitor_operand_before: function to be called before processing an operation.
1982 It has four arguments: operator, operand-arguments,
1983 current transformation matrix and text matrix.
1984 Ignored with a warning in "layout" mode.
1985 visitor_operand_after: function to be called after processing an operation.
1986 It has four arguments: operator, operand-arguments,
1987 current transformation matrix and text matrix.
1988 Ignored with a warning in "layout" mode.
1989 visitor_text: function to be called when extracting some text at some position.
1990 It has five arguments: text, current transformation matrix,
1991 text matrix, font-dictionary and font-size.
1992 The font-dictionary may be None in case of unknown fonts.
1993 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
1994 Ignored with a warning in "layout" mode.
1995 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
1996 "layout" for experimental layout mode functionality.
1997 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
1998 in "layout" mode.
2000 kwargs:
2001 layout_mode_space_vertically (bool): include blank lines inferred from
2002 y distance + font height. Defaults to True.
2003 layout_mode_scale_weight (float): multiplier for string length when calculating
2004 weighted average character width. Defaults to 1.25.
2005 layout_mode_strip_rotated (bool): layout mode does not support rotated text.
2006 Set to False to include rotated text anyway. If rotated text is discovered,
2007 layout will be degraded and a warning will result. Defaults to True.
2008 layout_mode_debug_path (Path | None): if supplied, must target a directory.
2009 creates the following files with debug information for layout mode
2010 functions if supplied:
2012 - fonts.json: output of self._layout_mode_fonts
2013 - tjs.json: individual text render ops with corresponding transform matrices
2014 - bts.json: text render ops left justified and grouped by BT/ET operators
2015 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
2016 layout_mode_font_height_weight (float): multiplier for font height when calculating
2017 blank lines. Defaults to 1.
2019 Returns:
2020 The extracted text
2022 """
2023 if extraction_mode not in ["plain", "layout"]:
2024 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
2025 if extraction_mode == "layout":
2026 for visitor in (
2027 "visitor_operand_before",
2028 "visitor_operand_after",
2029 "visitor_text",
2030 ):
2031 if locals()[visitor]:
2032 logger_warning(
2033 "Argument %(visitor)s is ignored in layout mode",
2034 source=__name__,
2035 visitor=visitor,
2036 )
2037 return self._layout_mode_text(
2038 space_vertically=kwargs.get("layout_mode_space_vertically", True),
2039 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
2040 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
2041 debug_path=kwargs.get("layout_mode_debug_path"),
2042 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
2043 )
2044 if len(args) >= 1:
2045 if isinstance(args[0], str):
2046 if len(args) >= 3:
2047 if isinstance(args[2], (tuple, int)):
2048 orientations = args[2]
2049 else:
2050 raise TypeError(f"Invalid positional parameter {args[2]}")
2051 if len(args) >= 4:
2052 if isinstance(args[3], (float, int)):
2053 space_width = args[3]
2054 else:
2055 raise TypeError(f"Invalid positional parameter {args[3]}")
2056 elif isinstance(args[0], (tuple, int)):
2057 orientations = args[0]
2058 if len(args) >= 2:
2059 if isinstance(args[1], (float, int)):
2060 space_width = args[1]
2061 else:
2062 raise TypeError(f"Invalid positional parameter {args[1]}")
2063 else:
2064 raise TypeError(f"Invalid positional parameter {args[0]}")
2066 if isinstance(orientations, int):
2067 orientations = (orientations,)
2069 return self._extract_text(
2070 self,
2071 self.pdf,
2072 orientations,
2073 space_width,
2074 PG.CONTENTS,
2075 visitor_operand_before,
2076 visitor_operand_after,
2077 visitor_text,
2078 )
2080 def extract_xform_text(
2081 self,
2082 xform: EncodedStreamObject,
2083 orientations: tuple[int, ...] = (0, 90, 270, 360),
2084 space_width: float = 200.0,
2085 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2086 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2087 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2088 *,
2089 known_ids: Optional[set[int]] = None,
2090 ) -> str:
2091 """
2092 Extract text from an XObject.
2094 Args:
2095 xform:
2096 orientations:
2097 space_width: force default space width (if not extracted from font (default 200)
2098 visitor_operand_before:
2099 visitor_operand_after:
2100 visitor_text:
2102 Returns:
2103 The extracted text
2105 """
2106 return self._extract_text(
2107 xform,
2108 self.pdf,
2109 orientations,
2110 space_width,
2111 None,
2112 visitor_operand_before,
2113 visitor_operand_after,
2114 visitor_text,
2115 known_ids=known_ids,
2116 )
2118 def _get_fonts(self) -> tuple[set[str], set[str]]:
2119 """
2120 Get the names of embedded fonts and unembedded fonts.
2122 Returns:
2123 A tuple (set of embedded fonts, set of unembedded fonts)
2125 """
2126 obj = self.get_object()
2127 assert isinstance(obj, DictionaryObject)
2128 fonts: set[str] = set()
2129 embedded: set[str] = set()
2130 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
2131 unembedded = fonts - embedded
2132 return embedded, unembedded
2134 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
2135 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2136 default user space units, defining the boundaries of the physical medium on
2137 which the page is intended to be displayed or printed."""
2139 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
2140 """
2141 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2142 default user space units, defining the visible region of default user
2143 space.
2145 When the page is displayed or printed, its contents are to be clipped
2146 (cropped) to this rectangle and then imposed on the output medium in some
2147 implementation-defined manner. Default value: same as
2148 :attr:`mediabox<mediabox>`.
2149 """
2151 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
2152 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2153 default user space units, defining the region to which the contents of the
2154 page should be clipped when output in a production environment."""
2156 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
2157 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2158 default user space units, defining the intended dimensions of the finished
2159 page after trimming."""
2161 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
2162 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2163 default user space units, defining the extent of the page's meaningful
2164 content as intended by the page's creator."""
2166 @property
2167 def annotations(self) -> Optional[ArrayObject]:
2168 if "/Annots" not in self:
2169 return None
2170 return cast(ArrayObject, self["/Annots"])
2172 @annotations.setter
2173 def annotations(self, value: Optional[ArrayObject]) -> None:
2174 """
2175 Set the annotations array of the page.
2177 Typically you do not want to set this value, but append to it.
2178 If you append to it, remember to add the object first to the writer
2179 and only add the indirect object.
2180 """
2181 if value is None:
2182 if "/Annots" not in self:
2183 return
2184 del self[NameObject("/Annots")]
2185 else:
2186 self[NameObject("/Annots")] = value
2189class _VirtualList(Sequence[PageObject]):
2190 def __init__(
2191 self,
2192 length_function: Callable[[], int],
2193 get_function: Callable[[int], PageObject],
2194 ) -> None:
2195 self.length_function = length_function
2196 self.get_function = get_function
2197 self.current = -1
2199 def __len__(self) -> int:
2200 return self.length_function()
2202 @overload
2203 def __getitem__(self, index: int) -> PageObject:
2204 ...
2206 @overload
2207 def __getitem__(self, index: slice) -> Sequence[PageObject]:
2208 ...
2210 def __getitem__(
2211 self, index: Union[int, slice]
2212 ) -> Union[PageObject, Sequence[PageObject]]:
2213 if isinstance(index, slice):
2214 indices = range(*index.indices(len(self)))
2215 cls = type(self)
2216 return cls(indices.__len__, lambda idx: self[indices[idx]])
2217 if not isinstance(index, int):
2218 raise TypeError("Sequence indices must be integers")
2219 len_self = len(self)
2220 if index < 0:
2221 # support negative indexes
2222 index += len_self
2223 if not (0 <= index < len_self):
2224 raise IndexError("Sequence index out of range")
2225 return self.get_function(index)
2227 def __delitem__(self, index: Union[int, slice]) -> None:
2228 if isinstance(index, slice):
2229 r = list(range(*index.indices(len(self))))
2230 # pages have to be deleted from last to first
2231 r.sort()
2232 r.reverse()
2233 for p in r:
2234 del self[p] # recursive call
2235 return
2236 if not isinstance(index, int):
2237 raise TypeError("Index must be integers")
2238 len_self = len(self)
2239 if index < 0:
2240 # support negative indexes
2241 index += len_self
2242 if not (0 <= index < len_self):
2243 raise IndexError("Index out of range")
2244 ind = self[index].indirect_reference
2245 assert ind is not None
2246 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2247 "/Parent", None
2248 )
2249 first = True
2250 while parent is not None:
2251 parent = cast(DictionaryObject, parent.get_object())
2252 try:
2253 i = cast(ArrayObject, parent["/Kids"]).index(ind)
2254 del cast(ArrayObject, parent["/Kids"])[i]
2255 first = False
2256 try:
2257 assert ind is not None
2258 del ind.pdf.flattened_pages[index] # case of page in a Reader
2259 except Exception: # pragma: no cover
2260 pass
2261 if "/Count" in parent:
2262 parent[NameObject("/Count")] = NumberObject(
2263 cast(int, parent["/Count"]) - 1
2264 )
2265 if len(cast(ArrayObject, parent["/Kids"])) == 0:
2266 # No more objects in this part of this subtree
2267 ind = parent.indirect_reference
2268 parent = parent.get("/Parent", None)
2269 except ValueError: # from index
2270 if first:
2271 raise PdfReadError(f"Page not found in page tree: {ind}")
2272 break
2274 def __iter__(self) -> Iterator[PageObject]:
2275 for i in range(len(self)):
2276 yield self[i]
2278 def __str__(self) -> str:
2279 p = [f"PageObject({i})" for i in range(self.length_function())]
2280 return f"[{', '.join(p)}]"
2283def _get_fonts_walk(
2284 obj: DictionaryObject,
2285 fnt: set[str],
2286 emb: set[str],
2287) -> tuple[set[str], set[str]]:
2288 """
2289 Get the set of all fonts and all embedded fonts.
2291 Args:
2292 obj: Page resources dictionary
2293 fnt: font
2294 emb: embedded fonts
2296 Returns:
2297 A tuple (fnt, emb)
2299 If there is a key called 'BaseFont', that is a font that is used in the document.
2300 If there is a key called 'FontName' and another key in the same dictionary object
2301 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
2302 embedded.
2304 We create and add to two sets, fnt = fonts used and emb = fonts embedded.
2306 """
2307 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
2309 def process_font(f: DictionaryObject) -> None:
2310 nonlocal fnt, emb
2311 f = cast(DictionaryObject, f.get_object()) # to be sure
2312 if "/BaseFont" in f:
2313 fnt.add(cast(str, f["/BaseFont"]))
2315 if (
2316 ("/CharProcs" in f)
2317 or (
2318 "/FontDescriptor" in f
2319 and any(
2320 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
2321 )
2322 )
2323 or (
2324 "/DescendantFonts" in f
2325 and "/FontDescriptor"
2326 in cast(
2327 DictionaryObject,
2328 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2329 )
2330 and any(
2331 x
2332 in cast(
2333 DictionaryObject,
2334 cast(
2335 DictionaryObject,
2336 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2337 )["/FontDescriptor"],
2338 )
2339 for x in fontkeys
2340 )
2341 )
2342 ):
2343 # the list comprehension ensures there is FontFile
2344 try:
2345 emb.add(cast(str, f["/BaseFont"]))
2346 except KeyError:
2347 emb.add("(" + cast(str, f["/Subtype"]) + ")")
2349 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
2350 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
2351 process_font(f)
2352 if "/Resources" in obj:
2353 if "/Font" in cast(DictionaryObject, obj["/Resources"]):
2354 for f in cast(
2355 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
2356 ).values():
2357 process_font(f)
2358 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
2359 for x in cast(
2360 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
2361 ).values():
2362 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
2363 if "/Annots" in obj:
2364 for a in cast(ArrayObject, obj["/Annots"]):
2365 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
2366 if "/AP" in obj:
2367 if (
2368 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
2369 "/Type"
2370 )
2371 == "/XObject"
2372 ):
2373 _get_fonts_walk(
2374 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
2375 fnt,
2376 emb,
2377 )
2378 else:
2379 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
2380 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
2381 return fnt, emb # return the sets for each page