Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import math
31from collections.abc import Iterable, Iterator, Sequence
32from copy import deepcopy
33from dataclasses import asdict, dataclass
34from decimal import Decimal
35from io import BytesIO
36from pathlib import Path
37from typing import (
38 Any,
39 Callable,
40 Literal,
41 Optional,
42 Union,
43 cast,
44 overload,
45)
47from ._font import Font
48from ._protocols import PdfCommonDocProtocol
49from ._text_extraction import (
50 _layout_mode,
51)
52from ._text_extraction._text_extractor import TextExtraction
53from ._utils import (
54 CompressedTransformationMatrix,
55 TransformationMatrixType,
56 _human_readable_bytes,
57 deprecate,
58 logger_warning,
59 matrix_multiply,
60)
61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING
62from .constants import AnnotationDictionaryAttributes as ADA
63from .constants import ImageAttributes as IA
64from .constants import PageAttributes as PG
65from .constants import Resources as RES
66from .errors import PageSizeNotDefinedError, PdfReadError
67from .generic import (
68 ArrayObject,
69 ContentStream,
70 DictionaryObject,
71 EncodedStreamObject,
72 FloatObject,
73 IndirectObject,
74 NameObject,
75 NullObject,
76 NumberObject,
77 PdfObject,
78 RectangleObject,
79 StreamObject,
80 is_null_or_none,
81)
83try:
84 from PIL.Image import Image
86 pil_not_imported = False
87except ImportError:
88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
89 pil_not_imported = True # error will be raised only when using images
91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"
94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)
96 if isinstance(retval, RectangleObject):
97 return retval
98 if is_null_or_none(retval):
99 for d in defaults:
100 retval = self.get(d)
101 if retval is not None:
102 break
103 if isinstance(retval, IndirectObject):
104 retval = self.pdf.get_object(retval)
105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:
106 logger_warning(f"Expected four values, got {length}: {retval}", __name__)
107 retval = RectangleObject(tuple(retval[:4]))
108 else:
109 retval = RectangleObject(retval) # type: ignore
110 _set_rectangle(self, name, retval)
111 return retval
114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
115 self[NameObject(name)] = value
118def _delete_rectangle(self: Any, name: str) -> None:
119 del self[name]
122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
123 return property(
124 lambda self: _get_rectangle(self, name, fallback),
125 lambda self, value: _set_rectangle(self, name, value),
126 lambda self: _delete_rectangle(self, name),
127 )
130class Transformation:
131 """
132 Represent a 2D transformation.
134 The transformation between two coordinate systems is represented by a 3-by-3
135 transformation matrix with the following form::
137 a b 0
138 c d 0
139 e f 1
141 Because a transformation matrix has only six elements that can be changed,
142 it is usually specified in PDF as the six-element array [ a b c d e f ].
144 Coordinate transformations are expressed as matrix multiplications::
146 a b 0
147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0
148 e f 1
151 Example:
152 >>> from pypdf import PdfWriter, Transformation
153 >>> page = PdfWriter().add_blank_page(800, 600)
154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
155 >>> page.add_transformation(op)
157 """
159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:
160 self.ctm = ctm
162 @property
163 def matrix(self) -> TransformationMatrixType:
164 """
165 Return the transformation matrix as a tuple of tuples in the form:
167 ((a, b, 0), (c, d, 0), (e, f, 1))
168 """
169 return (
170 (self.ctm[0], self.ctm[1], 0),
171 (self.ctm[2], self.ctm[3], 0),
172 (self.ctm[4], self.ctm[5], 1),
173 )
175 @staticmethod
176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
177 """
178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
180 Args:
181 matrix: The transformation matrix as a tuple of tuples.
183 Returns:
184 A tuple representing the transformation matrix as (a, b, c, d, e, f)
186 """
187 return (
188 matrix[0][0],
189 matrix[0][1],
190 matrix[1][0],
191 matrix[1][1],
192 matrix[2][0],
193 matrix[2][1],
194 )
196 def _to_cm(self) -> str:
197 # Returns the cm operation string for the given transformation matrix
198 return (
199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
201 )
203 def transform(self, m: "Transformation") -> "Transformation":
204 """
205 Apply one transformation to another.
207 Args:
208 m: a Transformation to apply.
210 Returns:
211 A new ``Transformation`` instance
213 Example:
214 >>> from pypdf import PdfWriter, Transformation
215 >>> height, width = 40, 50
216 >>> page = PdfWriter().add_blank_page(800, 600)
217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror
219 >>> page.add_transformation(op)
221 """
222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
223 return Transformation(ctm)
225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
226 """
227 Translate the contents of a page.
229 Args:
230 tx: The translation along the x-axis.
231 ty: The translation along the y-axis.
233 Returns:
234 A new ``Transformation`` instance
236 """
237 m = self.ctm
238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
240 def scale(
241 self, sx: Optional[float] = None, sy: Optional[float] = None
242 ) -> "Transformation":
243 """
244 Scale the contents of a page towards the origin of the coordinate system.
246 Typically, that is the lower-left corner of the page. That can be
247 changed by translating the contents / the page boxes.
249 Args:
250 sx: The scale factor along the x-axis.
251 sy: The scale factor along the y-axis.
253 Returns:
254 A new Transformation instance with the scaled matrix.
256 """
257 if sx is None and sy is None:
258 raise ValueError("Either sx or sy must be specified")
259 if sx is None:
260 sx = sy
261 if sy is None:
262 sy = sx
263 assert sx is not None
264 assert sy is not None
265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
266 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
267 return Transformation(ctm)
269 def rotate(self, rotation: float) -> "Transformation":
270 """
271 Rotate the contents of a page.
273 Args:
274 rotation: The angle of rotation in degrees.
276 Returns:
277 A new ``Transformation`` instance with the rotated matrix.
279 """
280 rotation = math.radians(rotation)
281 op: TransformationMatrixType = (
282 (math.cos(rotation), math.sin(rotation), 0),
283 (-math.sin(rotation), math.cos(rotation), 0),
284 (0, 0, 1),
285 )
286 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
287 return Transformation(ctm)
289 def __repr__(self) -> str:
290 return f"Transformation(ctm={self.ctm})"
292 @overload
293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:
294 ...
296 @overload
297 def apply_on(
298 self, pt: tuple[float, float], as_object: bool = False
299 ) -> tuple[float, float]:
300 ...
302 def apply_on(
303 self,
304 pt: Union[tuple[float, float], list[float]],
305 as_object: bool = False,
306 ) -> Union[tuple[float, float], list[float]]:
307 """
308 Apply the transformation matrix on the given point.
310 Args:
311 pt: A tuple or list representing the point in the form (x, y).
312 as_object: If True, return items as FloatObject, otherwise as plain floats.
314 Returns:
315 A tuple or list representing the transformed point in the form (x', y')
317 """
318 typ = FloatObject if as_object else float
319 pt1 = (
320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
322 )
323 return list(pt1) if isinstance(pt, list) else pt1
326@dataclass
327class ImageFile:
328 """
329 Image within the PDF file. *This object is not designed to be built.*
331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.
332 """
334 name: str = ""
335 """
336 Filename as identified within the PDF file.
337 """
339 data: bytes = b""
340 """
341 Data as bytes.
342 """
344 image: Optional[Image] = None
345 """
346 Data as PIL image.
347 """
349 indirect_reference: Optional[IndirectObject] = None
350 """
351 Reference to the object storing the stream.
352 """
354 def replace(self, new_image: Image, **kwargs: Any) -> None:
355 """
356 Replace the image with a new PIL image.
358 Args:
359 new_image (PIL.Image.Image): The new PIL image to replace the existing image.
360 **kwargs: Additional keyword arguments to pass to `Image.save()`.
362 Raises:
363 TypeError: If the image is inline or in a PdfReader.
364 TypeError: If the image does not belong to a PdfWriter.
365 TypeError: If `new_image` is not a PIL Image.
367 Note:
368 This method replaces the existing image with a new image.
369 It is not allowed for inline images or images within a PdfReader.
370 The `kwargs` parameter allows passing additional parameters
371 to `Image.save()`, such as quality.
373 """
374 if pil_not_imported:
375 raise ImportError(
376 "pillow is required to do image extraction. "
377 "It can be installed via 'pip install pypdf[image]'"
378 )
380 from ._reader import PdfReader # noqa: PLC0415
381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415
382 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
384 if self.indirect_reference is None:
385 raise TypeError("Cannot update an inline image.")
386 if not hasattr(self.indirect_reference.pdf, "_id_translated"):
387 raise TypeError("Cannot update an image not belonging to a PdfWriter.")
388 if not isinstance(new_image, Image):
389 raise TypeError("new_image shall be a PIL Image")
390 b = BytesIO()
391 new_image.save(b, "PDF", **kwargs)
392 reader = PdfReader(b)
393 page_image = reader.pages[0].images[0]
394 assert page_image.indirect_reference is not None
395 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
396 page_image.indirect_reference.get_object()
397 )
398 cast(
399 PdfObject, self.indirect_reference.get_object()
400 ).indirect_reference = self.indirect_reference
401 # change the object attributes
402 extension, byte_stream, img = _xobj_to_image(
403 cast(DictionaryObject, self.indirect_reference.get_object()),
404 pillow_parameters=kwargs,
405 )
406 assert extension is not None
407 self.name = self.name[: self.name.rfind(".")] + extension
408 self.data = byte_stream
409 self.image = img
411 def __str__(self) -> str:
412 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
414 def __repr__(self) -> str:
415 return self.__str__()[:-1] + f", hash: {hash(self.data)})"
418class VirtualListImages(Sequence[ImageFile]):
419 """
420 Provides access to images referenced within a page.
421 Only one copy will be returned if the usage is used on the same page multiple times.
422 See :func:`PageObject.images` for more details.
423 """
425 def __init__(
426 self,
427 ids_function: Callable[[], list[Union[str, list[str]]]],
428 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],
429 ) -> None:
430 self.ids_function = ids_function
431 self.get_function = get_function
432 self.current = -1
434 def __len__(self) -> int:
435 return len(self.ids_function())
437 def keys(self) -> list[Union[str, list[str]]]:
438 return self.ids_function()
440 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:
441 return [(x, self[x]) for x in self.ids_function()]
443 @overload
444 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:
445 ...
447 @overload
448 def __getitem__(self, index: slice) -> Sequence[ImageFile]:
449 ...
451 def __getitem__(
452 self, index: Union[int, slice, str, list[str], tuple[str]]
453 ) -> Union[ImageFile, Sequence[ImageFile]]:
454 lst = self.ids_function()
455 if isinstance(index, slice):
456 indices = range(*index.indices(len(self)))
457 lst = [lst[x] for x in indices]
458 cls = type(self)
459 return cls((lambda: lst), self.get_function)
460 if isinstance(index, (str, list, tuple)):
461 return self.get_function(index)
462 if not isinstance(index, int):
463 raise TypeError("Invalid sequence indices type")
464 len_self = len(lst)
465 if index < 0:
466 # support negative indexes
467 index += len_self
468 if not (0 <= index < len_self):
469 raise IndexError("Sequence index out of range")
470 return self.get_function(lst[index])
472 def __iter__(self) -> Iterator[ImageFile]:
473 for i in range(len(self)):
474 yield self[i]
476 def __str__(self) -> str:
477 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
478 return f"[{', '.join(p)}]"
481class PageObject(DictionaryObject):
482 """
483 PageObject represents a single page within a PDF file.
485 Typically these objects will be created by accessing the
486 :attr:`pages<pypdf.PdfReader.pages>` property of the
487 :class:`PdfReader<pypdf.PdfReader>` class, but it is
488 also possible to create an empty page with the
489 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
491 Args:
492 pdf: PDF file the page belongs to.
493 indirect_reference: Stores the original indirect reference to
494 this object in its source PDF
496 """
498 original_page: "PageObject" # very local use in writer when appending
500 def __init__(
501 self,
502 pdf: Optional[PdfCommonDocProtocol] = None,
503 indirect_reference: Optional[IndirectObject] = None,
504 ) -> None:
505 DictionaryObject.__init__(self)
506 self.pdf = pdf
507 self.inline_images: Optional[dict[str, ImageFile]] = None
508 self.indirect_reference = indirect_reference
509 if not is_null_or_none(indirect_reference):
510 assert indirect_reference is not None, "mypy"
511 self.update(cast(DictionaryObject, indirect_reference.get_object()))
512 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}
514 def hash_bin(self) -> int:
515 """
516 Used to detect modified object.
518 Note: this function is overloaded to return the same results
519 as a DictionaryObject.
521 Returns:
522 Hash considering type and value.
524 """
525 return hash(
526 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
527 )
529 def hash_value_data(self) -> bytes:
530 data = super().hash_value_data()
531 data += f"{id(self)}".encode()
532 return data
534 @property
535 def user_unit(self) -> float:
536 """
537 A read-only positive number giving the size of user space units.
539 It is in multiples of 1/72 inch. Hence a value of 1 means a user
540 space unit is 1/72 inch, and a value of 3 means that a user
541 space unit is 3/72 inch.
542 """
543 return cast(float, self.get(PG.USER_UNIT, 1))
545 @staticmethod
546 def create_blank_page(
547 pdf: Optional[PdfCommonDocProtocol] = None,
548 width: Union[float, Decimal, None] = None,
549 height: Union[float, Decimal, None] = None,
550 ) -> "PageObject":
551 """
552 Return a new blank page.
554 If ``width`` or ``height`` is ``None``, try to get the page size
555 from the last page of *pdf*.
557 Args:
558 pdf: PDF file the page is within.
559 width: The width of the new page expressed in default user
560 space units.
561 height: The height of the new page expressed in default user
562 space units.
564 Returns:
565 The new blank page
567 Raises:
568 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
569 no page
571 """
572 page = PageObject(pdf)
574 # Creates a new page (cf PDF Reference §7.7.3.3)
575 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
576 page.__setitem__(NameObject(PG.PARENT), NullObject())
577 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
578 if width is None or height is None:
579 if pdf is not None and len(pdf.pages) > 0:
580 lastpage = pdf.pages[len(pdf.pages) - 1]
581 width = lastpage.mediabox.width
582 height = lastpage.mediabox.height
583 else:
584 raise PageSizeNotDefinedError
585 page.__setitem__(
586 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore
587 )
589 return page
591 def _get_ids_image(
592 self,
593 obj: Optional[DictionaryObject] = None,
594 ancest: Optional[list[str]] = None,
595 call_stack: Optional[list[Any]] = None,
596 ) -> list[Union[str, list[str]]]:
597 if call_stack is None:
598 call_stack = []
599 _i = getattr(obj, "indirect_reference", None)
600 if _i in call_stack:
601 return []
602 call_stack.append(_i)
603 if self.inline_images is None:
604 self.inline_images = self._get_inline_images()
605 if obj is None:
606 obj = self
607 if ancest is None:
608 ancest = []
609 lst: list[Union[str, list[str]]] = []
610 if (
611 PG.RESOURCES not in obj or
612 is_null_or_none(resources := obj[PG.RESOURCES]) or
613 RES.XOBJECT not in cast(DictionaryObject, resources)
614 ):
615 return [] if self.inline_images is None else list(self.inline_images.keys())
617 x_object = resources[RES.XOBJECT].get_object() # type: ignore
618 for o in x_object:
619 if not isinstance(x_object[o], StreamObject):
620 continue
621 if x_object[o][IA.SUBTYPE] == "/Image":
622 lst.append(o if len(ancest) == 0 else [*ancest, o])
623 else: # is a form with possible images inside
624 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))
625 assert self.inline_images is not None
626 lst.extend(list(self.inline_images.keys()))
627 return lst
629 def _get_image(
630 self,
631 id: Union[str, list[str], tuple[str]],
632 obj: Optional[DictionaryObject] = None,
633 ) -> ImageFile:
634 if obj is None:
635 obj = cast(DictionaryObject, self)
636 if isinstance(id, tuple):
637 id = list(id)
638 if isinstance(id, list) and len(id) == 1:
639 id = id[0]
640 xobjs: Optional[DictionaryObject] = None
641 try:
642 xobjs = cast(
643 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
644 )
645 except KeyError as exc:
646 if not (id[0] == "~" and id[-1] == "~"):
647 raise KeyError(
648 f"Cannot access image object {id} without XObject resources"
649 ) from exc
650 if isinstance(id, str):
651 if id[0] == "~" and id[-1] == "~":
652 if self.inline_images is None:
653 self.inline_images = self._get_inline_images()
654 if self.inline_images is None:
655 raise KeyError("No inline image can be found")
656 return self.inline_images[id]
658 assert xobjs is not None
659 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
660 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
661 extension, byte_stream = imgd[:2]
662 return ImageFile(
663 name=f"{id[1:]}{extension}",
664 data=byte_stream,
665 image=imgd[2],
666 indirect_reference=xobjs[id].indirect_reference,
667 )
668 # in a subobject
669 assert xobjs is not None
670 ids = id[1:]
671 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
673 @property
674 def images(self) -> VirtualListImages:
675 """
676 Read-only property emulating a list of images on a page.
678 Get a list of all images on the page. The key can be:
679 - A string (for the top object)
680 - A tuple (for images within XObject forms)
681 - An integer
683 Examples:
684 * `reader.pages[0].images[0]` # return first image
685 * `reader.pages[0].images['/I0']` # return image '/I0'
686 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form
687 * `for img in reader.pages[0].images:` # loops through all objects
689 images.keys() and images.items() can be used.
691 The ImageFile has the following properties:
693 * `.name` : name of the object
694 * `.data` : bytes of the object
695 * `.image` : PIL Image Object
696 * `.indirect_reference` : object reference
698 and the following methods:
699 `.replace(new_image: PIL.Image.Image, **kwargs)` :
700 replace the image in the pdf with the new image
701 applying the saving parameters indicated (such as quality)
703 Example usage:
705 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)
707 Inline images are extracted and named ~0~, ~1~, ..., with the
708 indirect_reference set to None.
710 """
711 return VirtualListImages(self._get_ids_image, self._get_image)
713 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:
714 """Translate values used in inline image"""
715 try:
716 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])
717 except (TypeError, KeyError):
718 if isinstance(v, NameObject):
719 # It is a custom name, thus we have to look in resources.
720 # The only applicable case is for ColorSpace.
721 try:
722 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
723 v = cast(DictionaryObject, res)[v]
724 except KeyError: # for res and v
725 raise PdfReadError(f"Cannot find resource entry {v} for {k}")
726 return v
728 def _get_inline_images(self) -> dict[str, ImageFile]:
729 """Load inline images. Entries will be identified as `~1~`."""
730 content = self.get_contents()
731 if is_null_or_none(content):
732 return {}
733 imgs_data = []
734 assert content is not None, "mypy"
735 for param, ope in content.operations:
736 if ope == b"INLINE IMAGE":
737 imgs_data.append(
738 {"settings": param["settings"], "__streamdata__": param["data"]}
739 )
740 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
741 raise PdfReadError(
742 f"{ope!r} operator met whereas not expected, "
743 "please share use case with pypdf dev team"
744 )
745 files = {}
746 for num, ii in enumerate(imgs_data):
747 init = {
748 "__streamdata__": ii["__streamdata__"],
749 "/Length": len(ii["__streamdata__"]),
750 }
751 for k, v in ii["settings"].items():
752 if k in {"/Length", "/L"}: # no length is expected
753 continue
754 if isinstance(v, list):
755 v = ArrayObject(
756 [self._translate_value_inline_image(k, x) for x in v]
757 )
758 else:
759 v = self._translate_value_inline_image(k, v)
760 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])
761 if k not in init:
762 init[k] = v
763 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
764 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415
765 extension, byte_stream, img = _xobj_to_image(ii["object"])
766 files[f"~{num}~"] = ImageFile(
767 name=f"~{num}~{extension}",
768 data=byte_stream,
769 image=img,
770 indirect_reference=None,
771 )
772 return files
774 @property
775 def rotation(self) -> int:
776 """
777 The visual rotation of the page.
779 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
780 valid values. This property does not affect ``/Contents``.
781 """
782 rotate_obj = self.get(PG.ROTATE, 0)
783 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
785 @rotation.setter
786 def rotation(self, r: float) -> None:
787 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
789 def transfer_rotation_to_content(self) -> None:
790 """
791 Apply the rotation of the page to the content and the media/crop/...
792 boxes.
794 It is recommended to apply this function before page merging.
795 """
796 r = -self.rotation # rotation to apply is in the otherway
797 self.rotation = 0
798 mb = RectangleObject(self.mediabox)
799 trsf = (
800 Transformation()
801 .translate(
802 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
803 )
804 .rotate(r)
805 )
806 pt1 = trsf.apply_on(mb.lower_left)
807 pt2 = trsf.apply_on(mb.upper_right)
808 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
809 self.add_transformation(trsf, False)
810 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
811 if b in self:
812 rr = RectangleObject(self[b]) # type: ignore
813 pt1 = trsf.apply_on(rr.lower_left)
814 pt2 = trsf.apply_on(rr.upper_right)
815 self[NameObject(b)] = RectangleObject(
816 (
817 min(pt1[0], pt2[0]),
818 min(pt1[1], pt2[1]),
819 max(pt1[0], pt2[0]),
820 max(pt1[1], pt2[1]),
821 )
822 )
824 def rotate(self, angle: int) -> "PageObject":
825 """
826 Rotate a page clockwise by increments of 90 degrees.
828 Args:
829 angle: Angle to rotate the page. Must be an increment of 90 deg.
831 Returns:
832 The rotated PageObject
834 """
835 if angle % 90 != 0:
836 raise ValueError("Rotation angle must be a multiple of 90")
837 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
838 return self
840 def _merge_resources(
841 self,
842 res1: DictionaryObject,
843 res2: DictionaryObject,
844 resource: Any,
845 new_res1: bool = True,
846 ) -> tuple[dict[str, Any], dict[str, Any]]:
847 try:
848 assert isinstance(self.indirect_reference, IndirectObject)
849 pdf = self.indirect_reference.pdf
850 is_pdf_writer = hasattr(
851 pdf, "_add_object"
852 ) # expect isinstance(pdf, PdfWriter)
853 except (AssertionError, AttributeError):
854 pdf = None
855 is_pdf_writer = False
857 def compute_unique_key(base_key: str) -> tuple[str, bool]:
858 """
859 Find a key that either doesn't already exist or has the same value
860 (indicated by the bool)
862 Args:
863 base_key: An index is added to this to get the computed key
865 Returns:
866 A tuple (computed key, bool) where the boolean indicates
867 if there is a resource of the given computed_key with the same
868 value.
870 """
871 value = page2res.raw_get(base_key)
872 # TODO: a possible improvement for writer, the indirect_reference
873 # cannot be found because translated
875 # try the current key first (e.g. "foo"), but otherwise iterate
876 # through "foo-0", "foo-1", etc. new_res can contain only finitely
877 # many keys, thus this'll eventually end, even if it's been crafted
878 # to be maximally annoying.
879 computed_key = base_key
880 idx = 0
881 while computed_key in new_res:
882 if new_res.raw_get(computed_key) == value:
883 # there's already a resource of this name, with the exact
884 # same value
885 return computed_key, True
886 computed_key = f"{base_key}-{idx}"
887 idx += 1
888 return computed_key, False
890 if new_res1:
891 new_res = DictionaryObject()
892 new_res.update(res1.get(resource, DictionaryObject()).get_object())
893 else:
894 new_res = cast(DictionaryObject, res1[resource])
895 page2res = cast(
896 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
897 )
898 rename_res = {}
899 for key in page2res:
900 unique_key, same_value = compute_unique_key(key)
901 newname = NameObject(unique_key)
902 if key != unique_key:
903 # we have to use a different name for this
904 rename_res[key] = newname
906 if not same_value:
907 if is_pdf_writer:
908 new_res[newname] = page2res.raw_get(key).clone(pdf)
909 try:
910 new_res[newname] = new_res[newname].indirect_reference
911 except AttributeError:
912 pass
913 else:
914 new_res[newname] = page2res.raw_get(key)
915 lst = sorted(new_res.items())
916 new_res.clear()
917 for el in lst:
918 new_res[el[0]] = el[1]
919 return new_res, rename_res
921 @staticmethod
922 def _content_stream_rename(
923 stream: ContentStream,
924 rename: dict[Any, Any],
925 pdf: Optional[PdfCommonDocProtocol],
926 ) -> ContentStream:
927 if not rename:
928 return stream
929 stream = ContentStream(stream, pdf)
930 for operands, _operator in stream.operations:
931 if isinstance(operands, list):
932 for i, op in enumerate(operands):
933 if isinstance(op, NameObject):
934 operands[i] = rename.get(op, op)
935 elif isinstance(operands, dict):
936 for i, op in operands.items():
937 if isinstance(op, NameObject):
938 operands[i] = rename.get(op, op)
939 else:
940 raise KeyError(f"Type of operands is {type(operands)}")
941 return stream
943 @staticmethod
944 def _add_transformation_matrix(
945 contents: Any,
946 pdf: Optional[PdfCommonDocProtocol],
947 ctm: CompressedTransformationMatrix,
948 ) -> ContentStream:
949 """Add transformation matrix at the beginning of the given contents stream."""
950 content_stream = ContentStream(contents, pdf)
951 content_stream.operations.insert(
952 0,
953 (
954 [FloatObject(x) for x in ctm],
955 b"cm",
956 ),
957 )
958 return content_stream
960 def _get_contents_as_bytes(self) -> Optional[bytes]:
961 """
962 Return the page contents as bytes.
964 Returns:
965 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
967 """
968 if PG.CONTENTS in self:
969 obj = self[PG.CONTENTS].get_object()
970 if isinstance(obj, list):
971 return b"".join(x.get_object().get_data() for x in obj)
972 return cast(EncodedStreamObject, obj).get_data()
973 return None
975 def get_contents(self) -> Optional[ContentStream]:
976 """
977 Access the page contents.
979 Returns:
980 The ``/Contents`` object, or ``None`` if it does not exist.
981 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
983 """
984 if PG.CONTENTS in self:
985 try:
986 pdf = cast(IndirectObject, self.indirect_reference).pdf
987 except AttributeError:
988 pdf = None
989 obj = self[PG.CONTENTS]
990 if is_null_or_none(obj):
991 return None
992 resolved_object = obj.get_object()
993 return ContentStream(resolved_object, pdf)
994 return None
996 def replace_contents(
997 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
998 ) -> None:
999 """
1000 Replace the page contents with the new content and nullify old objects
1001 Args:
1002 content: new content; if None delete the content field.
1003 """
1004 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
1005 # the page is not attached : the content is directly attached.
1006 self[NameObject(PG.CONTENTS)] = content
1007 return
1009 from pypdf._writer import PdfWriter # noqa: PLC0415
1010 if not isinstance(self.indirect_reference.pdf, PdfWriter):
1011 deprecate(
1012 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "
1013 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "
1014 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."
1015 )
1017 writer = self.indirect_reference.pdf
1018 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
1019 content_array = cast(ArrayObject, self[PG.CONTENTS])
1020 for reference in content_array:
1021 try:
1022 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())
1023 except ValueError:
1024 # Occurs when called on PdfReader.
1025 pass
1027 if isinstance(content, ArrayObject):
1028 content = ArrayObject(writer._add_object(obj) for obj in content)
1030 if is_null_or_none(content):
1031 if PG.CONTENTS not in self:
1032 return
1033 assert self[PG.CONTENTS].indirect_reference is not None
1034 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())
1035 del self[PG.CONTENTS]
1036 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
1037 try:
1038 self[NameObject(PG.CONTENTS)] = writer._add_object(content)
1039 except AttributeError:
1040 # applies at least for page not in writer
1041 # as a backup solution, we put content as an object although not in accordance with pdf ref
1042 # this will be fixed with the _add_object
1043 self[NameObject(PG.CONTENTS)] = content
1044 else:
1045 assert content is not None, "mypy"
1046 content.indirect_reference = self[
1047 PG.CONTENTS
1048 ].indirect_reference # TODO: in the future may require generation management
1049 try:
1050 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)
1051 except AttributeError:
1052 # applies at least for page not in writer
1053 # as a backup solution, we put content as an object although not in accordance with pdf ref
1054 # this will be fixed with the _add_object
1055 self[NameObject(PG.CONTENTS)] = content
1056 # forces recalculation of inline_images
1057 self.inline_images = None
1059 def merge_page(
1060 self, page2: "PageObject", expand: bool = False, over: bool = True
1061 ) -> None:
1062 """
1063 Merge the content streams of two pages into one.
1065 Resource references (e.g. fonts) are maintained from both pages.
1066 The mediabox, cropbox, etc of this page are not altered.
1067 The parameter page's content stream will
1068 be added to the end of this page's content stream,
1069 meaning that it will be drawn after, or "on top" of this page.
1071 Args:
1072 page2: The page to be merged into this one. Should be
1073 an instance of :class:`PageObject<PageObject>`.
1074 over: set the page2 content over page1 if True (default) else under
1075 expand: If True, the current page dimensions will be
1076 expanded to accommodate the dimensions of the page to be merged.
1078 """
1079 self._merge_page(page2, over=over, expand=expand)
1081 def _merge_page(
1082 self,
1083 page2: "PageObject",
1084 page2_transformation: Optional[Callable[[Any], ContentStream]] = None,
1085 ctm: Optional[CompressedTransformationMatrix] = None,
1086 over: bool = True,
1087 expand: bool = False,
1088 ) -> None:
1089 # First we work on merging the resource dictionaries. This allows us
1090 # to find out what symbols in the content streams we might need to
1091 # rename.
1092 try:
1093 assert isinstance(self.indirect_reference, IndirectObject)
1094 if hasattr(self.indirect_reference.pdf, "_add_object"): # to detect PdfWriter
1095 return self._merge_page_writer(
1096 page2, page2_transformation, ctm, over, expand
1097 )
1098 except (AssertionError, AttributeError):
1099 pass
1101 new_resources = DictionaryObject()
1102 rename: dict[str, Any] = {}
1103 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())
1104 page2_resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())
1105 new_annots = ArrayObject()
1107 for page in (self, page2):
1108 if PG.ANNOTS in page:
1109 annots = page[PG.ANNOTS]
1110 if isinstance(annots, ArrayObject):
1111 new_annots.extend(annots)
1112 self[NameObject(PG.ANNOTS)] = new_annots
1114 for res in (
1115 RES.EXT_G_STATE,
1116 RES.COLOR_SPACE,
1117 RES.PATTERN,
1118 RES.SHADING,
1119 RES.XOBJECT,
1120 RES.FONT,
1121 RES.PROPERTIES,
1122 ):
1123 new, new_resource_name = self._merge_resources(
1124 original_resources, page2_resources, res
1125 )
1126 if new:
1127 new_resources[NameObject(res)] = new
1128 rename.update(new_resource_name)
1130 # Combine /ProcSet sets, making sure there is a consistent order
1131 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
1132 sorted(
1133 set(
1134 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
1135 ).union(
1136 set(page2_resources.get(RES.PROC_SET, ArrayObject()).get_object())
1137 )
1138 )
1139 )
1141 new_content_array = ArrayObject()
1142 original_content = self.get_contents()
1143 if original_content is not None:
1144 original_content.isolate_graphics_state()
1145 new_content_array.append(original_content)
1147 page2_content = page2.get_contents()
1148 if page2_content is not None:
1149 rect = getattr(page2, MERGE_CROP_BOX)
1150 page2_content.operations.insert(
1151 0,
1152 (
1153 map(
1154 FloatObject,
1155 [
1156 rect.left,
1157 rect.bottom,
1158 rect.width,
1159 rect.height,
1160 ],
1161 ),
1162 b"re",
1163 ),
1164 )
1165 page2_content.operations.insert(1, ([], b"W"))
1166 page2_content.operations.insert(2, ([], b"n"))
1167 if page2_transformation is not None:
1168 page2_content = page2_transformation(page2_content)
1169 page2_content = PageObject._content_stream_rename(
1170 page2_content, rename, self.pdf
1171 )
1172 page2_content.isolate_graphics_state()
1173 if over:
1174 new_content_array.append(page2_content)
1175 else:
1176 new_content_array.insert(0, page2_content)
1178 # if expanding the page to fit a new page, calculate the new media box size
1179 if expand:
1180 self._expand_mediabox(page2, ctm)
1182 self.replace_contents(ContentStream(new_content_array, self.pdf))
1183 self[NameObject(PG.RESOURCES)] = new_resources
1185 return None
1187 def _merge_page_writer(
1188 self,
1189 page2: "PageObject",
1190 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1191 ctm: Optional[CompressedTransformationMatrix] = None,
1192 over: bool = True,
1193 expand: bool = False,
1194 ) -> None:
1195 # First we work on merging the resource dictionaries. This allows us
1196 # to find which symbols in the content streams we might need to
1197 # rename.
1198 assert isinstance(self.indirect_reference, IndirectObject)
1199 pdf = self.indirect_reference.pdf
1201 if PG.RESOURCES not in self:
1202 self[NameObject(PG.RESOURCES)] = DictionaryObject()
1203 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1204 if PG.RESOURCES not in page2:
1205 page2resources = DictionaryObject()
1206 else:
1207 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1209 rename = {}
1210 for res in (
1211 RES.EXT_G_STATE,
1212 RES.COLOR_SPACE,
1213 RES.PATTERN,
1214 RES.SHADING,
1215 RES.XOBJECT,
1216 RES.FONT,
1217 RES.PROPERTIES,
1218 ):
1219 if res in page2resources:
1220 if res not in original_resources:
1221 original_resources[NameObject(res)] = DictionaryObject()
1222 _, newrename = self._merge_resources(
1223 original_resources, page2resources, res, False
1224 )
1225 rename.update(newrename)
1226 # Combine /ProcSet sets
1227 if RES.PROC_SET in page2resources:
1228 if RES.PROC_SET not in original_resources:
1229 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
1230 arr = cast(ArrayObject, original_resources[RES.PROC_SET])
1231 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
1232 if x not in arr:
1233 arr.append(x)
1234 arr.sort()
1236 if not is_null_or_none(page2.get(PG.ANNOTS, None)):
1237 if PG.ANNOTS not in self:
1238 self[NameObject(PG.ANNOTS)] = ArrayObject()
1239 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
1240 if ctm is None:
1241 trsf = Transformation()
1242 else:
1243 trsf = Transformation(ctm)
1244 # Ensure we are working on a copy of the list. Otherwise, if both pages
1245 # are the same object, we might run into an infinite loop.
1246 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):
1247 a = a.get_object()
1248 aa = a.clone(
1249 pdf,
1250 ignore_fields=("/P", "/StructParent", "/Parent"),
1251 force_duplicate=True,
1252 )
1253 r = cast(ArrayObject, a["/Rect"])
1254 pt1 = trsf.apply_on((r[0], r[1]), True)
1255 pt2 = trsf.apply_on((r[2], r[3]), True)
1256 aa[NameObject("/Rect")] = ArrayObject(
1257 (
1258 min(pt1[0], pt2[0]),
1259 min(pt1[1], pt2[1]),
1260 max(pt1[0], pt2[0]),
1261 max(pt1[1], pt2[1]),
1262 )
1263 )
1264 if "/QuadPoints" in a:
1265 q = cast(ArrayObject, a["/QuadPoints"])
1266 aa[NameObject("/QuadPoints")] = ArrayObject(
1267 trsf.apply_on((q[0], q[1]), True)
1268 + trsf.apply_on((q[2], q[3]), True)
1269 + trsf.apply_on((q[4], q[5]), True)
1270 + trsf.apply_on((q[6], q[7]), True)
1271 )
1272 try:
1273 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
1274 except KeyError:
1275 pass
1276 try:
1277 aa[NameObject("/P")] = self.indirect_reference
1278 annots.append(aa.indirect_reference)
1279 except AttributeError:
1280 pass
1282 new_content_array = ArrayObject()
1283 original_content = self.get_contents()
1284 if original_content is not None:
1285 original_content.isolate_graphics_state()
1286 new_content_array.append(original_content)
1288 page2content = page2.get_contents()
1289 if page2content is not None:
1290 rect = getattr(page2, MERGE_CROP_BOX)
1291 page2content.operations.insert(
1292 0,
1293 (
1294 map(
1295 FloatObject,
1296 [
1297 rect.left,
1298 rect.bottom,
1299 rect.width,
1300 rect.height,
1301 ],
1302 ),
1303 b"re",
1304 ),
1305 )
1306 page2content.operations.insert(1, ([], b"W"))
1307 page2content.operations.insert(2, ([], b"n"))
1308 if page2transformation is not None:
1309 page2content = page2transformation(page2content)
1310 page2content = PageObject._content_stream_rename(
1311 page2content, rename, self.pdf
1312 )
1313 page2content.isolate_graphics_state()
1314 if over:
1315 new_content_array.append(page2content)
1316 else:
1317 new_content_array.insert(0, page2content)
1319 # if expanding the page to fit a new page, calculate the new media box size
1320 if expand:
1321 self._expand_mediabox(page2, ctm)
1323 self.replace_contents(new_content_array)
1325 def _expand_mediabox(
1326 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
1327 ) -> None:
1328 corners1 = (
1329 self.mediabox.left.as_numeric(),
1330 self.mediabox.bottom.as_numeric(),
1331 self.mediabox.right.as_numeric(),
1332 self.mediabox.top.as_numeric(),
1333 )
1334 corners2 = (
1335 page2.mediabox.left.as_numeric(),
1336 page2.mediabox.bottom.as_numeric(),
1337 page2.mediabox.left.as_numeric(),
1338 page2.mediabox.top.as_numeric(),
1339 page2.mediabox.right.as_numeric(),
1340 page2.mediabox.top.as_numeric(),
1341 page2.mediabox.right.as_numeric(),
1342 page2.mediabox.bottom.as_numeric(),
1343 )
1344 if ctm is not None:
1345 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1346 new_x = tuple(
1347 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
1348 for i in range(0, 8, 2)
1349 )
1350 new_y = tuple(
1351 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
1352 for i in range(0, 8, 2)
1353 )
1354 else:
1355 new_x = corners2[0:8:2]
1356 new_y = corners2[1:8:2]
1357 lowerleft = (min(new_x), min(new_y))
1358 upperright = (max(new_x), max(new_y))
1359 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
1360 upperright = (
1361 max(corners1[2], upperright[0]),
1362 max(corners1[3], upperright[1]),
1363 )
1365 self.mediabox.lower_left = lowerleft
1366 self.mediabox.upper_right = upperright
1368 def merge_transformed_page(
1369 self,
1370 page2: "PageObject",
1371 ctm: Union[CompressedTransformationMatrix, Transformation],
1372 over: bool = True,
1373 expand: bool = False,
1374 ) -> None:
1375 """
1376 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation
1377 matrix is applied to the merged stream.
1379 Args:
1380 page2: The page to be merged into this one.
1381 ctm: a 6-element tuple containing the operands of the
1382 transformation matrix
1383 over: set the page2 content over page1 if True (default) else under
1384 expand: Whether the page should be expanded to fit the dimensions
1385 of the page to be merged.
1387 """
1388 if isinstance(ctm, Transformation):
1389 ctm = ctm.ctm
1390 self._merge_page(
1391 page2,
1392 lambda page2_content: PageObject._add_transformation_matrix(
1393 page2_content, page2.pdf, ctm
1394 ),
1395 ctm,
1396 over,
1397 expand,
1398 )
1400 def merge_scaled_page(
1401 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
1402 ) -> None:
1403 """
1404 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1405 is scaled by applying a transformation matrix.
1407 Args:
1408 page2: The page to be merged into this one.
1409 scale: The scaling factor
1410 over: set the page2 content over page1 if True (default) else under
1411 expand: Whether the page should be expanded to fit the
1412 dimensions of the page to be merged.
1414 """
1415 op = Transformation().scale(scale, scale)
1416 self.merge_transformed_page(page2, op, over, expand)
1418 def merge_rotated_page(
1419 self,
1420 page2: "PageObject",
1421 rotation: float,
1422 over: bool = True,
1423 expand: bool = False,
1424 ) -> None:
1425 """
1426 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1427 is rotated by applying a transformation matrix.
1429 Args:
1430 page2: The page to be merged into this one.
1431 rotation: The angle of the rotation, in degrees
1432 over: set the page2 content over page1 if True (default) else under
1433 expand: Whether the page should be expanded to fit the
1434 dimensions of the page to be merged.
1436 """
1437 op = Transformation().rotate(rotation)
1438 self.merge_transformed_page(page2, op, over, expand)
1440 def merge_translated_page(
1441 self,
1442 page2: "PageObject",
1443 tx: float,
1444 ty: float,
1445 over: bool = True,
1446 expand: bool = False,
1447 ) -> None:
1448 """
1449 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be
1450 merged is translated by applying a transformation matrix.
1452 Args:
1453 page2: the page to be merged into this one.
1454 tx: The translation on X axis
1455 ty: The translation on Y axis
1456 over: set the page2 content over page1 if True (default) else under
1457 expand: Whether the page should be expanded to fit the
1458 dimensions of the page to be merged.
1460 """
1461 op = Transformation().translate(tx, ty)
1462 self.merge_transformed_page(page2, op, over, expand)
1464 def add_transformation(
1465 self,
1466 ctm: Union[Transformation, CompressedTransformationMatrix],
1467 expand: bool = False,
1468 ) -> None:
1469 """
1470 Apply a transformation matrix to the page.
1472 Args:
1473 ctm: A 6-element tuple containing the operands of the
1474 transformation matrix. Alternatively, a
1475 :py:class:`Transformation<pypdf.Transformation>`
1476 object can be passed.
1478 See :doc:`/user/cropping-and-transforming`.
1480 """
1481 if isinstance(ctm, Transformation):
1482 ctm = ctm.ctm
1483 content = self.get_contents()
1484 if content is not None:
1485 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
1486 content.isolate_graphics_state()
1487 self.replace_contents(content)
1488 # if expanding the page to fit a new page, calculate the new media box size
1489 if expand:
1490 corners = [
1491 self.mediabox.left.as_numeric(),
1492 self.mediabox.bottom.as_numeric(),
1493 self.mediabox.left.as_numeric(),
1494 self.mediabox.top.as_numeric(),
1495 self.mediabox.right.as_numeric(),
1496 self.mediabox.top.as_numeric(),
1497 self.mediabox.right.as_numeric(),
1498 self.mediabox.bottom.as_numeric(),
1499 ]
1501 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1502 new_x = [
1503 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
1504 for i in range(0, 8, 2)
1505 ]
1506 new_y = [
1507 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
1508 for i in range(0, 8, 2)
1509 ]
1511 self.mediabox.lower_left = (min(new_x), min(new_y))
1512 self.mediabox.upper_right = (max(new_x), max(new_y))
1514 def scale(self, sx: float, sy: float) -> None:
1515 """
1516 Scale a page by the given factors by applying a transformation matrix
1517 to its content and updating the page size.
1519 This updates the various page boundaries (bleedbox, trimbox, etc.)
1520 and the contents of the page.
1522 Args:
1523 sx: The scaling factor on horizontal axis.
1524 sy: The scaling factor on vertical axis.
1526 """
1527 self.add_transformation((sx, 0, 0, sy, 0, 0))
1528 self.bleedbox = self.bleedbox.scale(sx, sy)
1529 self.trimbox = self.trimbox.scale(sx, sy)
1530 self.artbox = self.artbox.scale(sx, sy)
1531 self.cropbox = self.cropbox.scale(sx, sy)
1532 self.mediabox = self.mediabox.scale(sx, sy)
1534 if PG.ANNOTS in self:
1535 annotations = self[PG.ANNOTS]
1536 if isinstance(annotations, ArrayObject):
1537 for annotation in annotations:
1538 annotation_obj = annotation.get_object()
1539 if ADA.Rect in annotation_obj:
1540 rectangle = annotation_obj[ADA.Rect]
1541 if isinstance(rectangle, ArrayObject):
1542 rectangle[0] = FloatObject(float(rectangle[0]) * sx)
1543 rectangle[1] = FloatObject(float(rectangle[1]) * sy)
1544 rectangle[2] = FloatObject(float(rectangle[2]) * sx)
1545 rectangle[3] = FloatObject(float(rectangle[3]) * sy)
1547 if PG.VP in self:
1548 viewport = self[PG.VP]
1549 if isinstance(viewport, ArrayObject):
1550 bbox = viewport[0]["/BBox"]
1551 else:
1552 bbox = viewport["/BBox"] # type: ignore
1553 scaled_bbox = RectangleObject(
1554 (
1555 float(bbox[0]) * sx,
1556 float(bbox[1]) * sy,
1557 float(bbox[2]) * sx,
1558 float(bbox[3]) * sy,
1559 )
1560 )
1561 if isinstance(viewport, ArrayObject):
1562 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore
1563 NameObject("/BBox")
1564 ] = scaled_bbox
1565 else:
1566 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore
1568 def scale_by(self, factor: float) -> None:
1569 """
1570 Scale a page by the given factor by applying a transformation matrix to
1571 its content and updating the page size.
1573 Args:
1574 factor: The scaling factor (for both X and Y axis).
1576 """
1577 self.scale(factor, factor)
1579 def scale_to(self, width: float, height: float) -> None:
1580 """
1581 Scale a page to the specified dimensions by applying a transformation
1582 matrix to its content and updating the page size.
1584 Args:
1585 width: The new width.
1586 height: The new height.
1588 """
1589 sx = width / float(self.mediabox.width)
1590 sy = height / float(self.mediabox.height)
1591 self.scale(sx, sy)
1593 def compress_content_streams(self, level: int = -1) -> None:
1594 """
1595 Compress the size of this page by joining all content streams and
1596 applying a FlateDecode filter.
1598 However, it is possible that this function will perform no action if
1599 content stream compression becomes "automatic".
1600 """
1601 content = self.get_contents()
1602 if content is not None:
1603 content_obj = content.flate_encode(level)
1604 try:
1605 content.indirect_reference.pdf._objects[ # type: ignore
1606 content.indirect_reference.idnum - 1 # type: ignore
1607 ] = content_obj
1608 except AttributeError:
1609 if self.indirect_reference is not None and hasattr(
1610 self.indirect_reference.pdf, "_add_object"
1611 ):
1612 self.replace_contents(content_obj)
1613 else:
1614 raise ValueError("Page must be part of a PdfWriter")
1616 @property
1617 def page_number(self) -> Optional[int]:
1618 """
1619 Read-only property which returns the page number within the PDF file.
1621 Returns:
1622 Page number; None if the page is not attached to a PDF.
1624 """
1625 if self.indirect_reference is None:
1626 return None
1627 try:
1628 lst = self.indirect_reference.pdf.pages
1629 return int(lst.index(self))
1630 except ValueError:
1631 return None
1633 def _debug_for_extract(self) -> str: # pragma: no cover
1634 out = ""
1635 for ope, op in ContentStream(
1636 self["/Contents"].get_object(), self.pdf, "bytes"
1637 ).operations:
1638 if op == b"TJ":
1639 s = [x for x in ope[0] if isinstance(x, str)]
1640 else:
1641 s = []
1642 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
1643 out += "\n=============================\n"
1644 try:
1645 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore
1646 out += fo + "\n"
1647 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore
1648 try:
1649 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1650 "/Encoding"
1651 ].__repr__()
1652 out += enc_repr + "\n"
1653 except Exception:
1654 pass
1655 try:
1656 out += (
1657 self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1658 "/ToUnicode"
1659 ]
1660 .get_data()
1661 .decode()
1662 + "\n"
1663 )
1664 except Exception:
1665 pass
1667 except KeyError:
1668 out += "No Font\n"
1669 return out
1671 def _extract_text(
1672 self,
1673 obj: Any,
1674 pdf: Any,
1675 orientations: tuple[int, ...] = (0, 90, 180, 270),
1676 space_width: float = 200.0,
1677 content_key: Optional[str] = PG.CONTENTS,
1678 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1679 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1680 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1681 ) -> str:
1682 """
1683 See extract_text for most arguments.
1685 Args:
1686 content_key: indicate the default key where to extract data
1687 None = the object; this allows reusing the function on an XObject
1688 default = "/Content"
1690 """
1691 extractor = TextExtraction()
1692 font_resources: dict[str, DictionaryObject] = {}
1693 fonts: dict[str, Font] = {}
1695 try:
1696 objr = obj
1697 while NameObject(PG.RESOURCES) not in objr:
1698 # /Resources can be inherited so we look to parents
1699 objr = objr["/Parent"].get_object()
1700 # If no parents then no /Resources will be available,
1701 # so an exception will be raised
1702 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
1703 except Exception:
1704 # No resources means no text is possible (no font); we consider the
1705 # file as not damaged, no need to check for TJ or Tj
1706 return ""
1708 if (
1709 not is_null_or_none(resources_dict)
1710 and "/Font" in resources_dict
1711 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))
1712 ):
1713 for font_resource in font_resources_dict:
1714 try:
1715 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())
1716 font_resources[font_resource] = font_resource_object
1717 fonts[font_resource] = Font.from_font_resource(font_resource_object)
1718 # Override space width, if applicable
1719 if fonts[font_resource].character_widths.get(" ", 0) == 0:
1720 fonts[font_resource].space_width = space_width
1721 except (AttributeError, TypeError):
1722 pass
1724 try:
1725 content = (
1726 obj[content_key].get_object() if isinstance(content_key, str) else obj
1727 )
1728 if not isinstance(content, ContentStream):
1729 content = ContentStream(content, pdf, "bytes")
1730 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
1731 return ""
1732 # We check all strings are TextStringObjects. ByteStringObjects
1733 # are strings where the byte->string encoding was unknown, so adding
1734 # them to the text here would be gibberish.
1736 # Initialize the extractor with the necessary parameters
1737 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)
1739 for operands, operator in content.operations:
1740 if visitor_operand_before is not None:
1741 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1742 # Multiple operators are handled here
1743 if operator == b"'":
1744 extractor.process_operation(b"T*", [])
1745 extractor.process_operation(b"Tj", operands)
1746 elif operator == b'"':
1747 extractor.process_operation(b"Tw", [operands[0]])
1748 extractor.process_operation(b"Tc", [operands[1]])
1749 extractor.process_operation(b"T*", [])
1750 extractor.process_operation(b"Tj", operands[2:])
1751 elif operator == b"TJ":
1752 # The space width may be smaller than the font width, so the width should be 95%.
1753 _confirm_space_width = extractor._space_width * 0.95
1754 if operands:
1755 for op in operands[0]:
1756 if isinstance(op, (str, bytes)):
1757 extractor.process_operation(b"Tj", [op])
1758 if isinstance(op, (int, float, NumberObject, FloatObject)) and (
1759 abs(float(op)) >= _confirm_space_width
1760 and extractor.text
1761 and extractor.text[-1] != " "
1762 ):
1763 extractor.process_operation(b"Tj", [" "])
1764 elif operator == b"TD":
1765 extractor.process_operation(b"TL", [-operands[1]])
1766 extractor.process_operation(b"Td", operands)
1767 elif operator == b"Do":
1768 extractor.output += extractor.text
1769 if visitor_text is not None:
1770 visitor_text(
1771 extractor.text,
1772 extractor.memo_cm,
1773 extractor.memo_tm,
1774 extractor.font_resource,
1775 extractor.font_size,
1776 )
1777 try:
1778 if extractor.output[-1] != "\n":
1779 extractor.output += "\n"
1780 if visitor_text is not None:
1781 visitor_text(
1782 "\n",
1783 extractor.memo_cm,
1784 extractor.memo_tm,
1785 extractor.font_resource,
1786 extractor.font_size,
1787 )
1788 except IndexError:
1789 pass
1790 try:
1791 xobj = resources_dict["/XObject"]
1792 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
1793 text = self.extract_xform_text(
1794 xobj[operands[0]], # type: ignore
1795 orientations,
1796 space_width,
1797 visitor_operand_before,
1798 visitor_operand_after,
1799 visitor_text,
1800 )
1801 extractor.output += text
1802 if visitor_text is not None:
1803 visitor_text(
1804 text,
1805 extractor.memo_cm,
1806 extractor.memo_tm,
1807 extractor.font_resource,
1808 extractor.font_size,
1809 )
1810 except Exception as exception:
1811 logger_warning(
1812 f"Impossible to decode XFormObject {operands[0]}: {exception}",
1813 __name__,
1814 )
1815 finally:
1816 extractor.text = ""
1817 extractor.memo_cm = extractor.cm_matrix.copy()
1818 extractor.memo_tm = extractor.tm_matrix.copy()
1819 else:
1820 extractor.process_operation(operator, operands)
1821 if visitor_operand_after is not None:
1822 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1823 extractor.output += extractor.text # just in case
1824 if extractor.text != "" and visitor_text is not None:
1825 visitor_text(
1826 extractor.text,
1827 extractor.memo_cm,
1828 extractor.memo_tm,
1829 extractor.font_resource,
1830 extractor.font_size,
1831 )
1832 return extractor.output
1834 def _layout_mode_fonts(self) -> dict[str, Font]:
1835 """
1836 Get fonts formatted for "layout" mode text extraction.
1838 Returns:
1839 Dict[str, Font]: dictionary of Font instances keyed by font name
1841 """
1842 # Font retrieval logic adapted from pypdf.PageObject._extract_text()
1843 objr: Any = self
1844 fonts: dict[str, Font] = {}
1845 while objr is not None:
1846 try:
1847 resources_dict: Any = objr[PG.RESOURCES]
1848 except KeyError:
1849 resources_dict = {}
1850 if "/Font" in resources_dict and self.pdf is not None:
1851 for font_name in resources_dict["/Font"]:
1852 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])
1853 try:
1854 objr = objr["/Parent"].get_object()
1855 except KeyError:
1856 objr = None
1858 return fonts
1860 def _layout_mode_text(
1861 self,
1862 space_vertically: bool = True,
1863 scale_weight: float = 1.25,
1864 strip_rotated: bool = True,
1865 debug_path: Optional[Path] = None,
1866 font_height_weight: float = 1,
1867 ) -> str:
1868 """
1869 Get text preserving fidelity to source PDF text layout.
1871 Args:
1872 space_vertically: include blank lines inferred from y distance + font
1873 height. Defaults to True.
1874 scale_weight: multiplier for string length when calculating weighted
1875 average character width. Defaults to 1.25.
1876 strip_rotated: Removes text that is rotated w.r.t. to the page from
1877 layout mode output. Defaults to True.
1878 debug_path (Path | None): if supplied, must target a directory.
1879 creates the following files with debug information for layout mode
1880 functions if supplied:
1881 - fonts.json: output of self._layout_mode_fonts
1882 - tjs.json: individual text render ops with corresponding transform matrices
1883 - bts.json: text render ops left justified and grouped by BT/ET operators
1884 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1885 Defaults to None.
1886 font_height_weight: multiplier for font height when calculating
1887 blank lines. Defaults to 1.
1889 Returns:
1890 str: multiline string containing page text in a fixed width format that
1891 closely adheres to the rendered layout in the source pdf.
1893 """
1894 fonts = self._layout_mode_fonts()
1895 if debug_path: # pragma: no cover
1896 import json # noqa: PLC0415
1898 debug_path.joinpath("fonts.json").write_text(
1899 json.dumps(fonts, indent=2, default=asdict),
1900 "utf-8"
1901 )
1903 ops = iter(
1904 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
1905 )
1906 bt_groups = _layout_mode.text_show_operations(
1907 ops, fonts, strip_rotated, debug_path
1908 )
1910 if not bt_groups:
1911 return ""
1913 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
1915 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
1917 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
1919 def extract_text(
1920 self,
1921 *args: Any,
1922 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),
1923 space_width: float = 200.0,
1924 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1925 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1926 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1927 extraction_mode: Literal["plain", "layout"] = "plain",
1928 **kwargs: Any,
1929 ) -> str:
1930 """
1931 Locate all text drawing commands, in the order they are provided in the
1932 content stream, and extract the text.
1934 This works well for some PDF files, but poorly for others, depending on
1935 the generator used. This will be refined in the future.
1937 Do not rely on the order of text coming out of this function, as it
1938 will change if this function is made more sophisticated.
1940 Arabic and Hebrew are extracted in the correct order.
1941 If required a custom RTL range of characters can be defined;
1942 see function set_custom_rtl.
1944 Additionally you can provide visitor methods to get informed on all
1945 operations and all text objects.
1946 For example in some PDF files this can be useful to parse tables.
1948 Args:
1949 orientations: list of orientations extract_text will look for
1950 default = (0, 90, 180, 270)
1951 note: currently only 0 (up),90 (turned left), 180 (upside down),
1952 270 (turned right)
1953 Silently ignored in "layout" mode.
1954 space_width: force default space width
1955 if not extracted from font (default: 200)
1956 Silently ignored in "layout" mode.
1957 visitor_operand_before: function to be called before processing an operation.
1958 It has four arguments: operator, operand-arguments,
1959 current transformation matrix and text matrix.
1960 Ignored with a warning in "layout" mode.
1961 visitor_operand_after: function to be called after processing an operation.
1962 It has four arguments: operator, operand-arguments,
1963 current transformation matrix and text matrix.
1964 Ignored with a warning in "layout" mode.
1965 visitor_text: function to be called when extracting some text at some position.
1966 It has five arguments: text, current transformation matrix,
1967 text matrix, font-dictionary and font-size.
1968 The font-dictionary may be None in case of unknown fonts.
1969 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
1970 Ignored with a warning in "layout" mode.
1971 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
1972 "layout" for experimental layout mode functionality.
1973 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
1974 in "layout" mode.
1976 kwargs:
1977 layout_mode_space_vertically (bool): include blank lines inferred from
1978 y distance + font height. Defaults to True.
1979 layout_mode_scale_weight (float): multiplier for string length when calculating
1980 weighted average character width. Defaults to 1.25.
1981 layout_mode_strip_rotated (bool): layout mode does not support rotated text.
1982 Set to False to include rotated text anyway. If rotated text is discovered,
1983 layout will be degraded and a warning will result. Defaults to True.
1984 layout_mode_debug_path (Path | None): if supplied, must target a directory.
1985 creates the following files with debug information for layout mode
1986 functions if supplied:
1988 - fonts.json: output of self._layout_mode_fonts
1989 - tjs.json: individual text render ops with corresponding transform matrices
1990 - bts.json: text render ops left justified and grouped by BT/ET operators
1991 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1992 layout_mode_font_height_weight (float): multiplier for font height when calculating
1993 blank lines. Defaults to 1.
1995 Returns:
1996 The extracted text
1998 """
1999 if extraction_mode not in ["plain", "layout"]:
2000 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
2001 if extraction_mode == "layout":
2002 for visitor in (
2003 "visitor_operand_before",
2004 "visitor_operand_after",
2005 "visitor_text",
2006 ):
2007 if locals()[visitor]:
2008 logger_warning(
2009 f"Argument {visitor} is ignored in layout mode",
2010 __name__,
2011 )
2012 return self._layout_mode_text(
2013 space_vertically=kwargs.get("layout_mode_space_vertically", True),
2014 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
2015 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
2016 debug_path=kwargs.get("layout_mode_debug_path"),
2017 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
2018 )
2019 if len(args) >= 1:
2020 if isinstance(args[0], str):
2021 if len(args) >= 3:
2022 if isinstance(args[2], (tuple, int)):
2023 orientations = args[2]
2024 else:
2025 raise TypeError(f"Invalid positional parameter {args[2]}")
2026 if len(args) >= 4:
2027 if isinstance(args[3], (float, int)):
2028 space_width = args[3]
2029 else:
2030 raise TypeError(f"Invalid positional parameter {args[3]}")
2031 elif isinstance(args[0], (tuple, int)):
2032 orientations = args[0]
2033 if len(args) >= 2:
2034 if isinstance(args[1], (float, int)):
2035 space_width = args[1]
2036 else:
2037 raise TypeError(f"Invalid positional parameter {args[1]}")
2038 else:
2039 raise TypeError(f"Invalid positional parameter {args[0]}")
2041 if isinstance(orientations, int):
2042 orientations = (orientations,)
2044 return self._extract_text(
2045 self,
2046 self.pdf,
2047 orientations,
2048 space_width,
2049 PG.CONTENTS,
2050 visitor_operand_before,
2051 visitor_operand_after,
2052 visitor_text,
2053 )
2055 def extract_xform_text(
2056 self,
2057 xform: EncodedStreamObject,
2058 orientations: tuple[int, ...] = (0, 90, 270, 360),
2059 space_width: float = 200.0,
2060 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2061 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2062 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2063 ) -> str:
2064 """
2065 Extract text from an XObject.
2067 Args:
2068 xform:
2069 orientations:
2070 space_width: force default space width (if not extracted from font (default 200)
2071 visitor_operand_before:
2072 visitor_operand_after:
2073 visitor_text:
2075 Returns:
2076 The extracted text
2078 """
2079 return self._extract_text(
2080 xform,
2081 self.pdf,
2082 orientations,
2083 space_width,
2084 None,
2085 visitor_operand_before,
2086 visitor_operand_after,
2087 visitor_text,
2088 )
2090 def _get_fonts(self) -> tuple[set[str], set[str]]:
2091 """
2092 Get the names of embedded fonts and unembedded fonts.
2094 Returns:
2095 A tuple (set of embedded fonts, set of unembedded fonts)
2097 """
2098 obj = self.get_object()
2099 assert isinstance(obj, DictionaryObject)
2100 fonts: set[str] = set()
2101 embedded: set[str] = set()
2102 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
2103 unembedded = fonts - embedded
2104 return embedded, unembedded
2106 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
2107 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2108 default user space units, defining the boundaries of the physical medium on
2109 which the page is intended to be displayed or printed."""
2111 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
2112 """
2113 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2114 default user space units, defining the visible region of default user
2115 space.
2117 When the page is displayed or printed, its contents are to be clipped
2118 (cropped) to this rectangle and then imposed on the output medium in some
2119 implementation-defined manner. Default value: same as
2120 :attr:`mediabox<mediabox>`.
2121 """
2123 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
2124 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2125 default user space units, defining the region to which the contents of the
2126 page should be clipped when output in a production environment."""
2128 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
2129 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2130 default user space units, defining the intended dimensions of the finished
2131 page after trimming."""
2133 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
2134 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2135 default user space units, defining the extent of the page's meaningful
2136 content as intended by the page's creator."""
2138 @property
2139 def annotations(self) -> Optional[ArrayObject]:
2140 if "/Annots" not in self:
2141 return None
2142 return cast(ArrayObject, self["/Annots"])
2144 @annotations.setter
2145 def annotations(self, value: Optional[ArrayObject]) -> None:
2146 """
2147 Set the annotations array of the page.
2149 Typically you do not want to set this value, but append to it.
2150 If you append to it, remember to add the object first to the writer
2151 and only add the indirect object.
2152 """
2153 if value is None:
2154 if "/Annots" not in self:
2155 return
2156 del self[NameObject("/Annots")]
2157 else:
2158 self[NameObject("/Annots")] = value
2161class _VirtualList(Sequence[PageObject]):
2162 def __init__(
2163 self,
2164 length_function: Callable[[], int],
2165 get_function: Callable[[int], PageObject],
2166 ) -> None:
2167 self.length_function = length_function
2168 self.get_function = get_function
2169 self.current = -1
2171 def __len__(self) -> int:
2172 return self.length_function()
2174 @overload
2175 def __getitem__(self, index: int) -> PageObject:
2176 ...
2178 @overload
2179 def __getitem__(self, index: slice) -> Sequence[PageObject]:
2180 ...
2182 def __getitem__(
2183 self, index: Union[int, slice]
2184 ) -> Union[PageObject, Sequence[PageObject]]:
2185 if isinstance(index, slice):
2186 indices = range(*index.indices(len(self)))
2187 cls = type(self)
2188 return cls(indices.__len__, lambda idx: self[indices[idx]])
2189 if not isinstance(index, int):
2190 raise TypeError("Sequence indices must be integers")
2191 len_self = len(self)
2192 if index < 0:
2193 # support negative indexes
2194 index += len_self
2195 if not (0 <= index < len_self):
2196 raise IndexError("Sequence index out of range")
2197 return self.get_function(index)
2199 def __delitem__(self, index: Union[int, slice]) -> None:
2200 if isinstance(index, slice):
2201 r = list(range(*index.indices(len(self))))
2202 # pages have to be deleted from last to first
2203 r.sort()
2204 r.reverse()
2205 for p in r:
2206 del self[p] # recursive call
2207 return
2208 if not isinstance(index, int):
2209 raise TypeError("Index must be integers")
2210 len_self = len(self)
2211 if index < 0:
2212 # support negative indexes
2213 index += len_self
2214 if not (0 <= index < len_self):
2215 raise IndexError("Index out of range")
2216 ind = self[index].indirect_reference
2217 assert ind is not None
2218 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2219 "/Parent", None
2220 )
2221 first = True
2222 while parent is not None:
2223 parent = cast(DictionaryObject, parent.get_object())
2224 try:
2225 i = cast(ArrayObject, parent["/Kids"]).index(ind)
2226 del cast(ArrayObject, parent["/Kids"])[i]
2227 first = False
2228 try:
2229 assert ind is not None
2230 del ind.pdf.flattened_pages[index] # case of page in a Reader
2231 except Exception: # pragma: no cover
2232 pass
2233 if "/Count" in parent:
2234 parent[NameObject("/Count")] = NumberObject(
2235 cast(int, parent["/Count"]) - 1
2236 )
2237 if len(cast(ArrayObject, parent["/Kids"])) == 0:
2238 # No more objects in this part of this subtree
2239 ind = parent.indirect_reference
2240 parent = parent.get("/Parent", None)
2241 except ValueError: # from index
2242 if first:
2243 raise PdfReadError(f"Page not found in page tree: {ind}")
2244 break
2246 def __iter__(self) -> Iterator[PageObject]:
2247 for i in range(len(self)):
2248 yield self[i]
2250 def __str__(self) -> str:
2251 p = [f"PageObject({i})" for i in range(self.length_function())]
2252 return f"[{', '.join(p)}]"
2255def _get_fonts_walk(
2256 obj: DictionaryObject,
2257 fnt: set[str],
2258 emb: set[str],
2259) -> tuple[set[str], set[str]]:
2260 """
2261 Get the set of all fonts and all embedded fonts.
2263 Args:
2264 obj: Page resources dictionary
2265 fnt: font
2266 emb: embedded fonts
2268 Returns:
2269 A tuple (fnt, emb)
2271 If there is a key called 'BaseFont', that is a font that is used in the document.
2272 If there is a key called 'FontName' and another key in the same dictionary object
2273 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
2274 embedded.
2276 We create and add to two sets, fnt = fonts used and emb = fonts embedded.
2278 """
2279 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
2281 def process_font(f: DictionaryObject) -> None:
2282 nonlocal fnt, emb
2283 f = cast(DictionaryObject, f.get_object()) # to be sure
2284 if "/BaseFont" in f:
2285 fnt.add(cast(str, f["/BaseFont"]))
2287 if (
2288 ("/CharProcs" in f)
2289 or (
2290 "/FontDescriptor" in f
2291 and any(
2292 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
2293 )
2294 )
2295 or (
2296 "/DescendantFonts" in f
2297 and "/FontDescriptor"
2298 in cast(
2299 DictionaryObject,
2300 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2301 )
2302 and any(
2303 x
2304 in cast(
2305 DictionaryObject,
2306 cast(
2307 DictionaryObject,
2308 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2309 )["/FontDescriptor"],
2310 )
2311 for x in fontkeys
2312 )
2313 )
2314 ):
2315 # the list comprehension ensures there is FontFile
2316 try:
2317 emb.add(cast(str, f["/BaseFont"]))
2318 except KeyError:
2319 emb.add("(" + cast(str, f["/Subtype"]) + ")")
2321 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
2322 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
2323 process_font(f)
2324 if "/Resources" in obj:
2325 if "/Font" in cast(DictionaryObject, obj["/Resources"]):
2326 for f in cast(
2327 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
2328 ).values():
2329 process_font(f)
2330 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
2331 for x in cast(
2332 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
2333 ).values():
2334 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
2335 if "/Annots" in obj:
2336 for a in cast(ArrayObject, obj["/Annots"]):
2337 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
2338 if "/AP" in obj:
2339 if (
2340 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
2341 "/Type"
2342 )
2343 == "/XObject"
2344 ):
2345 _get_fonts_walk(
2346 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
2347 fnt,
2348 emb,
2349 )
2350 else:
2351 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
2352 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
2353 return fnt, emb # return the sets for each page