Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import math
31from collections.abc import Iterable, Iterator, Sequence
32from dataclasses import dataclass
33from decimal import Decimal
34from io import BytesIO
35from pathlib import Path
36from typing import (
37 Any,
38 Callable,
39 Literal,
40 Optional,
41 Union,
42 cast,
43 overload,
44)
46from ._cmap import (
47 build_char_map,
48)
49from ._protocols import PdfCommonDocProtocol
50from ._text_extraction import (
51 _layout_mode,
52)
53from ._text_extraction._text_extractor import TextExtraction
54from ._utils import (
55 CompressedTransformationMatrix,
56 TransformationMatrixType,
57 _human_readable_bytes,
58 logger_warning,
59 matrix_multiply,
60)
61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING
62from .constants import AnnotationDictionaryAttributes as ADA
63from .constants import ImageAttributes as IA
64from .constants import PageAttributes as PG
65from .constants import Resources as RES
66from .errors import PageSizeNotDefinedError, PdfReadError
67from .filters import _xobj_to_image
68from .generic import (
69 ArrayObject,
70 ContentStream,
71 DictionaryObject,
72 EncodedStreamObject,
73 FloatObject,
74 IndirectObject,
75 NameObject,
76 NullObject,
77 NumberObject,
78 PdfObject,
79 RectangleObject,
80 StreamObject,
81 is_null_or_none,
82)
84try:
85 from PIL.Image import Image
87 pil_not_imported = False
88except ImportError:
89 Image = object # type: ignore
90 pil_not_imported = True # error will be raised only when using images
92MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"
95def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
96 retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
97 if isinstance(retval, RectangleObject):
98 return retval
99 if is_null_or_none(retval):
100 for d in defaults:
101 retval = self.get(d)
102 if retval is not None:
103 break
104 if isinstance(retval, IndirectObject):
105 retval = self.pdf.get_object(retval)
106 retval = RectangleObject(retval) # type: ignore
107 _set_rectangle(self, name, retval)
108 return retval
111def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
112 self[NameObject(name)] = value
115def _delete_rectangle(self: Any, name: str) -> None:
116 del self[name]
119def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
120 return property(
121 lambda self: _get_rectangle(self, name, fallback),
122 lambda self, value: _set_rectangle(self, name, value),
123 lambda self: _delete_rectangle(self, name),
124 )
127class Transformation:
128 """
129 Represent a 2D transformation.
131 The transformation between two coordinate systems is represented by a 3-by-3
132 transformation matrix with the following form::
134 a b 0
135 c d 0
136 e f 1
138 Because a transformation matrix has only six elements that can be changed,
139 it is usually specified in PDF as the six-element array [ a b c d e f ].
141 Coordinate transformations are expressed as matrix multiplications::
143 a b 0
144 [ x′ y′ 1 ] = [ x y 1 ] × c d 0
145 e f 1
148 Example:
149 >>> from pypdf import PdfWriter, Transformation
150 >>> page = PdfWriter().add_blank_page(800, 600)
151 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
152 >>> page.add_transformation(op)
154 """
156 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:
157 self.ctm = ctm
159 @property
160 def matrix(self) -> TransformationMatrixType:
161 """
162 Return the transformation matrix as a tuple of tuples in the form:
164 ((a, b, 0), (c, d, 0), (e, f, 1))
165 """
166 return (
167 (self.ctm[0], self.ctm[1], 0),
168 (self.ctm[2], self.ctm[3], 0),
169 (self.ctm[4], self.ctm[5], 1),
170 )
172 @staticmethod
173 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
174 """
175 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
177 Args:
178 matrix: The transformation matrix as a tuple of tuples.
180 Returns:
181 A tuple representing the transformation matrix as (a, b, c, d, e, f)
183 """
184 return (
185 matrix[0][0],
186 matrix[0][1],
187 matrix[1][0],
188 matrix[1][1],
189 matrix[2][0],
190 matrix[2][1],
191 )
193 def _to_cm(self) -> str:
194 # Returns the cm operation string for the given transformation matrix
195 return (
196 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
197 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
198 )
200 def transform(self, m: "Transformation") -> "Transformation":
201 """
202 Apply one transformation to another.
204 Args:
205 m: a Transformation to apply.
207 Returns:
208 A new ``Transformation`` instance
210 Example:
211 >>> from pypdf import PdfWriter, Transformation
212 >>> height, width = 40, 50
213 >>> page = PdfWriter().add_blank_page(800, 600)
214 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
215 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror
216 >>> page.add_transformation(op)
218 """
219 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
220 return Transformation(ctm)
222 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
223 """
224 Translate the contents of a page.
226 Args:
227 tx: The translation along the x-axis.
228 ty: The translation along the y-axis.
230 Returns:
231 A new ``Transformation`` instance
233 """
234 m = self.ctm
235 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
237 def scale(
238 self, sx: Optional[float] = None, sy: Optional[float] = None
239 ) -> "Transformation":
240 """
241 Scale the contents of a page towards the origin of the coordinate system.
243 Typically, that is the lower-left corner of the page. That can be
244 changed by translating the contents / the page boxes.
246 Args:
247 sx: The scale factor along the x-axis.
248 sy: The scale factor along the y-axis.
250 Returns:
251 A new Transformation instance with the scaled matrix.
253 """
254 if sx is None and sy is None:
255 raise ValueError("Either sx or sy must be specified")
256 if sx is None:
257 sx = sy
258 if sy is None:
259 sy = sx
260 assert sx is not None
261 assert sy is not None
262 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
263 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
264 return Transformation(ctm)
266 def rotate(self, rotation: float) -> "Transformation":
267 """
268 Rotate the contents of a page.
270 Args:
271 rotation: The angle of rotation in degrees.
273 Returns:
274 A new ``Transformation`` instance with the rotated matrix.
276 """
277 rotation = math.radians(rotation)
278 op: TransformationMatrixType = (
279 (math.cos(rotation), math.sin(rotation), 0),
280 (-math.sin(rotation), math.cos(rotation), 0),
281 (0, 0, 1),
282 )
283 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
284 return Transformation(ctm)
286 def __repr__(self) -> str:
287 return f"Transformation(ctm={self.ctm})"
289 @overload
290 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:
291 ...
293 @overload
294 def apply_on(
295 self, pt: tuple[float, float], as_object: bool = False
296 ) -> tuple[float, float]:
297 ...
299 def apply_on(
300 self,
301 pt: Union[tuple[float, float], list[float]],
302 as_object: bool = False,
303 ) -> Union[tuple[float, float], list[float]]:
304 """
305 Apply the transformation matrix on the given point.
307 Args:
308 pt: A tuple or list representing the point in the form (x, y).
309 as_object: If True, return items as FloatObject, otherwise as plain floats.
311 Returns:
312 A tuple or list representing the transformed point in the form (x', y')
314 """
315 typ = FloatObject if as_object else float
316 pt1 = (
317 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
318 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
319 )
320 return list(pt1) if isinstance(pt, list) else pt1
323@dataclass
324class ImageFile:
325 """
326 Image within the PDF file. *This object is not designed to be built.*
328 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.
329 """
331 name: str = ""
332 """
333 Filename as identified within the PDF file.
334 """
336 data: bytes = b""
337 """
338 Data as bytes.
339 """
341 image: Optional[Image] = None
342 """
343 Data as PIL image.
344 """
346 indirect_reference: Optional[IndirectObject] = None
347 """
348 Reference to the object storing the stream.
349 """
351 def replace(self, new_image: Image, **kwargs: Any) -> None:
352 """
353 Replace the image with a new PIL image.
355 Args:
356 new_image (PIL.Image.Image): The new PIL image to replace the existing image.
357 **kwargs: Additional keyword arguments to pass to `Image.save()`.
359 Raises:
360 TypeError: If the image is inline or in a PdfReader.
361 TypeError: If the image does not belong to a PdfWriter.
362 TypeError: If `new_image` is not a PIL Image.
364 Note:
365 This method replaces the existing image with a new image.
366 It is not allowed for inline images or images within a PdfReader.
367 The `kwargs` parameter allows passing additional parameters
368 to `Image.save()`, such as quality.
370 """
371 if pil_not_imported:
372 raise ImportError(
373 "pillow is required to do image extraction. "
374 "It can be installed via 'pip install pypdf[image]'"
375 )
377 from ._reader import PdfReader # noqa: PLC0415
379 # to prevent circular import
380 from .filters import _xobj_to_image # noqa: PLC0415
381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415
383 if self.indirect_reference is None:
384 raise TypeError("Cannot update an inline image.")
385 if not hasattr(self.indirect_reference.pdf, "_id_translated"):
386 raise TypeError("Cannot update an image not belonging to a PdfWriter.")
387 if not isinstance(new_image, Image):
388 raise TypeError("new_image shall be a PIL Image")
389 b = BytesIO()
390 new_image.save(b, "PDF", **kwargs)
391 reader = PdfReader(b)
392 assert reader.pages[0].images[0].indirect_reference is not None
393 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
394 reader.pages[0].images[0].indirect_reference.get_object()
395 )
396 cast(
397 PdfObject, self.indirect_reference.get_object()
398 ).indirect_reference = self.indirect_reference
399 # change the object attributes
400 extension, byte_stream, img = _xobj_to_image(
401 cast(DictionaryObject, self.indirect_reference.get_object())
402 )
403 assert extension is not None
404 self.name = self.name[: self.name.rfind(".")] + extension
405 self.data = byte_stream
406 self.image = img
408 def __str__(self) -> str:
409 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
411 def __repr__(self) -> str:
412 return self.__str__()[:-1] + f", hash: {hash(self.data)})"
415class VirtualListImages(Sequence[ImageFile]):
416 """
417 Provides access to images referenced within a page.
418 Only one copy will be returned if the usage is used on the same page multiple times.
419 See :func:`PageObject.images` for more details.
420 """
422 def __init__(
423 self,
424 ids_function: Callable[[], list[Union[str, list[str]]]],
425 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],
426 ) -> None:
427 self.ids_function = ids_function
428 self.get_function = get_function
429 self.current = -1
431 def __len__(self) -> int:
432 return len(self.ids_function())
434 def keys(self) -> list[Union[str, list[str]]]:
435 return self.ids_function()
437 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:
438 return [(x, self[x]) for x in self.ids_function()]
440 @overload
441 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:
442 ...
444 @overload
445 def __getitem__(self, index: slice) -> Sequence[ImageFile]:
446 ...
448 def __getitem__(
449 self, index: Union[int, slice, str, list[str], tuple[str]]
450 ) -> Union[ImageFile, Sequence[ImageFile]]:
451 lst = self.ids_function()
452 if isinstance(index, slice):
453 indices = range(*index.indices(len(self)))
454 lst = [lst[x] for x in indices]
455 cls = type(self)
456 return cls((lambda: lst), self.get_function)
457 if isinstance(index, (str, list, tuple)):
458 return self.get_function(index)
459 if not isinstance(index, int):
460 raise TypeError("Invalid sequence indices type")
461 len_self = len(lst)
462 if index < 0:
463 # support negative indexes
464 index += len_self
465 if not (0 <= index < len_self):
466 raise IndexError("Sequence index out of range")
467 return self.get_function(lst[index])
469 def __iter__(self) -> Iterator[ImageFile]:
470 for i in range(len(self)):
471 yield self[i]
473 def __str__(self) -> str:
474 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
475 return f"[{', '.join(p)}]"
478class PageObject(DictionaryObject):
479 """
480 PageObject represents a single page within a PDF file.
482 Typically these objects will be created by accessing the
483 :attr:`pages<pypdf.PdfReader.pages>` property of the
484 :class:`PdfReader<pypdf.PdfReader>` class, but it is
485 also possible to create an empty page with the
486 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
488 Args:
489 pdf: PDF file the page belongs to.
490 indirect_reference: Stores the original indirect reference to
491 this object in its source PDF
493 """
495 original_page: "PageObject" # very local use in writer when appending
497 def __init__(
498 self,
499 pdf: Optional[PdfCommonDocProtocol] = None,
500 indirect_reference: Optional[IndirectObject] = None,
501 ) -> None:
502 DictionaryObject.__init__(self)
503 self.pdf = pdf
504 self.inline_images: Optional[dict[str, ImageFile]] = None
505 self.indirect_reference = indirect_reference
506 if not is_null_or_none(indirect_reference):
507 assert indirect_reference is not None, "mypy"
508 self.update(cast(DictionaryObject, indirect_reference.get_object()))
509 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}
511 def hash_bin(self) -> int:
512 """
513 Used to detect modified object.
515 Note: this function is overloaded to return the same results
516 as a DictionaryObject.
518 Returns:
519 Hash considering type and value.
521 """
522 return hash(
523 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
524 )
526 def hash_value_data(self) -> bytes:
527 data = super().hash_value_data()
528 data += f"{id(self)}".encode()
529 return data
531 @property
532 def user_unit(self) -> float:
533 """
534 A read-only positive number giving the size of user space units.
536 It is in multiples of 1/72 inch. Hence a value of 1 means a user
537 space unit is 1/72 inch, and a value of 3 means that a user
538 space unit is 3/72 inch.
539 """
540 return self.get(PG.USER_UNIT, 1)
542 @staticmethod
543 def create_blank_page(
544 pdf: Optional[PdfCommonDocProtocol] = None,
545 width: Union[float, Decimal, None] = None,
546 height: Union[float, Decimal, None] = None,
547 ) -> "PageObject":
548 """
549 Return a new blank page.
551 If ``width`` or ``height`` is ``None``, try to get the page size
552 from the last page of *pdf*.
554 Args:
555 pdf: PDF file the page is within.
556 width: The width of the new page expressed in default user
557 space units.
558 height: The height of the new page expressed in default user
559 space units.
561 Returns:
562 The new blank page
564 Raises:
565 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
566 no page
568 """
569 page = PageObject(pdf)
571 # Creates a new page (cf PDF Reference §7.7.3.3)
572 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
573 page.__setitem__(NameObject(PG.PARENT), NullObject())
574 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
575 if width is None or height is None:
576 if pdf is not None and len(pdf.pages) > 0:
577 lastpage = pdf.pages[len(pdf.pages) - 1]
578 width = lastpage.mediabox.width
579 height = lastpage.mediabox.height
580 else:
581 raise PageSizeNotDefinedError
582 page.__setitem__(
583 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore
584 )
586 return page
588 def _get_ids_image(
589 self,
590 obj: Optional[DictionaryObject] = None,
591 ancest: Optional[list[str]] = None,
592 call_stack: Optional[list[Any]] = None,
593 ) -> list[Union[str, list[str]]]:
594 if call_stack is None:
595 call_stack = []
596 _i = getattr(obj, "indirect_reference", None)
597 if _i in call_stack:
598 return []
599 call_stack.append(_i)
600 if self.inline_images is None:
601 self.inline_images = self._get_inline_images()
602 if obj is None:
603 obj = self
604 if ancest is None:
605 ancest = []
606 lst: list[Union[str, list[str]]] = []
607 if (
608 PG.RESOURCES not in obj or
609 is_null_or_none(resources := obj[PG.RESOURCES]) or
610 RES.XOBJECT not in cast(DictionaryObject, resources)
611 ):
612 return [] if self.inline_images is None else list(self.inline_images.keys())
614 x_object = resources[RES.XOBJECT].get_object() # type: ignore
615 for o in x_object:
616 if not isinstance(x_object[o], StreamObject):
617 continue
618 if x_object[o][IA.SUBTYPE] == "/Image":
619 lst.append(o if len(ancest) == 0 else [*ancest, o])
620 else: # is a form with possible images inside
621 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))
622 assert self.inline_images is not None
623 lst.extend(list(self.inline_images.keys()))
624 return lst
626 def _get_image(
627 self,
628 id: Union[str, list[str], tuple[str]],
629 obj: Optional[DictionaryObject] = None,
630 ) -> ImageFile:
631 if obj is None:
632 obj = cast(DictionaryObject, self)
633 if isinstance(id, tuple):
634 id = list(id)
635 if isinstance(id, list) and len(id) == 1:
636 id = id[0]
637 try:
638 xobjs = cast(
639 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
640 )
641 except KeyError:
642 if not (id[0] == "~" and id[-1] == "~"):
643 raise
644 if isinstance(id, str):
645 if id[0] == "~" and id[-1] == "~":
646 if self.inline_images is None:
647 self.inline_images = self._get_inline_images()
648 if self.inline_images is None: # pragma: no cover
649 raise KeyError("No inline image can be found")
650 return self.inline_images[id]
652 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
653 extension, byte_stream = imgd[:2]
654 return ImageFile(
655 name=f"{id[1:]}{extension}",
656 data=byte_stream,
657 image=imgd[2],
658 indirect_reference=xobjs[id].indirect_reference,
659 )
660 # in a subobject
661 ids = id[1:]
662 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
664 @property
665 def images(self) -> VirtualListImages:
666 """
667 Read-only property emulating a list of images on a page.
669 Get a list of all images on the page. The key can be:
670 - A string (for the top object)
671 - A tuple (for images within XObject forms)
672 - An integer
674 Examples:
675 * `reader.pages[0].images[0]` # return first image
676 * `reader.pages[0].images['/I0']` # return image '/I0'
677 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form
678 * `for img in reader.pages[0].images:` # loops through all objects
680 images.keys() and images.items() can be used.
682 The ImageFile has the following properties:
684 * `.name` : name of the object
685 * `.data` : bytes of the object
686 * `.image` : PIL Image Object
687 * `.indirect_reference` : object reference
689 and the following methods:
690 `.replace(new_image: PIL.Image.Image, **kwargs)` :
691 replace the image in the pdf with the new image
692 applying the saving parameters indicated (such as quality)
694 Example usage:
696 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)
698 Inline images are extracted and named ~0~, ~1~, ..., with the
699 indirect_reference set to None.
701 """
702 return VirtualListImages(self._get_ids_image, self._get_image)
704 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:
705 """Translate values used in inline image"""
706 try:
707 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])
708 except (TypeError, KeyError):
709 if isinstance(v, NameObject):
710 # It is a custom name, thus we have to look in resources.
711 # The only applicable case is for ColorSpace.
712 try:
713 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
714 v = cast(DictionaryObject, res)[v]
715 except KeyError: # for res and v
716 raise PdfReadError(f"Cannot find resource entry {v} for {k}")
717 return v
719 def _get_inline_images(self) -> dict[str, ImageFile]:
720 """Load inline images. Entries will be identified as `~1~`."""
721 content = self.get_contents()
722 if is_null_or_none(content):
723 return {}
724 imgs_data = []
725 assert content is not None, "mypy"
726 for param, ope in content.operations:
727 if ope == b"INLINE IMAGE":
728 imgs_data.append(
729 {"settings": param["settings"], "__streamdata__": param["data"]}
730 )
731 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
732 raise PdfReadError(
733 f"{ope!r} operator met whereas not expected, "
734 "please share use case with pypdf dev team"
735 )
736 files = {}
737 for num, ii in enumerate(imgs_data):
738 init = {
739 "__streamdata__": ii["__streamdata__"],
740 "/Length": len(ii["__streamdata__"]),
741 }
742 for k, v in ii["settings"].items():
743 if k in {"/Length", "/L"}: # no length is expected
744 continue
745 if isinstance(v, list):
746 v = ArrayObject(
747 [self._translate_value_inline_image(k, x) for x in v]
748 )
749 else:
750 v = self._translate_value_inline_image(k, v)
751 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])
752 if k not in init:
753 init[k] = v
754 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
755 extension, byte_stream, img = _xobj_to_image(ii["object"])
756 files[f"~{num}~"] = ImageFile(
757 name=f"~{num}~{extension}",
758 data=byte_stream,
759 image=img,
760 indirect_reference=None,
761 )
762 return files
764 @property
765 def rotation(self) -> int:
766 """
767 The visual rotation of the page.
769 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
770 valid values. This property does not affect ``/Contents``.
771 """
772 rotate_obj = self.get(PG.ROTATE, 0)
773 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
775 @rotation.setter
776 def rotation(self, r: float) -> None:
777 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
779 def transfer_rotation_to_content(self) -> None:
780 """
781 Apply the rotation of the page to the content and the media/crop/...
782 boxes.
784 It is recommended to apply this function before page merging.
785 """
786 r = -self.rotation # rotation to apply is in the otherway
787 self.rotation = 0
788 mb = RectangleObject(self.mediabox)
789 trsf = (
790 Transformation()
791 .translate(
792 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
793 )
794 .rotate(r)
795 )
796 pt1 = trsf.apply_on(mb.lower_left)
797 pt2 = trsf.apply_on(mb.upper_right)
798 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
799 self.add_transformation(trsf, False)
800 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
801 if b in self:
802 rr = RectangleObject(self[b]) # type: ignore
803 pt1 = trsf.apply_on(rr.lower_left)
804 pt2 = trsf.apply_on(rr.upper_right)
805 self[NameObject(b)] = RectangleObject(
806 (
807 min(pt1[0], pt2[0]),
808 min(pt1[1], pt2[1]),
809 max(pt1[0], pt2[0]),
810 max(pt1[1], pt2[1]),
811 )
812 )
814 def rotate(self, angle: int) -> "PageObject":
815 """
816 Rotate a page clockwise by increments of 90 degrees.
818 Args:
819 angle: Angle to rotate the page. Must be an increment of 90 deg.
821 Returns:
822 The rotated PageObject
824 """
825 if angle % 90 != 0:
826 raise ValueError("Rotation angle must be a multiple of 90")
827 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
828 return self
830 def _merge_resources(
831 self,
832 res1: DictionaryObject,
833 res2: DictionaryObject,
834 resource: Any,
835 new_res1: bool = True,
836 ) -> tuple[dict[str, Any], dict[str, Any]]:
837 try:
838 assert isinstance(self.indirect_reference, IndirectObject)
839 pdf = self.indirect_reference.pdf
840 is_pdf_writer = hasattr(
841 pdf, "_add_object"
842 ) # expect isinstance(pdf, PdfWriter)
843 except (AssertionError, AttributeError):
844 pdf = None
845 is_pdf_writer = False
847 def compute_unique_key(base_key: str) -> tuple[str, bool]:
848 """
849 Find a key that either doesn't already exist or has the same value
850 (indicated by the bool)
852 Args:
853 base_key: An index is added to this to get the computed key
855 Returns:
856 A tuple (computed key, bool) where the boolean indicates
857 if there is a resource of the given computed_key with the same
858 value.
860 """
861 value = page2res.raw_get(base_key)
862 # TODO: a possible improvement for writer, the indirect_reference
863 # cannot be found because translated
865 # try the current key first (e.g. "foo"), but otherwise iterate
866 # through "foo-0", "foo-1", etc. new_res can contain only finitely
867 # many keys, thus this'll eventually end, even if it's been crafted
868 # to be maximally annoying.
869 computed_key = base_key
870 idx = 0
871 while computed_key in new_res:
872 if new_res.raw_get(computed_key) == value:
873 # there's already a resource of this name, with the exact
874 # same value
875 return computed_key, True
876 computed_key = f"{base_key}-{idx}"
877 idx += 1
878 return computed_key, False
880 if new_res1:
881 new_res = DictionaryObject()
882 new_res.update(res1.get(resource, DictionaryObject()).get_object())
883 else:
884 new_res = cast(DictionaryObject, res1[resource])
885 page2res = cast(
886 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
887 )
888 rename_res = {}
889 for key in page2res:
890 unique_key, same_value = compute_unique_key(key)
891 newname = NameObject(unique_key)
892 if key != unique_key:
893 # we have to use a different name for this
894 rename_res[key] = newname
896 if not same_value:
897 if is_pdf_writer:
898 new_res[newname] = page2res.raw_get(key).clone(pdf)
899 try:
900 new_res[newname] = new_res[newname].indirect_reference
901 except AttributeError:
902 pass
903 else:
904 new_res[newname] = page2res.raw_get(key)
905 lst = sorted(new_res.items())
906 new_res.clear()
907 for el in lst:
908 new_res[el[0]] = el[1]
909 return new_res, rename_res
911 @staticmethod
912 def _content_stream_rename(
913 stream: ContentStream,
914 rename: dict[Any, Any],
915 pdf: Optional[PdfCommonDocProtocol],
916 ) -> ContentStream:
917 if not rename:
918 return stream
919 stream = ContentStream(stream, pdf)
920 for operands, _operator in stream.operations:
921 if isinstance(operands, list):
922 for i, op in enumerate(operands):
923 if isinstance(op, NameObject):
924 operands[i] = rename.get(op, op)
925 elif isinstance(operands, dict):
926 for i, op in operands.items():
927 if isinstance(op, NameObject):
928 operands[i] = rename.get(op, op)
929 else:
930 raise KeyError(f"Type of operands is {type(operands)}")
931 return stream
933 @staticmethod
934 def _add_transformation_matrix(
935 contents: Any,
936 pdf: Optional[PdfCommonDocProtocol],
937 ctm: CompressedTransformationMatrix,
938 ) -> ContentStream:
939 """Add transformation matrix at the beginning of the given contents stream."""
940 contents = ContentStream(contents, pdf)
941 contents.operations.insert(
942 0,
943 [
944 [FloatObject(x) for x in ctm],
945 b"cm",
946 ],
947 )
948 return contents
950 def _get_contents_as_bytes(self) -> Optional[bytes]:
951 """
952 Return the page contents as bytes.
954 Returns:
955 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
957 """
958 if PG.CONTENTS in self:
959 obj = self[PG.CONTENTS].get_object()
960 if isinstance(obj, list):
961 return b"".join(x.get_object().get_data() for x in obj)
962 return cast(EncodedStreamObject, obj).get_data()
963 return None
965 def get_contents(self) -> Optional[ContentStream]:
966 """
967 Access the page contents.
969 Returns:
970 The ``/Contents`` object, or ``None`` if it does not exist.
971 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
973 """
974 if PG.CONTENTS in self:
975 try:
976 pdf = cast(IndirectObject, self.indirect_reference).pdf
977 except AttributeError:
978 pdf = None
979 obj = self[PG.CONTENTS]
980 if is_null_or_none(obj):
981 return None
982 resolved_object = obj.get_object()
983 return ContentStream(resolved_object, pdf)
984 return None
986 def replace_contents(
987 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
988 ) -> None:
989 """
990 Replace the page contents with the new content and nullify old objects
991 Args:
992 content: new content; if None delete the content field.
993 """
994 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
995 # the page is not attached : the content is directly attached.
996 self[NameObject(PG.CONTENTS)] = content
997 return
999 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
1000 for o in self[PG.CONTENTS]: # type: ignore[attr-defined]
1001 try:
1002 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore
1003 except AttributeError:
1004 pass
1006 if isinstance(content, ArrayObject):
1007 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content)
1009 if is_null_or_none(content):
1010 if PG.CONTENTS not in self:
1011 return
1012 assert self.indirect_reference is not None
1013 assert self[PG.CONTENTS].indirect_reference is not None
1014 self.indirect_reference.pdf._objects[
1015 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore
1016 ] = NullObject()
1017 del self[PG.CONTENTS]
1018 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
1019 try:
1020 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(
1021 content
1022 )
1023 except AttributeError:
1024 # applies at least for page not in writer
1025 # as a backup solution, we put content as an object although not in accordance with pdf ref
1026 # this will be fixed with the _add_object
1027 self[NameObject(PG.CONTENTS)] = content
1028 else:
1029 assert content is not None, "mypy"
1030 content.indirect_reference = self[
1031 PG.CONTENTS
1032 ].indirect_reference # TODO: in the future may require generation management
1033 try:
1034 self.indirect_reference.pdf._objects[
1035 content.indirect_reference.idnum - 1 # type: ignore
1036 ] = content
1037 except AttributeError:
1038 # applies at least for page not in writer
1039 # as a backup solution, we put content as an object although not in accordance with pdf ref
1040 # this will be fixed with the _add_object
1041 self[NameObject(PG.CONTENTS)] = content
1042 # forces recalculation of inline_images
1043 self.inline_images = None
1045 def merge_page(
1046 self, page2: "PageObject", expand: bool = False, over: bool = True
1047 ) -> None:
1048 """
1049 Merge the content streams of two pages into one.
1051 Resource references (e.g. fonts) are maintained from both pages.
1052 The mediabox, cropbox, etc of this page are not altered.
1053 The parameter page's content stream will
1054 be added to the end of this page's content stream,
1055 meaning that it will be drawn after, or "on top" of this page.
1057 Args:
1058 page2: The page to be merged into this one. Should be
1059 an instance of :class:`PageObject<PageObject>`.
1060 over: set the page2 content over page1 if True (default) else under
1061 expand: If True, the current page dimensions will be
1062 expanded to accommodate the dimensions of the page to be merged.
1064 """
1065 self._merge_page(page2, over=over, expand=expand)
1067 def _merge_page(
1068 self,
1069 page2: "PageObject",
1070 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1071 ctm: Optional[CompressedTransformationMatrix] = None,
1072 over: bool = True,
1073 expand: bool = False,
1074 ) -> None:
1075 # First we work on merging the resource dictionaries. This allows us
1076 # to find out what symbols in the content streams we might need to
1077 # rename.
1078 try:
1079 assert isinstance(self.indirect_reference, IndirectObject)
1080 if hasattr(
1081 self.indirect_reference.pdf, "_add_object"
1082 ): # to detect PdfWriter
1083 return self._merge_page_writer(
1084 page2, page2transformation, ctm, over, expand
1085 )
1086 return None
1087 except (AssertionError, AttributeError):
1088 pass
1090 new_resources = DictionaryObject()
1091 rename = {}
1092 try:
1093 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1094 except KeyError:
1095 original_resources = DictionaryObject()
1096 try:
1097 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1098 except KeyError:
1099 page2resources = DictionaryObject()
1100 new_annots = ArrayObject()
1102 for page in (self, page2):
1103 if PG.ANNOTS in page:
1104 annots = page[PG.ANNOTS]
1105 if isinstance(annots, ArrayObject):
1106 new_annots.extend(annots)
1108 for res in (
1109 RES.EXT_G_STATE,
1110 RES.FONT,
1111 RES.XOBJECT,
1112 RES.COLOR_SPACE,
1113 RES.PATTERN,
1114 RES.SHADING,
1115 RES.PROPERTIES,
1116 ):
1117 new, newrename = self._merge_resources(
1118 original_resources, page2resources, res
1119 )
1120 if new:
1121 new_resources[NameObject(res)] = new
1122 rename.update(newrename)
1124 # Combine /ProcSet sets, making sure there's a consistent order
1125 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
1126 sorted(
1127 set(
1128 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
1129 ).union(
1130 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
1131 )
1132 )
1133 )
1135 new_content_array = ArrayObject()
1136 original_content = self.get_contents()
1137 if original_content is not None:
1138 original_content.isolate_graphics_state()
1139 new_content_array.append(original_content)
1141 page2content = page2.get_contents()
1142 if page2content is not None:
1143 rect = getattr(page2, MERGE_CROP_BOX)
1144 page2content.operations.insert(
1145 0,
1146 (
1147 map(
1148 FloatObject,
1149 [
1150 rect.left,
1151 rect.bottom,
1152 rect.width,
1153 rect.height,
1154 ],
1155 ),
1156 b"re",
1157 ),
1158 )
1159 page2content.operations.insert(1, ([], b"W"))
1160 page2content.operations.insert(2, ([], b"n"))
1161 if page2transformation is not None:
1162 page2content = page2transformation(page2content)
1163 page2content = PageObject._content_stream_rename(
1164 page2content, rename, self.pdf
1165 )
1166 page2content.isolate_graphics_state()
1167 if over:
1168 new_content_array.append(page2content)
1169 else:
1170 new_content_array.insert(0, page2content)
1172 # if expanding the page to fit a new page, calculate the new media box size
1173 if expand:
1174 self._expand_mediabox(page2, ctm)
1176 self.replace_contents(ContentStream(new_content_array, self.pdf))
1177 self[NameObject(PG.RESOURCES)] = new_resources
1178 self[NameObject(PG.ANNOTS)] = new_annots
1179 return None
1181 def _merge_page_writer(
1182 self,
1183 page2: "PageObject",
1184 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1185 ctm: Optional[CompressedTransformationMatrix] = None,
1186 over: bool = True,
1187 expand: bool = False,
1188 ) -> None:
1189 # First we work on merging the resource dictionaries. This allows us
1190 # to find which symbols in the content streams we might need to
1191 # rename.
1192 assert isinstance(self.indirect_reference, IndirectObject)
1193 pdf = self.indirect_reference.pdf
1195 rename = {}
1196 if PG.RESOURCES not in self:
1197 self[NameObject(PG.RESOURCES)] = DictionaryObject()
1198 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1199 if PG.RESOURCES not in page2:
1200 page2resources = DictionaryObject()
1201 else:
1202 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1204 for res in (
1205 RES.EXT_G_STATE,
1206 RES.FONT,
1207 RES.XOBJECT,
1208 RES.COLOR_SPACE,
1209 RES.PATTERN,
1210 RES.SHADING,
1211 RES.PROPERTIES,
1212 ):
1213 if res in page2resources:
1214 if res not in original_resources:
1215 original_resources[NameObject(res)] = DictionaryObject()
1216 _, newrename = self._merge_resources(
1217 original_resources, page2resources, res, False
1218 )
1219 rename.update(newrename)
1220 # Combine /ProcSet sets.
1221 if RES.PROC_SET in page2resources:
1222 if RES.PROC_SET not in original_resources:
1223 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
1224 arr = cast(ArrayObject, original_resources[RES.PROC_SET])
1225 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
1226 if x not in arr:
1227 arr.append(x)
1228 arr.sort()
1230 if PG.ANNOTS in page2:
1231 if PG.ANNOTS not in self:
1232 self[NameObject(PG.ANNOTS)] = ArrayObject()
1233 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
1234 if ctm is None:
1235 trsf = Transformation()
1236 else:
1237 trsf = Transformation(ctm)
1238 for a in cast(ArrayObject, page2[PG.ANNOTS]):
1239 a = a.get_object()
1240 aa = a.clone(
1241 pdf,
1242 ignore_fields=("/P", "/StructParent", "/Parent"),
1243 force_duplicate=True,
1244 )
1245 r = cast(ArrayObject, a["/Rect"])
1246 pt1 = trsf.apply_on((r[0], r[1]), True)
1247 pt2 = trsf.apply_on((r[2], r[3]), True)
1248 aa[NameObject("/Rect")] = ArrayObject(
1249 (
1250 min(pt1[0], pt2[0]),
1251 min(pt1[1], pt2[1]),
1252 max(pt1[0], pt2[0]),
1253 max(pt1[1], pt2[1]),
1254 )
1255 )
1256 if "/QuadPoints" in a:
1257 q = cast(ArrayObject, a["/QuadPoints"])
1258 aa[NameObject("/QuadPoints")] = ArrayObject(
1259 trsf.apply_on((q[0], q[1]), True)
1260 + trsf.apply_on((q[2], q[3]), True)
1261 + trsf.apply_on((q[4], q[5]), True)
1262 + trsf.apply_on((q[6], q[7]), True)
1263 )
1264 try:
1265 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
1266 except KeyError:
1267 pass
1268 try:
1269 aa[NameObject("/P")] = self.indirect_reference
1270 annots.append(aa.indirect_reference)
1271 except AttributeError:
1272 pass
1274 new_content_array = ArrayObject()
1275 original_content = self.get_contents()
1276 if original_content is not None:
1277 original_content.isolate_graphics_state()
1278 new_content_array.append(original_content)
1280 page2content = page2.get_contents()
1281 if page2content is not None:
1282 rect = getattr(page2, MERGE_CROP_BOX)
1283 page2content.operations.insert(
1284 0,
1285 (
1286 map(
1287 FloatObject,
1288 [
1289 rect.left,
1290 rect.bottom,
1291 rect.width,
1292 rect.height,
1293 ],
1294 ),
1295 b"re",
1296 ),
1297 )
1298 page2content.operations.insert(1, ([], b"W"))
1299 page2content.operations.insert(2, ([], b"n"))
1300 if page2transformation is not None:
1301 page2content = page2transformation(page2content)
1302 page2content = PageObject._content_stream_rename(
1303 page2content, rename, self.pdf
1304 )
1305 page2content.isolate_graphics_state()
1306 if over:
1307 new_content_array.append(page2content)
1308 else:
1309 new_content_array.insert(0, page2content)
1311 # if expanding the page to fit a new page, calculate the new media box size
1312 if expand:
1313 self._expand_mediabox(page2, ctm)
1315 self.replace_contents(new_content_array)
1317 def _expand_mediabox(
1318 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
1319 ) -> None:
1320 corners1 = (
1321 self.mediabox.left.as_numeric(),
1322 self.mediabox.bottom.as_numeric(),
1323 self.mediabox.right.as_numeric(),
1324 self.mediabox.top.as_numeric(),
1325 )
1326 corners2 = (
1327 page2.mediabox.left.as_numeric(),
1328 page2.mediabox.bottom.as_numeric(),
1329 page2.mediabox.left.as_numeric(),
1330 page2.mediabox.top.as_numeric(),
1331 page2.mediabox.right.as_numeric(),
1332 page2.mediabox.top.as_numeric(),
1333 page2.mediabox.right.as_numeric(),
1334 page2.mediabox.bottom.as_numeric(),
1335 )
1336 if ctm is not None:
1337 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1338 new_x = tuple(
1339 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
1340 for i in range(0, 8, 2)
1341 )
1342 new_y = tuple(
1343 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
1344 for i in range(0, 8, 2)
1345 )
1346 else:
1347 new_x = corners2[0:8:2]
1348 new_y = corners2[1:8:2]
1349 lowerleft = (min(new_x), min(new_y))
1350 upperright = (max(new_x), max(new_y))
1351 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
1352 upperright = (
1353 max(corners1[2], upperright[0]),
1354 max(corners1[3], upperright[1]),
1355 )
1357 self.mediabox.lower_left = lowerleft
1358 self.mediabox.upper_right = upperright
1360 def merge_transformed_page(
1361 self,
1362 page2: "PageObject",
1363 ctm: Union[CompressedTransformationMatrix, Transformation],
1364 over: bool = True,
1365 expand: bool = False,
1366 ) -> None:
1367 """
1368 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation
1369 matrix is applied to the merged stream.
1371 Args:
1372 page2: The page to be merged into this one.
1373 ctm: a 6-element tuple containing the operands of the
1374 transformation matrix
1375 over: set the page2 content over page1 if True (default) else under
1376 expand: Whether the page should be expanded to fit the dimensions
1377 of the page to be merged.
1379 """
1380 if isinstance(ctm, Transformation):
1381 ctm = ctm.ctm
1382 self._merge_page(
1383 page2,
1384 lambda page2Content: PageObject._add_transformation_matrix(
1385 page2Content, page2.pdf, ctm
1386 ),
1387 ctm,
1388 over,
1389 expand,
1390 )
1392 def merge_scaled_page(
1393 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
1394 ) -> None:
1395 """
1396 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1397 is scaled by applying a transformation matrix.
1399 Args:
1400 page2: The page to be merged into this one.
1401 scale: The scaling factor
1402 over: set the page2 content over page1 if True (default) else under
1403 expand: Whether the page should be expanded to fit the
1404 dimensions of the page to be merged.
1406 """
1407 op = Transformation().scale(scale, scale)
1408 self.merge_transformed_page(page2, op, over, expand)
1410 def merge_rotated_page(
1411 self,
1412 page2: "PageObject",
1413 rotation: float,
1414 over: bool = True,
1415 expand: bool = False,
1416 ) -> None:
1417 """
1418 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1419 is rotated by applying a transformation matrix.
1421 Args:
1422 page2: The page to be merged into this one.
1423 rotation: The angle of the rotation, in degrees
1424 over: set the page2 content over page1 if True (default) else under
1425 expand: Whether the page should be expanded to fit the
1426 dimensions of the page to be merged.
1428 """
1429 op = Transformation().rotate(rotation)
1430 self.merge_transformed_page(page2, op, over, expand)
1432 def merge_translated_page(
1433 self,
1434 page2: "PageObject",
1435 tx: float,
1436 ty: float,
1437 over: bool = True,
1438 expand: bool = False,
1439 ) -> None:
1440 """
1441 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be
1442 merged is translated by applying a transformation matrix.
1444 Args:
1445 page2: the page to be merged into this one.
1446 tx: The translation on X axis
1447 ty: The translation on Y axis
1448 over: set the page2 content over page1 if True (default) else under
1449 expand: Whether the page should be expanded to fit the
1450 dimensions of the page to be merged.
1452 """
1453 op = Transformation().translate(tx, ty)
1454 self.merge_transformed_page(page2, op, over, expand)
1456 def add_transformation(
1457 self,
1458 ctm: Union[Transformation, CompressedTransformationMatrix],
1459 expand: bool = False,
1460 ) -> None:
1461 """
1462 Apply a transformation matrix to the page.
1464 Args:
1465 ctm: A 6-element tuple containing the operands of the
1466 transformation matrix. Alternatively, a
1467 :py:class:`Transformation<pypdf.Transformation>`
1468 object can be passed.
1470 See :doc:`/user/cropping-and-transforming`.
1472 """
1473 if isinstance(ctm, Transformation):
1474 ctm = ctm.ctm
1475 content = self.get_contents()
1476 if content is not None:
1477 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
1478 content.isolate_graphics_state()
1479 self.replace_contents(content)
1480 # if expanding the page to fit a new page, calculate the new media box size
1481 if expand:
1482 corners = [
1483 self.mediabox.left.as_numeric(),
1484 self.mediabox.bottom.as_numeric(),
1485 self.mediabox.left.as_numeric(),
1486 self.mediabox.top.as_numeric(),
1487 self.mediabox.right.as_numeric(),
1488 self.mediabox.top.as_numeric(),
1489 self.mediabox.right.as_numeric(),
1490 self.mediabox.bottom.as_numeric(),
1491 ]
1493 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1494 new_x = [
1495 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
1496 for i in range(0, 8, 2)
1497 ]
1498 new_y = [
1499 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
1500 for i in range(0, 8, 2)
1501 ]
1503 self.mediabox.lower_left = (min(new_x), min(new_y))
1504 self.mediabox.upper_right = (max(new_x), max(new_y))
1506 def scale(self, sx: float, sy: float) -> None:
1507 """
1508 Scale a page by the given factors by applying a transformation matrix
1509 to its content and updating the page size.
1511 This updates the various page boundaries (bleedbox, trimbox, etc.)
1512 and the contents of the page.
1514 Args:
1515 sx: The scaling factor on horizontal axis.
1516 sy: The scaling factor on vertical axis.
1518 """
1519 self.add_transformation((sx, 0, 0, sy, 0, 0))
1520 self.bleedbox = self.bleedbox.scale(sx, sy)
1521 self.trimbox = self.trimbox.scale(sx, sy)
1522 self.artbox = self.artbox.scale(sx, sy)
1523 self.cropbox = self.cropbox.scale(sx, sy)
1524 self.mediabox = self.mediabox.scale(sx, sy)
1526 if PG.ANNOTS in self:
1527 annotations = self[PG.ANNOTS]
1528 if isinstance(annotations, ArrayObject):
1529 for annotation in annotations:
1530 annotation_obj = annotation.get_object()
1531 if ADA.Rect in annotation_obj:
1532 rectangle = annotation_obj[ADA.Rect]
1533 if isinstance(rectangle, ArrayObject):
1534 rectangle[0] = FloatObject(float(rectangle[0]) * sx)
1535 rectangle[1] = FloatObject(float(rectangle[1]) * sy)
1536 rectangle[2] = FloatObject(float(rectangle[2]) * sx)
1537 rectangle[3] = FloatObject(float(rectangle[3]) * sy)
1539 if PG.VP in self:
1540 viewport = self[PG.VP]
1541 if isinstance(viewport, ArrayObject):
1542 bbox = viewport[0]["/BBox"]
1543 else:
1544 bbox = viewport["/BBox"] # type: ignore
1545 scaled_bbox = RectangleObject(
1546 (
1547 float(bbox[0]) * sx,
1548 float(bbox[1]) * sy,
1549 float(bbox[2]) * sx,
1550 float(bbox[3]) * sy,
1551 )
1552 )
1553 if isinstance(viewport, ArrayObject):
1554 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore
1555 NameObject("/BBox")
1556 ] = scaled_bbox
1557 else:
1558 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore
1560 def scale_by(self, factor: float) -> None:
1561 """
1562 Scale a page by the given factor by applying a transformation matrix to
1563 its content and updating the page size.
1565 Args:
1566 factor: The scaling factor (for both X and Y axis).
1568 """
1569 self.scale(factor, factor)
1571 def scale_to(self, width: float, height: float) -> None:
1572 """
1573 Scale a page to the specified dimensions by applying a transformation
1574 matrix to its content and updating the page size.
1576 Args:
1577 width: The new width.
1578 height: The new height.
1580 """
1581 sx = width / float(self.mediabox.width)
1582 sy = height / float(self.mediabox.height)
1583 self.scale(sx, sy)
1585 def compress_content_streams(self, level: int = -1) -> None:
1586 """
1587 Compress the size of this page by joining all content streams and
1588 applying a FlateDecode filter.
1590 However, it is possible that this function will perform no action if
1591 content stream compression becomes "automatic".
1592 """
1593 content = self.get_contents()
1594 if content is not None:
1595 content_obj = content.flate_encode(level)
1596 try:
1597 content.indirect_reference.pdf._objects[ # type: ignore
1598 content.indirect_reference.idnum - 1 # type: ignore
1599 ] = content_obj
1600 except AttributeError:
1601 if self.indirect_reference is not None and hasattr(
1602 self.indirect_reference.pdf, "_add_object"
1603 ):
1604 self.replace_contents(content_obj)
1605 else:
1606 raise ValueError("Page must be part of a PdfWriter")
1608 @property
1609 def page_number(self) -> Optional[int]:
1610 """
1611 Read-only property which returns the page number within the PDF file.
1613 Returns:
1614 Page number; None if the page is not attached to a PDF.
1616 """
1617 if self.indirect_reference is None:
1618 return None
1619 try:
1620 lst = self.indirect_reference.pdf.pages
1621 return lst.index(self)
1622 except ValueError:
1623 return None
1625 def _debug_for_extract(self) -> str: # pragma: no cover
1626 out = ""
1627 for ope, op in ContentStream(
1628 self["/Contents"].get_object(), self.pdf, "bytes"
1629 ).operations:
1630 if op == b"TJ":
1631 s = [x for x in ope[0] if isinstance(x, str)]
1632 else:
1633 s = []
1634 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
1635 out += "\n=============================\n"
1636 try:
1637 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore
1638 out += fo + "\n"
1639 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore
1640 try:
1641 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1642 "/Encoding"
1643 ].__repr__()
1644 out += enc_repr + "\n"
1645 except Exception:
1646 pass
1647 try:
1648 out += (
1649 self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1650 "/ToUnicode"
1651 ]
1652 .get_data()
1653 .decode()
1654 + "\n"
1655 )
1656 except Exception:
1657 pass
1659 except KeyError:
1660 out += "No Font\n"
1661 return out
1663 def _extract_text(
1664 self,
1665 obj: Any,
1666 pdf: Any,
1667 orientations: tuple[int, ...] = (0, 90, 180, 270),
1668 space_width: float = 200.0,
1669 content_key: Optional[str] = PG.CONTENTS,
1670 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1671 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1672 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1673 ) -> str:
1674 """
1675 See extract_text for most arguments.
1677 Args:
1678 content_key: indicate the default key where to extract data
1679 None = the object; this allows reusing the function on an XObject
1680 default = "/Content"
1682 """
1683 extractor = TextExtraction()
1684 cmaps: dict[
1685 str,
1686 tuple[
1687 str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject
1688 ],
1689 ] = {}
1691 try:
1692 objr = obj
1693 while NameObject(PG.RESOURCES) not in objr:
1694 # /Resources can be inherited so we look to parents
1695 objr = objr["/Parent"].get_object()
1696 # If no parents then no /Resources will be available,
1697 # so an exception will be raised
1698 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
1699 except Exception:
1700 # No resources means no text is possible (no font); we consider the
1701 # file as not damaged, no need to check for TJ or Tj
1702 return ""
1704 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]):
1705 for f in cast(DictionaryObject, font):
1706 try:
1707 cmaps[f] = build_char_map(f, space_width, obj)
1708 except TypeError:
1709 pass
1711 try:
1712 content = (
1713 obj[content_key].get_object() if isinstance(content_key, str) else obj
1714 )
1715 if not isinstance(content, ContentStream):
1716 content = ContentStream(content, pdf, "bytes")
1717 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
1718 return ""
1719 # We check all strings are TextStringObjects. ByteStringObjects
1720 # are strings where the byte->string encoding was unknown, so adding
1721 # them to the text here would be gibberish.
1723 # Initialize the extractor with the necessary parameters
1724 extractor.initialize_extraction(orientations, visitor_text, cmaps)
1726 for operands, operator in content.operations:
1727 if visitor_operand_before is not None:
1728 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1729 # Multiple operators are handled here
1730 if operator == b"'":
1731 extractor.process_operation(b"T*", [])
1732 extractor.process_operation(b"Tj", operands)
1733 elif operator == b'"':
1734 extractor.process_operation(b"Tw", [operands[0]])
1735 extractor.process_operation(b"Tc", [operands[1]])
1736 extractor.process_operation(b"T*", [])
1737 extractor.process_operation(b"Tj", operands[2:])
1738 elif operator == b"TJ":
1739 # The space width may be smaller than the font width, so the width should be 95%.
1740 _confirm_space_width = extractor._space_width * 0.95
1741 if operands:
1742 for op in operands[0]:
1743 if isinstance(op, (str, bytes)):
1744 extractor.process_operation(b"Tj", [op])
1745 if isinstance(op, (int, float, NumberObject, FloatObject)) and (
1746 abs(float(op)) >= _confirm_space_width
1747 and extractor.text
1748 and extractor.text[-1] != " "
1749 ):
1750 extractor.process_operation(b"Tj", [" "])
1751 elif operator == b"TD":
1752 extractor.process_operation(b"TL", [-operands[1]])
1753 extractor.process_operation(b"Td", operands)
1754 elif operator == b"Do":
1755 extractor.output += extractor.text
1756 if visitor_text is not None:
1757 visitor_text(
1758 extractor.text,
1759 extractor.memo_cm,
1760 extractor.memo_tm,
1761 extractor.cmap[3],
1762 extractor.font_size,
1763 )
1764 try:
1765 if extractor.output[-1] != "\n":
1766 extractor.output += "\n"
1767 if visitor_text is not None:
1768 visitor_text(
1769 "\n",
1770 extractor.memo_cm,
1771 extractor.memo_tm,
1772 extractor.cmap[3],
1773 extractor.font_size,
1774 )
1775 except IndexError:
1776 pass
1777 try:
1778 xobj = resources_dict["/XObject"]
1779 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
1780 text = self.extract_xform_text(
1781 xobj[operands[0]], # type: ignore
1782 orientations,
1783 space_width,
1784 visitor_operand_before,
1785 visitor_operand_after,
1786 visitor_text,
1787 )
1788 extractor.output += text
1789 if visitor_text is not None:
1790 visitor_text(
1791 text,
1792 extractor.memo_cm,
1793 extractor.memo_tm,
1794 extractor.cmap[3],
1795 extractor.font_size,
1796 )
1797 except Exception as exception:
1798 logger_warning(
1799 f"Impossible to decode XFormObject {operands[0]}: {exception}",
1800 __name__,
1801 )
1802 finally:
1803 extractor.text = ""
1804 extractor.memo_cm = extractor.cm_matrix.copy()
1805 extractor.memo_tm = extractor.tm_matrix.copy()
1806 else:
1807 extractor.process_operation(operator, operands)
1808 if visitor_operand_after is not None:
1809 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1810 extractor.output += extractor.text # just in case
1811 if extractor.text != "" and visitor_text is not None:
1812 visitor_text(
1813 extractor.text,
1814 extractor.memo_cm,
1815 extractor.memo_tm,
1816 extractor.cmap[3],
1817 extractor.font_size,
1818 )
1819 return extractor.output
1821 def _layout_mode_fonts(self) -> dict[str, _layout_mode.Font]:
1822 """
1823 Get fonts formatted for "layout" mode text extraction.
1825 Returns:
1826 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
1828 """
1829 # Font retrieval logic adapted from pypdf.PageObject._extract_text()
1830 objr: Any = self
1831 fonts: dict[str, _layout_mode.Font] = {}
1832 while objr is not None:
1833 try:
1834 resources_dict: Any = objr[PG.RESOURCES]
1835 except KeyError:
1836 resources_dict = {}
1837 if "/Font" in resources_dict and self.pdf is not None:
1838 for font_name in resources_dict["/Font"]:
1839 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
1840 font_dict = {
1841 k: v.get_object()
1842 if isinstance(v, IndirectObject)
1843 else [_v.get_object() for _v in v]
1844 if isinstance(v, ArrayObject)
1845 else v
1846 for k, v in font_dict_obj.items()
1847 }
1848 # mypy really sucks at unpacking
1849 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
1850 try:
1851 objr = objr["/Parent"].get_object()
1852 except KeyError:
1853 objr = None
1855 return fonts
1857 def _layout_mode_text(
1858 self,
1859 space_vertically: bool = True,
1860 scale_weight: float = 1.25,
1861 strip_rotated: bool = True,
1862 debug_path: Optional[Path] = None,
1863 font_height_weight: float = 1,
1864 ) -> str:
1865 """
1866 Get text preserving fidelity to source PDF text layout.
1868 Args:
1869 space_vertically: include blank lines inferred from y distance + font
1870 height. Defaults to True.
1871 scale_weight: multiplier for string length when calculating weighted
1872 average character width. Defaults to 1.25.
1873 strip_rotated: Removes text that is rotated w.r.t. to the page from
1874 layout mode output. Defaults to True.
1875 debug_path (Path | None): if supplied, must target a directory.
1876 creates the following files with debug information for layout mode
1877 functions if supplied:
1878 - fonts.json: output of self._layout_mode_fonts
1879 - tjs.json: individual text render ops with corresponding transform matrices
1880 - bts.json: text render ops left justified and grouped by BT/ET operators
1881 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1882 Defaults to None.
1883 font_height_weight: multiplier for font height when calculating
1884 blank lines. Defaults to 1.
1886 Returns:
1887 str: multiline string containing page text in a fixed width format that
1888 closely adheres to the rendered layout in the source pdf.
1890 """
1891 fonts = self._layout_mode_fonts()
1892 if debug_path: # pragma: no cover
1893 import json # noqa: PLC0415
1895 debug_path.joinpath("fonts.json").write_text(
1896 json.dumps(
1897 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
1898 ),
1899 "utf-8",
1900 )
1902 ops = iter(
1903 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
1904 )
1905 bt_groups = _layout_mode.text_show_operations(
1906 ops, fonts, strip_rotated, debug_path
1907 )
1909 if not bt_groups:
1910 return ""
1912 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
1914 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
1916 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
1918 def extract_text(
1919 self,
1920 *args: Any,
1921 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),
1922 space_width: float = 200.0,
1923 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1924 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1925 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1926 extraction_mode: Literal["plain", "layout"] = "plain",
1927 **kwargs: Any,
1928 ) -> str:
1929 """
1930 Locate all text drawing commands, in the order they are provided in the
1931 content stream, and extract the text.
1933 This works well for some PDF files, but poorly for others, depending on
1934 the generator used. This will be refined in the future.
1936 Do not rely on the order of text coming out of this function, as it
1937 will change if this function is made more sophisticated.
1939 Arabic and Hebrew are extracted in the correct order.
1940 If required a custom RTL range of characters can be defined;
1941 see function set_custom_rtl.
1943 Additionally you can provide visitor methods to get informed on all
1944 operations and all text objects.
1945 For example in some PDF files this can be useful to parse tables.
1947 Args:
1948 orientations: list of orientations extract_text will look for
1949 default = (0, 90, 180, 270)
1950 note: currently only 0 (up),90 (turned left), 180 (upside down),
1951 270 (turned right)
1952 Silently ignored in "layout" mode.
1953 space_width: force default space width
1954 if not extracted from font (default: 200)
1955 Silently ignored in "layout" mode.
1956 visitor_operand_before: function to be called before processing an operation.
1957 It has four arguments: operator, operand-arguments,
1958 current transformation matrix and text matrix.
1959 Ignored with a warning in "layout" mode.
1960 visitor_operand_after: function to be called after processing an operation.
1961 It has four arguments: operator, operand-arguments,
1962 current transformation matrix and text matrix.
1963 Ignored with a warning in "layout" mode.
1964 visitor_text: function to be called when extracting some text at some position.
1965 It has five arguments: text, current transformation matrix,
1966 text matrix, font-dictionary and font-size.
1967 The font-dictionary may be None in case of unknown fonts.
1968 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
1969 Ignored with a warning in "layout" mode.
1970 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
1971 "layout" for experimental layout mode functionality.
1972 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
1973 in "layout" mode.
1975 kwargs:
1976 layout_mode_space_vertically (bool): include blank lines inferred from
1977 y distance + font height. Defaults to True.
1978 layout_mode_scale_weight (float): multiplier for string length when calculating
1979 weighted average character width. Defaults to 1.25.
1980 layout_mode_strip_rotated (bool): layout mode does not support rotated text.
1981 Set to False to include rotated text anyway. If rotated text is discovered,
1982 layout will be degraded and a warning will result. Defaults to True.
1983 layout_mode_debug_path (Path | None): if supplied, must target a directory.
1984 creates the following files with debug information for layout mode
1985 functions if supplied:
1987 - fonts.json: output of self._layout_mode_fonts
1988 - tjs.json: individual text render ops with corresponding transform matrices
1989 - bts.json: text render ops left justified and grouped by BT/ET operators
1990 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1991 layout_mode_font_height_weight (float): multiplier for font height when calculating
1992 blank lines. Defaults to 1.
1994 Returns:
1995 The extracted text
1997 """
1998 if extraction_mode not in ["plain", "layout"]:
1999 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
2000 if extraction_mode == "layout":
2001 for visitor in (
2002 "visitor_operand_before",
2003 "visitor_operand_after",
2004 "visitor_text",
2005 ):
2006 if locals()[visitor]:
2007 logger_warning(
2008 f"Argument {visitor} is ignored in layout mode",
2009 __name__,
2010 )
2011 return self._layout_mode_text(
2012 space_vertically=kwargs.get("layout_mode_space_vertically", True),
2013 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
2014 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
2015 debug_path=kwargs.get("layout_mode_debug_path"),
2016 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
2017 )
2018 if len(args) >= 1:
2019 if isinstance(args[0], str):
2020 if len(args) >= 3:
2021 if isinstance(args[2], (tuple, int)):
2022 orientations = args[2]
2023 else:
2024 raise TypeError(f"Invalid positional parameter {args[2]}")
2025 if len(args) >= 4:
2026 if isinstance(args[3], (float, int)):
2027 space_width = args[3]
2028 else:
2029 raise TypeError(f"Invalid positional parameter {args[3]}")
2030 elif isinstance(args[0], (tuple, int)):
2031 orientations = args[0]
2032 if len(args) >= 2:
2033 if isinstance(args[1], (float, int)):
2034 space_width = args[1]
2035 else:
2036 raise TypeError(f"Invalid positional parameter {args[1]}")
2037 else:
2038 raise TypeError(f"Invalid positional parameter {args[0]}")
2040 if isinstance(orientations, int):
2041 orientations = (orientations,)
2043 return self._extract_text(
2044 self,
2045 self.pdf,
2046 orientations,
2047 space_width,
2048 PG.CONTENTS,
2049 visitor_operand_before,
2050 visitor_operand_after,
2051 visitor_text,
2052 )
2054 def extract_xform_text(
2055 self,
2056 xform: EncodedStreamObject,
2057 orientations: tuple[int, ...] = (0, 90, 270, 360),
2058 space_width: float = 200.0,
2059 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2060 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2061 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2062 ) -> str:
2063 """
2064 Extract text from an XObject.
2066 Args:
2067 xform:
2068 orientations:
2069 space_width: force default space width (if not extracted from font (default 200)
2070 visitor_operand_before:
2071 visitor_operand_after:
2072 visitor_text:
2074 Returns:
2075 The extracted text
2077 """
2078 return self._extract_text(
2079 xform,
2080 self.pdf,
2081 orientations,
2082 space_width,
2083 None,
2084 visitor_operand_before,
2085 visitor_operand_after,
2086 visitor_text,
2087 )
2089 def _get_fonts(self) -> tuple[set[str], set[str]]:
2090 """
2091 Get the names of embedded fonts and unembedded fonts.
2093 Returns:
2094 A tuple (set of embedded fonts, set of unembedded fonts)
2096 """
2097 obj = self.get_object()
2098 assert isinstance(obj, DictionaryObject)
2099 fonts: set[str] = set()
2100 embedded: set[str] = set()
2101 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
2102 unembedded = fonts - embedded
2103 return embedded, unembedded
2105 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
2106 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2107 default user space units, defining the boundaries of the physical medium on
2108 which the page is intended to be displayed or printed."""
2110 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
2111 """
2112 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2113 default user space units, defining the visible region of default user
2114 space.
2116 When the page is displayed or printed, its contents are to be clipped
2117 (cropped) to this rectangle and then imposed on the output medium in some
2118 implementation-defined manner. Default value: same as
2119 :attr:`mediabox<mediabox>`.
2120 """
2122 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
2123 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2124 default user space units, defining the region to which the contents of the
2125 page should be clipped when output in a production environment."""
2127 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
2128 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2129 default user space units, defining the intended dimensions of the finished
2130 page after trimming."""
2132 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
2133 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2134 default user space units, defining the extent of the page's meaningful
2135 content as intended by the page's creator."""
2137 @property
2138 def annotations(self) -> Optional[ArrayObject]:
2139 if "/Annots" not in self:
2140 return None
2141 return cast(ArrayObject, self["/Annots"])
2143 @annotations.setter
2144 def annotations(self, value: Optional[ArrayObject]) -> None:
2145 """
2146 Set the annotations array of the page.
2148 Typically you do not want to set this value, but append to it.
2149 If you append to it, remember to add the object first to the writer
2150 and only add the indirect object.
2151 """
2152 if value is None:
2153 del self[NameObject("/Annots")]
2154 else:
2155 self[NameObject("/Annots")] = value
2158class _VirtualList(Sequence[PageObject]):
2159 def __init__(
2160 self,
2161 length_function: Callable[[], int],
2162 get_function: Callable[[int], PageObject],
2163 ) -> None:
2164 self.length_function = length_function
2165 self.get_function = get_function
2166 self.current = -1
2168 def __len__(self) -> int:
2169 return self.length_function()
2171 @overload
2172 def __getitem__(self, index: int) -> PageObject:
2173 ...
2175 @overload
2176 def __getitem__(self, index: slice) -> Sequence[PageObject]:
2177 ...
2179 def __getitem__(
2180 self, index: Union[int, slice]
2181 ) -> Union[PageObject, Sequence[PageObject]]:
2182 if isinstance(index, slice):
2183 indices = range(*index.indices(len(self)))
2184 cls = type(self)
2185 return cls(indices.__len__, lambda idx: self[indices[idx]])
2186 if not isinstance(index, int):
2187 raise TypeError("Sequence indices must be integers")
2188 len_self = len(self)
2189 if index < 0:
2190 # support negative indexes
2191 index += len_self
2192 if not (0 <= index < len_self):
2193 raise IndexError("Sequence index out of range")
2194 return self.get_function(index)
2196 def __delitem__(self, index: Union[int, slice]) -> None:
2197 if isinstance(index, slice):
2198 r = list(range(*index.indices(len(self))))
2199 # pages have to be deleted from last to first
2200 r.sort()
2201 r.reverse()
2202 for p in r:
2203 del self[p] # recursive call
2204 return
2205 if not isinstance(index, int):
2206 raise TypeError("Index must be integers")
2207 len_self = len(self)
2208 if index < 0:
2209 # support negative indexes
2210 index += len_self
2211 if not (0 <= index < len_self):
2212 raise IndexError("Index out of range")
2213 ind = self[index].indirect_reference
2214 assert ind is not None
2215 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2216 "/Parent", None
2217 )
2218 first = True
2219 while parent is not None:
2220 parent = cast(DictionaryObject, parent.get_object())
2221 try:
2222 i = cast(ArrayObject, parent["/Kids"]).index(ind)
2223 del cast(ArrayObject, parent["/Kids"])[i]
2224 first = False
2225 try:
2226 assert ind is not None
2227 del ind.pdf.flattened_pages[index] # case of page in a Reader
2228 except Exception: # pragma: no cover
2229 pass
2230 if "/Count" in parent:
2231 parent[NameObject("/Count")] = NumberObject(
2232 cast(int, parent["/Count"]) - 1
2233 )
2234 if len(cast(ArrayObject, parent["/Kids"])) == 0:
2235 # No more objects in this part of this subtree
2236 ind = parent.indirect_reference
2237 parent = parent.get("/Parent", None)
2238 except ValueError: # from index
2239 if first:
2240 raise PdfReadError(f"Page not found in page tree: {ind}")
2241 break
2243 def __iter__(self) -> Iterator[PageObject]:
2244 for i in range(len(self)):
2245 yield self[i]
2247 def __str__(self) -> str:
2248 p = [f"PageObject({i})" for i in range(self.length_function())]
2249 return f"[{', '.join(p)}]"
2252def _get_fonts_walk(
2253 obj: DictionaryObject,
2254 fnt: set[str],
2255 emb: set[str],
2256) -> tuple[set[str], set[str]]:
2257 """
2258 Get the set of all fonts and all embedded fonts.
2260 Args:
2261 obj: Page resources dictionary
2262 fnt: font
2263 emb: embedded fonts
2265 Returns:
2266 A tuple (fnt, emb)
2268 If there is a key called 'BaseFont', that is a font that is used in the document.
2269 If there is a key called 'FontName' and another key in the same dictionary object
2270 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
2271 embedded.
2273 We create and add to two sets, fnt = fonts used and emb = fonts embedded.
2275 """
2276 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
2278 def process_font(f: DictionaryObject) -> None:
2279 nonlocal fnt, emb
2280 f = cast(DictionaryObject, f.get_object()) # to be sure
2281 if "/BaseFont" in f:
2282 fnt.add(cast(str, f["/BaseFont"]))
2284 if (
2285 ("/CharProcs" in f)
2286 or (
2287 "/FontDescriptor" in f
2288 and any(
2289 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
2290 )
2291 )
2292 or (
2293 "/DescendantFonts" in f
2294 and "/FontDescriptor"
2295 in cast(
2296 DictionaryObject,
2297 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2298 )
2299 and any(
2300 x
2301 in cast(
2302 DictionaryObject,
2303 cast(
2304 DictionaryObject,
2305 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2306 )["/FontDescriptor"],
2307 )
2308 for x in fontkeys
2309 )
2310 )
2311 ):
2312 # the list comprehension ensures there is FontFile
2313 try:
2314 emb.add(cast(str, f["/BaseFont"]))
2315 except KeyError:
2316 emb.add("(" + cast(str, f["/Subtype"]) + ")")
2318 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
2319 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
2320 process_font(f)
2321 if "/Resources" in obj:
2322 if "/Font" in cast(DictionaryObject, obj["/Resources"]):
2323 for f in cast(
2324 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
2325 ).values():
2326 process_font(f)
2327 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
2328 for x in cast(
2329 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
2330 ).values():
2331 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
2332 if "/Annots" in obj:
2333 for a in cast(ArrayObject, obj["/Annots"]):
2334 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
2335 if "/AP" in obj:
2336 if (
2337 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
2338 "/Type"
2339 )
2340 == "/XObject"
2341 ):
2342 _get_fonts_walk(
2343 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
2344 fnt,
2345 emb,
2346 )
2347 else:
2348 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
2349 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
2350 return fnt, emb # return the sets for each page