Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
30import math
31from dataclasses import dataclass
32from decimal import Decimal
33from io import BytesIO
34from pathlib import Path
35from typing import (
36 Any,
37 Callable,
38 Dict,
39 Iterable,
40 Iterator,
41 List,
42 Literal,
43 Optional,
44 Sequence,
45 Set,
46 Tuple,
47 Union,
48 cast,
49 overload,
50)
52from ._cmap import (
53 build_char_map,
54)
55from ._protocols import PdfCommonDocProtocol
56from ._text_extraction import (
57 _layout_mode,
58)
59from ._text_extraction._text_extractor import TextExtraction
60from ._utils import (
61 CompressedTransformationMatrix,
62 TransformationMatrixType,
63 _human_readable_bytes,
64 logger_warning,
65 matrix_multiply,
66)
67from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING
68from .constants import AnnotationDictionaryAttributes as ADA
69from .constants import ImageAttributes as IA
70from .constants import PageAttributes as PG
71from .constants import Resources as RES
72from .errors import PageSizeNotDefinedError, PdfReadError
73from .filters import _xobj_to_image
74from .generic import (
75 ArrayObject,
76 ContentStream,
77 DictionaryObject,
78 EncodedStreamObject,
79 FloatObject,
80 IndirectObject,
81 NameObject,
82 NullObject,
83 NumberObject,
84 PdfObject,
85 RectangleObject,
86 StreamObject,
87 is_null_or_none,
88)
90try:
91 from PIL.Image import Image
93 pil_not_imported = False
94except ImportError:
95 Image = object # type: ignore
96 pil_not_imported = True # error will be raised only when using images
98MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
101def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
102 retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
103 if isinstance(retval, RectangleObject):
104 return retval
105 if is_null_or_none(retval):
106 for d in defaults:
107 retval = self.get(d)
108 if retval is not None:
109 break
110 if isinstance(retval, IndirectObject):
111 retval = self.pdf.get_object(retval)
112 retval = RectangleObject(retval) # type: ignore
113 _set_rectangle(self, name, retval)
114 return retval
117def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
118 name = NameObject(name)
119 self[name] = value
122def _delete_rectangle(self: Any, name: str) -> None:
123 del self[name]
126def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
127 return property(
128 lambda self: _get_rectangle(self, name, fallback),
129 lambda self, value: _set_rectangle(self, name, value),
130 lambda self: _delete_rectangle(self, name),
131 )
134class Transformation:
135 """
136 Represent a 2D transformation.
138 The transformation between two coordinate systems is represented by a 3-by-3
139 transformation matrix with the following form::
141 a b 0
142 c d 0
143 e f 1
145 Because a transformation matrix has only six elements that can be changed,
146 it is usually specified in PDF as the six-element array [ a b c d e f ].
148 Coordinate transformations are expressed as matrix multiplications::
150 a b 0
151 [ x′ y′ 1 ] = [ x y 1 ] × c d 0
152 e f 1
155 Example:
156 >>> from pypdf import Transformation
157 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
158 >>> page.add_transformation(op)
160 """
162 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:
163 self.ctm = ctm
165 @property
166 def matrix(self) -> TransformationMatrixType:
167 """
168 Return the transformation matrix as a tuple of tuples in the form:
170 ((a, b, 0), (c, d, 0), (e, f, 1))
171 """
172 return (
173 (self.ctm[0], self.ctm[1], 0),
174 (self.ctm[2], self.ctm[3], 0),
175 (self.ctm[4], self.ctm[5], 1),
176 )
178 @staticmethod
179 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
180 """
181 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
183 Args:
184 matrix: The transformation matrix as a tuple of tuples.
186 Returns:
187 A tuple representing the transformation matrix as (a, b, c, d, e, f)
189 """
190 return (
191 matrix[0][0],
192 matrix[0][1],
193 matrix[1][0],
194 matrix[1][1],
195 matrix[2][0],
196 matrix[2][1],
197 )
199 def _to_cm(self) -> str:
200 # Returns the cm operation string for the given transformation matrix
201 return (
202 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
203 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
204 )
206 def transform(self, m: "Transformation") -> "Transformation":
207 """
208 Apply one transformation to another.
210 Args:
211 m: a Transformation to apply.
213 Returns:
214 A new ``Transformation`` instance
216 Example:
217 >>> from pypdf import Transformation
218 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
219 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror
220 >>> page.add_transformation(op)
222 """
223 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
224 return Transformation(ctm)
226 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
227 """
228 Translate the contents of a page.
230 Args:
231 tx: The translation along the x-axis.
232 ty: The translation along the y-axis.
234 Returns:
235 A new ``Transformation`` instance
237 """
238 m = self.ctm
239 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
241 def scale(
242 self, sx: Optional[float] = None, sy: Optional[float] = None
243 ) -> "Transformation":
244 """
245 Scale the contents of a page towards the origin of the coordinate system.
247 Typically, that is the lower-left corner of the page. That can be
248 changed by translating the contents / the page boxes.
250 Args:
251 sx: The scale factor along the x-axis.
252 sy: The scale factor along the y-axis.
254 Returns:
255 A new Transformation instance with the scaled matrix.
257 """
258 if sx is None and sy is None:
259 raise ValueError("Either sx or sy must be specified")
260 if sx is None:
261 sx = sy
262 if sy is None:
263 sy = sx
264 assert sx is not None
265 assert sy is not None
266 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
267 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
268 return Transformation(ctm)
270 def rotate(self, rotation: float) -> "Transformation":
271 """
272 Rotate the contents of a page.
274 Args:
275 rotation: The angle of rotation in degrees.
277 Returns:
278 A new ``Transformation`` instance with the rotated matrix.
280 """
281 rotation = math.radians(rotation)
282 op: TransformationMatrixType = (
283 (math.cos(rotation), math.sin(rotation), 0),
284 (-math.sin(rotation), math.cos(rotation), 0),
285 (0, 0, 1),
286 )
287 ctm = Transformation.compress(matrix_multiply(self.matrix, op))
288 return Transformation(ctm)
290 def __repr__(self) -> str:
291 return f"Transformation(ctm={self.ctm})"
293 @overload
294 def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]:
295 ...
297 @overload
298 def apply_on(
299 self, pt: Tuple[float, float], as_object: bool = False
300 ) -> Tuple[float, float]:
301 ...
303 def apply_on(
304 self,
305 pt: Union[Tuple[float, float], List[float]],
306 as_object: bool = False,
307 ) -> Union[Tuple[float, float], List[float]]:
308 """
309 Apply the transformation matrix on the given point.
311 Args:
312 pt: A tuple or list representing the point in the form (x, y).
313 as_object: If True, return items as FloatObject, otherwise as plain floats.
315 Returns:
316 A tuple or list representing the transformed point in the form (x', y')
318 """
319 typ = FloatObject if as_object else float
320 pt1 = (
321 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
322 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
323 )
324 return list(pt1) if isinstance(pt, list) else pt1
327@dataclass
328class ImageFile:
329 """
330 Image within the PDF file. *This object is not designed to be built.*
332 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.
333 """
335 name: str = ""
336 """
337 Filename as identified within the PDF file.
338 """
340 data: bytes = b""
341 """
342 Data as bytes.
343 """
345 image: Optional[Image] = None
346 """
347 Data as PIL image.
348 """
350 indirect_reference: Optional[IndirectObject] = None
351 """
352 Reference to the object storing the stream.
353 """
355 def replace(self, new_image: Image, **kwargs: Any) -> None:
356 """
357 Replace the image with a new PIL image.
359 Args:
360 new_image (PIL.Image.Image): The new PIL image to replace the existing image.
361 **kwargs: Additional keyword arguments to pass to `Image.save()`.
363 Raises:
364 TypeError: If the image is inline or in a PdfReader.
365 TypeError: If the image does not belong to a PdfWriter.
366 TypeError: If `new_image` is not a PIL Image.
368 Note:
369 This method replaces the existing image with a new image.
370 It is not allowed for inline images or images within a PdfReader.
371 The `kwargs` parameter allows passing additional parameters
372 to `Image.save()`, such as quality.
374 """
375 if pil_not_imported:
376 raise ImportError(
377 "pillow is required to do image extraction. "
378 "It can be installed via 'pip install pypdf[image]'"
379 )
381 from ._reader import PdfReader # noqa: PLC0415
383 # to prevent circular import
384 from .filters import _xobj_to_image # noqa: PLC0415
385 from .generic import DictionaryObject, PdfObject # noqa: PLC0415
387 if self.indirect_reference is None:
388 raise TypeError("Cannot update an inline image.")
389 if not hasattr(self.indirect_reference.pdf, "_id_translated"):
390 raise TypeError("Cannot update an image not belonging to a PdfWriter.")
391 if not isinstance(new_image, Image):
392 raise TypeError("new_image shall be a PIL Image")
393 b = BytesIO()
394 new_image.save(b, "PDF", **kwargs)
395 reader = PdfReader(b)
396 assert reader.pages[0].images[0].indirect_reference is not None
397 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
398 reader.pages[0].images[0].indirect_reference.get_object()
399 )
400 cast(
401 PdfObject, self.indirect_reference.get_object()
402 ).indirect_reference = self.indirect_reference
403 # change the object attributes
404 extension, byte_stream, img = _xobj_to_image(
405 cast(DictionaryObject, self.indirect_reference.get_object())
406 )
407 assert extension is not None
408 self.name = self.name[: self.name.rfind(".")] + extension
409 self.data = byte_stream
410 self.image = img
412 def __str__(self) -> str:
413 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
415 def __repr__(self) -> str:
416 return self.__str__()[:-1] + f", hash: {hash(self.data)})"
419class VirtualListImages(Sequence[ImageFile]):
420 """
421 Provides access to images referenced within a page.
422 Only one copy will be returned if the usage is used on the same page multiple times.
423 See :func:`PageObject.images` for more details.
424 """
426 def __init__(
427 self,
428 ids_function: Callable[[], List[Union[str, List[str]]]],
429 get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile],
430 ) -> None:
431 self.ids_function = ids_function
432 self.get_function = get_function
433 self.current = -1
435 def __len__(self) -> int:
436 return len(self.ids_function())
438 def keys(self) -> List[Union[str, List[str]]]:
439 return self.ids_function()
441 def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]:
442 return [(x, self[x]) for x in self.ids_function()]
444 @overload
445 def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile:
446 ...
448 @overload
449 def __getitem__(self, index: slice) -> Sequence[ImageFile]:
450 ...
452 def __getitem__(
453 self, index: Union[int, slice, str, List[str], Tuple[str]]
454 ) -> Union[ImageFile, Sequence[ImageFile]]:
455 lst = self.ids_function()
456 if isinstance(index, slice):
457 indices = range(*index.indices(len(self)))
458 lst = [lst[x] for x in indices]
459 cls = type(self)
460 return cls((lambda: lst), self.get_function)
461 if isinstance(index, (str, list, tuple)):
462 return self.get_function(index)
463 if not isinstance(index, int):
464 raise TypeError("Invalid sequence indices type")
465 len_self = len(lst)
466 if index < 0:
467 # support negative indexes
468 index += len_self
469 if not (0 <= index < len_self):
470 raise IndexError("Sequence index out of range")
471 return self.get_function(lst[index])
473 def __iter__(self) -> Iterator[ImageFile]:
474 for i in range(len(self)):
475 yield self[i]
477 def __str__(self) -> str:
478 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
479 return f"[{', '.join(p)}]"
482class PageObject(DictionaryObject):
483 """
484 PageObject represents a single page within a PDF file.
486 Typically these objects will be created by accessing the
487 :attr:`pages<pypdf.PdfReader.pages>` property of the
488 :class:`PdfReader<pypdf.PdfReader>` class, but it is
489 also possible to create an empty page with the
490 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
492 Args:
493 pdf: PDF file the page belongs to.
494 indirect_reference: Stores the original indirect reference to
495 this object in its source PDF
497 """
499 original_page: "PageObject" # very local use in writer when appending
501 def __init__(
502 self,
503 pdf: Optional[PdfCommonDocProtocol] = None,
504 indirect_reference: Optional[IndirectObject] = None,
505 ) -> None:
506 DictionaryObject.__init__(self)
507 self.pdf = pdf
508 self.inline_images: Optional[Dict[str, ImageFile]] = None
509 self.indirect_reference = indirect_reference
510 if not is_null_or_none(indirect_reference):
511 assert indirect_reference is not None, "mypy"
512 self.update(cast(DictionaryObject, indirect_reference.get_object()))
513 self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {}
515 def hash_bin(self) -> int:
516 """
517 Used to detect modified object.
519 Note: this function is overloaded to return the same results
520 as a DictionaryObject.
522 Returns:
523 Hash considering type and value.
525 """
526 return hash(
527 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
528 )
530 def hash_value_data(self) -> bytes:
531 data = super().hash_value_data()
532 data += b"%d" % id(self)
533 return data
535 @property
536 def user_unit(self) -> float:
537 """
538 A read-only positive number giving the size of user space units.
540 It is in multiples of 1/72 inch. Hence a value of 1 means a user
541 space unit is 1/72 inch, and a value of 3 means that a user
542 space unit is 3/72 inch.
543 """
544 return self.get(PG.USER_UNIT, 1)
546 @staticmethod
547 def create_blank_page(
548 pdf: Optional[PdfCommonDocProtocol] = None,
549 width: Union[float, Decimal, None] = None,
550 height: Union[float, Decimal, None] = None,
551 ) -> "PageObject":
552 """
553 Return a new blank page.
555 If ``width`` or ``height`` is ``None``, try to get the page size
556 from the last page of *pdf*.
558 Args:
559 pdf: PDF file the page is within.
560 width: The width of the new page expressed in default user
561 space units.
562 height: The height of the new page expressed in default user
563 space units.
565 Returns:
566 The new blank page
568 Raises:
569 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
570 no page
572 """
573 page = PageObject(pdf)
575 # Creates a new page (cf PDF Reference §7.7.3.3)
576 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
577 page.__setitem__(NameObject(PG.PARENT), NullObject())
578 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
579 if width is None or height is None:
580 if pdf is not None and len(pdf.pages) > 0:
581 lastpage = pdf.pages[len(pdf.pages) - 1]
582 width = lastpage.mediabox.width
583 height = lastpage.mediabox.height
584 else:
585 raise PageSizeNotDefinedError
586 page.__setitem__(
587 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore
588 )
590 return page
592 def _get_ids_image(
593 self,
594 obj: Optional[DictionaryObject] = None,
595 ancest: Optional[List[str]] = None,
596 call_stack: Optional[List[Any]] = None,
597 ) -> List[Union[str, List[str]]]:
598 if call_stack is None:
599 call_stack = []
600 _i = getattr(obj, "indirect_reference", None)
601 if _i in call_stack:
602 return []
603 call_stack.append(_i)
604 if self.inline_images is None:
605 self.inline_images = self._get_inline_images()
606 if obj is None:
607 obj = self
608 if ancest is None:
609 ancest = []
610 lst: List[Union[str, List[str]]] = []
611 if (
612 PG.RESOURCES not in obj or
613 is_null_or_none(resources := obj[PG.RESOURCES]) or
614 RES.XOBJECT not in cast(DictionaryObject, resources)
615 ):
616 return [] if self.inline_images is None else list(self.inline_images.keys())
618 x_object = resources[RES.XOBJECT].get_object() # type: ignore
619 for o in x_object:
620 if not isinstance(x_object[o], StreamObject):
621 continue
622 if x_object[o][IA.SUBTYPE] == "/Image":
623 lst.append(o if len(ancest) == 0 else [*ancest, o])
624 else: # is a form with possible images inside
625 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))
626 assert self.inline_images is not None
627 lst.extend(list(self.inline_images.keys()))
628 return lst
630 def _get_image(
631 self,
632 id: Union[str, List[str], Tuple[str]],
633 obj: Optional[DictionaryObject] = None,
634 ) -> ImageFile:
635 if obj is None:
636 obj = cast(DictionaryObject, self)
637 if isinstance(id, tuple):
638 id = list(id)
639 if isinstance(id, List) and len(id) == 1:
640 id = id[0]
641 try:
642 xobjs = cast(
643 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
644 )
645 except KeyError:
646 if not (id[0] == "~" and id[-1] == "~"):
647 raise
648 if isinstance(id, str):
649 if id[0] == "~" and id[-1] == "~":
650 if self.inline_images is None:
651 self.inline_images = self._get_inline_images()
652 if self.inline_images is None: # pragma: no cover
653 raise KeyError("No inline image can be found")
654 return self.inline_images[id]
656 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
657 extension, byte_stream = imgd[:2]
658 return ImageFile(
659 name=f"{id[1:]}{extension}",
660 data=byte_stream,
661 image=imgd[2],
662 indirect_reference=xobjs[id].indirect_reference,
663 )
664 # in a subobject
665 ids = id[1:]
666 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
668 @property
669 def images(self) -> VirtualListImages:
670 """
671 Read-only property emulating a list of images on a page.
673 Get a list of all images on the page. The key can be:
674 - A string (for the top object)
675 - A tuple (for images within XObject forms)
676 - An integer
678 Examples:
679 * `reader.pages[0].images[0]` # return first image
680 * `reader.pages[0].images['/I0']` # return image '/I0'
681 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form
682 * `for img in reader.pages[0].images:` # loops through all objects
684 images.keys() and images.items() can be used.
686 The ImageFile has the following properties:
688 * `.name` : name of the object
689 * `.data` : bytes of the object
690 * `.image` : PIL Image Object
691 * `.indirect_reference` : object reference
693 and the following methods:
694 `.replace(new_image: PIL.Image.Image, **kwargs)` :
695 replace the image in the pdf with the new image
696 applying the saving parameters indicated (such as quality)
698 Example usage:
700 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)
702 Inline images are extracted and named ~0~, ~1~, ..., with the
703 indirect_reference set to None.
705 """
706 return VirtualListImages(self._get_ids_image, self._get_image)
708 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:
709 """Translate values used in inline image"""
710 try:
711 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])
712 except (TypeError, KeyError):
713 if isinstance(v, NameObject):
714 # It is a custom name, thus we have to look in resources.
715 # The only applicable case is for ColorSpace.
716 try:
717 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
718 v = cast(DictionaryObject, res)[v]
719 except KeyError: # for res and v
720 raise PdfReadError(f"Cannot find resource entry {v} for {k}")
721 return v
723 def _get_inline_images(self) -> Dict[str, ImageFile]:
724 """Load inline images. Entries will be identified as `~1~`."""
725 content = self.get_contents()
726 if is_null_or_none(content):
727 return {}
728 imgs_data = []
729 assert content is not None, "mypy"
730 for param, ope in content.operations:
731 if ope == b"INLINE IMAGE":
732 imgs_data.append(
733 {"settings": param["settings"], "__streamdata__": param["data"]}
734 )
735 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
736 raise PdfReadError(
737 f"{ope!r} operator met whereas not expected, "
738 "please share use case with pypdf dev team"
739 )
740 files = {}
741 for num, ii in enumerate(imgs_data):
742 init = {
743 "__streamdata__": ii["__streamdata__"],
744 "/Length": len(ii["__streamdata__"]),
745 }
746 for k, v in ii["settings"].items():
747 if k in {"/Length", "/L"}: # no length is expected
748 continue
749 if isinstance(v, list):
750 v = ArrayObject(
751 [self._translate_value_inline_image(k, x) for x in v]
752 )
753 else:
754 v = self._translate_value_inline_image(k, v)
755 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])
756 if k not in init:
757 init[k] = v
758 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
759 extension, byte_stream, img = _xobj_to_image(ii["object"])
760 files[f"~{num}~"] = ImageFile(
761 name=f"~{num}~{extension}",
762 data=byte_stream,
763 image=img,
764 indirect_reference=None,
765 )
766 return files
768 @property
769 def rotation(self) -> int:
770 """
771 The visual rotation of the page.
773 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
774 valid values. This property does not affect ``/Contents``.
775 """
776 rotate_obj = self.get(PG.ROTATE, 0)
777 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
779 @rotation.setter
780 def rotation(self, r: float) -> None:
781 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
783 def transfer_rotation_to_content(self) -> None:
784 """
785 Apply the rotation of the page to the content and the media/crop/...
786 boxes.
788 It is recommended to apply this function before page merging.
789 """
790 r = -self.rotation # rotation to apply is in the otherway
791 self.rotation = 0
792 mb = RectangleObject(self.mediabox)
793 trsf = (
794 Transformation()
795 .translate(
796 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
797 )
798 .rotate(r)
799 )
800 pt1 = trsf.apply_on(mb.lower_left)
801 pt2 = trsf.apply_on(mb.upper_right)
802 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
803 self.add_transformation(trsf, False)
804 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
805 if b in self:
806 rr = RectangleObject(self[b]) # type: ignore
807 pt1 = trsf.apply_on(rr.lower_left)
808 pt2 = trsf.apply_on(rr.upper_right)
809 self[NameObject(b)] = RectangleObject(
810 (
811 min(pt1[0], pt2[0]),
812 min(pt1[1], pt2[1]),
813 max(pt1[0], pt2[0]),
814 max(pt1[1], pt2[1]),
815 )
816 )
818 def rotate(self, angle: int) -> "PageObject":
819 """
820 Rotate a page clockwise by increments of 90 degrees.
822 Args:
823 angle: Angle to rotate the page. Must be an increment of 90 deg.
825 Returns:
826 The rotated PageObject
828 """
829 if angle % 90 != 0:
830 raise ValueError("Rotation angle must be a multiple of 90")
831 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
832 return self
834 def _merge_resources(
835 self,
836 res1: DictionaryObject,
837 res2: DictionaryObject,
838 resource: Any,
839 new_res1: bool = True,
840 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
841 try:
842 assert isinstance(self.indirect_reference, IndirectObject)
843 pdf = self.indirect_reference.pdf
844 is_pdf_writer = hasattr(
845 pdf, "_add_object"
846 ) # expect isinstance(pdf, PdfWriter)
847 except (AssertionError, AttributeError):
848 pdf = None
849 is_pdf_writer = False
851 def compute_unique_key(base_key: str) -> Tuple[str, bool]:
852 """
853 Find a key that either doesn't already exist or has the same value
854 (indicated by the bool)
856 Args:
857 base_key: An index is added to this to get the computed key
859 Returns:
860 A tuple (computed key, bool) where the boolean indicates
861 if there is a resource of the given computed_key with the same
862 value.
864 """
865 value = page2res.raw_get(base_key)
866 # TODO: a possible improvement for writer, the indirect_reference
867 # cannot be found because translated
869 # try the current key first (e.g. "foo"), but otherwise iterate
870 # through "foo-0", "foo-1", etc. new_res can contain only finitely
871 # many keys, thus this'll eventually end, even if it's been crafted
872 # to be maximally annoying.
873 computed_key = base_key
874 idx = 0
875 while computed_key in new_res:
876 if new_res.raw_get(computed_key) == value:
877 # there's already a resource of this name, with the exact
878 # same value
879 return computed_key, True
880 computed_key = f"{base_key}-{idx}"
881 idx += 1
882 return computed_key, False
884 if new_res1:
885 new_res = DictionaryObject()
886 new_res.update(res1.get(resource, DictionaryObject()).get_object())
887 else:
888 new_res = cast(DictionaryObject, res1[resource])
889 page2res = cast(
890 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
891 )
892 rename_res = {}
893 for key in page2res:
894 unique_key, same_value = compute_unique_key(key)
895 newname = NameObject(unique_key)
896 if key != unique_key:
897 # we have to use a different name for this
898 rename_res[key] = newname
900 if not same_value:
901 if is_pdf_writer:
902 new_res[newname] = page2res.raw_get(key).clone(pdf)
903 try:
904 new_res[newname] = new_res[newname].indirect_reference
905 except AttributeError:
906 pass
907 else:
908 new_res[newname] = page2res.raw_get(key)
909 lst = sorted(new_res.items())
910 new_res.clear()
911 for el in lst:
912 new_res[el[0]] = el[1]
913 return new_res, rename_res
915 @staticmethod
916 def _content_stream_rename(
917 stream: ContentStream,
918 rename: Dict[Any, Any],
919 pdf: Optional[PdfCommonDocProtocol],
920 ) -> ContentStream:
921 if not rename:
922 return stream
923 stream = ContentStream(stream, pdf)
924 for operands, _operator in stream.operations:
925 if isinstance(operands, list):
926 for i, op in enumerate(operands):
927 if isinstance(op, NameObject):
928 operands[i] = rename.get(op, op)
929 elif isinstance(operands, dict):
930 for i, op in operands.items():
931 if isinstance(op, NameObject):
932 operands[i] = rename.get(op, op)
933 else:
934 raise KeyError(f"Type of operands is {type(operands)}")
935 return stream
937 @staticmethod
938 def _add_transformation_matrix(
939 contents: Any,
940 pdf: Optional[PdfCommonDocProtocol],
941 ctm: CompressedTransformationMatrix,
942 ) -> ContentStream:
943 """Add transformation matrix at the beginning of the given contents stream."""
944 contents = ContentStream(contents, pdf)
945 contents.operations.insert(
946 0,
947 [
948 [FloatObject(x) for x in ctm],
949 b"cm",
950 ],
951 )
952 return contents
954 def _get_contents_as_bytes(self) -> Optional[bytes]:
955 """
956 Return the page contents as bytes.
958 Returns:
959 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
961 """
962 if PG.CONTENTS in self:
963 obj = self[PG.CONTENTS].get_object()
964 if isinstance(obj, list):
965 return b"".join(x.get_object().get_data() for x in obj)
966 return cast(EncodedStreamObject, obj).get_data()
967 return None
969 def get_contents(self) -> Optional[ContentStream]:
970 """
971 Access the page contents.
973 Returns:
974 The ``/Contents`` object, or ``None`` if it does not exist.
975 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
977 """
978 if PG.CONTENTS in self:
979 try:
980 pdf = cast(IndirectObject, self.indirect_reference).pdf
981 except AttributeError:
982 pdf = None
983 obj = self[PG.CONTENTS]
984 if is_null_or_none(obj):
985 return None
986 resolved_object = obj.get_object()
987 return ContentStream(resolved_object, pdf)
988 return None
990 def replace_contents(
991 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
992 ) -> None:
993 """
994 Replace the page contents with the new content and nullify old objects
995 Args:
996 content: new content; if None delete the content field.
997 """
998 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
999 # the page is not attached : the content is directly attached.
1000 self[NameObject(PG.CONTENTS)] = content
1001 return
1003 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
1004 for o in self[PG.CONTENTS]: # type: ignore[attr-defined]
1005 try:
1006 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore
1007 except AttributeError:
1008 pass
1010 if isinstance(content, ArrayObject):
1011 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content)
1013 if is_null_or_none(content):
1014 if PG.CONTENTS not in self:
1015 return
1016 assert self.indirect_reference is not None
1017 assert self[PG.CONTENTS].indirect_reference is not None
1018 self.indirect_reference.pdf._objects[
1019 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore
1020 ] = NullObject()
1021 del self[PG.CONTENTS]
1022 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
1023 try:
1024 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(
1025 content
1026 )
1027 except AttributeError:
1028 # applies at least for page not in writer
1029 # as a backup solution, we put content as an object although not in accordance with pdf ref
1030 # this will be fixed with the _add_object
1031 self[NameObject(PG.CONTENTS)] = content
1032 else:
1033 assert content is not None, "mypy"
1034 content.indirect_reference = self[
1035 PG.CONTENTS
1036 ].indirect_reference # TODO: in the future may require generation management
1037 try:
1038 self.indirect_reference.pdf._objects[
1039 content.indirect_reference.idnum - 1 # type: ignore
1040 ] = content
1041 except AttributeError:
1042 # applies at least for page not in writer
1043 # as a backup solution, we put content as an object although not in accordance with pdf ref
1044 # this will be fixed with the _add_object
1045 self[NameObject(PG.CONTENTS)] = content
1046 # forces recalculation of inline_images
1047 self.inline_images = None
1049 def merge_page(
1050 self, page2: "PageObject", expand: bool = False, over: bool = True
1051 ) -> None:
1052 """
1053 Merge the content streams of two pages into one.
1055 Resource references (e.g. fonts) are maintained from both pages.
1056 The mediabox, cropbox, etc of this page are not altered.
1057 The parameter page's content stream will
1058 be added to the end of this page's content stream,
1059 meaning that it will be drawn after, or "on top" of this page.
1061 Args:
1062 page2: The page to be merged into this one. Should be
1063 an instance of :class:`PageObject<PageObject>`.
1064 over: set the page2 content over page1 if True (default) else under
1065 expand: If True, the current page dimensions will be
1066 expanded to accommodate the dimensions of the page to be merged.
1068 """
1069 self._merge_page(page2, over=over, expand=expand)
1071 def _merge_page(
1072 self,
1073 page2: "PageObject",
1074 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1075 ctm: Optional[CompressedTransformationMatrix] = None,
1076 over: bool = True,
1077 expand: bool = False,
1078 ) -> None:
1079 # First we work on merging the resource dictionaries. This allows us
1080 # to find out what symbols in the content streams we might need to
1081 # rename.
1082 try:
1083 assert isinstance(self.indirect_reference, IndirectObject)
1084 if hasattr(
1085 self.indirect_reference.pdf, "_add_object"
1086 ): # to detect PdfWriter
1087 return self._merge_page_writer(
1088 page2, page2transformation, ctm, over, expand
1089 )
1090 except (AssertionError, AttributeError):
1091 pass
1093 new_resources = DictionaryObject()
1094 rename = {}
1095 try:
1096 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1097 except KeyError:
1098 original_resources = DictionaryObject()
1099 try:
1100 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1101 except KeyError:
1102 page2resources = DictionaryObject()
1103 new_annots = ArrayObject()
1105 for page in (self, page2):
1106 if PG.ANNOTS in page:
1107 annots = page[PG.ANNOTS]
1108 if isinstance(annots, ArrayObject):
1109 new_annots.extend(annots)
1111 for res in (
1112 RES.EXT_G_STATE,
1113 RES.FONT,
1114 RES.XOBJECT,
1115 RES.COLOR_SPACE,
1116 RES.PATTERN,
1117 RES.SHADING,
1118 RES.PROPERTIES,
1119 ):
1120 new, newrename = self._merge_resources(
1121 original_resources, page2resources, res
1122 )
1123 if new:
1124 new_resources[NameObject(res)] = new
1125 rename.update(newrename)
1127 # Combine /ProcSet sets, making sure there's a consistent order
1128 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
1129 sorted(
1130 set(
1131 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
1132 ).union(
1133 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
1134 )
1135 )
1136 )
1138 new_content_array = ArrayObject()
1139 original_content = self.get_contents()
1140 if original_content is not None:
1141 original_content.isolate_graphics_state()
1142 new_content_array.append(original_content)
1144 page2content = page2.get_contents()
1145 if page2content is not None:
1146 rect = getattr(page2, MERGE_CROP_BOX)
1147 page2content.operations.insert(
1148 0,
1149 (
1150 map(
1151 FloatObject,
1152 [
1153 rect.left,
1154 rect.bottom,
1155 rect.width,
1156 rect.height,
1157 ],
1158 ),
1159 b"re",
1160 ),
1161 )
1162 page2content.operations.insert(1, ([], b"W"))
1163 page2content.operations.insert(2, ([], b"n"))
1164 if page2transformation is not None:
1165 page2content = page2transformation(page2content)
1166 page2content = PageObject._content_stream_rename(
1167 page2content, rename, self.pdf
1168 )
1169 page2content.isolate_graphics_state()
1170 if over:
1171 new_content_array.append(page2content)
1172 else:
1173 new_content_array.insert(0, page2content)
1175 # if expanding the page to fit a new page, calculate the new media box size
1176 if expand:
1177 self._expand_mediabox(page2, ctm)
1179 self.replace_contents(ContentStream(new_content_array, self.pdf))
1180 self[NameObject(PG.RESOURCES)] = new_resources
1181 self[NameObject(PG.ANNOTS)] = new_annots
1183 def _merge_page_writer(
1184 self,
1185 page2: "PageObject",
1186 page2transformation: Optional[Callable[[Any], ContentStream]] = None,
1187 ctm: Optional[CompressedTransformationMatrix] = None,
1188 over: bool = True,
1189 expand: bool = False,
1190 ) -> None:
1191 # First we work on merging the resource dictionaries. This allows us
1192 # to find which symbols in the content streams we might need to
1193 # rename.
1194 assert isinstance(self.indirect_reference, IndirectObject)
1195 pdf = self.indirect_reference.pdf
1197 rename = {}
1198 if PG.RESOURCES not in self:
1199 self[NameObject(PG.RESOURCES)] = DictionaryObject()
1200 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
1201 if PG.RESOURCES not in page2:
1202 page2resources = DictionaryObject()
1203 else:
1204 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
1206 for res in (
1207 RES.EXT_G_STATE,
1208 RES.FONT,
1209 RES.XOBJECT,
1210 RES.COLOR_SPACE,
1211 RES.PATTERN,
1212 RES.SHADING,
1213 RES.PROPERTIES,
1214 ):
1215 if res in page2resources:
1216 if res not in original_resources:
1217 original_resources[NameObject(res)] = DictionaryObject()
1218 _, newrename = self._merge_resources(
1219 original_resources, page2resources, res, False
1220 )
1221 rename.update(newrename)
1222 # Combine /ProcSet sets.
1223 if RES.PROC_SET in page2resources:
1224 if RES.PROC_SET not in original_resources:
1225 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
1226 arr = cast(ArrayObject, original_resources[RES.PROC_SET])
1227 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
1228 if x not in arr:
1229 arr.append(x)
1230 arr.sort()
1232 if PG.ANNOTS in page2:
1233 if PG.ANNOTS not in self:
1234 self[NameObject(PG.ANNOTS)] = ArrayObject()
1235 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
1236 if ctm is None:
1237 trsf = Transformation()
1238 else:
1239 trsf = Transformation(ctm)
1240 for a in cast(ArrayObject, page2[PG.ANNOTS]):
1241 a = a.get_object()
1242 aa = a.clone(
1243 pdf,
1244 ignore_fields=("/P", "/StructParent", "/Parent"),
1245 force_duplicate=True,
1246 )
1247 r = cast(ArrayObject, a["/Rect"])
1248 pt1 = trsf.apply_on((r[0], r[1]), True)
1249 pt2 = trsf.apply_on((r[2], r[3]), True)
1250 aa[NameObject("/Rect")] = ArrayObject(
1251 (
1252 min(pt1[0], pt2[0]),
1253 min(pt1[1], pt2[1]),
1254 max(pt1[0], pt2[0]),
1255 max(pt1[1], pt2[1]),
1256 )
1257 )
1258 if "/QuadPoints" in a:
1259 q = cast(ArrayObject, a["/QuadPoints"])
1260 aa[NameObject("/QuadPoints")] = ArrayObject(
1261 trsf.apply_on((q[0], q[1]), True)
1262 + trsf.apply_on((q[2], q[3]), True)
1263 + trsf.apply_on((q[4], q[5]), True)
1264 + trsf.apply_on((q[6], q[7]), True)
1265 )
1266 try:
1267 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
1268 except KeyError:
1269 pass
1270 try:
1271 aa[NameObject("/P")] = self.indirect_reference
1272 annots.append(aa.indirect_reference)
1273 except AttributeError:
1274 pass
1276 new_content_array = ArrayObject()
1277 original_content = self.get_contents()
1278 if original_content is not None:
1279 original_content.isolate_graphics_state()
1280 new_content_array.append(original_content)
1282 page2content = page2.get_contents()
1283 if page2content is not None:
1284 rect = getattr(page2, MERGE_CROP_BOX)
1285 page2content.operations.insert(
1286 0,
1287 (
1288 map(
1289 FloatObject,
1290 [
1291 rect.left,
1292 rect.bottom,
1293 rect.width,
1294 rect.height,
1295 ],
1296 ),
1297 b"re",
1298 ),
1299 )
1300 page2content.operations.insert(1, ([], b"W"))
1301 page2content.operations.insert(2, ([], b"n"))
1302 if page2transformation is not None:
1303 page2content = page2transformation(page2content)
1304 page2content = PageObject._content_stream_rename(
1305 page2content, rename, self.pdf
1306 )
1307 page2content.isolate_graphics_state()
1308 if over:
1309 new_content_array.append(page2content)
1310 else:
1311 new_content_array.insert(0, page2content)
1313 # if expanding the page to fit a new page, calculate the new media box size
1314 if expand:
1315 self._expand_mediabox(page2, ctm)
1317 self.replace_contents(new_content_array)
1319 def _expand_mediabox(
1320 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
1321 ) -> None:
1322 corners1 = (
1323 self.mediabox.left.as_numeric(),
1324 self.mediabox.bottom.as_numeric(),
1325 self.mediabox.right.as_numeric(),
1326 self.mediabox.top.as_numeric(),
1327 )
1328 corners2 = (
1329 page2.mediabox.left.as_numeric(),
1330 page2.mediabox.bottom.as_numeric(),
1331 page2.mediabox.left.as_numeric(),
1332 page2.mediabox.top.as_numeric(),
1333 page2.mediabox.right.as_numeric(),
1334 page2.mediabox.top.as_numeric(),
1335 page2.mediabox.right.as_numeric(),
1336 page2.mediabox.bottom.as_numeric(),
1337 )
1338 if ctm is not None:
1339 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1340 new_x = tuple(
1341 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
1342 for i in range(0, 8, 2)
1343 )
1344 new_y = tuple(
1345 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
1346 for i in range(0, 8, 2)
1347 )
1348 else:
1349 new_x = corners2[0:8:2]
1350 new_y = corners2[1:8:2]
1351 lowerleft = (min(new_x), min(new_y))
1352 upperright = (max(new_x), max(new_y))
1353 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
1354 upperright = (
1355 max(corners1[2], upperright[0]),
1356 max(corners1[3], upperright[1]),
1357 )
1359 self.mediabox.lower_left = lowerleft
1360 self.mediabox.upper_right = upperright
1362 def merge_transformed_page(
1363 self,
1364 page2: "PageObject",
1365 ctm: Union[CompressedTransformationMatrix, Transformation],
1366 over: bool = True,
1367 expand: bool = False,
1368 ) -> None:
1369 """
1370 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation
1371 matrix is applied to the merged stream.
1373 Args:
1374 page2: The page to be merged into this one.
1375 ctm: a 6-element tuple containing the operands of the
1376 transformation matrix
1377 over: set the page2 content over page1 if True (default) else under
1378 expand: Whether the page should be expanded to fit the dimensions
1379 of the page to be merged.
1381 """
1382 if isinstance(ctm, Transformation):
1383 ctm = ctm.ctm
1384 self._merge_page(
1385 page2,
1386 lambda page2Content: PageObject._add_transformation_matrix(
1387 page2Content, page2.pdf, ctm
1388 ),
1389 ctm,
1390 over,
1391 expand,
1392 )
1394 def merge_scaled_page(
1395 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
1396 ) -> None:
1397 """
1398 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1399 is scaled by applying a transformation matrix.
1401 Args:
1402 page2: The page to be merged into this one.
1403 scale: The scaling factor
1404 over: set the page2 content over page1 if True (default) else under
1405 expand: Whether the page should be expanded to fit the
1406 dimensions of the page to be merged.
1408 """
1409 op = Transformation().scale(scale, scale)
1410 self.merge_transformed_page(page2, op, over, expand)
1412 def merge_rotated_page(
1413 self,
1414 page2: "PageObject",
1415 rotation: float,
1416 over: bool = True,
1417 expand: bool = False,
1418 ) -> None:
1419 """
1420 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged
1421 is rotated by applying a transformation matrix.
1423 Args:
1424 page2: The page to be merged into this one.
1425 rotation: The angle of the rotation, in degrees
1426 over: set the page2 content over page1 if True (default) else under
1427 expand: Whether the page should be expanded to fit the
1428 dimensions of the page to be merged.
1430 """
1431 op = Transformation().rotate(rotation)
1432 self.merge_transformed_page(page2, op, over, expand)
1434 def merge_translated_page(
1435 self,
1436 page2: "PageObject",
1437 tx: float,
1438 ty: float,
1439 over: bool = True,
1440 expand: bool = False,
1441 ) -> None:
1442 """
1443 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be
1444 merged is translated by applying a transformation matrix.
1446 Args:
1447 page2: the page to be merged into this one.
1448 tx: The translation on X axis
1449 ty: The translation on Y axis
1450 over: set the page2 content over page1 if True (default) else under
1451 expand: Whether the page should be expanded to fit the
1452 dimensions of the page to be merged.
1454 """
1455 op = Transformation().translate(tx, ty)
1456 self.merge_transformed_page(page2, op, over, expand)
1458 def add_transformation(
1459 self,
1460 ctm: Union[Transformation, CompressedTransformationMatrix],
1461 expand: bool = False,
1462 ) -> None:
1463 """
1464 Apply a transformation matrix to the page.
1466 Args:
1467 ctm: A 6-element tuple containing the operands of the
1468 transformation matrix. Alternatively, a
1469 :py:class:`Transformation<pypdf.Transformation>`
1470 object can be passed.
1472 See :doc:`/user/cropping-and-transforming`.
1474 """
1475 if isinstance(ctm, Transformation):
1476 ctm = ctm.ctm
1477 content = self.get_contents()
1478 if content is not None:
1479 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
1480 content.isolate_graphics_state()
1481 self.replace_contents(content)
1482 # if expanding the page to fit a new page, calculate the new media box size
1483 if expand:
1484 corners = [
1485 self.mediabox.left.as_numeric(),
1486 self.mediabox.bottom.as_numeric(),
1487 self.mediabox.left.as_numeric(),
1488 self.mediabox.top.as_numeric(),
1489 self.mediabox.right.as_numeric(),
1490 self.mediabox.top.as_numeric(),
1491 self.mediabox.right.as_numeric(),
1492 self.mediabox.bottom.as_numeric(),
1493 ]
1495 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]
1496 new_x = [
1497 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
1498 for i in range(0, 8, 2)
1499 ]
1500 new_y = [
1501 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
1502 for i in range(0, 8, 2)
1503 ]
1505 self.mediabox.lower_left = (min(new_x), min(new_y))
1506 self.mediabox.upper_right = (max(new_x), max(new_y))
1508 def scale(self, sx: float, sy: float) -> None:
1509 """
1510 Scale a page by the given factors by applying a transformation matrix
1511 to its content and updating the page size.
1513 This updates the various page boundaries (mediabox, cropbox, etc.)
1514 and the contents of the page.
1516 Args:
1517 sx: The scaling factor on horizontal axis.
1518 sy: The scaling factor on vertical axis.
1520 """
1521 self.add_transformation((sx, 0, 0, sy, 0, 0))
1522 self.mediabox = self.mediabox.scale(sx, sy)
1523 self.cropbox = self.cropbox.scale(sx, sy)
1524 self.bleedbox = self.bleedbox.scale(sx, sy)
1525 self.trimbox = self.trimbox.scale(sx, sy)
1526 self.artbox = self.artbox.scale(sx, sy)
1528 if PG.ANNOTS in self:
1529 annotations = self[PG.ANNOTS]
1530 if isinstance(annotations, ArrayObject):
1531 for annotation in annotations:
1532 annotation_obj = annotation.get_object()
1533 if ADA.Rect in annotation_obj:
1534 rectangle = annotation_obj[ADA.Rect]
1535 if isinstance(rectangle, ArrayObject):
1536 rectangle[0] = FloatObject(float(rectangle[0]) * sx)
1537 rectangle[1] = FloatObject(float(rectangle[1]) * sy)
1538 rectangle[2] = FloatObject(float(rectangle[2]) * sx)
1539 rectangle[3] = FloatObject(float(rectangle[3]) * sy)
1541 if PG.VP in self:
1542 viewport = self[PG.VP]
1543 if isinstance(viewport, ArrayObject):
1544 bbox = viewport[0]["/BBox"]
1545 else:
1546 bbox = viewport["/BBox"] # type: ignore
1547 scaled_bbox = RectangleObject(
1548 (
1549 float(bbox[0]) * sx,
1550 float(bbox[1]) * sy,
1551 float(bbox[2]) * sx,
1552 float(bbox[3]) * sy,
1553 )
1554 )
1555 if isinstance(viewport, ArrayObject):
1556 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore
1557 NameObject("/BBox")
1558 ] = scaled_bbox
1559 else:
1560 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore
1562 def scale_by(self, factor: float) -> None:
1563 """
1564 Scale a page by the given factor by applying a transformation matrix to
1565 its content and updating the page size.
1567 Args:
1568 factor: The scaling factor (for both X and Y axis).
1570 """
1571 self.scale(factor, factor)
1573 def scale_to(self, width: float, height: float) -> None:
1574 """
1575 Scale a page to the specified dimensions by applying a transformation
1576 matrix to its content and updating the page size.
1578 Args:
1579 width: The new width.
1580 height: The new height.
1582 """
1583 sx = width / float(self.mediabox.width)
1584 sy = height / float(self.mediabox.height)
1585 self.scale(sx, sy)
1587 def compress_content_streams(self, level: int = -1) -> None:
1588 """
1589 Compress the size of this page by joining all content streams and
1590 applying a FlateDecode filter.
1592 However, it is possible that this function will perform no action if
1593 content stream compression becomes "automatic".
1594 """
1595 content = self.get_contents()
1596 if content is not None:
1597 content_obj = content.flate_encode(level)
1598 try:
1599 content.indirect_reference.pdf._objects[ # type: ignore
1600 content.indirect_reference.idnum - 1 # type: ignore
1601 ] = content_obj
1602 except AttributeError:
1603 if self.indirect_reference is not None and hasattr(
1604 self.indirect_reference.pdf, "_add_object"
1605 ):
1606 self.replace_contents(content_obj)
1607 else:
1608 raise ValueError("Page must be part of a PdfWriter")
1610 @property
1611 def page_number(self) -> Optional[int]:
1612 """
1613 Read-only property which returns the page number within the PDF file.
1615 Returns:
1616 Page number; None if the page is not attached to a PDF.
1618 """
1619 if self.indirect_reference is None:
1620 return None
1621 try:
1622 lst = self.indirect_reference.pdf.pages
1623 return lst.index(self)
1624 except ValueError:
1625 return None
1627 def _debug_for_extract(self) -> str: # pragma: no cover
1628 out = ""
1629 for ope, op in ContentStream(
1630 self["/Contents"].get_object(), self.pdf, "bytes"
1631 ).operations:
1632 if op == b"TJ":
1633 s = [x for x in ope[0] if isinstance(x, str)]
1634 else:
1635 s = []
1636 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
1637 out += "\n=============================\n"
1638 try:
1639 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore
1640 out += fo + "\n"
1641 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore
1642 try:
1643 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1644 "/Encoding"
1645 ].__repr__()
1646 out += enc_repr + "\n"
1647 except Exception:
1648 pass
1649 try:
1650 out += (
1651 self[PG.RESOURCES]["/Font"][fo][ # type:ignore
1652 "/ToUnicode"
1653 ]
1654 .get_data()
1655 .decode()
1656 + "\n"
1657 )
1658 except Exception:
1659 pass
1661 except KeyError:
1662 out += "No Font\n"
1663 return out
1665 def _extract_text(
1666 self,
1667 obj: Any,
1668 pdf: Any,
1669 orientations: Tuple[int, ...] = (0, 90, 180, 270),
1670 space_width: float = 200.0,
1671 content_key: Optional[str] = PG.CONTENTS,
1672 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1673 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1674 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1675 ) -> str:
1676 """
1677 See extract_text for most arguments.
1679 Args:
1680 content_key: indicate the default key where to extract data
1681 None = the object; this allows reusing the function on an XObject
1682 default = "/Content"
1684 """
1685 extractor = TextExtraction()
1686 cmaps: Dict[
1687 str,
1688 Tuple[
1689 str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject
1690 ],
1691 ] = {}
1693 try:
1694 objr = obj
1695 while NameObject(PG.RESOURCES) not in objr:
1696 # /Resources can be inherited so we look to parents
1697 objr = objr["/Parent"].get_object()
1698 # If no parents then no /Resources will be available,
1699 # so an exception will be raised
1700 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
1701 except Exception:
1702 # No resources means no text is possible (no font); we consider the
1703 # file as not damaged, no need to check for TJ or Tj
1704 return ""
1706 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]):
1707 for f in cast(DictionaryObject, font):
1708 try:
1709 cmaps[f] = build_char_map(f, space_width, obj)
1710 except TypeError:
1711 pass
1713 try:
1714 content = (
1715 obj[content_key].get_object() if isinstance(content_key, str) else obj
1716 )
1717 if not isinstance(content, ContentStream):
1718 content = ContentStream(content, pdf, "bytes")
1719 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
1720 return ""
1721 # We check all strings are TextStringObjects. ByteStringObjects
1722 # are strings where the byte->string encoding was unknown, so adding
1723 # them to the text here would be gibberish.
1725 # Initialize the extractor with the necessary parameters
1726 extractor.initialize_extraction(orientations, visitor_text, cmaps)
1728 for operands, operator in content.operations:
1729 if visitor_operand_before is not None:
1730 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1731 # Multiple operators are handled here
1732 if operator == b"'":
1733 extractor.process_operation(b"T*", [])
1734 extractor.process_operation(b"Tj", operands)
1735 elif operator == b'"':
1736 extractor.process_operation(b"Tw", [operands[0]])
1737 extractor.process_operation(b"Tc", [operands[1]])
1738 extractor.process_operation(b"T*", [])
1739 extractor.process_operation(b"Tj", operands[2:])
1740 elif operator == b"TJ":
1741 # The space width may be smaller than the font width, so the width should be 95%.
1742 _confirm_space_width = extractor._space_width * 0.95
1743 if operands:
1744 for op in operands[0]:
1745 if isinstance(op, (str, bytes)):
1746 extractor.process_operation(b"Tj", [op])
1747 if isinstance(op, (int, float, NumberObject, FloatObject)) and (
1748 abs(float(op)) >= _confirm_space_width
1749 and extractor.text
1750 and extractor.text[-1] != " "
1751 ):
1752 extractor.process_operation(b"Tj", [" "])
1753 elif operator == b"TD":
1754 extractor.process_operation(b"TL", [-operands[1]])
1755 extractor.process_operation(b"Td", operands)
1756 elif operator == b"Do":
1757 extractor.output += extractor.text
1758 if visitor_text is not None:
1759 visitor_text(
1760 extractor.text,
1761 extractor.memo_cm,
1762 extractor.memo_tm,
1763 extractor.cmap[3],
1764 extractor.font_size,
1765 )
1766 try:
1767 if extractor.output[-1] != "\n":
1768 extractor.output += "\n"
1769 if visitor_text is not None:
1770 visitor_text(
1771 "\n",
1772 extractor.memo_cm,
1773 extractor.memo_tm,
1774 extractor.cmap[3],
1775 extractor.font_size,
1776 )
1777 except IndexError:
1778 pass
1779 try:
1780 xobj = resources_dict["/XObject"]
1781 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
1782 text = self.extract_xform_text(
1783 xobj[operands[0]], # type: ignore
1784 orientations,
1785 space_width,
1786 visitor_operand_before,
1787 visitor_operand_after,
1788 visitor_text,
1789 )
1790 extractor.output += text
1791 if visitor_text is not None:
1792 visitor_text(
1793 text,
1794 extractor.memo_cm,
1795 extractor.memo_tm,
1796 extractor.cmap[3],
1797 extractor.font_size,
1798 )
1799 except Exception as exception:
1800 logger_warning(
1801 f"Impossible to decode XFormObject {operands[0]}: {exception}",
1802 __name__,
1803 )
1804 finally:
1805 extractor.text = ""
1806 extractor.memo_cm = extractor.cm_matrix.copy()
1807 extractor.memo_tm = extractor.tm_matrix.copy()
1808 else:
1809 extractor.process_operation(operator, operands)
1810 if visitor_operand_after is not None:
1811 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1812 extractor.output += extractor.text # just in case
1813 if extractor.text != "" and visitor_text is not None:
1814 visitor_text(
1815 extractor.text,
1816 extractor.memo_cm,
1817 extractor.memo_tm,
1818 extractor.cmap[3],
1819 extractor.font_size,
1820 )
1821 return extractor.output
1823 def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
1824 """
1825 Get fonts formatted for "layout" mode text extraction.
1827 Returns:
1828 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
1830 """
1831 # Font retrieval logic adapted from pypdf.PageObject._extract_text()
1832 objr: Any = self
1833 fonts: Dict[str, _layout_mode.Font] = {}
1834 while objr is not None:
1835 try:
1836 resources_dict: Any = objr[PG.RESOURCES]
1837 except KeyError:
1838 resources_dict = {}
1839 if "/Font" in resources_dict and self.pdf is not None:
1840 for font_name in resources_dict["/Font"]:
1841 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
1842 font_dict = {
1843 k: v.get_object()
1844 if isinstance(v, IndirectObject)
1845 else [_v.get_object() for _v in v]
1846 if isinstance(v, ArrayObject)
1847 else v
1848 for k, v in font_dict_obj.items()
1849 }
1850 # mypy really sucks at unpacking
1851 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
1852 try:
1853 objr = objr["/Parent"].get_object()
1854 except KeyError:
1855 objr = None
1857 return fonts
1859 def _layout_mode_text(
1860 self,
1861 space_vertically: bool = True,
1862 scale_weight: float = 1.25,
1863 strip_rotated: bool = True,
1864 debug_path: Optional[Path] = None,
1865 font_height_weight: float = 1,
1866 ) -> str:
1867 """
1868 Get text preserving fidelity to source PDF text layout.
1870 Args:
1871 space_vertically: include blank lines inferred from y distance + font
1872 height. Defaults to True.
1873 scale_weight: multiplier for string length when calculating weighted
1874 average character width. Defaults to 1.25.
1875 strip_rotated: Removes text that is rotated w.r.t. to the page from
1876 layout mode output. Defaults to True.
1877 debug_path (Path | None): if supplied, must target a directory.
1878 creates the following files with debug information for layout mode
1879 functions if supplied:
1880 - fonts.json: output of self._layout_mode_fonts
1881 - tjs.json: individual text render ops with corresponding transform matrices
1882 - bts.json: text render ops left justified and grouped by BT/ET operators
1883 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1884 Defaults to None.
1885 font_height_weight: multiplier for font height when calculating
1886 blank lines. Defaults to 1.
1888 Returns:
1889 str: multiline string containing page text in a fixed width format that
1890 closely adheres to the rendered layout in the source pdf.
1892 """
1893 fonts = self._layout_mode_fonts()
1894 if debug_path: # pragma: no cover
1895 import json # noqa: PLC0415
1897 debug_path.joinpath("fonts.json").write_text(
1898 json.dumps(
1899 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
1900 ),
1901 "utf-8",
1902 )
1904 ops = iter(
1905 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
1906 )
1907 bt_groups = _layout_mode.text_show_operations(
1908 ops, fonts, strip_rotated, debug_path
1909 )
1911 if not bt_groups:
1912 return ""
1914 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
1916 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
1918 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
1920 def extract_text(
1921 self,
1922 *args: Any,
1923 orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270),
1924 space_width: float = 200.0,
1925 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1926 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
1927 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1928 extraction_mode: Literal["plain", "layout"] = "plain",
1929 **kwargs: Any,
1930 ) -> str:
1931 """
1932 Locate all text drawing commands, in the order they are provided in the
1933 content stream, and extract the text.
1935 This works well for some PDF files, but poorly for others, depending on
1936 the generator used. This will be refined in the future.
1938 Do not rely on the order of text coming out of this function, as it
1939 will change if this function is made more sophisticated.
1941 Arabic and Hebrew are extracted in the correct order.
1942 If required a custom RTL range of characters can be defined;
1943 see function set_custom_rtl.
1945 Additionally you can provide visitor methods to get informed on all
1946 operations and all text objects.
1947 For example in some PDF files this can be useful to parse tables.
1949 Args:
1950 orientations: list of orientations extract_text will look for
1951 default = (0, 90, 180, 270)
1952 note: currently only 0 (up),90 (turned left), 180 (upside down),
1953 270 (turned right)
1954 Silently ignored in "layout" mode.
1955 space_width: force default space width
1956 if not extracted from font (default: 200)
1957 Silently ignored in "layout" mode.
1958 visitor_operand_before: function to be called before processing an operation.
1959 It has four arguments: operator, operand-arguments,
1960 current transformation matrix and text matrix.
1961 Ignored with a warning in "layout" mode.
1962 visitor_operand_after: function to be called after processing an operation.
1963 It has four arguments: operator, operand-arguments,
1964 current transformation matrix and text matrix.
1965 Ignored with a warning in "layout" mode.
1966 visitor_text: function to be called when extracting some text at some position.
1967 It has five arguments: text, current transformation matrix,
1968 text matrix, font-dictionary and font-size.
1969 The font-dictionary may be None in case of unknown fonts.
1970 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
1971 Ignored with a warning in "layout" mode.
1972 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
1973 "layout" for experimental layout mode functionality.
1974 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
1975 in "layout" mode.
1977 kwargs:
1978 layout_mode_space_vertically (bool): include blank lines inferred from
1979 y distance + font height. Defaults to True.
1980 layout_mode_scale_weight (float): multiplier for string length when calculating
1981 weighted average character width. Defaults to 1.25.
1982 layout_mode_strip_rotated (bool): layout mode does not support rotated text.
1983 Set to False to include rotated text anyway. If rotated text is discovered,
1984 layout will be degraded and a warning will result. Defaults to True.
1985 layout_mode_debug_path (Path | None): if supplied, must target a directory.
1986 creates the following files with debug information for layout mode
1987 functions if supplied:
1989 - fonts.json: output of self._layout_mode_fonts
1990 - tjs.json: individual text render ops with corresponding transform matrices
1991 - bts.json: text render ops left justified and grouped by BT/ET operators
1992 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
1993 layout_mode_font_height_weight (float): multiplier for font height when calculating
1994 blank lines. Defaults to 1.
1996 Returns:
1997 The extracted text
1999 """
2000 if extraction_mode not in ["plain", "layout"]:
2001 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
2002 if extraction_mode == "layout":
2003 for visitor in (
2004 "visitor_operand_before",
2005 "visitor_operand_after",
2006 "visitor_text",
2007 ):
2008 if locals()[visitor]:
2009 logger_warning(
2010 f"Argument {visitor} is ignored in layout mode",
2011 __name__,
2012 )
2013 return self._layout_mode_text(
2014 space_vertically=kwargs.get("layout_mode_space_vertically", True),
2015 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
2016 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
2017 debug_path=kwargs.get("layout_mode_debug_path"),
2018 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
2019 )
2020 if len(args) >= 1:
2021 if isinstance(args[0], str):
2022 if len(args) >= 3:
2023 if isinstance(args[2], (tuple, int)):
2024 orientations = args[2]
2025 else:
2026 raise TypeError(f"Invalid positional parameter {args[2]}")
2027 if len(args) >= 4:
2028 if isinstance(args[3], (float, int)):
2029 space_width = args[3]
2030 else:
2031 raise TypeError(f"Invalid positional parameter {args[3]}")
2032 elif isinstance(args[0], (tuple, int)):
2033 orientations = args[0]
2034 if len(args) >= 2:
2035 if isinstance(args[1], (float, int)):
2036 space_width = args[1]
2037 else:
2038 raise TypeError(f"Invalid positional parameter {args[1]}")
2039 else:
2040 raise TypeError(f"Invalid positional parameter {args[0]}")
2042 if isinstance(orientations, int):
2043 orientations = (orientations,)
2045 return self._extract_text(
2046 self,
2047 self.pdf,
2048 orientations,
2049 space_width,
2050 PG.CONTENTS,
2051 visitor_operand_before,
2052 visitor_operand_after,
2053 visitor_text,
2054 )
2056 def extract_xform_text(
2057 self,
2058 xform: EncodedStreamObject,
2059 orientations: Tuple[int, ...] = (0, 90, 270, 360),
2060 space_width: float = 200.0,
2061 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2062 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
2063 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2064 ) -> str:
2065 """
2066 Extract text from an XObject.
2068 Args:
2069 xform:
2070 orientations:
2071 space_width: force default space width (if not extracted from font (default 200)
2072 visitor_operand_before:
2073 visitor_operand_after:
2074 visitor_text:
2076 Returns:
2077 The extracted text
2079 """
2080 return self._extract_text(
2081 xform,
2082 self.pdf,
2083 orientations,
2084 space_width,
2085 None,
2086 visitor_operand_before,
2087 visitor_operand_after,
2088 visitor_text,
2089 )
2091 def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
2092 """
2093 Get the names of embedded fonts and unembedded fonts.
2095 Returns:
2096 A tuple (set of embedded fonts, set of unembedded fonts)
2098 """
2099 obj = self.get_object()
2100 assert isinstance(obj, DictionaryObject)
2101 fonts: Set[str] = set()
2102 embedded: Set[str] = set()
2103 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
2104 unembedded = fonts - embedded
2105 return embedded, unembedded
2107 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
2108 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2109 default user space units, defining the boundaries of the physical medium on
2110 which the page is intended to be displayed or printed."""
2112 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
2113 """
2114 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2115 default user space units, defining the visible region of default user
2116 space.
2118 When the page is displayed or printed, its contents are to be clipped
2119 (cropped) to this rectangle and then imposed on the output medium in some
2120 implementation-defined manner. Default value: same as
2121 :attr:`mediabox<mediabox>`.
2122 """
2124 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
2125 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2126 default user space units, defining the region to which the contents of the
2127 page should be clipped when output in a production environment."""
2129 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
2130 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2131 default user space units, defining the intended dimensions of the finished
2132 page after trimming."""
2134 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
2135 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
2136 default user space units, defining the extent of the page's meaningful
2137 content as intended by the page's creator."""
2139 @property
2140 def annotations(self) -> Optional[ArrayObject]:
2141 if "/Annots" not in self:
2142 return None
2143 return cast(ArrayObject, self["/Annots"])
2145 @annotations.setter
2146 def annotations(self, value: Optional[ArrayObject]) -> None:
2147 """
2148 Set the annotations array of the page.
2150 Typically you do not want to set this value, but append to it.
2151 If you append to it, remember to add the object first to the writer
2152 and only add the indirect object.
2153 """
2154 if value is None:
2155 del self[NameObject("/Annots")]
2156 else:
2157 self[NameObject("/Annots")] = value
2160class _VirtualList(Sequence[PageObject]):
2161 def __init__(
2162 self,
2163 length_function: Callable[[], int],
2164 get_function: Callable[[int], PageObject],
2165 ) -> None:
2166 self.length_function = length_function
2167 self.get_function = get_function
2168 self.current = -1
2170 def __len__(self) -> int:
2171 return self.length_function()
2173 @overload
2174 def __getitem__(self, index: int) -> PageObject:
2175 ...
2177 @overload
2178 def __getitem__(self, index: slice) -> Sequence[PageObject]:
2179 ...
2181 def __getitem__(
2182 self, index: Union[int, slice]
2183 ) -> Union[PageObject, Sequence[PageObject]]:
2184 if isinstance(index, slice):
2185 indices = range(*index.indices(len(self)))
2186 cls = type(self)
2187 return cls(indices.__len__, lambda idx: self[indices[idx]])
2188 if not isinstance(index, int):
2189 raise TypeError("Sequence indices must be integers")
2190 len_self = len(self)
2191 if index < 0:
2192 # support negative indexes
2193 index += len_self
2194 if not (0 <= index < len_self):
2195 raise IndexError("Sequence index out of range")
2196 return self.get_function(index)
2198 def __delitem__(self, index: Union[int, slice]) -> None:
2199 if isinstance(index, slice):
2200 r = list(range(*index.indices(len(self))))
2201 # pages have to be deleted from last to first
2202 r.sort()
2203 r.reverse()
2204 for p in r:
2205 del self[p] # recursive call
2206 return
2207 if not isinstance(index, int):
2208 raise TypeError("Index must be integers")
2209 len_self = len(self)
2210 if index < 0:
2211 # support negative indexes
2212 index += len_self
2213 if not (0 <= index < len_self):
2214 raise IndexError("Index out of range")
2215 ind = self[index].indirect_reference
2216 assert ind is not None
2217 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
2218 "/Parent", None
2219 )
2220 first = True
2221 while parent is not None:
2222 parent = cast(DictionaryObject, parent.get_object())
2223 try:
2224 i = cast(ArrayObject, parent["/Kids"]).index(ind)
2225 del cast(ArrayObject, parent["/Kids"])[i]
2226 first = False
2227 try:
2228 assert ind is not None
2229 del ind.pdf.flattened_pages[index] # case of page in a Reader
2230 except Exception: # pragma: no cover
2231 pass
2232 if "/Count" in parent:
2233 parent[NameObject("/Count")] = NumberObject(
2234 cast(int, parent["/Count"]) - 1
2235 )
2236 if len(cast(ArrayObject, parent["/Kids"])) == 0:
2237 # No more objects in this part of this subtree
2238 ind = parent.indirect_reference
2239 parent = parent.get("/Parent", None)
2240 except ValueError: # from index
2241 if first:
2242 raise PdfReadError(f"Page not found in page tree: {ind}")
2243 break
2245 def __iter__(self) -> Iterator[PageObject]:
2246 for i in range(len(self)):
2247 yield self[i]
2249 def __str__(self) -> str:
2250 p = [f"PageObject({i})" for i in range(self.length_function())]
2251 return f"[{', '.join(p)}]"
2254def _get_fonts_walk(
2255 obj: DictionaryObject,
2256 fnt: Set[str],
2257 emb: Set[str],
2258) -> Tuple[Set[str], Set[str]]:
2259 """
2260 Get the set of all fonts and all embedded fonts.
2262 Args:
2263 obj: Page resources dictionary
2264 fnt: font
2265 emb: embedded fonts
2267 Returns:
2268 A tuple (fnt, emb)
2270 If there is a key called 'BaseFont', that is a font that is used in the document.
2271 If there is a key called 'FontName' and another key in the same dictionary object
2272 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
2273 embedded.
2275 We create and add to two sets, fnt = fonts used and emb = fonts embedded.
2277 """
2278 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
2280 def process_font(f: DictionaryObject) -> None:
2281 nonlocal fnt, emb
2282 f = cast(DictionaryObject, f.get_object()) # to be sure
2283 if "/BaseFont" in f:
2284 fnt.add(cast(str, f["/BaseFont"]))
2286 if (
2287 ("/CharProcs" in f)
2288 or (
2289 "/FontDescriptor" in f
2290 and any(
2291 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
2292 )
2293 )
2294 or (
2295 "/DescendantFonts" in f
2296 and "/FontDescriptor"
2297 in cast(
2298 DictionaryObject,
2299 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2300 )
2301 and any(
2302 x
2303 in cast(
2304 DictionaryObject,
2305 cast(
2306 DictionaryObject,
2307 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
2308 )["/FontDescriptor"],
2309 )
2310 for x in fontkeys
2311 )
2312 )
2313 ):
2314 # the list comprehension ensures there is FontFile
2315 try:
2316 emb.add(cast(str, f["/BaseFont"]))
2317 except KeyError:
2318 emb.add("(" + cast(str, f["/Subtype"]) + ")")
2320 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
2321 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
2322 process_font(f)
2323 if "/Resources" in obj:
2324 if "/Font" in cast(DictionaryObject, obj["/Resources"]):
2325 for f in cast(
2326 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
2327 ).values():
2328 process_font(f)
2329 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
2330 for x in cast(
2331 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
2332 ).values():
2333 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
2334 if "/Annots" in obj:
2335 for a in cast(ArrayObject, obj["/Annots"]):
2336 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
2337 if "/AP" in obj:
2338 if (
2339 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
2340 "/Type"
2341 )
2342 == "/XObject"
2343 ):
2344 _get_fonts_walk(
2345 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
2346 fnt,
2347 emb,
2348 )
2349 else:
2350 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
2351 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
2352 return fnt, emb # return the sets for each page