Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from copy import deepcopy

33from dataclasses import asdict, dataclass

34from decimal import Decimal

35from io import BytesIO

36from pathlib import Path

37from typing import (

38 Any,

39 Callable,

40 Literal,

41 Optional,

42 Union,

43 cast,

44 overload,

45)

47from ._font import Font

48from ._protocols import PdfCommonDocProtocol

49from ._text_extraction import (

50 _layout_mode,

51)

52from ._text_extraction._text_extractor import TextExtraction

53from ._utils import (

54 CompressedTransformationMatrix,

55 TransformationMatrixType,

56 _human_readable_bytes,

57 deprecate,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING

62from .constants import AnnotationDictionaryAttributes as ADA

63from .constants import ImageAttributes as IA

64from .constants import PageAttributes as PG

65from .constants import Resources as RES

66from .errors import PageSizeNotDefinedError, PdfReadError

67from .generic import (

68 ArrayObject,

69 ContentStream,

70 DictionaryObject,

71 EncodedStreamObject,

72 FloatObject,

73 IndirectObject,

74 NameObject,

75 NullObject,

76 NumberObject,

77 PdfObject,

78 RectangleObject,

79 StreamObject,

80 is_null_or_none,

81)

83try:

84 from PIL.Image import Image

86 pil_not_imported = False

87except ImportError:

88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10

89 pil_not_imported = True # error will be raised only when using images

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)

96 if isinstance(retval, RectangleObject):

97 return retval

98 if is_null_or_none(retval):

99 for d in defaults:

100 retval = self.get(d)

101 if retval is not None:

102 break

103 if isinstance(retval, IndirectObject):

104 retval = self.pdf.get_object(retval)

105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:

106 logger_warning(f"Expected four values, got {length}: {retval}", __name__)

107 retval = RectangleObject(tuple(retval[:4]))

108 else:

109 retval = RectangleObject(retval) # type: ignore

110 _set_rectangle(self, name, retval)

111 return retval

112

113

114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

115 self[NameObject(name)] = value

116

117

118def _delete_rectangle(self: Any, name: str) -> None:

119 del self[name]

120

121

122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

123 return property(

124 lambda self: _get_rectangle(self, name, fallback),

125 lambda self, value: _set_rectangle(self, name, value),

126 lambda self: _delete_rectangle(self, name),

127 )

128

129

130class Transformation:

131 """

132 Represent a 2D transformation.

133

134 The transformation between two coordinate systems is represented by a 3-by-3

135 transformation matrix with the following form::

136

137 a b 0

138 c d 0

139 e f 1

140

141 Because a transformation matrix has only six elements that can be changed,

142 it is usually specified in PDF as the six-element array [ a b c d e f ].

143

144 Coordinate transformations are expressed as matrix multiplications::

145

146 a b 0

147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

148 e f 1

149

150

151 Example:

152 >>> from pypdf import PdfWriter, Transformation

153 >>> page = PdfWriter().add_blank_page(800, 600)

154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

155 >>> page.add_transformation(op)

156

157 """

158

159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

160 self.ctm = ctm

161

162 @property

163 def matrix(self) -> TransformationMatrixType:

164 """

165 Return the transformation matrix as a tuple of tuples in the form:

166

167 ((a, b, 0), (c, d, 0), (e, f, 1))

168 """

169 return (

170 (self.ctm[0], self.ctm[1], 0),

171 (self.ctm[2], self.ctm[3], 0),

172 (self.ctm[4], self.ctm[5], 1),

173 )

174

175 @staticmethod

176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

177 """

178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

179

180 Args:

181 matrix: The transformation matrix as a tuple of tuples.

182

183 Returns:

184 A tuple representing the transformation matrix as (a, b, c, d, e, f)

185

186 """

187 return (

188 matrix[0][0],

189 matrix[0][1],

190 matrix[1][0],

191 matrix[1][1],

192 matrix[2][0],

193 matrix[2][1],

194 )

195

196 def _to_cm(self) -> str:

197 # Returns the cm operation string for the given transformation matrix

198 return (

199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

201 )

202

203 def transform(self, m: "Transformation") -> "Transformation":

204 """

205 Apply one transformation to another.

206

207 Args:

208 m: a Transformation to apply.

209

210 Returns:

211 A new ``Transformation`` instance

212

213 Example:

214 >>> from pypdf import PdfWriter, Transformation

215 >>> height, width = 40, 50

216 >>> page = PdfWriter().add_blank_page(800, 600)

217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

219 >>> page.add_transformation(op)

220

221 """

222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

223 return Transformation(ctm)

224

225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

226 """

227 Translate the contents of a page.

228

229 Args:

230 tx: The translation along the x-axis.

231 ty: The translation along the y-axis.

232

233 Returns:

234 A new ``Transformation`` instance

235

236 """

237 m = self.ctm

238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

239

240 def scale(

241 self, sx: Optional[float] = None, sy: Optional[float] = None

242 ) -> "Transformation":

243 """

244 Scale the contents of a page towards the origin of the coordinate system.

245

246 Typically, that is the lower-left corner of the page. That can be

247 changed by translating the contents / the page boxes.

248

249 Args:

250 sx: The scale factor along the x-axis.

251 sy: The scale factor along the y-axis.

252

253 Returns:

254 A new Transformation instance with the scaled matrix.

255

256 """

257 if sx is None and sy is None:

258 raise ValueError("Either sx or sy must be specified")

259 if sx is None:

260 sx = sy

261 if sy is None:

262 sy = sx

263 assert sx is not None

264 assert sy is not None

265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

266 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

267 return Transformation(ctm)

268

269 def rotate(self, rotation: float) -> "Transformation":

270 """

271 Rotate the contents of a page.

272

273 Args:

274 rotation: The angle of rotation in degrees.

275

276 Returns:

277 A new ``Transformation`` instance with the rotated matrix.

278

279 """

280 rotation = math.radians(rotation)

281 op: TransformationMatrixType = (

282 (math.cos(rotation), math.sin(rotation), 0),

283 (-math.sin(rotation), math.cos(rotation), 0),

284 (0, 0, 1),

285 )

286 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

287 return Transformation(ctm)

288

289 def __repr__(self) -> str:

290 return f"Transformation(ctm={self.ctm})"

291

292 @overload

293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

294 ...

295

296 @overload

297 def apply_on(

298 self, pt: tuple[float, float], as_object: bool = False

299 ) -> tuple[float, float]:

300 ...

301

302 def apply_on(

303 self,

304 pt: Union[tuple[float, float], list[float]],

305 as_object: bool = False,

306 ) -> Union[tuple[float, float], list[float]]:

307 """

308 Apply the transformation matrix on the given point.

309

310 Args:

311 pt: A tuple or list representing the point in the form (x, y).

312 as_object: If True, return items as FloatObject, otherwise as plain floats.

313

314 Returns:

315 A tuple or list representing the transformed point in the form (x', y')

316

317 """

318 typ = FloatObject if as_object else float

319 pt1 = (

320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

322 )

323 return list(pt1) if isinstance(pt, list) else pt1

324

325

326@dataclass

327class ImageFile:

328 """

329 Image within the PDF file. *This object is not designed to be built.*

330

331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

332 """

333

334 name: str = ""

335 """

336 Filename as identified within the PDF file.

337 """

338

339 data: bytes = b""

340 """

341 Data as bytes.

342 """

343

344 image: Optional[Image] = None

345 """

346 Data as PIL image.

347 """

348

349 indirect_reference: Optional[IndirectObject] = None

350 """

351 Reference to the object storing the stream.

352 """

353

354 def replace(self, new_image: Image, **kwargs: Any) -> None:

355 """

356 Replace the image with a new PIL image.

357

358 Args:

359 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

360 **kwargs: Additional keyword arguments to pass to `Image.save()`.

361

362 Raises:

363 TypeError: If the image is inline or in a PdfReader.

364 TypeError: If the image does not belong to a PdfWriter.

365 TypeError: If `new_image` is not a PIL Image.

366

367 Note:

368 This method replaces the existing image with a new image.

369 It is not allowed for inline images or images within a PdfReader.

370 The `kwargs` parameter allows passing additional parameters

371 to `Image.save()`, such as quality.

372

373 """

374 if pil_not_imported:

375 raise ImportError(

376 "pillow is required to do image extraction. "

377 "It can be installed via 'pip install pypdf[image]'"

378 )

379

380 from ._reader import PdfReader # noqa: PLC0415

381

382 # to prevent circular import

383 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

384 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

385

386 if self.indirect_reference is None:

387 raise TypeError("Cannot update an inline image.")

388 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

389 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

390 if not isinstance(new_image, Image):

391 raise TypeError("new_image shall be a PIL Image")

392 b = BytesIO()

393 new_image.save(b, "PDF", **kwargs)

394 reader = PdfReader(b)

395 page_image = reader.pages[0].images[0]

396 assert page_image.indirect_reference is not None

397 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

398 page_image.indirect_reference.get_object()

399 )

400 cast(

401 PdfObject, self.indirect_reference.get_object()

402 ).indirect_reference = self.indirect_reference

403 # change the object attributes

404 extension, byte_stream, img = _xobj_to_image(

405 cast(DictionaryObject, self.indirect_reference.get_object()),

406 pillow_parameters=kwargs,

407 )

408 assert extension is not None

409 self.name = self.name[: self.name.rfind(".")] + extension

410 self.data = byte_stream

411 self.image = img

412

413 def __str__(self) -> str:

414 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

415

416 def __repr__(self) -> str:

417 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

418

419

420class VirtualListImages(Sequence[ImageFile]):

421 """

422 Provides access to images referenced within a page.

423 Only one copy will be returned if the usage is used on the same page multiple times.

424 See :func:`PageObject.images` for more details.

425 """

426

427 def __init__(

428 self,

429 ids_function: Callable[[], list[Union[str, list[str]]]],

430 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

431 ) -> None:

432 self.ids_function = ids_function

433 self.get_function = get_function

434 self.current = -1

435

436 def __len__(self) -> int:

437 return len(self.ids_function())

438

439 def keys(self) -> list[Union[str, list[str]]]:

440 return self.ids_function()

441

442 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

443 return [(x, self[x]) for x in self.ids_function()]

444

445 @overload

446 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

447 ...

448

449 @overload

450 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

451 ...

452

453 def __getitem__(

454 self, index: Union[int, slice, str, list[str], tuple[str]]

455 ) -> Union[ImageFile, Sequence[ImageFile]]:

456 lst = self.ids_function()

457 if isinstance(index, slice):

458 indices = range(*index.indices(len(self)))

459 lst = [lst[x] for x in indices]

460 cls = type(self)

461 return cls((lambda: lst), self.get_function)

462 if isinstance(index, (str, list, tuple)):

463 return self.get_function(index)

464 if not isinstance(index, int):

465 raise TypeError("Invalid sequence indices type")

466 len_self = len(lst)

467 if index < 0:

468 # support negative indexes

469 index += len_self

470 if not (0 <= index < len_self):

471 raise IndexError("Sequence index out of range")

472 return self.get_function(lst[index])

473

474 def __iter__(self) -> Iterator[ImageFile]:

475 for i in range(len(self)):

476 yield self[i]

477

478 def __str__(self) -> str:

479 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

480 return f"[{', '.join(p)}]"

481

482

483class PageObject(DictionaryObject):

484 """

485 PageObject represents a single page within a PDF file.

486

487 Typically these objects will be created by accessing the

488 :attr:`pages<pypdf.PdfReader.pages>` property of the

489 :class:`PdfReader<pypdf.PdfReader>` class, but it is

490 also possible to create an empty page with the

491 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

492

493 Args:

494 pdf: PDF file the page belongs to.

495 indirect_reference: Stores the original indirect reference to

496 this object in its source PDF

497

498 """

499

500 original_page: "PageObject" # very local use in writer when appending

501

502 def __init__(

503 self,

504 pdf: Optional[PdfCommonDocProtocol] = None,

505 indirect_reference: Optional[IndirectObject] = None,

506 ) -> None:

507 DictionaryObject.__init__(self)

508 self.pdf = pdf

509 self.inline_images: Optional[dict[str, ImageFile]] = None

510 self.indirect_reference = indirect_reference

511 if not is_null_or_none(indirect_reference):

512 assert indirect_reference is not None, "mypy"

513 self.update(cast(DictionaryObject, indirect_reference.get_object()))

514 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}

515

516 def hash_bin(self) -> int:

517 """

518 Used to detect modified object.

519

520 Note: this function is overloaded to return the same results

521 as a DictionaryObject.

522

523 Returns:

524 Hash considering type and value.

525

526 """

527 return hash(

528 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

529 )

530

531 def hash_value_data(self) -> bytes:

532 data = super().hash_value_data()

533 data += f"{id(self)}".encode()

534 return data

535

536 @property

537 def user_unit(self) -> float:

538 """

539 A read-only positive number giving the size of user space units.

540

541 It is in multiples of 1/72 inch. Hence a value of 1 means a user

542 space unit is 1/72 inch, and a value of 3 means that a user

543 space unit is 3/72 inch.

544 """

545 return self.get(PG.USER_UNIT, 1)

546

547 @staticmethod

548 def create_blank_page(

549 pdf: Optional[PdfCommonDocProtocol] = None,

550 width: Union[float, Decimal, None] = None,

551 height: Union[float, Decimal, None] = None,

552 ) -> "PageObject":

553 """

554 Return a new blank page.

555

556 If ``width`` or ``height`` is ``None``, try to get the page size

557 from the last page of *pdf*.

558

559 Args:

560 pdf: PDF file the page is within.

561 width: The width of the new page expressed in default user

562 space units.

563 height: The height of the new page expressed in default user

564 space units.

565

566 Returns:

567 The new blank page

568

569 Raises:

570 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

571 no page

572

573 """

574 page = PageObject(pdf)

575

576 # Creates a new page (cf PDF Reference §7.7.3.3)

577 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

578 page.__setitem__(NameObject(PG.PARENT), NullObject())

579 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

580 if width is None or height is None:

581 if pdf is not None and len(pdf.pages) > 0:

582 lastpage = pdf.pages[len(pdf.pages) - 1]

583 width = lastpage.mediabox.width

584 height = lastpage.mediabox.height

585 else:

586 raise PageSizeNotDefinedError

587 page.__setitem__(

588 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore

589 )

590

591 return page

592

593 def _get_ids_image(

594 self,

595 obj: Optional[DictionaryObject] = None,

596 ancest: Optional[list[str]] = None,

597 call_stack: Optional[list[Any]] = None,

598 ) -> list[Union[str, list[str]]]:

599 if call_stack is None:

600 call_stack = []

601 _i = getattr(obj, "indirect_reference", None)

602 if _i in call_stack:

603 return []

604 call_stack.append(_i)

605 if self.inline_images is None:

606 self.inline_images = self._get_inline_images()

607 if obj is None:

608 obj = self

609 if ancest is None:

610 ancest = []

611 lst: list[Union[str, list[str]]] = []

612 if (

613 PG.RESOURCES not in obj or

614 is_null_or_none(resources := obj[PG.RESOURCES]) or

615 RES.XOBJECT not in cast(DictionaryObject, resources)

616 ):

617 return [] if self.inline_images is None else list(self.inline_images.keys())

618

619 x_object = resources[RES.XOBJECT].get_object() # type: ignore

620 for o in x_object:

621 if not isinstance(x_object[o], StreamObject):

622 continue

623 if x_object[o][IA.SUBTYPE] == "/Image":

624 lst.append(o if len(ancest) == 0 else [*ancest, o])

625 else: # is a form with possible images inside

626 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

627 assert self.inline_images is not None

628 lst.extend(list(self.inline_images.keys()))

629 return lst

630

631 def _get_image(

632 self,

633 id: Union[str, list[str], tuple[str]],

634 obj: Optional[DictionaryObject] = None,

635 ) -> ImageFile:

636 if obj is None:

637 obj = cast(DictionaryObject, self)

638 if isinstance(id, tuple):

639 id = list(id)

640 if isinstance(id, list) and len(id) == 1:

641 id = id[0]

642 try:

643 xobjs = cast(

644 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

645 )

646 except KeyError:

647 if not (id[0] == "~" and id[-1] == "~"):

648 raise

649 if isinstance(id, str):

650 if id[0] == "~" and id[-1] == "~":

651 if self.inline_images is None:

652 self.inline_images = self._get_inline_images()

653 if self.inline_images is None: # pragma: no cover

654 raise KeyError("No inline image can be found")

655 return self.inline_images[id]

656

657 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

658 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

659 extension, byte_stream = imgd[:2]

660 return ImageFile(

661 name=f"{id[1:]}{extension}",

662 data=byte_stream,

663 image=imgd[2],

664 indirect_reference=xobjs[id].indirect_reference,

665 )

666 # in a subobject

667 ids = id[1:]

668 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

669

670 @property

671 def images(self) -> VirtualListImages:

672 """

673 Read-only property emulating a list of images on a page.

674

675 Get a list of all images on the page. The key can be:

676 - A string (for the top object)

677 - A tuple (for images within XObject forms)

678 - An integer

679

680 Examples:

681 * `reader.pages[0].images[0]` # return first image

682 * `reader.pages[0].images['/I0']` # return image '/I0'

683 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

684 * `for img in reader.pages[0].images:` # loops through all objects

685

686 images.keys() and images.items() can be used.

687

688 The ImageFile has the following properties:

689

690 * `.name` : name of the object

691 * `.data` : bytes of the object

692 * `.image` : PIL Image Object

693 * `.indirect_reference` : object reference

694

695 and the following methods:

696 `.replace(new_image: PIL.Image.Image, **kwargs)` :

697 replace the image in the pdf with the new image

698 applying the saving parameters indicated (such as quality)

699

700 Example usage:

701

702 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

703

704 Inline images are extracted and named ~0~, ~1~, ..., with the

705 indirect_reference set to None.

706

707 """

708 return VirtualListImages(self._get_ids_image, self._get_image)

709

710 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

711 """Translate values used in inline image"""

712 try:

713 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

714 except (TypeError, KeyError):

715 if isinstance(v, NameObject):

716 # It is a custom name, thus we have to look in resources.

717 # The only applicable case is for ColorSpace.

718 try:

719 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

720 v = cast(DictionaryObject, res)[v]

721 except KeyError: # for res and v

722 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

723 return v

724

725 def _get_inline_images(self) -> dict[str, ImageFile]:

726 """Load inline images. Entries will be identified as `~1~`."""

727 content = self.get_contents()

728 if is_null_or_none(content):

729 return {}

730 imgs_data = []

731 assert content is not None, "mypy"

732 for param, ope in content.operations:

733 if ope == b"INLINE IMAGE":

734 imgs_data.append(

735 {"settings": param["settings"], "__streamdata__": param["data"]}

736 )

737 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

738 raise PdfReadError(

739 f"{ope!r} operator met whereas not expected, "

740 "please share use case with pypdf dev team"

741 )

742 files = {}

743 for num, ii in enumerate(imgs_data):

744 init = {

745 "__streamdata__": ii["__streamdata__"],

746 "/Length": len(ii["__streamdata__"]),

747 }

748 for k, v in ii["settings"].items():

749 if k in {"/Length", "/L"}: # no length is expected

750 continue

751 if isinstance(v, list):

752 v = ArrayObject(

753 [self._translate_value_inline_image(k, x) for x in v]

754 )

755 else:

756 v = self._translate_value_inline_image(k, v)

757 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

758 if k not in init:

759 init[k] = v

760 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

761 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

762 extension, byte_stream, img = _xobj_to_image(ii["object"])

763 files[f"~{num}~"] = ImageFile(

764 name=f"~{num}~{extension}",

765 data=byte_stream,

766 image=img,

767 indirect_reference=None,

768 )

769 return files

770

771 @property

772 def rotation(self) -> int:

773 """

774 The visual rotation of the page.

775

776 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

777 valid values. This property does not affect ``/Contents``.

778 """

779 rotate_obj = self.get(PG.ROTATE, 0)

780 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

781

782 @rotation.setter

783 def rotation(self, r: float) -> None:

784 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

785

786 def transfer_rotation_to_content(self) -> None:

787 """

788 Apply the rotation of the page to the content and the media/crop/...

789 boxes.

790

791 It is recommended to apply this function before page merging.

792 """

793 r = -self.rotation # rotation to apply is in the otherway

794 self.rotation = 0

795 mb = RectangleObject(self.mediabox)

796 trsf = (

797 Transformation()

798 .translate(

799 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

800 )

801 .rotate(r)

802 )

803 pt1 = trsf.apply_on(mb.lower_left)

804 pt2 = trsf.apply_on(mb.upper_right)

805 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

806 self.add_transformation(trsf, False)

807 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

808 if b in self:

809 rr = RectangleObject(self[b]) # type: ignore

810 pt1 = trsf.apply_on(rr.lower_left)

811 pt2 = trsf.apply_on(rr.upper_right)

812 self[NameObject(b)] = RectangleObject(

813 (

814 min(pt1[0], pt2[0]),

815 min(pt1[1], pt2[1]),

816 max(pt1[0], pt2[0]),

817 max(pt1[1], pt2[1]),

818 )

819 )

820

821 def rotate(self, angle: int) -> "PageObject":

822 """

823 Rotate a page clockwise by increments of 90 degrees.

824

825 Args:

826 angle: Angle to rotate the page. Must be an increment of 90 deg.

827

828 Returns:

829 The rotated PageObject

830

831 """

832 if angle % 90 != 0:

833 raise ValueError("Rotation angle must be a multiple of 90")

834 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

835 return self

836

837 def _merge_resources(

838 self,

839 res1: DictionaryObject,

840 res2: DictionaryObject,

841 resource: Any,

842 new_res1: bool = True,

843 ) -> tuple[dict[str, Any], dict[str, Any]]:

844 try:

845 assert isinstance(self.indirect_reference, IndirectObject)

846 pdf = self.indirect_reference.pdf

847 is_pdf_writer = hasattr(

848 pdf, "_add_object"

849 ) # expect isinstance(pdf, PdfWriter)

850 except (AssertionError, AttributeError):

851 pdf = None

852 is_pdf_writer = False

853

854 def compute_unique_key(base_key: str) -> tuple[str, bool]:

855 """

856 Find a key that either doesn't already exist or has the same value

857 (indicated by the bool)

858

859 Args:

860 base_key: An index is added to this to get the computed key

861

862 Returns:

863 A tuple (computed key, bool) where the boolean indicates

864 if there is a resource of the given computed_key with the same

865 value.

866

867 """

868 value = page2res.raw_get(base_key)

869 # TODO: a possible improvement for writer, the indirect_reference

870 # cannot be found because translated

871

872 # try the current key first (e.g. "foo"), but otherwise iterate

873 # through "foo-0", "foo-1", etc. new_res can contain only finitely

874 # many keys, thus this'll eventually end, even if it's been crafted

875 # to be maximally annoying.

876 computed_key = base_key

877 idx = 0

878 while computed_key in new_res:

879 if new_res.raw_get(computed_key) == value:

880 # there's already a resource of this name, with the exact

881 # same value

882 return computed_key, True

883 computed_key = f"{base_key}-{idx}"

884 idx += 1

885 return computed_key, False

886

887 if new_res1:

888 new_res = DictionaryObject()

889 new_res.update(res1.get(resource, DictionaryObject()).get_object())

890 else:

891 new_res = cast(DictionaryObject, res1[resource])

892 page2res = cast(

893 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

894 )

895 rename_res = {}

896 for key in page2res:

897 unique_key, same_value = compute_unique_key(key)

898 newname = NameObject(unique_key)

899 if key != unique_key:

900 # we have to use a different name for this

901 rename_res[key] = newname

902

903 if not same_value:

904 if is_pdf_writer:

905 new_res[newname] = page2res.raw_get(key).clone(pdf)

906 try:

907 new_res[newname] = new_res[newname].indirect_reference

908 except AttributeError:

909 pass

910 else:

911 new_res[newname] = page2res.raw_get(key)

912 lst = sorted(new_res.items())

913 new_res.clear()

914 for el in lst:

915 new_res[el[0]] = el[1]

916 return new_res, rename_res

917

918 @staticmethod

919 def _content_stream_rename(

920 stream: ContentStream,

921 rename: dict[Any, Any],

922 pdf: Optional[PdfCommonDocProtocol],

923 ) -> ContentStream:

924 if not rename:

925 return stream

926 stream = ContentStream(stream, pdf)

927 for operands, _operator in stream.operations:

928 if isinstance(operands, list):

929 for i, op in enumerate(operands):

930 if isinstance(op, NameObject):

931 operands[i] = rename.get(op, op)

932 elif isinstance(operands, dict):

933 for i, op in operands.items():

934 if isinstance(op, NameObject):

935 operands[i] = rename.get(op, op)

936 else:

937 raise KeyError(f"Type of operands is {type(operands)}")

938 return stream

939

940 @staticmethod

941 def _add_transformation_matrix(

942 contents: Any,

943 pdf: Optional[PdfCommonDocProtocol],

944 ctm: CompressedTransformationMatrix,

945 ) -> ContentStream:

946 """Add transformation matrix at the beginning of the given contents stream."""

947 contents = ContentStream(contents, pdf)

948 contents.operations.insert(

949 0,

950 [

951 [FloatObject(x) for x in ctm],

952 b"cm",

953 ],

954 )

955 return contents

956

957 def _get_contents_as_bytes(self) -> Optional[bytes]:

958 """

959 Return the page contents as bytes.

960

961 Returns:

962 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

963

964 """

965 if PG.CONTENTS in self:

966 obj = self[PG.CONTENTS].get_object()

967 if isinstance(obj, list):

968 return b"".join(x.get_object().get_data() for x in obj)

969 return cast(EncodedStreamObject, obj).get_data()

970 return None

971

972 def get_contents(self) -> Optional[ContentStream]:

973 """

974 Access the page contents.

975

976 Returns:

977 The ``/Contents`` object, or ``None`` if it does not exist.

978 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

979

980 """

981 if PG.CONTENTS in self:

982 try:

983 pdf = cast(IndirectObject, self.indirect_reference).pdf

984 except AttributeError:

985 pdf = None

986 obj = self[PG.CONTENTS]

987 if is_null_or_none(obj):

988 return None

989 resolved_object = obj.get_object()

990 return ContentStream(resolved_object, pdf)

991 return None

992

993 def replace_contents(

994 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

995 ) -> None:

996 """

997 Replace the page contents with the new content and nullify old objects

998 Args:

999 content: new content; if None delete the content field.

1000 """

1001 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

1002 # the page is not attached : the content is directly attached.

1003 self[NameObject(PG.CONTENTS)] = content

1004 return

1005

1006 from pypdf._writer import PdfWriter # noqa: PLC0415

1007 if not isinstance(self.indirect_reference.pdf, PdfWriter):

1008 deprecate(

1009 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "

1010 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "

1011 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."

1012 )

1013

1014 writer = self.indirect_reference.pdf

1015 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1016 content_array = cast(ArrayObject, self[PG.CONTENTS])

1017 for reference in content_array:

1018 try:

1019 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())

1020 except ValueError:

1021 # Occurs when called on PdfReader.

1022 pass

1023

1024 if isinstance(content, ArrayObject):

1025 content = ArrayObject(writer._add_object(obj) for obj in content)

1026

1027 if is_null_or_none(content):

1028 if PG.CONTENTS not in self:

1029 return

1030 assert self[PG.CONTENTS].indirect_reference is not None

1031 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())

1032 del self[PG.CONTENTS]

1033 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1034 try:

1035 self[NameObject(PG.CONTENTS)] = writer._add_object(content)

1036 except AttributeError:

1037 # applies at least for page not in writer

1038 # as a backup solution, we put content as an object although not in accordance with pdf ref

1039 # this will be fixed with the _add_object

1040 self[NameObject(PG.CONTENTS)] = content

1041 else:

1042 assert content is not None, "mypy"

1043 content.indirect_reference = self[

1044 PG.CONTENTS

1045 ].indirect_reference # TODO: in the future may require generation management

1046 try:

1047 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)

1048 except AttributeError:

1049 # applies at least for page not in writer

1050 # as a backup solution, we put content as an object although not in accordance with pdf ref

1051 # this will be fixed with the _add_object

1052 self[NameObject(PG.CONTENTS)] = content

1053 # forces recalculation of inline_images

1054 self.inline_images = None

1055

1056 def merge_page(

1057 self, page2: "PageObject", expand: bool = False, over: bool = True

1058 ) -> None:

1059 """

1060 Merge the content streams of two pages into one.

1061

1062 Resource references (e.g. fonts) are maintained from both pages.

1063 The mediabox, cropbox, etc of this page are not altered.

1064 The parameter page's content stream will

1065 be added to the end of this page's content stream,

1066 meaning that it will be drawn after, or "on top" of this page.

1067

1068 Args:

1069 page2: The page to be merged into this one. Should be

1070 an instance of :class:`PageObject<PageObject>`.

1071 over: set the page2 content over page1 if True (default) else under

1072 expand: If True, the current page dimensions will be

1073 expanded to accommodate the dimensions of the page to be merged.

1074

1075 """

1076 self._merge_page(page2, over=over, expand=expand)

1077

1078 def _merge_page(

1079 self,

1080 page2: "PageObject",

1081 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1082 ctm: Optional[CompressedTransformationMatrix] = None,

1083 over: bool = True,

1084 expand: bool = False,

1085 ) -> None:

1086 # First we work on merging the resource dictionaries. This allows us

1087 # to find out what symbols in the content streams we might need to

1088 # rename.

1089 try:

1090 assert isinstance(self.indirect_reference, IndirectObject)

1091 if hasattr(

1092 self.indirect_reference.pdf, "_add_object"

1093 ): # to detect PdfWriter

1094 return self._merge_page_writer(

1095 page2, page2transformation, ctm, over, expand

1096 )

1097 except (AssertionError, AttributeError):

1098 pass

1099

1100 new_resources = DictionaryObject()

1101 rename = {}

1102 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())

1103 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())

1104 new_annots = ArrayObject()

1105

1106 for page in (self, page2):

1107 if PG.ANNOTS in page:

1108 annots = page[PG.ANNOTS]

1109 if isinstance(annots, ArrayObject):

1110 new_annots.extend(annots)

1111

1112 for res in (

1113 RES.EXT_G_STATE,

1114 RES.FONT,

1115 RES.XOBJECT,

1116 RES.COLOR_SPACE,

1117 RES.PATTERN,

1118 RES.SHADING,

1119 RES.PROPERTIES,

1120 ):

1121 new, newrename = self._merge_resources(

1122 original_resources, page2resources, res

1123 )

1124 if new:

1125 new_resources[NameObject(res)] = new

1126 rename.update(newrename)

1127

1128 # Combine /ProcSet sets, making sure there's a consistent order

1129 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1130 sorted(

1131 set(

1132 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1133 ).union(

1134 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())

1135 )

1136 )

1137 )

1138

1139 new_content_array = ArrayObject()

1140 original_content = self.get_contents()

1141 if original_content is not None:

1142 original_content.isolate_graphics_state()

1143 new_content_array.append(original_content)

1144

1145 page2content = page2.get_contents()

1146 if page2content is not None:

1147 rect = getattr(page2, MERGE_CROP_BOX)

1148 page2content.operations.insert(

1149 0,

1150 (

1151 map(

1152 FloatObject,

1153 [

1154 rect.left,

1155 rect.bottom,

1156 rect.width,

1157 rect.height,

1158 ],

1159 ),

1160 b"re",

1161 ),

1162 )

1163 page2content.operations.insert(1, ([], b"W"))

1164 page2content.operations.insert(2, ([], b"n"))

1165 if page2transformation is not None:

1166 page2content = page2transformation(page2content)

1167 page2content = PageObject._content_stream_rename(

1168 page2content, rename, self.pdf

1169 )

1170 page2content.isolate_graphics_state()

1171 if over:

1172 new_content_array.append(page2content)

1173 else:

1174 new_content_array.insert(0, page2content)

1175

1176 # if expanding the page to fit a new page, calculate the new media box size

1177 if expand:

1178 self._expand_mediabox(page2, ctm)

1179

1180 self.replace_contents(ContentStream(new_content_array, self.pdf))

1181 self[NameObject(PG.RESOURCES)] = new_resources

1182 self[NameObject(PG.ANNOTS)] = new_annots

1183 return None

1184

1185 def _merge_page_writer(

1186 self,

1187 page2: "PageObject",

1188 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1189 ctm: Optional[CompressedTransformationMatrix] = None,

1190 over: bool = True,

1191 expand: bool = False,

1192 ) -> None:

1193 # First we work on merging the resource dictionaries. This allows us

1194 # to find which symbols in the content streams we might need to

1195 # rename.

1196 assert isinstance(self.indirect_reference, IndirectObject)

1197 pdf = self.indirect_reference.pdf

1198

1199 rename = {}

1200 if PG.RESOURCES not in self:

1201 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1202 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1203 if PG.RESOURCES not in page2:

1204 page2resources = DictionaryObject()

1205 else:

1206 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1207

1208 for res in (

1209 RES.EXT_G_STATE,

1210 RES.FONT,

1211 RES.XOBJECT,

1212 RES.COLOR_SPACE,

1213 RES.PATTERN,

1214 RES.SHADING,

1215 RES.PROPERTIES,

1216 ):

1217 if res in page2resources:

1218 if res not in original_resources:

1219 original_resources[NameObject(res)] = DictionaryObject()

1220 _, newrename = self._merge_resources(

1221 original_resources, page2resources, res, False

1222 )

1223 rename.update(newrename)

1224 # Combine /ProcSet sets.

1225 if RES.PROC_SET in page2resources:

1226 if RES.PROC_SET not in original_resources:

1227 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1228 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1229 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1230 if x not in arr:

1231 arr.append(x)

1232 arr.sort()

1233

1234 if PG.ANNOTS in page2:

1235 if PG.ANNOTS not in self:

1236 self[NameObject(PG.ANNOTS)] = ArrayObject()

1237 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1238 if ctm is None:

1239 trsf = Transformation()

1240 else:

1241 trsf = Transformation(ctm)

1242 # Ensure we are working on a copy of the list. Otherwise, if both pages

1243 # are the same object, we might run into an infinite loop.

1244 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):

1245 a = a.get_object()

1246 aa = a.clone(

1247 pdf,

1248 ignore_fields=("/P", "/StructParent", "/Parent"),

1249 force_duplicate=True,

1250 )

1251 r = cast(ArrayObject, a["/Rect"])

1252 pt1 = trsf.apply_on((r[0], r[1]), True)

1253 pt2 = trsf.apply_on((r[2], r[3]), True)

1254 aa[NameObject("/Rect")] = ArrayObject(

1255 (

1256 min(pt1[0], pt2[0]),

1257 min(pt1[1], pt2[1]),

1258 max(pt1[0], pt2[0]),

1259 max(pt1[1], pt2[1]),

1260 )

1261 )

1262 if "/QuadPoints" in a:

1263 q = cast(ArrayObject, a["/QuadPoints"])

1264 aa[NameObject("/QuadPoints")] = ArrayObject(

1265 trsf.apply_on((q[0], q[1]), True)

1266 + trsf.apply_on((q[2], q[3]), True)

1267 + trsf.apply_on((q[4], q[5]), True)

1268 + trsf.apply_on((q[6], q[7]), True)

1269 )

1270 try:

1271 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1272 except KeyError:

1273 pass

1274 try:

1275 aa[NameObject("/P")] = self.indirect_reference

1276 annots.append(aa.indirect_reference)

1277 except AttributeError:

1278 pass

1279

1280 new_content_array = ArrayObject()

1281 original_content = self.get_contents()

1282 if original_content is not None:

1283 original_content.isolate_graphics_state()

1284 new_content_array.append(original_content)

1285

1286 page2content = page2.get_contents()

1287 if page2content is not None:

1288 rect = getattr(page2, MERGE_CROP_BOX)

1289 page2content.operations.insert(

1290 0,

1291 (

1292 map(

1293 FloatObject,

1294 [

1295 rect.left,

1296 rect.bottom,

1297 rect.width,

1298 rect.height,

1299 ],

1300 ),

1301 b"re",

1302 ),

1303 )

1304 page2content.operations.insert(1, ([], b"W"))

1305 page2content.operations.insert(2, ([], b"n"))

1306 if page2transformation is not None:

1307 page2content = page2transformation(page2content)

1308 page2content = PageObject._content_stream_rename(

1309 page2content, rename, self.pdf

1310 )

1311 page2content.isolate_graphics_state()

1312 if over:

1313 new_content_array.append(page2content)

1314 else:

1315 new_content_array.insert(0, page2content)

1316

1317 # if expanding the page to fit a new page, calculate the new media box size

1318 if expand:

1319 self._expand_mediabox(page2, ctm)

1320

1321 self.replace_contents(new_content_array)

1322

1323 def _expand_mediabox(

1324 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1325 ) -> None:

1326 corners1 = (

1327 self.mediabox.left.as_numeric(),

1328 self.mediabox.bottom.as_numeric(),

1329 self.mediabox.right.as_numeric(),

1330 self.mediabox.top.as_numeric(),

1331 )

1332 corners2 = (

1333 page2.mediabox.left.as_numeric(),

1334 page2.mediabox.bottom.as_numeric(),

1335 page2.mediabox.left.as_numeric(),

1336 page2.mediabox.top.as_numeric(),

1337 page2.mediabox.right.as_numeric(),

1338 page2.mediabox.top.as_numeric(),

1339 page2.mediabox.right.as_numeric(),

1340 page2.mediabox.bottom.as_numeric(),

1341 )

1342 if ctm is not None:

1343 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1344 new_x = tuple(

1345 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1346 for i in range(0, 8, 2)

1347 )

1348 new_y = tuple(

1349 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1350 for i in range(0, 8, 2)

1351 )

1352 else:

1353 new_x = corners2[0:8:2]

1354 new_y = corners2[1:8:2]

1355 lowerleft = (min(new_x), min(new_y))

1356 upperright = (max(new_x), max(new_y))

1357 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1358 upperright = (

1359 max(corners1[2], upperright[0]),

1360 max(corners1[3], upperright[1]),

1361 )

1362

1363 self.mediabox.lower_left = lowerleft

1364 self.mediabox.upper_right = upperright

1365

1366 def merge_transformed_page(

1367 self,

1368 page2: "PageObject",

1369 ctm: Union[CompressedTransformationMatrix, Transformation],

1370 over: bool = True,

1371 expand: bool = False,

1372 ) -> None:

1373 """

1374 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1375 matrix is applied to the merged stream.

1376

1377 Args:

1378 page2: The page to be merged into this one.

1379 ctm: a 6-element tuple containing the operands of the

1380 transformation matrix

1381 over: set the page2 content over page1 if True (default) else under

1382 expand: Whether the page should be expanded to fit the dimensions

1383 of the page to be merged.

1384

1385 """

1386 if isinstance(ctm, Transformation):

1387 ctm = ctm.ctm

1388 self._merge_page(

1389 page2,

1390 lambda page2_content: PageObject._add_transformation_matrix(

1391 page2_content, page2.pdf, ctm

1392 ),

1393 ctm,

1394 over,

1395 expand,

1396 )

1397

1398 def merge_scaled_page(

1399 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1400 ) -> None:

1401 """

1402 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1403 is scaled by applying a transformation matrix.

1404

1405 Args:

1406 page2: The page to be merged into this one.

1407 scale: The scaling factor

1408 over: set the page2 content over page1 if True (default) else under

1409 expand: Whether the page should be expanded to fit the

1410 dimensions of the page to be merged.

1411

1412 """

1413 op = Transformation().scale(scale, scale)

1414 self.merge_transformed_page(page2, op, over, expand)

1415

1416 def merge_rotated_page(

1417 self,

1418 page2: "PageObject",

1419 rotation: float,

1420 over: bool = True,

1421 expand: bool = False,

1422 ) -> None:

1423 """

1424 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1425 is rotated by applying a transformation matrix.

1426

1427 Args:

1428 page2: The page to be merged into this one.

1429 rotation: The angle of the rotation, in degrees

1430 over: set the page2 content over page1 if True (default) else under

1431 expand: Whether the page should be expanded to fit the

1432 dimensions of the page to be merged.

1433

1434 """

1435 op = Transformation().rotate(rotation)

1436 self.merge_transformed_page(page2, op, over, expand)

1437

1438 def merge_translated_page(

1439 self,

1440 page2: "PageObject",

1441 tx: float,

1442 ty: float,

1443 over: bool = True,

1444 expand: bool = False,

1445 ) -> None:

1446 """

1447 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1448 merged is translated by applying a transformation matrix.

1449

1450 Args:

1451 page2: the page to be merged into this one.

1452 tx: The translation on X axis

1453 ty: The translation on Y axis

1454 over: set the page2 content over page1 if True (default) else under

1455 expand: Whether the page should be expanded to fit the

1456 dimensions of the page to be merged.

1457

1458 """

1459 op = Transformation().translate(tx, ty)

1460 self.merge_transformed_page(page2, op, over, expand)

1461

1462 def add_transformation(

1463 self,

1464 ctm: Union[Transformation, CompressedTransformationMatrix],

1465 expand: bool = False,

1466 ) -> None:

1467 """

1468 Apply a transformation matrix to the page.

1469

1470 Args:

1471 ctm: A 6-element tuple containing the operands of the

1472 transformation matrix. Alternatively, a

1473 :py:class:`Transformation<pypdf.Transformation>`

1474 object can be passed.

1475

1476 See :doc:`/user/cropping-and-transforming`.

1477

1478 """

1479 if isinstance(ctm, Transformation):

1480 ctm = ctm.ctm

1481 content = self.get_contents()

1482 if content is not None:

1483 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1484 content.isolate_graphics_state()

1485 self.replace_contents(content)

1486 # if expanding the page to fit a new page, calculate the new media box size

1487 if expand:

1488 corners = [

1489 self.mediabox.left.as_numeric(),

1490 self.mediabox.bottom.as_numeric(),

1491 self.mediabox.left.as_numeric(),

1492 self.mediabox.top.as_numeric(),

1493 self.mediabox.right.as_numeric(),

1494 self.mediabox.top.as_numeric(),

1495 self.mediabox.right.as_numeric(),

1496 self.mediabox.bottom.as_numeric(),

1497 ]

1498

1499 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1500 new_x = [

1501 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1502 for i in range(0, 8, 2)

1503 ]

1504 new_y = [

1505 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1506 for i in range(0, 8, 2)

1507 ]

1508

1509 self.mediabox.lower_left = (min(new_x), min(new_y))

1510 self.mediabox.upper_right = (max(new_x), max(new_y))

1511

1512 def scale(self, sx: float, sy: float) -> None:

1513 """

1514 Scale a page by the given factors by applying a transformation matrix

1515 to its content and updating the page size.

1516

1517 This updates the various page boundaries (bleedbox, trimbox, etc.)

1518 and the contents of the page.

1519

1520 Args:

1521 sx: The scaling factor on horizontal axis.

1522 sy: The scaling factor on vertical axis.

1523

1524 """

1525 self.add_transformation((sx, 0, 0, sy, 0, 0))

1526 self.bleedbox = self.bleedbox.scale(sx, sy)

1527 self.trimbox = self.trimbox.scale(sx, sy)

1528 self.artbox = self.artbox.scale(sx, sy)

1529 self.cropbox = self.cropbox.scale(sx, sy)

1530 self.mediabox = self.mediabox.scale(sx, sy)

1531

1532 if PG.ANNOTS in self:

1533 annotations = self[PG.ANNOTS]

1534 if isinstance(annotations, ArrayObject):

1535 for annotation in annotations:

1536 annotation_obj = annotation.get_object()

1537 if ADA.Rect in annotation_obj:

1538 rectangle = annotation_obj[ADA.Rect]

1539 if isinstance(rectangle, ArrayObject):

1540 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1541 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1542 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1543 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1544

1545 if PG.VP in self:

1546 viewport = self[PG.VP]

1547 if isinstance(viewport, ArrayObject):

1548 bbox = viewport[0]["/BBox"]

1549 else:

1550 bbox = viewport["/BBox"] # type: ignore

1551 scaled_bbox = RectangleObject(

1552 (

1553 float(bbox[0]) * sx,

1554 float(bbox[1]) * sy,

1555 float(bbox[2]) * sx,

1556 float(bbox[3]) * sy,

1557 )

1558 )

1559 if isinstance(viewport, ArrayObject):

1560 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore

1561 NameObject("/BBox")

1562 ] = scaled_bbox

1563 else:

1564 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore

1565

1566 def scale_by(self, factor: float) -> None:

1567 """

1568 Scale a page by the given factor by applying a transformation matrix to

1569 its content and updating the page size.

1570

1571 Args:

1572 factor: The scaling factor (for both X and Y axis).

1573

1574 """

1575 self.scale(factor, factor)

1576

1577 def scale_to(self, width: float, height: float) -> None:

1578 """

1579 Scale a page to the specified dimensions by applying a transformation

1580 matrix to its content and updating the page size.

1581

1582 Args:

1583 width: The new width.

1584 height: The new height.

1585

1586 """

1587 sx = width / float(self.mediabox.width)

1588 sy = height / float(self.mediabox.height)

1589 self.scale(sx, sy)

1590

1591 def compress_content_streams(self, level: int = -1) -> None:

1592 """

1593 Compress the size of this page by joining all content streams and

1594 applying a FlateDecode filter.

1595

1596 However, it is possible that this function will perform no action if

1597 content stream compression becomes "automatic".

1598 """

1599 content = self.get_contents()

1600 if content is not None:

1601 content_obj = content.flate_encode(level)

1602 try:

1603 content.indirect_reference.pdf._objects[ # type: ignore

1604 content.indirect_reference.idnum - 1 # type: ignore

1605 ] = content_obj

1606 except AttributeError:

1607 if self.indirect_reference is not None and hasattr(

1608 self.indirect_reference.pdf, "_add_object"

1609 ):

1610 self.replace_contents(content_obj)

1611 else:

1612 raise ValueError("Page must be part of a PdfWriter")

1613

1614 @property

1615 def page_number(self) -> Optional[int]:

1616 """

1617 Read-only property which returns the page number within the PDF file.

1618

1619 Returns:

1620 Page number; None if the page is not attached to a PDF.

1621

1622 """

1623 if self.indirect_reference is None:

1624 return None

1625 try:

1626 lst = self.indirect_reference.pdf.pages

1627 return lst.index(self)

1628 except ValueError:

1629 return None

1630

1631 def _debug_for_extract(self) -> str: # pragma: no cover

1632 out = ""

1633 for ope, op in ContentStream(

1634 self["/Contents"].get_object(), self.pdf, "bytes"

1635 ).operations:

1636 if op == b"TJ":

1637 s = [x for x in ope[0] if isinstance(x, str)]

1638 else:

1639 s = []

1640 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1641 out += "\n=============================\n"

1642 try:

1643 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1644 out += fo + "\n"

1645 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1646 try:

1647 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1648 "/Encoding"

1649 ].__repr__()

1650 out += enc_repr + "\n"

1651 except Exception:

1652 pass

1653 try:

1654 out += (

1655 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1656 "/ToUnicode"

1657 ]

1658 .get_data()

1659 .decode()

1660 + "\n"

1661 )

1662 except Exception:

1663 pass

1664

1665 except KeyError:

1666 out += "No Font\n"

1667 return out

1668

1669 def _extract_text(

1670 self,

1671 obj: Any,

1672 pdf: Any,

1673 orientations: tuple[int, ...] = (0, 90, 180, 270),

1674 space_width: float = 200.0,

1675 content_key: Optional[str] = PG.CONTENTS,

1676 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1677 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1678 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1679 ) -> str:

1680 """

1681 See extract_text for most arguments.

1682

1683 Args:

1684 content_key: indicate the default key where to extract data

1685 None = the object; this allows reusing the function on an XObject

1686 default = "/Content"

1687

1688 """

1689 extractor = TextExtraction()

1690 font_resources: dict[str, DictionaryObject] = {}

1691 fonts: dict[str, Font] = {}

1692

1693 try:

1694 objr = obj

1695 while NameObject(PG.RESOURCES) not in objr:

1696 # /Resources can be inherited so we look to parents

1697 objr = objr["/Parent"].get_object()

1698 # If no parents then no /Resources will be available,

1699 # so an exception will be raised

1700 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])

1701 except Exception:

1702 # No resources means no text is possible (no font); we consider the

1703 # file as not damaged, no need to check for TJ or Tj

1704 return ""

1705

1706 if (

1707 not is_null_or_none(resources_dict)

1708 and "/Font" in resources_dict

1709 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))

1710 ):

1711 for font_resource in font_resources_dict:

1712 try:

1713 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())

1714 font_resources[font_resource] = font_resource_object

1715 fonts[font_resource] = Font.from_font_resource(font_resource_object)

1716 # Override space width, if applicable

1717 if fonts[font_resource].character_widths.get(" ", 0) == 0:

1718 fonts[font_resource].space_width = space_width

1719 except (AttributeError, TypeError):

1720 pass

1721

1722 try:

1723 content = (

1724 obj[content_key].get_object() if isinstance(content_key, str) else obj

1725 )

1726 if not isinstance(content, ContentStream):

1727 content = ContentStream(content, pdf, "bytes")

1728 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1729 return ""

1730 # We check all strings are TextStringObjects. ByteStringObjects

1731 # are strings where the byte->string encoding was unknown, so adding

1732 # them to the text here would be gibberish.

1733

1734 # Initialize the extractor with the necessary parameters

1735 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)

1736

1737 for operands, operator in content.operations:

1738 if visitor_operand_before is not None:

1739 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1740 # Multiple operators are handled here

1741 if operator == b"'":

1742 extractor.process_operation(b"T*", [])

1743 extractor.process_operation(b"Tj", operands)

1744 elif operator == b'"':

1745 extractor.process_operation(b"Tw", [operands[0]])

1746 extractor.process_operation(b"Tc", [operands[1]])

1747 extractor.process_operation(b"T*", [])

1748 extractor.process_operation(b"Tj", operands[2:])

1749 elif operator == b"TJ":

1750 # The space width may be smaller than the font width, so the width should be 95%.

1751 _confirm_space_width = extractor._space_width * 0.95

1752 if operands:

1753 for op in operands[0]:

1754 if isinstance(op, (str, bytes)):

1755 extractor.process_operation(b"Tj", [op])

1756 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1757 abs(float(op)) >= _confirm_space_width

1758 and extractor.text

1759 and extractor.text[-1] != " "

1760 ):

1761 extractor.process_operation(b"Tj", [" "])

1762 elif operator == b"TD":

1763 extractor.process_operation(b"TL", [-operands[1]])

1764 extractor.process_operation(b"Td", operands)

1765 elif operator == b"Do":

1766 extractor.output += extractor.text

1767 if visitor_text is not None:

1768 visitor_text(

1769 extractor.text,

1770 extractor.memo_cm,

1771 extractor.memo_tm,

1772 extractor.font_resource,

1773 extractor.font_size,

1774 )

1775 try:

1776 if extractor.output[-1] != "\n":

1777 extractor.output += "\n"

1778 if visitor_text is not None:

1779 visitor_text(

1780 "\n",

1781 extractor.memo_cm,

1782 extractor.memo_tm,

1783 extractor.font_resource,

1784 extractor.font_size,

1785 )

1786 except IndexError:

1787 pass

1788 try:

1789 xobj = resources_dict["/XObject"]

1790 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore

1791 text = self.extract_xform_text(

1792 xobj[operands[0]], # type: ignore

1793 orientations,

1794 space_width,

1795 visitor_operand_before,

1796 visitor_operand_after,

1797 visitor_text,

1798 )

1799 extractor.output += text

1800 if visitor_text is not None:

1801 visitor_text(

1802 text,

1803 extractor.memo_cm,

1804 extractor.memo_tm,

1805 extractor.font_resource,

1806 extractor.font_size,

1807 )

1808 except Exception as exception:

1809 logger_warning(

1810 f"Impossible to decode XFormObject {operands[0]}: {exception}",

1811 __name__,

1812 )

1813 finally:

1814 extractor.text = ""

1815 extractor.memo_cm = extractor.cm_matrix.copy()

1816 extractor.memo_tm = extractor.tm_matrix.copy()

1817 else:

1818 extractor.process_operation(operator, operands)

1819 if visitor_operand_after is not None:

1820 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1821 extractor.output += extractor.text # just in case

1822 if extractor.text != "" and visitor_text is not None:

1823 visitor_text(

1824 extractor.text,

1825 extractor.memo_cm,

1826 extractor.memo_tm,

1827 extractor.font_resource,

1828 extractor.font_size,

1829 )

1830 return extractor.output

1831

1832 def _layout_mode_fonts(self) -> dict[str, Font]:

1833 """

1834 Get fonts formatted for "layout" mode text extraction.

1835

1836 Returns:

1837 Dict[str, Font]: dictionary of Font instances keyed by font name

1838

1839 """

1840 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1841 objr: Any = self

1842 fonts: dict[str, Font] = {}

1843 while objr is not None:

1844 try:

1845 resources_dict: Any = objr[PG.RESOURCES]

1846 except KeyError:

1847 resources_dict = {}

1848 if "/Font" in resources_dict and self.pdf is not None:

1849 for font_name in resources_dict["/Font"]:

1850 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])

1851 try:

1852 objr = objr["/Parent"].get_object()

1853 except KeyError:

1854 objr = None

1855

1856 return fonts

1857

1858 def _layout_mode_text(

1859 self,

1860 space_vertically: bool = True,

1861 scale_weight: float = 1.25,

1862 strip_rotated: bool = True,

1863 debug_path: Optional[Path] = None,

1864 font_height_weight: float = 1,

1865 ) -> str:

1866 """

1867 Get text preserving fidelity to source PDF text layout.

1868

1869 Args:

1870 space_vertically: include blank lines inferred from y distance + font

1871 height. Defaults to True.

1872 scale_weight: multiplier for string length when calculating weighted

1873 average character width. Defaults to 1.25.

1874 strip_rotated: Removes text that is rotated w.r.t. to the page from

1875 layout mode output. Defaults to True.

1876 debug_path (Path | None): if supplied, must target a directory.

1877 creates the following files with debug information for layout mode

1878 functions if supplied:

1879 - fonts.json: output of self._layout_mode_fonts

1880 - tjs.json: individual text render ops with corresponding transform matrices

1881 - bts.json: text render ops left justified and grouped by BT/ET operators

1882 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1883 Defaults to None.

1884 font_height_weight: multiplier for font height when calculating

1885 blank lines. Defaults to 1.

1886

1887 Returns:

1888 str: multiline string containing page text in a fixed width format that

1889 closely adheres to the rendered layout in the source pdf.

1890

1891 """

1892 fonts = self._layout_mode_fonts()

1893 if debug_path: # pragma: no cover

1894 import json # noqa: PLC0415

1895

1896 debug_path.joinpath("fonts.json").write_text(

1897 json.dumps(fonts, indent=2, default=asdict),

1898 "utf-8"

1899 )

1900

1901 ops = iter(

1902 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1903 )

1904 bt_groups = _layout_mode.text_show_operations(

1905 ops, fonts, strip_rotated, debug_path

1906 )

1907

1908 if not bt_groups:

1909 return ""

1910

1911 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1912

1913 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1914

1915 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1916

1917 def extract_text(

1918 self,

1919 *args: Any,

1920 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1921 space_width: float = 200.0,

1922 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1923 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1924 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1925 extraction_mode: Literal["plain", "layout"] = "plain",

1926 **kwargs: Any,

1927 ) -> str:

1928 """

1929 Locate all text drawing commands, in the order they are provided in the

1930 content stream, and extract the text.

1931

1932 This works well for some PDF files, but poorly for others, depending on

1933 the generator used. This will be refined in the future.

1934

1935 Do not rely on the order of text coming out of this function, as it

1936 will change if this function is made more sophisticated.

1937

1938 Arabic and Hebrew are extracted in the correct order.

1939 If required a custom RTL range of characters can be defined;

1940 see function set_custom_rtl.

1941

1942 Additionally you can provide visitor methods to get informed on all

1943 operations and all text objects.

1944 For example in some PDF files this can be useful to parse tables.

1945

1946 Args:

1947 orientations: list of orientations extract_text will look for

1948 default = (0, 90, 180, 270)

1949 note: currently only 0 (up),90 (turned left), 180 (upside down),

1950 270 (turned right)

1951 Silently ignored in "layout" mode.

1952 space_width: force default space width

1953 if not extracted from font (default: 200)

1954 Silently ignored in "layout" mode.

1955 visitor_operand_before: function to be called before processing an operation.

1956 It has four arguments: operator, operand-arguments,

1957 current transformation matrix and text matrix.

1958 Ignored with a warning in "layout" mode.

1959 visitor_operand_after: function to be called after processing an operation.

1960 It has four arguments: operator, operand-arguments,

1961 current transformation matrix and text matrix.

1962 Ignored with a warning in "layout" mode.

1963 visitor_text: function to be called when extracting some text at some position.

1964 It has five arguments: text, current transformation matrix,

1965 text matrix, font-dictionary and font-size.

1966 The font-dictionary may be None in case of unknown fonts.

1967 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1968 Ignored with a warning in "layout" mode.

1969 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1970 "layout" for experimental layout mode functionality.

1971 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1972 in "layout" mode.

1973

1974 kwargs:

1975 layout_mode_space_vertically (bool): include blank lines inferred from

1976 y distance + font height. Defaults to True.

1977 layout_mode_scale_weight (float): multiplier for string length when calculating

1978 weighted average character width. Defaults to 1.25.

1979 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

1980 Set to False to include rotated text anyway. If rotated text is discovered,

1981 layout will be degraded and a warning will result. Defaults to True.

1982 layout_mode_debug_path (Path | None): if supplied, must target a directory.

1983 creates the following files with debug information for layout mode

1984 functions if supplied:

1985

1986 - fonts.json: output of self._layout_mode_fonts

1987 - tjs.json: individual text render ops with corresponding transform matrices

1988 - bts.json: text render ops left justified and grouped by BT/ET operators

1989 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1990 layout_mode_font_height_weight (float): multiplier for font height when calculating

1991 blank lines. Defaults to 1.

1992

1993 Returns:

1994 The extracted text

1995

1996 """

1997 if extraction_mode not in ["plain", "layout"]:

1998 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

1999 if extraction_mode == "layout":

2000 for visitor in (

2001 "visitor_operand_before",

2002 "visitor_operand_after",

2003 "visitor_text",

2004 ):

2005 if locals()[visitor]:

2006 logger_warning(

2007 f"Argument {visitor} is ignored in layout mode",

2008 __name__,

2009 )

2010 return self._layout_mode_text(

2011 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2012 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2013 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2014 debug_path=kwargs.get("layout_mode_debug_path"),

2015 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2016 )

2017 if len(args) >= 1:

2018 if isinstance(args[0], str):

2019 if len(args) >= 3:

2020 if isinstance(args[2], (tuple, int)):

2021 orientations = args[2]

2022 else:

2023 raise TypeError(f"Invalid positional parameter {args[2]}")

2024 if len(args) >= 4:

2025 if isinstance(args[3], (float, int)):

2026 space_width = args[3]

2027 else:

2028 raise TypeError(f"Invalid positional parameter {args[3]}")

2029 elif isinstance(args[0], (tuple, int)):

2030 orientations = args[0]

2031 if len(args) >= 2:

2032 if isinstance(args[1], (float, int)):

2033 space_width = args[1]

2034 else:

2035 raise TypeError(f"Invalid positional parameter {args[1]}")

2036 else:

2037 raise TypeError(f"Invalid positional parameter {args[0]}")

2038

2039 if isinstance(orientations, int):

2040 orientations = (orientations,)

2041

2042 return self._extract_text(

2043 self,

2044 self.pdf,

2045 orientations,

2046 space_width,

2047 PG.CONTENTS,

2048 visitor_operand_before,

2049 visitor_operand_after,

2050 visitor_text,

2051 )

2052

2053 def extract_xform_text(

2054 self,

2055 xform: EncodedStreamObject,

2056 orientations: tuple[int, ...] = (0, 90, 270, 360),

2057 space_width: float = 200.0,

2058 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2059 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2060 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2061 ) -> str:

2062 """

2063 Extract text from an XObject.

2064

2065 Args:

2066 xform:

2067 orientations:

2068 space_width: force default space width (if not extracted from font (default 200)

2069 visitor_operand_before:

2070 visitor_operand_after:

2071 visitor_text:

2072

2073 Returns:

2074 The extracted text

2075

2076 """

2077 return self._extract_text(

2078 xform,

2079 self.pdf,

2080 orientations,

2081 space_width,

2082 None,

2083 visitor_operand_before,

2084 visitor_operand_after,

2085 visitor_text,

2086 )

2087

2088 def _get_fonts(self) -> tuple[set[str], set[str]]:

2089 """

2090 Get the names of embedded fonts and unembedded fonts.

2091

2092 Returns:

2093 A tuple (set of embedded fonts, set of unembedded fonts)

2094

2095 """

2096 obj = self.get_object()

2097 assert isinstance(obj, DictionaryObject)

2098 fonts: set[str] = set()

2099 embedded: set[str] = set()

2100 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2101 unembedded = fonts - embedded

2102 return embedded, unembedded

2103

2104 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2105 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2106 default user space units, defining the boundaries of the physical medium on

2107 which the page is intended to be displayed or printed."""

2108

2109 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2110 """

2111 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2112 default user space units, defining the visible region of default user

2113 space.

2114

2115 When the page is displayed or printed, its contents are to be clipped

2116 (cropped) to this rectangle and then imposed on the output medium in some

2117 implementation-defined manner. Default value: same as

2118 :attr:`mediabox<mediabox>`.

2119 """

2120

2121 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2122 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2123 default user space units, defining the region to which the contents of the

2124 page should be clipped when output in a production environment."""

2125

2126 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2127 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2128 default user space units, defining the intended dimensions of the finished

2129 page after trimming."""

2130

2131 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2132 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2133 default user space units, defining the extent of the page's meaningful

2134 content as intended by the page's creator."""

2135

2136 @property

2137 def annotations(self) -> Optional[ArrayObject]:

2138 if "/Annots" not in self:

2139 return None

2140 return cast(ArrayObject, self["/Annots"])

2141

2142 @annotations.setter

2143 def annotations(self, value: Optional[ArrayObject]) -> None:

2144 """

2145 Set the annotations array of the page.

2146

2147 Typically you do not want to set this value, but append to it.

2148 If you append to it, remember to add the object first to the writer

2149 and only add the indirect object.

2150 """

2151 if value is None:

2152 if "/Annots" not in self:

2153 return

2154 del self[NameObject("/Annots")]

2155 else:

2156 self[NameObject("/Annots")] = value

2157

2158

2159class _VirtualList(Sequence[PageObject]):

2160 def __init__(

2161 self,

2162 length_function: Callable[[], int],

2163 get_function: Callable[[int], PageObject],

2164 ) -> None:

2165 self.length_function = length_function

2166 self.get_function = get_function

2167 self.current = -1

2168

2169 def __len__(self) -> int:

2170 return self.length_function()

2171

2172 @overload

2173 def __getitem__(self, index: int) -> PageObject:

2174 ...

2175

2176 @overload

2177 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2178 ...

2179

2180 def __getitem__(

2181 self, index: Union[int, slice]

2182 ) -> Union[PageObject, Sequence[PageObject]]:

2183 if isinstance(index, slice):

2184 indices = range(*index.indices(len(self)))

2185 cls = type(self)

2186 return cls(indices.__len__, lambda idx: self[indices[idx]])

2187 if not isinstance(index, int):

2188 raise TypeError("Sequence indices must be integers")

2189 len_self = len(self)

2190 if index < 0:

2191 # support negative indexes

2192 index += len_self

2193 if not (0 <= index < len_self):

2194 raise IndexError("Sequence index out of range")

2195 return self.get_function(index)

2196

2197 def __delitem__(self, index: Union[int, slice]) -> None:

2198 if isinstance(index, slice):

2199 r = list(range(*index.indices(len(self))))

2200 # pages have to be deleted from last to first

2201 r.sort()

2202 r.reverse()

2203 for p in r:

2204 del self[p] # recursive call

2205 return

2206 if not isinstance(index, int):

2207 raise TypeError("Index must be integers")

2208 len_self = len(self)

2209 if index < 0:

2210 # support negative indexes

2211 index += len_self

2212 if not (0 <= index < len_self):

2213 raise IndexError("Index out of range")

2214 ind = self[index].indirect_reference

2215 assert ind is not None

2216 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2217 "/Parent", None

2218 )

2219 first = True

2220 while parent is not None:

2221 parent = cast(DictionaryObject, parent.get_object())

2222 try:

2223 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2224 del cast(ArrayObject, parent["/Kids"])[i]

2225 first = False

2226 try:

2227 assert ind is not None

2228 del ind.pdf.flattened_pages[index] # case of page in a Reader

2229 except Exception: # pragma: no cover

2230 pass

2231 if "/Count" in parent:

2232 parent[NameObject("/Count")] = NumberObject(

2233 cast(int, parent["/Count"]) - 1

2234 )

2235 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2236 # No more objects in this part of this subtree

2237 ind = parent.indirect_reference

2238 parent = parent.get("/Parent", None)

2239 except ValueError: # from index

2240 if first:

2241 raise PdfReadError(f"Page not found in page tree: {ind}")

2242 break

2243

2244 def __iter__(self) -> Iterator[PageObject]:

2245 for i in range(len(self)):

2246 yield self[i]

2247

2248 def __str__(self) -> str:

2249 p = [f"PageObject({i})" for i in range(self.length_function())]

2250 return f"[{', '.join(p)}]"

2251

2252

2253def _get_fonts_walk(

2254 obj: DictionaryObject,

2255 fnt: set[str],

2256 emb: set[str],

2257) -> tuple[set[str], set[str]]:

2258 """

2259 Get the set of all fonts and all embedded fonts.

2260

2261 Args:

2262 obj: Page resources dictionary

2263 fnt: font

2264 emb: embedded fonts

2265

2266 Returns:

2267 A tuple (fnt, emb)

2268

2269 If there is a key called 'BaseFont', that is a font that is used in the document.

2270 If there is a key called 'FontName' and another key in the same dictionary object

2271 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2272 embedded.

2273

2274 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2275

2276 """

2277 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2278

2279 def process_font(f: DictionaryObject) -> None:

2280 nonlocal fnt, emb

2281 f = cast(DictionaryObject, f.get_object()) # to be sure

2282 if "/BaseFont" in f:

2283 fnt.add(cast(str, f["/BaseFont"]))

2284

2285 if (

2286 ("/CharProcs" in f)

2287 or (

2288 "/FontDescriptor" in f

2289 and any(

2290 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2291 )

2292 )

2293 or (

2294 "/DescendantFonts" in f

2295 and "/FontDescriptor"

2296 in cast(

2297 DictionaryObject,

2298 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2299 )

2300 and any(

2301 x

2302 in cast(

2303 DictionaryObject,

2304 cast(

2305 DictionaryObject,

2306 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2307 )["/FontDescriptor"],

2308 )

2309 for x in fontkeys

2310 )

2311 )

2312 ):

2313 # the list comprehension ensures there is FontFile

2314 try:

2315 emb.add(cast(str, f["/BaseFont"]))

2316 except KeyError:

2317 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2318

2319 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2320 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2321 process_font(f)

2322 if "/Resources" in obj:

2323 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2324 for f in cast(

2325 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2326 ).values():

2327 process_font(f)

2328 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2329 for x in cast(

2330 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2331 ).values():

2332 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2333 if "/Annots" in obj:

2334 for a in cast(ArrayObject, obj["/Annots"]):

2335 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2336 if "/AP" in obj:

2337 if (

2338 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2339 "/Type"

2340 )

2341 == "/XObject"

2342 ):

2343 _get_fonts_walk(

2344 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2345 fnt,

2346 emb,

2347 )

2348 else:

2349 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2350 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2351 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

922 statements