Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from copy import deepcopy

33from dataclasses import asdict, dataclass

34from decimal import Decimal

35from io import BytesIO

36from pathlib import Path

37from typing import (

38 Any,

39 Callable,

40 Literal,

41 Optional,

42 Union,

43 cast,

44 overload,

45)

47from ._font import Font

48from ._protocols import PdfCommonDocProtocol

49from ._text_extraction import (

50 _layout_mode,

51)

52from ._text_extraction._text_extractor import TextExtraction

53from ._utils import (

54 CompressedTransformationMatrix,

55 TransformationMatrixType,

56 _human_readable_bytes,

57 deprecate,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING

62from .constants import AnnotationDictionaryAttributes as ADA

63from .constants import ImageAttributes as IA

64from .constants import PageAttributes as PG

65from .constants import Resources as RES

66from .errors import PageSizeNotDefinedError, PdfReadError

67from .generic import (

68 ArrayObject,

69 ContentStream,

70 DictionaryObject,

71 EncodedStreamObject,

72 FloatObject,

73 IndirectObject,

74 NameObject,

75 NullObject,

76 NumberObject,

77 PdfObject,

78 RectangleObject,

79 StreamObject,

80 is_null_or_none,

81)

83try:

84 from PIL.Image import Image

86 pil_not_imported = False

87except ImportError:

88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10

89 pil_not_imported = True # error will be raised only when using images

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)

96 if isinstance(retval, RectangleObject):

97 return retval

98 if is_null_or_none(retval):

99 for d in defaults:

100 retval = self.get(d)

101 if retval is not None:

102 break

103 if isinstance(retval, IndirectObject):

104 retval = self.pdf.get_object(retval)

105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:

106 logger_warning(f"Expected four values, got {length}: {retval}", __name__)

107 retval = RectangleObject(tuple(retval[:4]))

108 else:

109 retval = RectangleObject(retval) # type: ignore

110 _set_rectangle(self, name, retval)

111 return retval

112

113

114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

115 self[NameObject(name)] = value

116

117

118def _delete_rectangle(self: Any, name: str) -> None:

119 del self[name]

120

121

122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

123 return property(

124 lambda self: _get_rectangle(self, name, fallback),

125 lambda self, value: _set_rectangle(self, name, value),

126 lambda self: _delete_rectangle(self, name),

127 )

128

129

130class Transformation:

131 """

132 Represent a 2D transformation.

133

134 The transformation between two coordinate systems is represented by a 3-by-3

135 transformation matrix with the following form::

136

137 a b 0

138 c d 0

139 e f 1

140

141 Because a transformation matrix has only six elements that can be changed,

142 it is usually specified in PDF as the six-element array [ a b c d e f ].

143

144 Coordinate transformations are expressed as matrix multiplications::

145

146 a b 0

147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

148 e f 1

149

150

151 Example:

152 >>> from pypdf import PdfWriter, Transformation

153 >>> page = PdfWriter().add_blank_page(800, 600)

154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

155 >>> page.add_transformation(op)

156

157 """

158

159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

160 self.ctm = ctm

161

162 @property

163 def matrix(self) -> TransformationMatrixType:

164 """

165 Return the transformation matrix as a tuple of tuples in the form:

166

167 ((a, b, 0), (c, d, 0), (e, f, 1))

168 """

169 return (

170 (self.ctm[0], self.ctm[1], 0),

171 (self.ctm[2], self.ctm[3], 0),

172 (self.ctm[4], self.ctm[5], 1),

173 )

174

175 @staticmethod

176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

177 """

178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

179

180 Args:

181 matrix: The transformation matrix as a tuple of tuples.

182

183 Returns:

184 A tuple representing the transformation matrix as (a, b, c, d, e, f)

185

186 """

187 return (

188 matrix[0][0],

189 matrix[0][1],

190 matrix[1][0],

191 matrix[1][1],

192 matrix[2][0],

193 matrix[2][1],

194 )

195

196 def _to_cm(self) -> str:

197 # Returns the cm operation string for the given transformation matrix

198 return (

199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

201 )

202

203 def transform(self, m: "Transformation") -> "Transformation":

204 """

205 Apply one transformation to another.

206

207 Args:

208 m: a Transformation to apply.

209

210 Returns:

211 A new ``Transformation`` instance

212

213 Example:

214 >>> from pypdf import PdfWriter, Transformation

215 >>> height, width = 40, 50

216 >>> page = PdfWriter().add_blank_page(800, 600)

217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

219 >>> page.add_transformation(op)

220

221 """

222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

223 return Transformation(ctm)

224

225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

226 """

227 Translate the contents of a page.

228

229 Args:

230 tx: The translation along the x-axis.

231 ty: The translation along the y-axis.

232

233 Returns:

234 A new ``Transformation`` instance

235

236 """

237 m = self.ctm

238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

239

240 def scale(

241 self, sx: Optional[float] = None, sy: Optional[float] = None

242 ) -> "Transformation":

243 """

244 Scale the contents of a page towards the origin of the coordinate system.

245

246 Typically, that is the lower-left corner of the page. That can be

247 changed by translating the contents / the page boxes.

248

249 Args:

250 sx: The scale factor along the x-axis.

251 sy: The scale factor along the y-axis.

252

253 Returns:

254 A new Transformation instance with the scaled matrix.

255

256 """

257 if sx is None and sy is None:

258 raise ValueError("Either sx or sy must be specified")

259 if sx is None:

260 sx = sy

261 if sy is None:

262 sy = sx

263 assert sx is not None

264 assert sy is not None

265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

266 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

267 return Transformation(ctm)

268

269 def rotate(self, rotation: float) -> "Transformation":

270 """

271 Rotate the contents of a page.

272

273 Args:

274 rotation: The angle of rotation in degrees.

275

276 Returns:

277 A new ``Transformation`` instance with the rotated matrix.

278

279 """

280 rotation = math.radians(rotation)

281 op: TransformationMatrixType = (

282 (math.cos(rotation), math.sin(rotation), 0),

283 (-math.sin(rotation), math.cos(rotation), 0),

284 (0, 0, 1),

285 )

286 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

287 return Transformation(ctm)

288

289 def __repr__(self) -> str:

290 return f"Transformation(ctm={self.ctm})"

291

292 @overload

293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

294 ...

295

296 @overload

297 def apply_on(

298 self, pt: tuple[float, float], as_object: bool = False

299 ) -> tuple[float, float]:

300 ...

301

302 def apply_on(

303 self,

304 pt: Union[tuple[float, float], list[float]],

305 as_object: bool = False,

306 ) -> Union[tuple[float, float], list[float]]:

307 """

308 Apply the transformation matrix on the given point.

309

310 Args:

311 pt: A tuple or list representing the point in the form (x, y).

312 as_object: If True, return items as FloatObject, otherwise as plain floats.

313

314 Returns:

315 A tuple or list representing the transformed point in the form (x', y')

316

317 """

318 typ = FloatObject if as_object else float

319 pt1 = (

320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

322 )

323 return list(pt1) if isinstance(pt, list) else pt1

324

325

326@dataclass

327class ImageFile:

328 """

329 Image within the PDF file. *This object is not designed to be built.*

330

331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

332 """

333

334 name: str = ""

335 """

336 Filename as identified within the PDF file.

337 """

338

339 data: bytes = b""

340 """

341 Data as bytes.

342 """

343

344 image: Optional[Image] = None

345 """

346 Data as PIL image.

347 """

348

349 indirect_reference: Optional[IndirectObject] = None

350 """

351 Reference to the object storing the stream.

352 """

353

354 def replace(self, new_image: Image, **kwargs: Any) -> None:

355 """

356 Replace the image with a new PIL image.

357

358 Args:

359 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

360 **kwargs: Additional keyword arguments to pass to `Image.save()`.

361

362 Raises:

363 TypeError: If the image is inline or in a PdfReader.

364 TypeError: If the image does not belong to a PdfWriter.

365 TypeError: If `new_image` is not a PIL Image.

366

367 Note:

368 This method replaces the existing image with a new image.

369 It is not allowed for inline images or images within a PdfReader.

370 The `kwargs` parameter allows passing additional parameters

371 to `Image.save()`, such as quality.

372

373 """

374 if pil_not_imported:

375 raise ImportError(

376 "pillow is required to do image extraction. "

377 "It can be installed via 'pip install pypdf[image]'"

378 )

379

380 from ._reader import PdfReader # noqa: PLC0415

381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

382 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415

383

384 if self.indirect_reference is None:

385 raise TypeError("Cannot update an inline image.")

386 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

387 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

388 if not isinstance(new_image, Image):

389 raise TypeError("new_image shall be a PIL Image")

390 b = BytesIO()

391 new_image.save(b, "PDF", **kwargs)

392 reader = PdfReader(b)

393 page_image = reader.pages[0].images[0]

394 assert page_image.indirect_reference is not None

395 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

396 page_image.indirect_reference.get_object()

397 )

398 cast(

399 PdfObject, self.indirect_reference.get_object()

400 ).indirect_reference = self.indirect_reference

401 # change the object attributes

402 extension, byte_stream, img = _xobj_to_image(

403 cast(DictionaryObject, self.indirect_reference.get_object()),

404 pillow_parameters=kwargs,

405 )

406 assert extension is not None

407 self.name = self.name[: self.name.rfind(".")] + extension

408 self.data = byte_stream

409 self.image = img

410

411 def __str__(self) -> str:

412 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

413

414 def __repr__(self) -> str:

415 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

416

417

418class VirtualListImages(Sequence[ImageFile]):

419 """

420 Provides access to images referenced within a page.

421 Only one copy will be returned if the usage is used on the same page multiple times.

422 See :func:`PageObject.images` for more details.

423 """

424

425 def __init__(

426 self,

427 ids_function: Callable[[], list[Union[str, list[str]]]],

428 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

429 ) -> None:

430 self.ids_function = ids_function

431 self.get_function = get_function

432 self.current = -1

433

434 def __len__(self) -> int:

435 return len(self.ids_function())

436

437 def keys(self) -> list[Union[str, list[str]]]:

438 return self.ids_function()

439

440 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

441 return [(x, self[x]) for x in self.ids_function()]

442

443 @overload

444 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

445 ...

446

447 @overload

448 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

449 ...

450

451 def __getitem__(

452 self, index: Union[int, slice, str, list[str], tuple[str]]

453 ) -> Union[ImageFile, Sequence[ImageFile]]:

454 lst = self.ids_function()

455 if isinstance(index, slice):

456 indices = range(*index.indices(len(self)))

457 lst = [lst[x] for x in indices]

458 cls = type(self)

459 return cls((lambda: lst), self.get_function)

460 if isinstance(index, (str, list, tuple)):

461 return self.get_function(index)

462 if not isinstance(index, int):

463 raise TypeError("Invalid sequence indices type")

464 len_self = len(lst)

465 if index < 0:

466 # support negative indexes

467 index += len_self

468 if not (0 <= index < len_self):

469 raise IndexError("Sequence index out of range")

470 return self.get_function(lst[index])

471

472 def __iter__(self) -> Iterator[ImageFile]:

473 for i in range(len(self)):

474 yield self[i]

475

476 def __str__(self) -> str:

477 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

478 return f"[{', '.join(p)}]"

479

480

481class PageObject(DictionaryObject):

482 """

483 PageObject represents a single page within a PDF file.

484

485 Typically these objects will be created by accessing the

486 :attr:`pages<pypdf.PdfReader.pages>` property of the

487 :class:`PdfReader<pypdf.PdfReader>` class, but it is

488 also possible to create an empty page with the

489 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

490

491 Args:

492 pdf: PDF file the page belongs to.

493 indirect_reference: Stores the original indirect reference to

494 this object in its source PDF

495

496 """

497

498 original_page: "PageObject" # very local use in writer when appending

499

500 def __init__(

501 self,

502 pdf: Optional[PdfCommonDocProtocol] = None,

503 indirect_reference: Optional[IndirectObject] = None,

504 ) -> None:

505 DictionaryObject.__init__(self)

506 self.pdf = pdf

507 self.inline_images: Optional[dict[str, ImageFile]] = None

508 self.indirect_reference = indirect_reference

509 if not is_null_or_none(indirect_reference):

510 assert indirect_reference is not None, "mypy"

511 self.update(cast(DictionaryObject, indirect_reference.get_object()))

512 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}

513

514 def hash_bin(self) -> int:

515 """

516 Used to detect modified object.

517

518 Note: this function is overloaded to return the same results

519 as a DictionaryObject.

520

521 Returns:

522 Hash considering type and value.

523

524 """

525 return hash(

526 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

527 )

528

529 def hash_value_data(self) -> bytes:

530 data = super().hash_value_data()

531 data += f"{id(self)}".encode()

532 return data

533

534 @property

535 def user_unit(self) -> float:

536 """

537 A read-only positive number giving the size of user space units.

538

539 It is in multiples of 1/72 inch. Hence a value of 1 means a user

540 space unit is 1/72 inch, and a value of 3 means that a user

541 space unit is 3/72 inch.

542 """

543 return self.get(PG.USER_UNIT, 1)

544

545 @staticmethod

546 def create_blank_page(

547 pdf: Optional[PdfCommonDocProtocol] = None,

548 width: Union[float, Decimal, None] = None,

549 height: Union[float, Decimal, None] = None,

550 ) -> "PageObject":

551 """

552 Return a new blank page.

553

554 If ``width`` or ``height`` is ``None``, try to get the page size

555 from the last page of *pdf*.

556

557 Args:

558 pdf: PDF file the page is within.

559 width: The width of the new page expressed in default user

560 space units.

561 height: The height of the new page expressed in default user

562 space units.

563

564 Returns:

565 The new blank page

566

567 Raises:

568 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

569 no page

570

571 """

572 page = PageObject(pdf)

573

574 # Creates a new page (cf PDF Reference §7.7.3.3)

575 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

576 page.__setitem__(NameObject(PG.PARENT), NullObject())

577 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

578 if width is None or height is None:

579 if pdf is not None and len(pdf.pages) > 0:

580 lastpage = pdf.pages[len(pdf.pages) - 1]

581 width = lastpage.mediabox.width

582 height = lastpage.mediabox.height

583 else:

584 raise PageSizeNotDefinedError

585 page.__setitem__(

586 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore

587 )

588

589 return page

590

591 def _get_ids_image(

592 self,

593 obj: Optional[DictionaryObject] = None,

594 ancest: Optional[list[str]] = None,

595 call_stack: Optional[list[Any]] = None,

596 ) -> list[Union[str, list[str]]]:

597 if call_stack is None:

598 call_stack = []

599 _i = getattr(obj, "indirect_reference", None)

600 if _i in call_stack:

601 return []

602 call_stack.append(_i)

603 if self.inline_images is None:

604 self.inline_images = self._get_inline_images()

605 if obj is None:

606 obj = self

607 if ancest is None:

608 ancest = []

609 lst: list[Union[str, list[str]]] = []

610 if (

611 PG.RESOURCES not in obj or

612 is_null_or_none(resources := obj[PG.RESOURCES]) or

613 RES.XOBJECT not in cast(DictionaryObject, resources)

614 ):

615 return [] if self.inline_images is None else list(self.inline_images.keys())

616

617 x_object = resources[RES.XOBJECT].get_object() # type: ignore

618 for o in x_object:

619 if not isinstance(x_object[o], StreamObject):

620 continue

621 if x_object[o][IA.SUBTYPE] == "/Image":

622 lst.append(o if len(ancest) == 0 else [*ancest, o])

623 else: # is a form with possible images inside

624 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

625 assert self.inline_images is not None

626 lst.extend(list(self.inline_images.keys()))

627 return lst

628

629 def _get_image(

630 self,

631 id: Union[str, list[str], tuple[str]],

632 obj: Optional[DictionaryObject] = None,

633 ) -> ImageFile:

634 if obj is None:

635 obj = cast(DictionaryObject, self)

636 if isinstance(id, tuple):

637 id = list(id)

638 if isinstance(id, list) and len(id) == 1:

639 id = id[0]

640 try:

641 xobjs = cast(

642 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

643 )

644 except KeyError:

645 if not (id[0] == "~" and id[-1] == "~"):

646 raise

647 if isinstance(id, str):

648 if id[0] == "~" and id[-1] == "~":

649 if self.inline_images is None:

650 self.inline_images = self._get_inline_images()

651 if self.inline_images is None: # pragma: no cover

652 raise KeyError("No inline image can be found")

653 return self.inline_images[id]

654

655 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415

656 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

657 extension, byte_stream = imgd[:2]

658 return ImageFile(

659 name=f"{id[1:]}{extension}",

660 data=byte_stream,

661 image=imgd[2],

662 indirect_reference=xobjs[id].indirect_reference,

663 )

664 # in a subobject

665 ids = id[1:]

666 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

667

668 @property

669 def images(self) -> VirtualListImages:

670 """

671 Read-only property emulating a list of images on a page.

672

673 Get a list of all images on the page. The key can be:

674 - A string (for the top object)

675 - A tuple (for images within XObject forms)

676 - An integer

677

678 Examples:

679 * `reader.pages[0].images[0]` # return first image

680 * `reader.pages[0].images['/I0']` # return image '/I0'

681 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

682 * `for img in reader.pages[0].images:` # loops through all objects

683

684 images.keys() and images.items() can be used.

685

686 The ImageFile has the following properties:

687

688 * `.name` : name of the object

689 * `.data` : bytes of the object

690 * `.image` : PIL Image Object

691 * `.indirect_reference` : object reference

692

693 and the following methods:

694 `.replace(new_image: PIL.Image.Image, **kwargs)` :

695 replace the image in the pdf with the new image

696 applying the saving parameters indicated (such as quality)

697

698 Example usage:

699

700 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

701

702 Inline images are extracted and named ~0~, ~1~, ..., with the

703 indirect_reference set to None.

704

705 """

706 return VirtualListImages(self._get_ids_image, self._get_image)

707

708 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

709 """Translate values used in inline image"""

710 try:

711 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

712 except (TypeError, KeyError):

713 if isinstance(v, NameObject):

714 # It is a custom name, thus we have to look in resources.

715 # The only applicable case is for ColorSpace.

716 try:

717 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

718 v = cast(DictionaryObject, res)[v]

719 except KeyError: # for res and v

720 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

721 return v

722

723 def _get_inline_images(self) -> dict[str, ImageFile]:

724 """Load inline images. Entries will be identified as `~1~`."""

725 content = self.get_contents()

726 if is_null_or_none(content):

727 return {}

728 imgs_data = []

729 assert content is not None, "mypy"

730 for param, ope in content.operations:

731 if ope == b"INLINE IMAGE":

732 imgs_data.append(

733 {"settings": param["settings"], "__streamdata__": param["data"]}

734 )

735 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

736 raise PdfReadError(

737 f"{ope!r} operator met whereas not expected, "

738 "please share use case with pypdf dev team"

739 )

740 files = {}

741 for num, ii in enumerate(imgs_data):

742 init = {

743 "__streamdata__": ii["__streamdata__"],

744 "/Length": len(ii["__streamdata__"]),

745 }

746 for k, v in ii["settings"].items():

747 if k in {"/Length", "/L"}: # no length is expected

748 continue

749 if isinstance(v, list):

750 v = ArrayObject(

751 [self._translate_value_inline_image(k, x) for x in v]

752 )

753 else:

754 v = self._translate_value_inline_image(k, v)

755 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

756 if k not in init:

757 init[k] = v

758 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

759 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415

760 extension, byte_stream, img = _xobj_to_image(ii["object"])

761 files[f"~{num}~"] = ImageFile(

762 name=f"~{num}~{extension}",

763 data=byte_stream,

764 image=img,

765 indirect_reference=None,

766 )

767 return files

768

769 @property

770 def rotation(self) -> int:

771 """

772 The visual rotation of the page.

773

774 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

775 valid values. This property does not affect ``/Contents``.

776 """

777 rotate_obj = self.get(PG.ROTATE, 0)

778 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

779

780 @rotation.setter

781 def rotation(self, r: float) -> None:

782 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

783

784 def transfer_rotation_to_content(self) -> None:

785 """

786 Apply the rotation of the page to the content and the media/crop/...

787 boxes.

788

789 It is recommended to apply this function before page merging.

790 """

791 r = -self.rotation # rotation to apply is in the otherway

792 self.rotation = 0

793 mb = RectangleObject(self.mediabox)

794 trsf = (

795 Transformation()

796 .translate(

797 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

798 )

799 .rotate(r)

800 )

801 pt1 = trsf.apply_on(mb.lower_left)

802 pt2 = trsf.apply_on(mb.upper_right)

803 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

804 self.add_transformation(trsf, False)

805 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

806 if b in self:

807 rr = RectangleObject(self[b]) # type: ignore

808 pt1 = trsf.apply_on(rr.lower_left)

809 pt2 = trsf.apply_on(rr.upper_right)

810 self[NameObject(b)] = RectangleObject(

811 (

812 min(pt1[0], pt2[0]),

813 min(pt1[1], pt2[1]),

814 max(pt1[0], pt2[0]),

815 max(pt1[1], pt2[1]),

816 )

817 )

818

819 def rotate(self, angle: int) -> "PageObject":

820 """

821 Rotate a page clockwise by increments of 90 degrees.

822

823 Args:

824 angle: Angle to rotate the page. Must be an increment of 90 deg.

825

826 Returns:

827 The rotated PageObject

828

829 """

830 if angle % 90 != 0:

831 raise ValueError("Rotation angle must be a multiple of 90")

832 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

833 return self

834

835 def _merge_resources(

836 self,

837 res1: DictionaryObject,

838 res2: DictionaryObject,

839 resource: Any,

840 new_res1: bool = True,

841 ) -> tuple[dict[str, Any], dict[str, Any]]:

842 try:

843 assert isinstance(self.indirect_reference, IndirectObject)

844 pdf = self.indirect_reference.pdf

845 is_pdf_writer = hasattr(

846 pdf, "_add_object"

847 ) # expect isinstance(pdf, PdfWriter)

848 except (AssertionError, AttributeError):

849 pdf = None

850 is_pdf_writer = False

851

852 def compute_unique_key(base_key: str) -> tuple[str, bool]:

853 """

854 Find a key that either doesn't already exist or has the same value

855 (indicated by the bool)

856

857 Args:

858 base_key: An index is added to this to get the computed key

859

860 Returns:

861 A tuple (computed key, bool) where the boolean indicates

862 if there is a resource of the given computed_key with the same

863 value.

864

865 """

866 value = page2res.raw_get(base_key)

867 # TODO: a possible improvement for writer, the indirect_reference

868 # cannot be found because translated

869

870 # try the current key first (e.g. "foo"), but otherwise iterate

871 # through "foo-0", "foo-1", etc. new_res can contain only finitely

872 # many keys, thus this'll eventually end, even if it's been crafted

873 # to be maximally annoying.

874 computed_key = base_key

875 idx = 0

876 while computed_key in new_res:

877 if new_res.raw_get(computed_key) == value:

878 # there's already a resource of this name, with the exact

879 # same value

880 return computed_key, True

881 computed_key = f"{base_key}-{idx}"

882 idx += 1

883 return computed_key, False

884

885 if new_res1:

886 new_res = DictionaryObject()

887 new_res.update(res1.get(resource, DictionaryObject()).get_object())

888 else:

889 new_res = cast(DictionaryObject, res1[resource])

890 page2res = cast(

891 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

892 )

893 rename_res = {}

894 for key in page2res:

895 unique_key, same_value = compute_unique_key(key)

896 newname = NameObject(unique_key)

897 if key != unique_key:

898 # we have to use a different name for this

899 rename_res[key] = newname

900

901 if not same_value:

902 if is_pdf_writer:

903 new_res[newname] = page2res.raw_get(key).clone(pdf)

904 try:

905 new_res[newname] = new_res[newname].indirect_reference

906 except AttributeError:

907 pass

908 else:

909 new_res[newname] = page2res.raw_get(key)

910 lst = sorted(new_res.items())

911 new_res.clear()

912 for el in lst:

913 new_res[el[0]] = el[1]

914 return new_res, rename_res

915

916 @staticmethod

917 def _content_stream_rename(

918 stream: ContentStream,

919 rename: dict[Any, Any],

920 pdf: Optional[PdfCommonDocProtocol],

921 ) -> ContentStream:

922 if not rename:

923 return stream

924 stream = ContentStream(stream, pdf)

925 for operands, _operator in stream.operations:

926 if isinstance(operands, list):

927 for i, op in enumerate(operands):

928 if isinstance(op, NameObject):

929 operands[i] = rename.get(op, op)

930 elif isinstance(operands, dict):

931 for i, op in operands.items():

932 if isinstance(op, NameObject):

933 operands[i] = rename.get(op, op)

934 else:

935 raise KeyError(f"Type of operands is {type(operands)}")

936 return stream

937

938 @staticmethod

939 def _add_transformation_matrix(

940 contents: Any,

941 pdf: Optional[PdfCommonDocProtocol],

942 ctm: CompressedTransformationMatrix,

943 ) -> ContentStream:

944 """Add transformation matrix at the beginning of the given contents stream."""

945 contents = ContentStream(contents, pdf)

946 contents.operations.insert(

947 0,

948 [

949 [FloatObject(x) for x in ctm],

950 b"cm",

951 ],

952 )

953 return contents

954

955 def _get_contents_as_bytes(self) -> Optional[bytes]:

956 """

957 Return the page contents as bytes.

958

959 Returns:

960 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

961

962 """

963 if PG.CONTENTS in self:

964 obj = self[PG.CONTENTS].get_object()

965 if isinstance(obj, list):

966 return b"".join(x.get_object().get_data() for x in obj)

967 return cast(EncodedStreamObject, obj).get_data()

968 return None

969

970 def get_contents(self) -> Optional[ContentStream]:

971 """

972 Access the page contents.

973

974 Returns:

975 The ``/Contents`` object, or ``None`` if it does not exist.

976 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

977

978 """

979 if PG.CONTENTS in self:

980 try:

981 pdf = cast(IndirectObject, self.indirect_reference).pdf

982 except AttributeError:

983 pdf = None

984 obj = self[PG.CONTENTS]

985 if is_null_or_none(obj):

986 return None

987 resolved_object = obj.get_object()

988 return ContentStream(resolved_object, pdf)

989 return None

990

991 def replace_contents(

992 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

993 ) -> None:

994 """

995 Replace the page contents with the new content and nullify old objects

996 Args:

997 content: new content; if None delete the content field.

998 """

999 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

1000 # the page is not attached : the content is directly attached.

1001 self[NameObject(PG.CONTENTS)] = content

1002 return

1003

1004 from pypdf._writer import PdfWriter # noqa: PLC0415

1005 if not isinstance(self.indirect_reference.pdf, PdfWriter):

1006 deprecate(

1007 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "

1008 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "

1009 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."

1010 )

1011

1012 writer = self.indirect_reference.pdf

1013 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1014 content_array = cast(ArrayObject, self[PG.CONTENTS])

1015 for reference in content_array:

1016 try:

1017 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())

1018 except ValueError:

1019 # Occurs when called on PdfReader.

1020 pass

1021

1022 if isinstance(content, ArrayObject):

1023 content = ArrayObject(writer._add_object(obj) for obj in content)

1024

1025 if is_null_or_none(content):

1026 if PG.CONTENTS not in self:

1027 return

1028 assert self[PG.CONTENTS].indirect_reference is not None

1029 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())

1030 del self[PG.CONTENTS]

1031 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1032 try:

1033 self[NameObject(PG.CONTENTS)] = writer._add_object(content)

1034 except AttributeError:

1035 # applies at least for page not in writer

1036 # as a backup solution, we put content as an object although not in accordance with pdf ref

1037 # this will be fixed with the _add_object

1038 self[NameObject(PG.CONTENTS)] = content

1039 else:

1040 assert content is not None, "mypy"

1041 content.indirect_reference = self[

1042 PG.CONTENTS

1043 ].indirect_reference # TODO: in the future may require generation management

1044 try:

1045 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)

1046 except AttributeError:

1047 # applies at least for page not in writer

1048 # as a backup solution, we put content as an object although not in accordance with pdf ref

1049 # this will be fixed with the _add_object

1050 self[NameObject(PG.CONTENTS)] = content

1051 # forces recalculation of inline_images

1052 self.inline_images = None

1053

1054 def merge_page(

1055 self, page2: "PageObject", expand: bool = False, over: bool = True

1056 ) -> None:

1057 """

1058 Merge the content streams of two pages into one.

1059

1060 Resource references (e.g. fonts) are maintained from both pages.

1061 The mediabox, cropbox, etc of this page are not altered.

1062 The parameter page's content stream will

1063 be added to the end of this page's content stream,

1064 meaning that it will be drawn after, or "on top" of this page.

1065

1066 Args:

1067 page2: The page to be merged into this one. Should be

1068 an instance of :class:`PageObject<PageObject>`.

1069 over: set the page2 content over page1 if True (default) else under

1070 expand: If True, the current page dimensions will be

1071 expanded to accommodate the dimensions of the page to be merged.

1072

1073 """

1074 self._merge_page(page2, over=over, expand=expand)

1075

1076 def _merge_page(

1077 self,

1078 page2: "PageObject",

1079 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1080 ctm: Optional[CompressedTransformationMatrix] = None,

1081 over: bool = True,

1082 expand: bool = False,

1083 ) -> None:

1084 # First we work on merging the resource dictionaries. This allows us

1085 # to find out what symbols in the content streams we might need to

1086 # rename.

1087 try:

1088 assert isinstance(self.indirect_reference, IndirectObject)

1089 if hasattr(

1090 self.indirect_reference.pdf, "_add_object"

1091 ): # to detect PdfWriter

1092 return self._merge_page_writer(

1093 page2, page2transformation, ctm, over, expand

1094 )

1095 except (AssertionError, AttributeError):

1096 pass

1097

1098 new_resources = DictionaryObject()

1099 rename = {}

1100 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())

1101 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())

1102 new_annots = ArrayObject()

1103

1104 for page in (self, page2):

1105 if PG.ANNOTS in page:

1106 annots = page[PG.ANNOTS]

1107 if isinstance(annots, ArrayObject):

1108 new_annots.extend(annots)

1109

1110 for res in (

1111 RES.EXT_G_STATE,

1112 RES.FONT,

1113 RES.XOBJECT,

1114 RES.COLOR_SPACE,

1115 RES.PATTERN,

1116 RES.SHADING,

1117 RES.PROPERTIES,

1118 ):

1119 new, newrename = self._merge_resources(

1120 original_resources, page2resources, res

1121 )

1122 if new:

1123 new_resources[NameObject(res)] = new

1124 rename.update(newrename)

1125

1126 # Combine /ProcSet sets, making sure there's a consistent order

1127 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1128 sorted(

1129 set(

1130 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1131 ).union(

1132 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())

1133 )

1134 )

1135 )

1136

1137 new_content_array = ArrayObject()

1138 original_content = self.get_contents()

1139 if original_content is not None:

1140 original_content.isolate_graphics_state()

1141 new_content_array.append(original_content)

1142

1143 page2content = page2.get_contents()

1144 if page2content is not None:

1145 rect = getattr(page2, MERGE_CROP_BOX)

1146 page2content.operations.insert(

1147 0,

1148 (

1149 map(

1150 FloatObject,

1151 [

1152 rect.left,

1153 rect.bottom,

1154 rect.width,

1155 rect.height,

1156 ],

1157 ),

1158 b"re",

1159 ),

1160 )

1161 page2content.operations.insert(1, ([], b"W"))

1162 page2content.operations.insert(2, ([], b"n"))

1163 if page2transformation is not None:

1164 page2content = page2transformation(page2content)

1165 page2content = PageObject._content_stream_rename(

1166 page2content, rename, self.pdf

1167 )

1168 page2content.isolate_graphics_state()

1169 if over:

1170 new_content_array.append(page2content)

1171 else:

1172 new_content_array.insert(0, page2content)

1173

1174 # if expanding the page to fit a new page, calculate the new media box size

1175 if expand:

1176 self._expand_mediabox(page2, ctm)

1177

1178 self.replace_contents(ContentStream(new_content_array, self.pdf))

1179 self[NameObject(PG.RESOURCES)] = new_resources

1180 self[NameObject(PG.ANNOTS)] = new_annots

1181 return None

1182

1183 def _merge_page_writer(

1184 self,

1185 page2: "PageObject",

1186 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1187 ctm: Optional[CompressedTransformationMatrix] = None,

1188 over: bool = True,

1189 expand: bool = False,

1190 ) -> None:

1191 # First we work on merging the resource dictionaries. This allows us

1192 # to find which symbols in the content streams we might need to

1193 # rename.

1194 assert isinstance(self.indirect_reference, IndirectObject)

1195 pdf = self.indirect_reference.pdf

1196

1197 rename = {}

1198 if PG.RESOURCES not in self:

1199 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1200 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1201 if PG.RESOURCES not in page2:

1202 page2resources = DictionaryObject()

1203 else:

1204 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1205

1206 for res in (

1207 RES.EXT_G_STATE,

1208 RES.FONT,

1209 RES.XOBJECT,

1210 RES.COLOR_SPACE,

1211 RES.PATTERN,

1212 RES.SHADING,

1213 RES.PROPERTIES,

1214 ):

1215 if res in page2resources:

1216 if res not in original_resources:

1217 original_resources[NameObject(res)] = DictionaryObject()

1218 _, newrename = self._merge_resources(

1219 original_resources, page2resources, res, False

1220 )

1221 rename.update(newrename)

1222 # Combine /ProcSet sets.

1223 if RES.PROC_SET in page2resources:

1224 if RES.PROC_SET not in original_resources:

1225 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1226 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1227 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1228 if x not in arr:

1229 arr.append(x)

1230 arr.sort()

1231

1232 if PG.ANNOTS in page2:

1233 if PG.ANNOTS not in self:

1234 self[NameObject(PG.ANNOTS)] = ArrayObject()

1235 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1236 if ctm is None:

1237 trsf = Transformation()

1238 else:

1239 trsf = Transformation(ctm)

1240 # Ensure we are working on a copy of the list. Otherwise, if both pages

1241 # are the same object, we might run into an infinite loop.

1242 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):

1243 a = a.get_object()

1244 aa = a.clone(

1245 pdf,

1246 ignore_fields=("/P", "/StructParent", "/Parent"),

1247 force_duplicate=True,

1248 )

1249 r = cast(ArrayObject, a["/Rect"])

1250 pt1 = trsf.apply_on((r[0], r[1]), True)

1251 pt2 = trsf.apply_on((r[2], r[3]), True)

1252 aa[NameObject("/Rect")] = ArrayObject(

1253 (

1254 min(pt1[0], pt2[0]),

1255 min(pt1[1], pt2[1]),

1256 max(pt1[0], pt2[0]),

1257 max(pt1[1], pt2[1]),

1258 )

1259 )

1260 if "/QuadPoints" in a:

1261 q = cast(ArrayObject, a["/QuadPoints"])

1262 aa[NameObject("/QuadPoints")] = ArrayObject(

1263 trsf.apply_on((q[0], q[1]), True)

1264 + trsf.apply_on((q[2], q[3]), True)

1265 + trsf.apply_on((q[4], q[5]), True)

1266 + trsf.apply_on((q[6], q[7]), True)

1267 )

1268 try:

1269 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1270 except KeyError:

1271 pass

1272 try:

1273 aa[NameObject("/P")] = self.indirect_reference

1274 annots.append(aa.indirect_reference)

1275 except AttributeError:

1276 pass

1277

1278 new_content_array = ArrayObject()

1279 original_content = self.get_contents()

1280 if original_content is not None:

1281 original_content.isolate_graphics_state()

1282 new_content_array.append(original_content)

1283

1284 page2content = page2.get_contents()

1285 if page2content is not None:

1286 rect = getattr(page2, MERGE_CROP_BOX)

1287 page2content.operations.insert(

1288 0,

1289 (

1290 map(

1291 FloatObject,

1292 [

1293 rect.left,

1294 rect.bottom,

1295 rect.width,

1296 rect.height,

1297 ],

1298 ),

1299 b"re",

1300 ),

1301 )

1302 page2content.operations.insert(1, ([], b"W"))

1303 page2content.operations.insert(2, ([], b"n"))

1304 if page2transformation is not None:

1305 page2content = page2transformation(page2content)

1306 page2content = PageObject._content_stream_rename(

1307 page2content, rename, self.pdf

1308 )

1309 page2content.isolate_graphics_state()

1310 if over:

1311 new_content_array.append(page2content)

1312 else:

1313 new_content_array.insert(0, page2content)

1314

1315 # if expanding the page to fit a new page, calculate the new media box size

1316 if expand:

1317 self._expand_mediabox(page2, ctm)

1318

1319 self.replace_contents(new_content_array)

1320

1321 def _expand_mediabox(

1322 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1323 ) -> None:

1324 corners1 = (

1325 self.mediabox.left.as_numeric(),

1326 self.mediabox.bottom.as_numeric(),

1327 self.mediabox.right.as_numeric(),

1328 self.mediabox.top.as_numeric(),

1329 )

1330 corners2 = (

1331 page2.mediabox.left.as_numeric(),

1332 page2.mediabox.bottom.as_numeric(),

1333 page2.mediabox.left.as_numeric(),

1334 page2.mediabox.top.as_numeric(),

1335 page2.mediabox.right.as_numeric(),

1336 page2.mediabox.top.as_numeric(),

1337 page2.mediabox.right.as_numeric(),

1338 page2.mediabox.bottom.as_numeric(),

1339 )

1340 if ctm is not None:

1341 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1342 new_x = tuple(

1343 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1344 for i in range(0, 8, 2)

1345 )

1346 new_y = tuple(

1347 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1348 for i in range(0, 8, 2)

1349 )

1350 else:

1351 new_x = corners2[0:8:2]

1352 new_y = corners2[1:8:2]

1353 lowerleft = (min(new_x), min(new_y))

1354 upperright = (max(new_x), max(new_y))

1355 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1356 upperright = (

1357 max(corners1[2], upperright[0]),

1358 max(corners1[3], upperright[1]),

1359 )

1360

1361 self.mediabox.lower_left = lowerleft

1362 self.mediabox.upper_right = upperright

1363

1364 def merge_transformed_page(

1365 self,

1366 page2: "PageObject",

1367 ctm: Union[CompressedTransformationMatrix, Transformation],

1368 over: bool = True,

1369 expand: bool = False,

1370 ) -> None:

1371 """

1372 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1373 matrix is applied to the merged stream.

1374

1375 Args:

1376 page2: The page to be merged into this one.

1377 ctm: a 6-element tuple containing the operands of the

1378 transformation matrix

1379 over: set the page2 content over page1 if True (default) else under

1380 expand: Whether the page should be expanded to fit the dimensions

1381 of the page to be merged.

1382

1383 """

1384 if isinstance(ctm, Transformation):

1385 ctm = ctm.ctm

1386 self._merge_page(

1387 page2,

1388 lambda page2_content: PageObject._add_transformation_matrix(

1389 page2_content, page2.pdf, ctm

1390 ),

1391 ctm,

1392 over,

1393 expand,

1394 )

1395

1396 def merge_scaled_page(

1397 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1398 ) -> None:

1399 """

1400 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1401 is scaled by applying a transformation matrix.

1402

1403 Args:

1404 page2: The page to be merged into this one.

1405 scale: The scaling factor

1406 over: set the page2 content over page1 if True (default) else under

1407 expand: Whether the page should be expanded to fit the

1408 dimensions of the page to be merged.

1409

1410 """

1411 op = Transformation().scale(scale, scale)

1412 self.merge_transformed_page(page2, op, over, expand)

1413

1414 def merge_rotated_page(

1415 self,

1416 page2: "PageObject",

1417 rotation: float,

1418 over: bool = True,

1419 expand: bool = False,

1420 ) -> None:

1421 """

1422 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1423 is rotated by applying a transformation matrix.

1424

1425 Args:

1426 page2: The page to be merged into this one.

1427 rotation: The angle of the rotation, in degrees

1428 over: set the page2 content over page1 if True (default) else under

1429 expand: Whether the page should be expanded to fit the

1430 dimensions of the page to be merged.

1431

1432 """

1433 op = Transformation().rotate(rotation)

1434 self.merge_transformed_page(page2, op, over, expand)

1435

1436 def merge_translated_page(

1437 self,

1438 page2: "PageObject",

1439 tx: float,

1440 ty: float,

1441 over: bool = True,

1442 expand: bool = False,

1443 ) -> None:

1444 """

1445 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1446 merged is translated by applying a transformation matrix.

1447

1448 Args:

1449 page2: the page to be merged into this one.

1450 tx: The translation on X axis

1451 ty: The translation on Y axis

1452 over: set the page2 content over page1 if True (default) else under

1453 expand: Whether the page should be expanded to fit the

1454 dimensions of the page to be merged.

1455

1456 """

1457 op = Transformation().translate(tx, ty)

1458 self.merge_transformed_page(page2, op, over, expand)

1459

1460 def add_transformation(

1461 self,

1462 ctm: Union[Transformation, CompressedTransformationMatrix],

1463 expand: bool = False,

1464 ) -> None:

1465 """

1466 Apply a transformation matrix to the page.

1467

1468 Args:

1469 ctm: A 6-element tuple containing the operands of the

1470 transformation matrix. Alternatively, a

1471 :py:class:`Transformation<pypdf.Transformation>`

1472 object can be passed.

1473

1474 See :doc:`/user/cropping-and-transforming`.

1475

1476 """

1477 if isinstance(ctm, Transformation):

1478 ctm = ctm.ctm

1479 content = self.get_contents()

1480 if content is not None:

1481 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1482 content.isolate_graphics_state()

1483 self.replace_contents(content)

1484 # if expanding the page to fit a new page, calculate the new media box size

1485 if expand:

1486 corners = [

1487 self.mediabox.left.as_numeric(),

1488 self.mediabox.bottom.as_numeric(),

1489 self.mediabox.left.as_numeric(),

1490 self.mediabox.top.as_numeric(),

1491 self.mediabox.right.as_numeric(),

1492 self.mediabox.top.as_numeric(),

1493 self.mediabox.right.as_numeric(),

1494 self.mediabox.bottom.as_numeric(),

1495 ]

1496

1497 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1498 new_x = [

1499 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1500 for i in range(0, 8, 2)

1501 ]

1502 new_y = [

1503 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1504 for i in range(0, 8, 2)

1505 ]

1506

1507 self.mediabox.lower_left = (min(new_x), min(new_y))

1508 self.mediabox.upper_right = (max(new_x), max(new_y))

1509

1510 def scale(self, sx: float, sy: float) -> None:

1511 """

1512 Scale a page by the given factors by applying a transformation matrix

1513 to its content and updating the page size.

1514

1515 This updates the various page boundaries (bleedbox, trimbox, etc.)

1516 and the contents of the page.

1517

1518 Args:

1519 sx: The scaling factor on horizontal axis.

1520 sy: The scaling factor on vertical axis.

1521

1522 """

1523 self.add_transformation((sx, 0, 0, sy, 0, 0))

1524 self.bleedbox = self.bleedbox.scale(sx, sy)

1525 self.trimbox = self.trimbox.scale(sx, sy)

1526 self.artbox = self.artbox.scale(sx, sy)

1527 self.cropbox = self.cropbox.scale(sx, sy)

1528 self.mediabox = self.mediabox.scale(sx, sy)

1529

1530 if PG.ANNOTS in self:

1531 annotations = self[PG.ANNOTS]

1532 if isinstance(annotations, ArrayObject):

1533 for annotation in annotations:

1534 annotation_obj = annotation.get_object()

1535 if ADA.Rect in annotation_obj:

1536 rectangle = annotation_obj[ADA.Rect]

1537 if isinstance(rectangle, ArrayObject):

1538 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1539 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1540 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1541 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1542

1543 if PG.VP in self:

1544 viewport = self[PG.VP]

1545 if isinstance(viewport, ArrayObject):

1546 bbox = viewport[0]["/BBox"]

1547 else:

1548 bbox = viewport["/BBox"] # type: ignore

1549 scaled_bbox = RectangleObject(

1550 (

1551 float(bbox[0]) * sx,

1552 float(bbox[1]) * sy,

1553 float(bbox[2]) * sx,

1554 float(bbox[3]) * sy,

1555 )

1556 )

1557 if isinstance(viewport, ArrayObject):

1558 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore

1559 NameObject("/BBox")

1560 ] = scaled_bbox

1561 else:

1562 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore

1563

1564 def scale_by(self, factor: float) -> None:

1565 """

1566 Scale a page by the given factor by applying a transformation matrix to

1567 its content and updating the page size.

1568

1569 Args:

1570 factor: The scaling factor (for both X and Y axis).

1571

1572 """

1573 self.scale(factor, factor)

1574

1575 def scale_to(self, width: float, height: float) -> None:

1576 """

1577 Scale a page to the specified dimensions by applying a transformation

1578 matrix to its content and updating the page size.

1579

1580 Args:

1581 width: The new width.

1582 height: The new height.

1583

1584 """

1585 sx = width / float(self.mediabox.width)

1586 sy = height / float(self.mediabox.height)

1587 self.scale(sx, sy)

1588

1589 def compress_content_streams(self, level: int = -1) -> None:

1590 """

1591 Compress the size of this page by joining all content streams and

1592 applying a FlateDecode filter.

1593

1594 However, it is possible that this function will perform no action if

1595 content stream compression becomes "automatic".

1596 """

1597 content = self.get_contents()

1598 if content is not None:

1599 content_obj = content.flate_encode(level)

1600 try:

1601 content.indirect_reference.pdf._objects[ # type: ignore

1602 content.indirect_reference.idnum - 1 # type: ignore

1603 ] = content_obj

1604 except AttributeError:

1605 if self.indirect_reference is not None and hasattr(

1606 self.indirect_reference.pdf, "_add_object"

1607 ):

1608 self.replace_contents(content_obj)

1609 else:

1610 raise ValueError("Page must be part of a PdfWriter")

1611

1612 @property

1613 def page_number(self) -> Optional[int]:

1614 """

1615 Read-only property which returns the page number within the PDF file.

1616

1617 Returns:

1618 Page number; None if the page is not attached to a PDF.

1619

1620 """

1621 if self.indirect_reference is None:

1622 return None

1623 try:

1624 lst = self.indirect_reference.pdf.pages

1625 return lst.index(self)

1626 except ValueError:

1627 return None

1628

1629 def _debug_for_extract(self) -> str: # pragma: no cover

1630 out = ""

1631 for ope, op in ContentStream(

1632 self["/Contents"].get_object(), self.pdf, "bytes"

1633 ).operations:

1634 if op == b"TJ":

1635 s = [x for x in ope[0] if isinstance(x, str)]

1636 else:

1637 s = []

1638 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1639 out += "\n=============================\n"

1640 try:

1641 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1642 out += fo + "\n"

1643 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1644 try:

1645 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1646 "/Encoding"

1647 ].__repr__()

1648 out += enc_repr + "\n"

1649 except Exception:

1650 pass

1651 try:

1652 out += (

1653 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1654 "/ToUnicode"

1655 ]

1656 .get_data()

1657 .decode()

1658 + "\n"

1659 )

1660 except Exception:

1661 pass

1662

1663 except KeyError:

1664 out += "No Font\n"

1665 return out

1666

1667 def _extract_text(

1668 self,

1669 obj: Any,

1670 pdf: Any,

1671 orientations: tuple[int, ...] = (0, 90, 180, 270),

1672 space_width: float = 200.0,

1673 content_key: Optional[str] = PG.CONTENTS,

1674 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1675 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1676 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1677 ) -> str:

1678 """

1679 See extract_text for most arguments.

1680

1681 Args:

1682 content_key: indicate the default key where to extract data

1683 None = the object; this allows reusing the function on an XObject

1684 default = "/Content"

1685

1686 """

1687 extractor = TextExtraction()

1688 font_resources: dict[str, DictionaryObject] = {}

1689 fonts: dict[str, Font] = {}

1690

1691 try:

1692 objr = obj

1693 while NameObject(PG.RESOURCES) not in objr:

1694 # /Resources can be inherited so we look to parents

1695 objr = objr["/Parent"].get_object()

1696 # If no parents then no /Resources will be available,

1697 # so an exception will be raised

1698 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])

1699 except Exception:

1700 # No resources means no text is possible (no font); we consider the

1701 # file as not damaged, no need to check for TJ or Tj

1702 return ""

1703

1704 if (

1705 not is_null_or_none(resources_dict)

1706 and "/Font" in resources_dict

1707 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))

1708 ):

1709 for font_resource in font_resources_dict:

1710 try:

1711 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())

1712 font_resources[font_resource] = font_resource_object

1713 fonts[font_resource] = Font.from_font_resource(font_resource_object)

1714 # Override space width, if applicable

1715 if fonts[font_resource].character_widths.get(" ", 0) == 0:

1716 fonts[font_resource].space_width = space_width

1717 except (AttributeError, TypeError):

1718 pass

1719

1720 try:

1721 content = (

1722 obj[content_key].get_object() if isinstance(content_key, str) else obj

1723 )

1724 if not isinstance(content, ContentStream):

1725 content = ContentStream(content, pdf, "bytes")

1726 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1727 return ""

1728 # We check all strings are TextStringObjects. ByteStringObjects

1729 # are strings where the byte->string encoding was unknown, so adding

1730 # them to the text here would be gibberish.

1731

1732 # Initialize the extractor with the necessary parameters

1733 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)

1734

1735 for operands, operator in content.operations:

1736 if visitor_operand_before is not None:

1737 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1738 # Multiple operators are handled here

1739 if operator == b"'":

1740 extractor.process_operation(b"T*", [])

1741 extractor.process_operation(b"Tj", operands)

1742 elif operator == b'"':

1743 extractor.process_operation(b"Tw", [operands[0]])

1744 extractor.process_operation(b"Tc", [operands[1]])

1745 extractor.process_operation(b"T*", [])

1746 extractor.process_operation(b"Tj", operands[2:])

1747 elif operator == b"TJ":

1748 # The space width may be smaller than the font width, so the width should be 95%.

1749 _confirm_space_width = extractor._space_width * 0.95

1750 if operands:

1751 for op in operands[0]:

1752 if isinstance(op, (str, bytes)):

1753 extractor.process_operation(b"Tj", [op])

1754 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1755 abs(float(op)) >= _confirm_space_width

1756 and extractor.text

1757 and extractor.text[-1] != " "

1758 ):

1759 extractor.process_operation(b"Tj", [" "])

1760 elif operator == b"TD":

1761 extractor.process_operation(b"TL", [-operands[1]])

1762 extractor.process_operation(b"Td", operands)

1763 elif operator == b"Do":

1764 extractor.output += extractor.text

1765 if visitor_text is not None:

1766 visitor_text(

1767 extractor.text,

1768 extractor.memo_cm,

1769 extractor.memo_tm,

1770 extractor.font_resource,

1771 extractor.font_size,

1772 )

1773 try:

1774 if extractor.output[-1] != "\n":

1775 extractor.output += "\n"

1776 if visitor_text is not None:

1777 visitor_text(

1778 "\n",

1779 extractor.memo_cm,

1780 extractor.memo_tm,

1781 extractor.font_resource,

1782 extractor.font_size,

1783 )

1784 except IndexError:

1785 pass

1786 try:

1787 xobj = resources_dict["/XObject"]

1788 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore

1789 text = self.extract_xform_text(

1790 xobj[operands[0]], # type: ignore

1791 orientations,

1792 space_width,

1793 visitor_operand_before,

1794 visitor_operand_after,

1795 visitor_text,

1796 )

1797 extractor.output += text

1798 if visitor_text is not None:

1799 visitor_text(

1800 text,

1801 extractor.memo_cm,

1802 extractor.memo_tm,

1803 extractor.font_resource,

1804 extractor.font_size,

1805 )

1806 except Exception as exception:

1807 logger_warning(

1808 f"Impossible to decode XFormObject {operands[0]}: {exception}",

1809 __name__,

1810 )

1811 finally:

1812 extractor.text = ""

1813 extractor.memo_cm = extractor.cm_matrix.copy()

1814 extractor.memo_tm = extractor.tm_matrix.copy()

1815 else:

1816 extractor.process_operation(operator, operands)

1817 if visitor_operand_after is not None:

1818 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1819 extractor.output += extractor.text # just in case

1820 if extractor.text != "" and visitor_text is not None:

1821 visitor_text(

1822 extractor.text,

1823 extractor.memo_cm,

1824 extractor.memo_tm,

1825 extractor.font_resource,

1826 extractor.font_size,

1827 )

1828 return extractor.output

1829

1830 def _layout_mode_fonts(self) -> dict[str, Font]:

1831 """

1832 Get fonts formatted for "layout" mode text extraction.

1833

1834 Returns:

1835 Dict[str, Font]: dictionary of Font instances keyed by font name

1836

1837 """

1838 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1839 objr: Any = self

1840 fonts: dict[str, Font] = {}

1841 while objr is not None:

1842 try:

1843 resources_dict: Any = objr[PG.RESOURCES]

1844 except KeyError:

1845 resources_dict = {}

1846 if "/Font" in resources_dict and self.pdf is not None:

1847 for font_name in resources_dict["/Font"]:

1848 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])

1849 try:

1850 objr = objr["/Parent"].get_object()

1851 except KeyError:

1852 objr = None

1853

1854 return fonts

1855

1856 def _layout_mode_text(

1857 self,

1858 space_vertically: bool = True,

1859 scale_weight: float = 1.25,

1860 strip_rotated: bool = True,

1861 debug_path: Optional[Path] = None,

1862 font_height_weight: float = 1,

1863 ) -> str:

1864 """

1865 Get text preserving fidelity to source PDF text layout.

1866

1867 Args:

1868 space_vertically: include blank lines inferred from y distance + font

1869 height. Defaults to True.

1870 scale_weight: multiplier for string length when calculating weighted

1871 average character width. Defaults to 1.25.

1872 strip_rotated: Removes text that is rotated w.r.t. to the page from

1873 layout mode output. Defaults to True.

1874 debug_path (Path | None): if supplied, must target a directory.

1875 creates the following files with debug information for layout mode

1876 functions if supplied:

1877 - fonts.json: output of self._layout_mode_fonts

1878 - tjs.json: individual text render ops with corresponding transform matrices

1879 - bts.json: text render ops left justified and grouped by BT/ET operators

1880 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1881 Defaults to None.

1882 font_height_weight: multiplier for font height when calculating

1883 blank lines. Defaults to 1.

1884

1885 Returns:

1886 str: multiline string containing page text in a fixed width format that

1887 closely adheres to the rendered layout in the source pdf.

1888

1889 """

1890 fonts = self._layout_mode_fonts()

1891 if debug_path: # pragma: no cover

1892 import json # noqa: PLC0415

1893

1894 debug_path.joinpath("fonts.json").write_text(

1895 json.dumps(fonts, indent=2, default=asdict),

1896 "utf-8"

1897 )

1898

1899 ops = iter(

1900 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1901 )

1902 bt_groups = _layout_mode.text_show_operations(

1903 ops, fonts, strip_rotated, debug_path

1904 )

1905

1906 if not bt_groups:

1907 return ""

1908

1909 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1910

1911 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1912

1913 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1914

1915 def extract_text(

1916 self,

1917 *args: Any,

1918 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1919 space_width: float = 200.0,

1920 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1921 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1922 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1923 extraction_mode: Literal["plain", "layout"] = "plain",

1924 **kwargs: Any,

1925 ) -> str:

1926 """

1927 Locate all text drawing commands, in the order they are provided in the

1928 content stream, and extract the text.

1929

1930 This works well for some PDF files, but poorly for others, depending on

1931 the generator used. This will be refined in the future.

1932

1933 Do not rely on the order of text coming out of this function, as it

1934 will change if this function is made more sophisticated.

1935

1936 Arabic and Hebrew are extracted in the correct order.

1937 If required a custom RTL range of characters can be defined;

1938 see function set_custom_rtl.

1939

1940 Additionally you can provide visitor methods to get informed on all

1941 operations and all text objects.

1942 For example in some PDF files this can be useful to parse tables.

1943

1944 Args:

1945 orientations: list of orientations extract_text will look for

1946 default = (0, 90, 180, 270)

1947 note: currently only 0 (up),90 (turned left), 180 (upside down),

1948 270 (turned right)

1949 Silently ignored in "layout" mode.

1950 space_width: force default space width

1951 if not extracted from font (default: 200)

1952 Silently ignored in "layout" mode.

1953 visitor_operand_before: function to be called before processing an operation.

1954 It has four arguments: operator, operand-arguments,

1955 current transformation matrix and text matrix.

1956 Ignored with a warning in "layout" mode.

1957 visitor_operand_after: function to be called after processing an operation.

1958 It has four arguments: operator, operand-arguments,

1959 current transformation matrix and text matrix.

1960 Ignored with a warning in "layout" mode.

1961 visitor_text: function to be called when extracting some text at some position.

1962 It has five arguments: text, current transformation matrix,

1963 text matrix, font-dictionary and font-size.

1964 The font-dictionary may be None in case of unknown fonts.

1965 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1966 Ignored with a warning in "layout" mode.

1967 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1968 "layout" for experimental layout mode functionality.

1969 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1970 in "layout" mode.

1971

1972 kwargs:

1973 layout_mode_space_vertically (bool): include blank lines inferred from

1974 y distance + font height. Defaults to True.

1975 layout_mode_scale_weight (float): multiplier for string length when calculating

1976 weighted average character width. Defaults to 1.25.

1977 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

1978 Set to False to include rotated text anyway. If rotated text is discovered,

1979 layout will be degraded and a warning will result. Defaults to True.

1980 layout_mode_debug_path (Path | None): if supplied, must target a directory.

1981 creates the following files with debug information for layout mode

1982 functions if supplied:

1983

1984 - fonts.json: output of self._layout_mode_fonts

1985 - tjs.json: individual text render ops with corresponding transform matrices

1986 - bts.json: text render ops left justified and grouped by BT/ET operators

1987 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1988 layout_mode_font_height_weight (float): multiplier for font height when calculating

1989 blank lines. Defaults to 1.

1990

1991 Returns:

1992 The extracted text

1993

1994 """

1995 if extraction_mode not in ["plain", "layout"]:

1996 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

1997 if extraction_mode == "layout":

1998 for visitor in (

1999 "visitor_operand_before",

2000 "visitor_operand_after",

2001 "visitor_text",

2002 ):

2003 if locals()[visitor]:

2004 logger_warning(

2005 f"Argument {visitor} is ignored in layout mode",

2006 __name__,

2007 )

2008 return self._layout_mode_text(

2009 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2010 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2011 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2012 debug_path=kwargs.get("layout_mode_debug_path"),

2013 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2014 )

2015 if len(args) >= 1:

2016 if isinstance(args[0], str):

2017 if len(args) >= 3:

2018 if isinstance(args[2], (tuple, int)):

2019 orientations = args[2]

2020 else:

2021 raise TypeError(f"Invalid positional parameter {args[2]}")

2022 if len(args) >= 4:

2023 if isinstance(args[3], (float, int)):

2024 space_width = args[3]

2025 else:

2026 raise TypeError(f"Invalid positional parameter {args[3]}")

2027 elif isinstance(args[0], (tuple, int)):

2028 orientations = args[0]

2029 if len(args) >= 2:

2030 if isinstance(args[1], (float, int)):

2031 space_width = args[1]

2032 else:

2033 raise TypeError(f"Invalid positional parameter {args[1]}")

2034 else:

2035 raise TypeError(f"Invalid positional parameter {args[0]}")

2036

2037 if isinstance(orientations, int):

2038 orientations = (orientations,)

2039

2040 return self._extract_text(

2041 self,

2042 self.pdf,

2043 orientations,

2044 space_width,

2045 PG.CONTENTS,

2046 visitor_operand_before,

2047 visitor_operand_after,

2048 visitor_text,

2049 )

2050

2051 def extract_xform_text(

2052 self,

2053 xform: EncodedStreamObject,

2054 orientations: tuple[int, ...] = (0, 90, 270, 360),

2055 space_width: float = 200.0,

2056 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2057 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2058 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2059 ) -> str:

2060 """

2061 Extract text from an XObject.

2062

2063 Args:

2064 xform:

2065 orientations:

2066 space_width: force default space width (if not extracted from font (default 200)

2067 visitor_operand_before:

2068 visitor_operand_after:

2069 visitor_text:

2070

2071 Returns:

2072 The extracted text

2073

2074 """

2075 return self._extract_text(

2076 xform,

2077 self.pdf,

2078 orientations,

2079 space_width,

2080 None,

2081 visitor_operand_before,

2082 visitor_operand_after,

2083 visitor_text,

2084 )

2085

2086 def _get_fonts(self) -> tuple[set[str], set[str]]:

2087 """

2088 Get the names of embedded fonts and unembedded fonts.

2089

2090 Returns:

2091 A tuple (set of embedded fonts, set of unembedded fonts)

2092

2093 """

2094 obj = self.get_object()

2095 assert isinstance(obj, DictionaryObject)

2096 fonts: set[str] = set()

2097 embedded: set[str] = set()

2098 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2099 unembedded = fonts - embedded

2100 return embedded, unembedded

2101

2102 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2103 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2104 default user space units, defining the boundaries of the physical medium on

2105 which the page is intended to be displayed or printed."""

2106

2107 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2108 """

2109 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2110 default user space units, defining the visible region of default user

2111 space.

2112

2113 When the page is displayed or printed, its contents are to be clipped

2114 (cropped) to this rectangle and then imposed on the output medium in some

2115 implementation-defined manner. Default value: same as

2116 :attr:`mediabox<mediabox>`.

2117 """

2118

2119 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2120 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2121 default user space units, defining the region to which the contents of the

2122 page should be clipped when output in a production environment."""

2123

2124 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2125 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2126 default user space units, defining the intended dimensions of the finished

2127 page after trimming."""

2128

2129 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2130 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2131 default user space units, defining the extent of the page's meaningful

2132 content as intended by the page's creator."""

2133

2134 @property

2135 def annotations(self) -> Optional[ArrayObject]:

2136 if "/Annots" not in self:

2137 return None

2138 return cast(ArrayObject, self["/Annots"])

2139

2140 @annotations.setter

2141 def annotations(self, value: Optional[ArrayObject]) -> None:

2142 """

2143 Set the annotations array of the page.

2144

2145 Typically you do not want to set this value, but append to it.

2146 If you append to it, remember to add the object first to the writer

2147 and only add the indirect object.

2148 """

2149 if value is None:

2150 if "/Annots" not in self:

2151 return

2152 del self[NameObject("/Annots")]

2153 else:

2154 self[NameObject("/Annots")] = value

2155

2156

2157class _VirtualList(Sequence[PageObject]):

2158 def __init__(

2159 self,

2160 length_function: Callable[[], int],

2161 get_function: Callable[[int], PageObject],

2162 ) -> None:

2163 self.length_function = length_function

2164 self.get_function = get_function

2165 self.current = -1

2166

2167 def __len__(self) -> int:

2168 return self.length_function()

2169

2170 @overload

2171 def __getitem__(self, index: int) -> PageObject:

2172 ...

2173

2174 @overload

2175 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2176 ...

2177

2178 def __getitem__(

2179 self, index: Union[int, slice]

2180 ) -> Union[PageObject, Sequence[PageObject]]:

2181 if isinstance(index, slice):

2182 indices = range(*index.indices(len(self)))

2183 cls = type(self)

2184 return cls(indices.__len__, lambda idx: self[indices[idx]])

2185 if not isinstance(index, int):

2186 raise TypeError("Sequence indices must be integers")

2187 len_self = len(self)

2188 if index < 0:

2189 # support negative indexes

2190 index += len_self

2191 if not (0 <= index < len_self):

2192 raise IndexError("Sequence index out of range")

2193 return self.get_function(index)

2194

2195 def __delitem__(self, index: Union[int, slice]) -> None:

2196 if isinstance(index, slice):

2197 r = list(range(*index.indices(len(self))))

2198 # pages have to be deleted from last to first

2199 r.sort()

2200 r.reverse()

2201 for p in r:

2202 del self[p] # recursive call

2203 return

2204 if not isinstance(index, int):

2205 raise TypeError("Index must be integers")

2206 len_self = len(self)

2207 if index < 0:

2208 # support negative indexes

2209 index += len_self

2210 if not (0 <= index < len_self):

2211 raise IndexError("Index out of range")

2212 ind = self[index].indirect_reference

2213 assert ind is not None

2214 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2215 "/Parent", None

2216 )

2217 first = True

2218 while parent is not None:

2219 parent = cast(DictionaryObject, parent.get_object())

2220 try:

2221 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2222 del cast(ArrayObject, parent["/Kids"])[i]

2223 first = False

2224 try:

2225 assert ind is not None

2226 del ind.pdf.flattened_pages[index] # case of page in a Reader

2227 except Exception: # pragma: no cover

2228 pass

2229 if "/Count" in parent:

2230 parent[NameObject("/Count")] = NumberObject(

2231 cast(int, parent["/Count"]) - 1

2232 )

2233 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2234 # No more objects in this part of this subtree

2235 ind = parent.indirect_reference

2236 parent = parent.get("/Parent", None)

2237 except ValueError: # from index

2238 if first:

2239 raise PdfReadError(f"Page not found in page tree: {ind}")

2240 break

2241

2242 def __iter__(self) -> Iterator[PageObject]:

2243 for i in range(len(self)):

2244 yield self[i]

2245

2246 def __str__(self) -> str:

2247 p = [f"PageObject({i})" for i in range(self.length_function())]

2248 return f"[{', '.join(p)}]"

2249

2250

2251def _get_fonts_walk(

2252 obj: DictionaryObject,

2253 fnt: set[str],

2254 emb: set[str],

2255) -> tuple[set[str], set[str]]:

2256 """

2257 Get the set of all fonts and all embedded fonts.

2258

2259 Args:

2260 obj: Page resources dictionary

2261 fnt: font

2262 emb: embedded fonts

2263

2264 Returns:

2265 A tuple (fnt, emb)

2266

2267 If there is a key called 'BaseFont', that is a font that is used in the document.

2268 If there is a key called 'FontName' and another key in the same dictionary object

2269 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2270 embedded.

2271

2272 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2273

2274 """

2275 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2276

2277 def process_font(f: DictionaryObject) -> None:

2278 nonlocal fnt, emb

2279 f = cast(DictionaryObject, f.get_object()) # to be sure

2280 if "/BaseFont" in f:

2281 fnt.add(cast(str, f["/BaseFont"]))

2282

2283 if (

2284 ("/CharProcs" in f)

2285 or (

2286 "/FontDescriptor" in f

2287 and any(

2288 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2289 )

2290 )

2291 or (

2292 "/DescendantFonts" in f

2293 and "/FontDescriptor"

2294 in cast(

2295 DictionaryObject,

2296 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2297 )

2298 and any(

2299 x

2300 in cast(

2301 DictionaryObject,

2302 cast(

2303 DictionaryObject,

2304 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2305 )["/FontDescriptor"],

2306 )

2307 for x in fontkeys

2308 )

2309 )

2310 ):

2311 # the list comprehension ensures there is FontFile

2312 try:

2313 emb.add(cast(str, f["/BaseFont"]))

2314 except KeyError:

2315 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2316

2317 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2318 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2319 process_font(f)

2320 if "/Resources" in obj:

2321 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2322 for f in cast(

2323 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2324 ).values():

2325 process_font(f)

2326 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2327 for x in cast(

2328 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2329 ).values():

2330 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2331 if "/Annots" in obj:

2332 for a in cast(ArrayObject, obj["/Annots"]):

2333 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2334 if "/AP" in obj:

2335 if (

2336 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2337 "/Type"

2338 )

2339 == "/XObject"

2340 ):

2341 _get_fonts_walk(

2342 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2343 fnt,

2344 emb,

2345 )

2346 else:

2347 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2348 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2349 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

922 statements