Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from copy import deepcopy

33from dataclasses import asdict, dataclass

34from decimal import Decimal

35from io import BytesIO

36from pathlib import Path

37from typing import (

38 Any,

39 Callable,

40 Literal,

41 Optional,

42 Union,

43 cast,

44 overload,

45)

47from ._font import Font

48from ._protocols import PdfCommonDocProtocol

49from ._text_extraction import (

50 _layout_mode,

51)

52from ._text_extraction._text_extractor import TextExtraction

53from ._utils import (

54 CompressedTransformationMatrix,

55 TransformationMatrixType,

56 _human_readable_bytes,

57 deprecate,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING

62from .constants import AnnotationDictionaryAttributes as ADA

63from .constants import ImageAttributes as IA

64from .constants import PageAttributes as PG

65from .constants import Resources as RES

66from .errors import PageSizeNotDefinedError, PdfReadError

67from .generic import (

68 ArrayObject,

69 ContentStream,

70 DictionaryObject,

71 EncodedStreamObject,

72 FloatObject,

73 IndirectObject,

74 NameObject,

75 NullObject,

76 NumberObject,

77 PdfObject,

78 RectangleObject,

79 StreamObject,

80 is_null_or_none,

81)

83try:

84 from PIL.Image import Image

86 pil_not_imported = False

87except ImportError:

88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10

89 pil_not_imported = True # error will be raised only when using images

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

95 retval: Union[None, RectangleObject, IndirectObject] = self.get(name)

96 if isinstance(retval, RectangleObject):

97 return retval

98 if is_null_or_none(retval):

99 for d in defaults:

100 retval = self.get(d)

101 if retval is not None:

102 break

103 if isinstance(retval, IndirectObject):

104 retval = self.pdf.get_object(retval)

105 retval = RectangleObject(retval) # type: ignore

106 _set_rectangle(self, name, retval)

107 return retval

108

109

110def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

111 self[NameObject(name)] = value

112

113

114def _delete_rectangle(self: Any, name: str) -> None:

115 del self[name]

116

117

118def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

119 return property(

120 lambda self: _get_rectangle(self, name, fallback),

121 lambda self, value: _set_rectangle(self, name, value),

122 lambda self: _delete_rectangle(self, name),

123 )

124

125

126class Transformation:

127 """

128 Represent a 2D transformation.

129

130 The transformation between two coordinate systems is represented by a 3-by-3

131 transformation matrix with the following form::

132

133 a b 0

134 c d 0

135 e f 1

136

137 Because a transformation matrix has only six elements that can be changed,

138 it is usually specified in PDF as the six-element array [ a b c d e f ].

139

140 Coordinate transformations are expressed as matrix multiplications::

141

142 a b 0

143 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

144 e f 1

145

146

147 Example:

148 >>> from pypdf import PdfWriter, Transformation

149 >>> page = PdfWriter().add_blank_page(800, 600)

150 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

151 >>> page.add_transformation(op)

152

153 """

154

155 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

156 self.ctm = ctm

157

158 @property

159 def matrix(self) -> TransformationMatrixType:

160 """

161 Return the transformation matrix as a tuple of tuples in the form:

162

163 ((a, b, 0), (c, d, 0), (e, f, 1))

164 """

165 return (

166 (self.ctm[0], self.ctm[1], 0),

167 (self.ctm[2], self.ctm[3], 0),

168 (self.ctm[4], self.ctm[5], 1),

169 )

170

171 @staticmethod

172 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

173 """

174 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

175

176 Args:

177 matrix: The transformation matrix as a tuple of tuples.

178

179 Returns:

180 A tuple representing the transformation matrix as (a, b, c, d, e, f)

181

182 """

183 return (

184 matrix[0][0],

185 matrix[0][1],

186 matrix[1][0],

187 matrix[1][1],

188 matrix[2][0],

189 matrix[2][1],

190 )

191

192 def _to_cm(self) -> str:

193 # Returns the cm operation string for the given transformation matrix

194 return (

195 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

196 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

197 )

198

199 def transform(self, m: "Transformation") -> "Transformation":

200 """

201 Apply one transformation to another.

202

203 Args:

204 m: a Transformation to apply.

205

206 Returns:

207 A new ``Transformation`` instance

208

209 Example:

210 >>> from pypdf import PdfWriter, Transformation

211 >>> height, width = 40, 50

212 >>> page = PdfWriter().add_blank_page(800, 600)

213 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

214 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

215 >>> page.add_transformation(op)

216

217 """

218 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

219 return Transformation(ctm)

220

221 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

222 """

223 Translate the contents of a page.

224

225 Args:

226 tx: The translation along the x-axis.

227 ty: The translation along the y-axis.

228

229 Returns:

230 A new ``Transformation`` instance

231

232 """

233 m = self.ctm

234 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

235

236 def scale(

237 self, sx: Optional[float] = None, sy: Optional[float] = None

238 ) -> "Transformation":

239 """

240 Scale the contents of a page towards the origin of the coordinate system.

241

242 Typically, that is the lower-left corner of the page. That can be

243 changed by translating the contents / the page boxes.

244

245 Args:

246 sx: The scale factor along the x-axis.

247 sy: The scale factor along the y-axis.

248

249 Returns:

250 A new Transformation instance with the scaled matrix.

251

252 """

253 if sx is None and sy is None:

254 raise ValueError("Either sx or sy must be specified")

255 if sx is None:

256 sx = sy

257 if sy is None:

258 sy = sx

259 assert sx is not None

260 assert sy is not None

261 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

262 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

263 return Transformation(ctm)

264

265 def rotate(self, rotation: float) -> "Transformation":

266 """

267 Rotate the contents of a page.

268

269 Args:

270 rotation: The angle of rotation in degrees.

271

272 Returns:

273 A new ``Transformation`` instance with the rotated matrix.

274

275 """

276 rotation = math.radians(rotation)

277 op: TransformationMatrixType = (

278 (math.cos(rotation), math.sin(rotation), 0),

279 (-math.sin(rotation), math.cos(rotation), 0),

280 (0, 0, 1),

281 )

282 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

283 return Transformation(ctm)

284

285 def __repr__(self) -> str:

286 return f"Transformation(ctm={self.ctm})"

287

288 @overload

289 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

290 ...

291

292 @overload

293 def apply_on(

294 self, pt: tuple[float, float], as_object: bool = False

295 ) -> tuple[float, float]:

296 ...

297

298 def apply_on(

299 self,

300 pt: Union[tuple[float, float], list[float]],

301 as_object: bool = False,

302 ) -> Union[tuple[float, float], list[float]]:

303 """

304 Apply the transformation matrix on the given point.

305

306 Args:

307 pt: A tuple or list representing the point in the form (x, y).

308 as_object: If True, return items as FloatObject, otherwise as plain floats.

309

310 Returns:

311 A tuple or list representing the transformed point in the form (x', y')

312

313 """

314 typ = FloatObject if as_object else float

315 pt1 = (

316 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

317 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

318 )

319 return list(pt1) if isinstance(pt, list) else pt1

320

321

322@dataclass

323class ImageFile:

324 """

325 Image within the PDF file. *This object is not designed to be built.*

326

327 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

328 """

329

330 name: str = ""

331 """

332 Filename as identified within the PDF file.

333 """

334

335 data: bytes = b""

336 """

337 Data as bytes.

338 """

339

340 image: Optional[Image] = None

341 """

342 Data as PIL image.

343 """

344

345 indirect_reference: Optional[IndirectObject] = None

346 """

347 Reference to the object storing the stream.

348 """

349

350 def replace(self, new_image: Image, **kwargs: Any) -> None:

351 """

352 Replace the image with a new PIL image.

353

354 Args:

355 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

356 **kwargs: Additional keyword arguments to pass to `Image.save()`.

357

358 Raises:

359 TypeError: If the image is inline or in a PdfReader.

360 TypeError: If the image does not belong to a PdfWriter.

361 TypeError: If `new_image` is not a PIL Image.

362

363 Note:

364 This method replaces the existing image with a new image.

365 It is not allowed for inline images or images within a PdfReader.

366 The `kwargs` parameter allows passing additional parameters

367 to `Image.save()`, such as quality.

368

369 """

370 if pil_not_imported:

371 raise ImportError(

372 "pillow is required to do image extraction. "

373 "It can be installed via 'pip install pypdf[image]'"

374 )

375

376 from ._reader import PdfReader # noqa: PLC0415

377

378 # to prevent circular import

379 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

380 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

381

382 if self.indirect_reference is None:

383 raise TypeError("Cannot update an inline image.")

384 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

385 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

386 if not isinstance(new_image, Image):

387 raise TypeError("new_image shall be a PIL Image")

388 b = BytesIO()

389 new_image.save(b, "PDF", **kwargs)

390 reader = PdfReader(b)

391 page_image = reader.pages[0].images[0]

392 assert page_image.indirect_reference is not None

393 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

394 page_image.indirect_reference.get_object()

395 )

396 cast(

397 PdfObject, self.indirect_reference.get_object()

398 ).indirect_reference = self.indirect_reference

399 # change the object attributes

400 extension, byte_stream, img = _xobj_to_image(

401 cast(DictionaryObject, self.indirect_reference.get_object()),

402 pillow_parameters=kwargs,

403 )

404 assert extension is not None

405 self.name = self.name[: self.name.rfind(".")] + extension

406 self.data = byte_stream

407 self.image = img

408

409 def __str__(self) -> str:

410 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

411

412 def __repr__(self) -> str:

413 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

414

415

416class VirtualListImages(Sequence[ImageFile]):

417 """

418 Provides access to images referenced within a page.

419 Only one copy will be returned if the usage is used on the same page multiple times.

420 See :func:`PageObject.images` for more details.

421 """

422

423 def __init__(

424 self,

425 ids_function: Callable[[], list[Union[str, list[str]]]],

426 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

427 ) -> None:

428 self.ids_function = ids_function

429 self.get_function = get_function

430 self.current = -1

431

432 def __len__(self) -> int:

433 return len(self.ids_function())

434

435 def keys(self) -> list[Union[str, list[str]]]:

436 return self.ids_function()

437

438 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

439 return [(x, self[x]) for x in self.ids_function()]

440

441 @overload

442 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

443 ...

444

445 @overload

446 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

447 ...

448

449 def __getitem__(

450 self, index: Union[int, slice, str, list[str], tuple[str]]

451 ) -> Union[ImageFile, Sequence[ImageFile]]:

452 lst = self.ids_function()

453 if isinstance(index, slice):

454 indices = range(*index.indices(len(self)))

455 lst = [lst[x] for x in indices]

456 cls = type(self)

457 return cls((lambda: lst), self.get_function)

458 if isinstance(index, (str, list, tuple)):

459 return self.get_function(index)

460 if not isinstance(index, int):

461 raise TypeError("Invalid sequence indices type")

462 len_self = len(lst)

463 if index < 0:

464 # support negative indexes

465 index += len_self

466 if not (0 <= index < len_self):

467 raise IndexError("Sequence index out of range")

468 return self.get_function(lst[index])

469

470 def __iter__(self) -> Iterator[ImageFile]:

471 for i in range(len(self)):

472 yield self[i]

473

474 def __str__(self) -> str:

475 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

476 return f"[{', '.join(p)}]"

477

478

479class PageObject(DictionaryObject):

480 """

481 PageObject represents a single page within a PDF file.

482

483 Typically these objects will be created by accessing the

484 :attr:`pages<pypdf.PdfReader.pages>` property of the

485 :class:`PdfReader<pypdf.PdfReader>` class, but it is

486 also possible to create an empty page with the

487 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

488

489 Args:

490 pdf: PDF file the page belongs to.

491 indirect_reference: Stores the original indirect reference to

492 this object in its source PDF

493

494 """

495

496 original_page: "PageObject" # very local use in writer when appending

497

498 def __init__(

499 self,

500 pdf: Optional[PdfCommonDocProtocol] = None,

501 indirect_reference: Optional[IndirectObject] = None,

502 ) -> None:

503 DictionaryObject.__init__(self)

504 self.pdf = pdf

505 self.inline_images: Optional[dict[str, ImageFile]] = None

506 self.indirect_reference = indirect_reference

507 if not is_null_or_none(indirect_reference):

508 assert indirect_reference is not None, "mypy"

509 self.update(cast(DictionaryObject, indirect_reference.get_object()))

510 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}

511

512 def hash_bin(self) -> int:

513 """

514 Used to detect modified object.

515

516 Note: this function is overloaded to return the same results

517 as a DictionaryObject.

518

519 Returns:

520 Hash considering type and value.

521

522 """

523 return hash(

524 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

525 )

526

527 def hash_value_data(self) -> bytes:

528 data = super().hash_value_data()

529 data += f"{id(self)}".encode()

530 return data

531

532 @property

533 def user_unit(self) -> float:

534 """

535 A read-only positive number giving the size of user space units.

536

537 It is in multiples of 1/72 inch. Hence a value of 1 means a user

538 space unit is 1/72 inch, and a value of 3 means that a user

539 space unit is 3/72 inch.

540 """

541 return self.get(PG.USER_UNIT, 1)

542

543 @staticmethod

544 def create_blank_page(

545 pdf: Optional[PdfCommonDocProtocol] = None,

546 width: Union[float, Decimal, None] = None,

547 height: Union[float, Decimal, None] = None,

548 ) -> "PageObject":

549 """

550 Return a new blank page.

551

552 If ``width`` or ``height`` is ``None``, try to get the page size

553 from the last page of *pdf*.

554

555 Args:

556 pdf: PDF file the page is within.

557 width: The width of the new page expressed in default user

558 space units.

559 height: The height of the new page expressed in default user

560 space units.

561

562 Returns:

563 The new blank page

564

565 Raises:

566 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

567 no page

568

569 """

570 page = PageObject(pdf)

571

572 # Creates a new page (cf PDF Reference §7.7.3.3)

573 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

574 page.__setitem__(NameObject(PG.PARENT), NullObject())

575 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

576 if width is None or height is None:

577 if pdf is not None and len(pdf.pages) > 0:

578 lastpage = pdf.pages[len(pdf.pages) - 1]

579 width = lastpage.mediabox.width

580 height = lastpage.mediabox.height

581 else:

582 raise PageSizeNotDefinedError

583 page.__setitem__(

584 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore

585 )

586

587 return page

588

589 def _get_ids_image(

590 self,

591 obj: Optional[DictionaryObject] = None,

592 ancest: Optional[list[str]] = None,

593 call_stack: Optional[list[Any]] = None,

594 ) -> list[Union[str, list[str]]]:

595 if call_stack is None:

596 call_stack = []

597 _i = getattr(obj, "indirect_reference", None)

598 if _i in call_stack:

599 return []

600 call_stack.append(_i)

601 if self.inline_images is None:

602 self.inline_images = self._get_inline_images()

603 if obj is None:

604 obj = self

605 if ancest is None:

606 ancest = []

607 lst: list[Union[str, list[str]]] = []

608 if (

609 PG.RESOURCES not in obj or

610 is_null_or_none(resources := obj[PG.RESOURCES]) or

611 RES.XOBJECT not in cast(DictionaryObject, resources)

612 ):

613 return [] if self.inline_images is None else list(self.inline_images.keys())

614

615 x_object = resources[RES.XOBJECT].get_object() # type: ignore

616 for o in x_object:

617 if not isinstance(x_object[o], StreamObject):

618 continue

619 if x_object[o][IA.SUBTYPE] == "/Image":

620 lst.append(o if len(ancest) == 0 else [*ancest, o])

621 else: # is a form with possible images inside

622 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

623 assert self.inline_images is not None

624 lst.extend(list(self.inline_images.keys()))

625 return lst

626

627 def _get_image(

628 self,

629 id: Union[str, list[str], tuple[str]],

630 obj: Optional[DictionaryObject] = None,

631 ) -> ImageFile:

632 if obj is None:

633 obj = cast(DictionaryObject, self)

634 if isinstance(id, tuple):

635 id = list(id)

636 if isinstance(id, list) and len(id) == 1:

637 id = id[0]

638 try:

639 xobjs = cast(

640 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

641 )

642 except KeyError:

643 if not (id[0] == "~" and id[-1] == "~"):

644 raise

645 if isinstance(id, str):

646 if id[0] == "~" and id[-1] == "~":

647 if self.inline_images is None:

648 self.inline_images = self._get_inline_images()

649 if self.inline_images is None: # pragma: no cover

650 raise KeyError("No inline image can be found")

651 return self.inline_images[id]

652

653 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

654 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

655 extension, byte_stream = imgd[:2]

656 return ImageFile(

657 name=f"{id[1:]}{extension}",

658 data=byte_stream,

659 image=imgd[2],

660 indirect_reference=xobjs[id].indirect_reference,

661 )

662 # in a subobject

663 ids = id[1:]

664 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

665

666 @property

667 def images(self) -> VirtualListImages:

668 """

669 Read-only property emulating a list of images on a page.

670

671 Get a list of all images on the page. The key can be:

672 - A string (for the top object)

673 - A tuple (for images within XObject forms)

674 - An integer

675

676 Examples:

677 * `reader.pages[0].images[0]` # return first image

678 * `reader.pages[0].images['/I0']` # return image '/I0'

679 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

680 * `for img in reader.pages[0].images:` # loops through all objects

681

682 images.keys() and images.items() can be used.

683

684 The ImageFile has the following properties:

685

686 * `.name` : name of the object

687 * `.data` : bytes of the object

688 * `.image` : PIL Image Object

689 * `.indirect_reference` : object reference

690

691 and the following methods:

692 `.replace(new_image: PIL.Image.Image, **kwargs)` :

693 replace the image in the pdf with the new image

694 applying the saving parameters indicated (such as quality)

695

696 Example usage:

697

698 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

699

700 Inline images are extracted and named ~0~, ~1~, ..., with the

701 indirect_reference set to None.

702

703 """

704 return VirtualListImages(self._get_ids_image, self._get_image)

705

706 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

707 """Translate values used in inline image"""

708 try:

709 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

710 except (TypeError, KeyError):

711 if isinstance(v, NameObject):

712 # It is a custom name, thus we have to look in resources.

713 # The only applicable case is for ColorSpace.

714 try:

715 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

716 v = cast(DictionaryObject, res)[v]

717 except KeyError: # for res and v

718 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

719 return v

720

721 def _get_inline_images(self) -> dict[str, ImageFile]:

722 """Load inline images. Entries will be identified as `~1~`."""

723 content = self.get_contents()

724 if is_null_or_none(content):

725 return {}

726 imgs_data = []

727 assert content is not None, "mypy"

728 for param, ope in content.operations:

729 if ope == b"INLINE IMAGE":

730 imgs_data.append(

731 {"settings": param["settings"], "__streamdata__": param["data"]}

732 )

733 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

734 raise PdfReadError(

735 f"{ope!r} operator met whereas not expected, "

736 "please share use case with pypdf dev team"

737 )

738 files = {}

739 for num, ii in enumerate(imgs_data):

740 init = {

741 "__streamdata__": ii["__streamdata__"],

742 "/Length": len(ii["__streamdata__"]),

743 }

744 for k, v in ii["settings"].items():

745 if k in {"/Length", "/L"}: # no length is expected

746 continue

747 if isinstance(v, list):

748 v = ArrayObject(

749 [self._translate_value_inline_image(k, x) for x in v]

750 )

751 else:

752 v = self._translate_value_inline_image(k, v)

753 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

754 if k not in init:

755 init[k] = v

756 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

757 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

758 extension, byte_stream, img = _xobj_to_image(ii["object"])

759 files[f"~{num}~"] = ImageFile(

760 name=f"~{num}~{extension}",

761 data=byte_stream,

762 image=img,

763 indirect_reference=None,

764 )

765 return files

766

767 @property

768 def rotation(self) -> int:

769 """

770 The visual rotation of the page.

771

772 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

773 valid values. This property does not affect ``/Contents``.

774 """

775 rotate_obj = self.get(PG.ROTATE, 0)

776 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

777

778 @rotation.setter

779 def rotation(self, r: float) -> None:

780 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

781

782 def transfer_rotation_to_content(self) -> None:

783 """

784 Apply the rotation of the page to the content and the media/crop/...

785 boxes.

786

787 It is recommended to apply this function before page merging.

788 """

789 r = -self.rotation # rotation to apply is in the otherway

790 self.rotation = 0

791 mb = RectangleObject(self.mediabox)

792 trsf = (

793 Transformation()

794 .translate(

795 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

796 )

797 .rotate(r)

798 )

799 pt1 = trsf.apply_on(mb.lower_left)

800 pt2 = trsf.apply_on(mb.upper_right)

801 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

802 self.add_transformation(trsf, False)

803 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

804 if b in self:

805 rr = RectangleObject(self[b]) # type: ignore

806 pt1 = trsf.apply_on(rr.lower_left)

807 pt2 = trsf.apply_on(rr.upper_right)

808 self[NameObject(b)] = RectangleObject(

809 (

810 min(pt1[0], pt2[0]),

811 min(pt1[1], pt2[1]),

812 max(pt1[0], pt2[0]),

813 max(pt1[1], pt2[1]),

814 )

815 )

816

817 def rotate(self, angle: int) -> "PageObject":

818 """

819 Rotate a page clockwise by increments of 90 degrees.

820

821 Args:

822 angle: Angle to rotate the page. Must be an increment of 90 deg.

823

824 Returns:

825 The rotated PageObject

826

827 """

828 if angle % 90 != 0:

829 raise ValueError("Rotation angle must be a multiple of 90")

830 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

831 return self

832

833 def _merge_resources(

834 self,

835 res1: DictionaryObject,

836 res2: DictionaryObject,

837 resource: Any,

838 new_res1: bool = True,

839 ) -> tuple[dict[str, Any], dict[str, Any]]:

840 try:

841 assert isinstance(self.indirect_reference, IndirectObject)

842 pdf = self.indirect_reference.pdf

843 is_pdf_writer = hasattr(

844 pdf, "_add_object"

845 ) # expect isinstance(pdf, PdfWriter)

846 except (AssertionError, AttributeError):

847 pdf = None

848 is_pdf_writer = False

849

850 def compute_unique_key(base_key: str) -> tuple[str, bool]:

851 """

852 Find a key that either doesn't already exist or has the same value

853 (indicated by the bool)

854

855 Args:

856 base_key: An index is added to this to get the computed key

857

858 Returns:

859 A tuple (computed key, bool) where the boolean indicates

860 if there is a resource of the given computed_key with the same

861 value.

862

863 """

864 value = page2res.raw_get(base_key)

865 # TODO: a possible improvement for writer, the indirect_reference

866 # cannot be found because translated

867

868 # try the current key first (e.g. "foo"), but otherwise iterate

869 # through "foo-0", "foo-1", etc. new_res can contain only finitely

870 # many keys, thus this'll eventually end, even if it's been crafted

871 # to be maximally annoying.

872 computed_key = base_key

873 idx = 0

874 while computed_key in new_res:

875 if new_res.raw_get(computed_key) == value:

876 # there's already a resource of this name, with the exact

877 # same value

878 return computed_key, True

879 computed_key = f"{base_key}-{idx}"

880 idx += 1

881 return computed_key, False

882

883 if new_res1:

884 new_res = DictionaryObject()

885 new_res.update(res1.get(resource, DictionaryObject()).get_object())

886 else:

887 new_res = cast(DictionaryObject, res1[resource])

888 page2res = cast(

889 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

890 )

891 rename_res = {}

892 for key in page2res:

893 unique_key, same_value = compute_unique_key(key)

894 newname = NameObject(unique_key)

895 if key != unique_key:

896 # we have to use a different name for this

897 rename_res[key] = newname

898

899 if not same_value:

900 if is_pdf_writer:

901 new_res[newname] = page2res.raw_get(key).clone(pdf)

902 try:

903 new_res[newname] = new_res[newname].indirect_reference

904 except AttributeError:

905 pass

906 else:

907 new_res[newname] = page2res.raw_get(key)

908 lst = sorted(new_res.items())

909 new_res.clear()

910 for el in lst:

911 new_res[el[0]] = el[1]

912 return new_res, rename_res

913

914 @staticmethod

915 def _content_stream_rename(

916 stream: ContentStream,

917 rename: dict[Any, Any],

918 pdf: Optional[PdfCommonDocProtocol],

919 ) -> ContentStream:

920 if not rename:

921 return stream

922 stream = ContentStream(stream, pdf)

923 for operands, _operator in stream.operations:

924 if isinstance(operands, list):

925 for i, op in enumerate(operands):

926 if isinstance(op, NameObject):

927 operands[i] = rename.get(op, op)

928 elif isinstance(operands, dict):

929 for i, op in operands.items():

930 if isinstance(op, NameObject):

931 operands[i] = rename.get(op, op)

932 else:

933 raise KeyError(f"Type of operands is {type(operands)}")

934 return stream

935

936 @staticmethod

937 def _add_transformation_matrix(

938 contents: Any,

939 pdf: Optional[PdfCommonDocProtocol],

940 ctm: CompressedTransformationMatrix,

941 ) -> ContentStream:

942 """Add transformation matrix at the beginning of the given contents stream."""

943 contents = ContentStream(contents, pdf)

944 contents.operations.insert(

945 0,

946 [

947 [FloatObject(x) for x in ctm],

948 b"cm",

949 ],

950 )

951 return contents

952

953 def _get_contents_as_bytes(self) -> Optional[bytes]:

954 """

955 Return the page contents as bytes.

956

957 Returns:

958 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

959

960 """

961 if PG.CONTENTS in self:

962 obj = self[PG.CONTENTS].get_object()

963 if isinstance(obj, list):

964 return b"".join(x.get_object().get_data() for x in obj)

965 return cast(EncodedStreamObject, obj).get_data()

966 return None

967

968 def get_contents(self) -> Optional[ContentStream]:

969 """

970 Access the page contents.

971

972 Returns:

973 The ``/Contents`` object, or ``None`` if it does not exist.

974 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

975

976 """

977 if PG.CONTENTS in self:

978 try:

979 pdf = cast(IndirectObject, self.indirect_reference).pdf

980 except AttributeError:

981 pdf = None

982 obj = self[PG.CONTENTS]

983 if is_null_or_none(obj):

984 return None

985 resolved_object = obj.get_object()

986 return ContentStream(resolved_object, pdf)

987 return None

988

989 def replace_contents(

990 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

991 ) -> None:

992 """

993 Replace the page contents with the new content and nullify old objects

994 Args:

995 content: new content; if None delete the content field.

996 """

997 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

998 # the page is not attached : the content is directly attached.

999 self[NameObject(PG.CONTENTS)] = content

1000 return

1001

1002 from pypdf._writer import PdfWriter # noqa: PLC0415

1003 if not isinstance(self.indirect_reference.pdf, PdfWriter):

1004 deprecate(

1005 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "

1006 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "

1007 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."

1008 )

1009

1010 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1011 for o in self[PG.CONTENTS]: # type: ignore[attr-defined]

1012 try:

1013 self.indirect_reference.pdf._objects[

1014 o.indirect_reference.idnum - 1

1015 ] = NullObject()

1016 except AttributeError:

1017 pass

1018

1019 if isinstance(content, ArrayObject):

1020 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content)

1021

1022 if is_null_or_none(content):

1023 if PG.CONTENTS not in self:

1024 return

1025 assert self.indirect_reference is not None

1026 assert self[PG.CONTENTS].indirect_reference is not None

1027 self.indirect_reference.pdf._objects[

1028 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore

1029 ] = NullObject()

1030 del self[PG.CONTENTS]

1031 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1032 try:

1033 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(

1034 content

1035 )

1036 except AttributeError:

1037 # applies at least for page not in writer

1038 # as a backup solution, we put content as an object although not in accordance with pdf ref

1039 # this will be fixed with the _add_object

1040 self[NameObject(PG.CONTENTS)] = content

1041 else:

1042 assert content is not None, "mypy"

1043 content.indirect_reference = self[

1044 PG.CONTENTS

1045 ].indirect_reference # TODO: in the future may require generation management

1046 try:

1047 self.indirect_reference.pdf._objects[

1048 content.indirect_reference.idnum - 1 # type: ignore

1049 ] = content

1050 except AttributeError:

1051 # applies at least for page not in writer

1052 # as a backup solution, we put content as an object although not in accordance with pdf ref

1053 # this will be fixed with the _add_object

1054 self[NameObject(PG.CONTENTS)] = content

1055 # forces recalculation of inline_images

1056 self.inline_images = None

1057

1058 def merge_page(

1059 self, page2: "PageObject", expand: bool = False, over: bool = True

1060 ) -> None:

1061 """

1062 Merge the content streams of two pages into one.

1063

1064 Resource references (e.g. fonts) are maintained from both pages.

1065 The mediabox, cropbox, etc of this page are not altered.

1066 The parameter page's content stream will

1067 be added to the end of this page's content stream,

1068 meaning that it will be drawn after, or "on top" of this page.

1069

1070 Args:

1071 page2: The page to be merged into this one. Should be

1072 an instance of :class:`PageObject<PageObject>`.

1073 over: set the page2 content over page1 if True (default) else under

1074 expand: If True, the current page dimensions will be

1075 expanded to accommodate the dimensions of the page to be merged.

1076

1077 """

1078 self._merge_page(page2, over=over, expand=expand)

1079

1080 def _merge_page(

1081 self,

1082 page2: "PageObject",

1083 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1084 ctm: Optional[CompressedTransformationMatrix] = None,

1085 over: bool = True,

1086 expand: bool = False,

1087 ) -> None:

1088 # First we work on merging the resource dictionaries. This allows us

1089 # to find out what symbols in the content streams we might need to

1090 # rename.

1091 try:

1092 assert isinstance(self.indirect_reference, IndirectObject)

1093 if hasattr(

1094 self.indirect_reference.pdf, "_add_object"

1095 ): # to detect PdfWriter

1096 return self._merge_page_writer(

1097 page2, page2transformation, ctm, over, expand

1098 )

1099 except (AssertionError, AttributeError):

1100 pass

1101

1102 new_resources = DictionaryObject()

1103 rename = {}

1104 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())

1105 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())

1106 new_annots = ArrayObject()

1107

1108 for page in (self, page2):

1109 if PG.ANNOTS in page:

1110 annots = page[PG.ANNOTS]

1111 if isinstance(annots, ArrayObject):

1112 new_annots.extend(annots)

1113

1114 for res in (

1115 RES.EXT_G_STATE,

1116 RES.FONT,

1117 RES.XOBJECT,

1118 RES.COLOR_SPACE,

1119 RES.PATTERN,

1120 RES.SHADING,

1121 RES.PROPERTIES,

1122 ):

1123 new, newrename = self._merge_resources(

1124 original_resources, page2resources, res

1125 )

1126 if new:

1127 new_resources[NameObject(res)] = new

1128 rename.update(newrename)

1129

1130 # Combine /ProcSet sets, making sure there's a consistent order

1131 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1132 sorted(

1133 set(

1134 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1135 ).union(

1136 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())

1137 )

1138 )

1139 )

1140

1141 new_content_array = ArrayObject()

1142 original_content = self.get_contents()

1143 if original_content is not None:

1144 original_content.isolate_graphics_state()

1145 new_content_array.append(original_content)

1146

1147 page2content = page2.get_contents()

1148 if page2content is not None:

1149 rect = getattr(page2, MERGE_CROP_BOX)

1150 page2content.operations.insert(

1151 0,

1152 (

1153 map(

1154 FloatObject,

1155 [

1156 rect.left,

1157 rect.bottom,

1158 rect.width,

1159 rect.height,

1160 ],

1161 ),

1162 b"re",

1163 ),

1164 )

1165 page2content.operations.insert(1, ([], b"W"))

1166 page2content.operations.insert(2, ([], b"n"))

1167 if page2transformation is not None:

1168 page2content = page2transformation(page2content)

1169 page2content = PageObject._content_stream_rename(

1170 page2content, rename, self.pdf

1171 )

1172 page2content.isolate_graphics_state()

1173 if over:

1174 new_content_array.append(page2content)

1175 else:

1176 new_content_array.insert(0, page2content)

1177

1178 # if expanding the page to fit a new page, calculate the new media box size

1179 if expand:

1180 self._expand_mediabox(page2, ctm)

1181

1182 self.replace_contents(ContentStream(new_content_array, self.pdf))

1183 self[NameObject(PG.RESOURCES)] = new_resources

1184 self[NameObject(PG.ANNOTS)] = new_annots

1185 return None

1186

1187 def _merge_page_writer(

1188 self,

1189 page2: "PageObject",

1190 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1191 ctm: Optional[CompressedTransformationMatrix] = None,

1192 over: bool = True,

1193 expand: bool = False,

1194 ) -> None:

1195 # First we work on merging the resource dictionaries. This allows us

1196 # to find which symbols in the content streams we might need to

1197 # rename.

1198 assert isinstance(self.indirect_reference, IndirectObject)

1199 pdf = self.indirect_reference.pdf

1200

1201 rename = {}

1202 if PG.RESOURCES not in self:

1203 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1204 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1205 if PG.RESOURCES not in page2:

1206 page2resources = DictionaryObject()

1207 else:

1208 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1209

1210 for res in (

1211 RES.EXT_G_STATE,

1212 RES.FONT,

1213 RES.XOBJECT,

1214 RES.COLOR_SPACE,

1215 RES.PATTERN,

1216 RES.SHADING,

1217 RES.PROPERTIES,

1218 ):

1219 if res in page2resources:

1220 if res not in original_resources:

1221 original_resources[NameObject(res)] = DictionaryObject()

1222 _, newrename = self._merge_resources(

1223 original_resources, page2resources, res, False

1224 )

1225 rename.update(newrename)

1226 # Combine /ProcSet sets.

1227 if RES.PROC_SET in page2resources:

1228 if RES.PROC_SET not in original_resources:

1229 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1230 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1231 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1232 if x not in arr:

1233 arr.append(x)

1234 arr.sort()

1235

1236 if PG.ANNOTS in page2:

1237 if PG.ANNOTS not in self:

1238 self[NameObject(PG.ANNOTS)] = ArrayObject()

1239 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1240 if ctm is None:

1241 trsf = Transformation()

1242 else:

1243 trsf = Transformation(ctm)

1244 # Ensure we are working on a copy of the list. Otherwise, if both pages

1245 # are the same object, we might run into an infinite loop.

1246 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):

1247 a = a.get_object()

1248 aa = a.clone(

1249 pdf,

1250 ignore_fields=("/P", "/StructParent", "/Parent"),

1251 force_duplicate=True,

1252 )

1253 r = cast(ArrayObject, a["/Rect"])

1254 pt1 = trsf.apply_on((r[0], r[1]), True)

1255 pt2 = trsf.apply_on((r[2], r[3]), True)

1256 aa[NameObject("/Rect")] = ArrayObject(

1257 (

1258 min(pt1[0], pt2[0]),

1259 min(pt1[1], pt2[1]),

1260 max(pt1[0], pt2[0]),

1261 max(pt1[1], pt2[1]),

1262 )

1263 )

1264 if "/QuadPoints" in a:

1265 q = cast(ArrayObject, a["/QuadPoints"])

1266 aa[NameObject("/QuadPoints")] = ArrayObject(

1267 trsf.apply_on((q[0], q[1]), True)

1268 + trsf.apply_on((q[2], q[3]), True)

1269 + trsf.apply_on((q[4], q[5]), True)

1270 + trsf.apply_on((q[6], q[7]), True)

1271 )

1272 try:

1273 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1274 except KeyError:

1275 pass

1276 try:

1277 aa[NameObject("/P")] = self.indirect_reference

1278 annots.append(aa.indirect_reference)

1279 except AttributeError:

1280 pass

1281

1282 new_content_array = ArrayObject()

1283 original_content = self.get_contents()

1284 if original_content is not None:

1285 original_content.isolate_graphics_state()

1286 new_content_array.append(original_content)

1287

1288 page2content = page2.get_contents()

1289 if page2content is not None:

1290 rect = getattr(page2, MERGE_CROP_BOX)

1291 page2content.operations.insert(

1292 0,

1293 (

1294 map(

1295 FloatObject,

1296 [

1297 rect.left,

1298 rect.bottom,

1299 rect.width,

1300 rect.height,

1301 ],

1302 ),

1303 b"re",

1304 ),

1305 )

1306 page2content.operations.insert(1, ([], b"W"))

1307 page2content.operations.insert(2, ([], b"n"))

1308 if page2transformation is not None:

1309 page2content = page2transformation(page2content)

1310 page2content = PageObject._content_stream_rename(

1311 page2content, rename, self.pdf

1312 )

1313 page2content.isolate_graphics_state()

1314 if over:

1315 new_content_array.append(page2content)

1316 else:

1317 new_content_array.insert(0, page2content)

1318

1319 # if expanding the page to fit a new page, calculate the new media box size

1320 if expand:

1321 self._expand_mediabox(page2, ctm)

1322

1323 self.replace_contents(new_content_array)

1324

1325 def _expand_mediabox(

1326 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1327 ) -> None:

1328 corners1 = (

1329 self.mediabox.left.as_numeric(),

1330 self.mediabox.bottom.as_numeric(),

1331 self.mediabox.right.as_numeric(),

1332 self.mediabox.top.as_numeric(),

1333 )

1334 corners2 = (

1335 page2.mediabox.left.as_numeric(),

1336 page2.mediabox.bottom.as_numeric(),

1337 page2.mediabox.left.as_numeric(),

1338 page2.mediabox.top.as_numeric(),

1339 page2.mediabox.right.as_numeric(),

1340 page2.mediabox.top.as_numeric(),

1341 page2.mediabox.right.as_numeric(),

1342 page2.mediabox.bottom.as_numeric(),

1343 )

1344 if ctm is not None:

1345 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1346 new_x = tuple(

1347 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1348 for i in range(0, 8, 2)

1349 )

1350 new_y = tuple(

1351 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1352 for i in range(0, 8, 2)

1353 )

1354 else:

1355 new_x = corners2[0:8:2]

1356 new_y = corners2[1:8:2]

1357 lowerleft = (min(new_x), min(new_y))

1358 upperright = (max(new_x), max(new_y))

1359 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1360 upperright = (

1361 max(corners1[2], upperright[0]),

1362 max(corners1[3], upperright[1]),

1363 )

1364

1365 self.mediabox.lower_left = lowerleft

1366 self.mediabox.upper_right = upperright

1367

1368 def merge_transformed_page(

1369 self,

1370 page2: "PageObject",

1371 ctm: Union[CompressedTransformationMatrix, Transformation],

1372 over: bool = True,

1373 expand: bool = False,

1374 ) -> None:

1375 """

1376 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1377 matrix is applied to the merged stream.

1378

1379 Args:

1380 page2: The page to be merged into this one.

1381 ctm: a 6-element tuple containing the operands of the

1382 transformation matrix

1383 over: set the page2 content over page1 if True (default) else under

1384 expand: Whether the page should be expanded to fit the dimensions

1385 of the page to be merged.

1386

1387 """

1388 if isinstance(ctm, Transformation):

1389 ctm = ctm.ctm

1390 self._merge_page(

1391 page2,

1392 lambda page2_content: PageObject._add_transformation_matrix(

1393 page2_content, page2.pdf, ctm

1394 ),

1395 ctm,

1396 over,

1397 expand,

1398 )

1399

1400 def merge_scaled_page(

1401 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1402 ) -> None:

1403 """

1404 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1405 is scaled by applying a transformation matrix.

1406

1407 Args:

1408 page2: The page to be merged into this one.

1409 scale: The scaling factor

1410 over: set the page2 content over page1 if True (default) else under

1411 expand: Whether the page should be expanded to fit the

1412 dimensions of the page to be merged.

1413

1414 """

1415 op = Transformation().scale(scale, scale)

1416 self.merge_transformed_page(page2, op, over, expand)

1417

1418 def merge_rotated_page(

1419 self,

1420 page2: "PageObject",

1421 rotation: float,

1422 over: bool = True,

1423 expand: bool = False,

1424 ) -> None:

1425 """

1426 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1427 is rotated by applying a transformation matrix.

1428

1429 Args:

1430 page2: The page to be merged into this one.

1431 rotation: The angle of the rotation, in degrees

1432 over: set the page2 content over page1 if True (default) else under

1433 expand: Whether the page should be expanded to fit the

1434 dimensions of the page to be merged.

1435

1436 """

1437 op = Transformation().rotate(rotation)

1438 self.merge_transformed_page(page2, op, over, expand)

1439

1440 def merge_translated_page(

1441 self,

1442 page2: "PageObject",

1443 tx: float,

1444 ty: float,

1445 over: bool = True,

1446 expand: bool = False,

1447 ) -> None:

1448 """

1449 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1450 merged is translated by applying a transformation matrix.

1451

1452 Args:

1453 page2: the page to be merged into this one.

1454 tx: The translation on X axis

1455 ty: The translation on Y axis

1456 over: set the page2 content over page1 if True (default) else under

1457 expand: Whether the page should be expanded to fit the

1458 dimensions of the page to be merged.

1459

1460 """

1461 op = Transformation().translate(tx, ty)

1462 self.merge_transformed_page(page2, op, over, expand)

1463

1464 def add_transformation(

1465 self,

1466 ctm: Union[Transformation, CompressedTransformationMatrix],

1467 expand: bool = False,

1468 ) -> None:

1469 """

1470 Apply a transformation matrix to the page.

1471

1472 Args:

1473 ctm: A 6-element tuple containing the operands of the

1474 transformation matrix. Alternatively, a

1475 :py:class:`Transformation<pypdf.Transformation>`

1476 object can be passed.

1477

1478 See :doc:`/user/cropping-and-transforming`.

1479

1480 """

1481 if isinstance(ctm, Transformation):

1482 ctm = ctm.ctm

1483 content = self.get_contents()

1484 if content is not None:

1485 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1486 content.isolate_graphics_state()

1487 self.replace_contents(content)

1488 # if expanding the page to fit a new page, calculate the new media box size

1489 if expand:

1490 corners = [

1491 self.mediabox.left.as_numeric(),

1492 self.mediabox.bottom.as_numeric(),

1493 self.mediabox.left.as_numeric(),

1494 self.mediabox.top.as_numeric(),

1495 self.mediabox.right.as_numeric(),

1496 self.mediabox.top.as_numeric(),

1497 self.mediabox.right.as_numeric(),

1498 self.mediabox.bottom.as_numeric(),

1499 ]

1500

1501 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1502 new_x = [

1503 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1504 for i in range(0, 8, 2)

1505 ]

1506 new_y = [

1507 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1508 for i in range(0, 8, 2)

1509 ]

1510

1511 self.mediabox.lower_left = (min(new_x), min(new_y))

1512 self.mediabox.upper_right = (max(new_x), max(new_y))

1513

1514 def scale(self, sx: float, sy: float) -> None:

1515 """

1516 Scale a page by the given factors by applying a transformation matrix

1517 to its content and updating the page size.

1518

1519 This updates the various page boundaries (bleedbox, trimbox, etc.)

1520 and the contents of the page.

1521

1522 Args:

1523 sx: The scaling factor on horizontal axis.

1524 sy: The scaling factor on vertical axis.

1525

1526 """

1527 self.add_transformation((sx, 0, 0, sy, 0, 0))

1528 self.bleedbox = self.bleedbox.scale(sx, sy)

1529 self.trimbox = self.trimbox.scale(sx, sy)

1530 self.artbox = self.artbox.scale(sx, sy)

1531 self.cropbox = self.cropbox.scale(sx, sy)

1532 self.mediabox = self.mediabox.scale(sx, sy)

1533

1534 if PG.ANNOTS in self:

1535 annotations = self[PG.ANNOTS]

1536 if isinstance(annotations, ArrayObject):

1537 for annotation in annotations:

1538 annotation_obj = annotation.get_object()

1539 if ADA.Rect in annotation_obj:

1540 rectangle = annotation_obj[ADA.Rect]

1541 if isinstance(rectangle, ArrayObject):

1542 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1543 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1544 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1545 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1546

1547 if PG.VP in self:

1548 viewport = self[PG.VP]

1549 if isinstance(viewport, ArrayObject):

1550 bbox = viewport[0]["/BBox"]

1551 else:

1552 bbox = viewport["/BBox"] # type: ignore

1553 scaled_bbox = RectangleObject(

1554 (

1555 float(bbox[0]) * sx,

1556 float(bbox[1]) * sy,

1557 float(bbox[2]) * sx,

1558 float(bbox[3]) * sy,

1559 )

1560 )

1561 if isinstance(viewport, ArrayObject):

1562 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore

1563 NameObject("/BBox")

1564 ] = scaled_bbox

1565 else:

1566 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore

1567

1568 def scale_by(self, factor: float) -> None:

1569 """

1570 Scale a page by the given factor by applying a transformation matrix to

1571 its content and updating the page size.

1572

1573 Args:

1574 factor: The scaling factor (for both X and Y axis).

1575

1576 """

1577 self.scale(factor, factor)

1578

1579 def scale_to(self, width: float, height: float) -> None:

1580 """

1581 Scale a page to the specified dimensions by applying a transformation

1582 matrix to its content and updating the page size.

1583

1584 Args:

1585 width: The new width.

1586 height: The new height.

1587

1588 """

1589 sx = width / float(self.mediabox.width)

1590 sy = height / float(self.mediabox.height)

1591 self.scale(sx, sy)

1592

1593 def compress_content_streams(self, level: int = -1) -> None:

1594 """

1595 Compress the size of this page by joining all content streams and

1596 applying a FlateDecode filter.

1597

1598 However, it is possible that this function will perform no action if

1599 content stream compression becomes "automatic".

1600 """

1601 content = self.get_contents()

1602 if content is not None:

1603 content_obj = content.flate_encode(level)

1604 try:

1605 content.indirect_reference.pdf._objects[ # type: ignore

1606 content.indirect_reference.idnum - 1 # type: ignore

1607 ] = content_obj

1608 except AttributeError:

1609 if self.indirect_reference is not None and hasattr(

1610 self.indirect_reference.pdf, "_add_object"

1611 ):

1612 self.replace_contents(content_obj)

1613 else:

1614 raise ValueError("Page must be part of a PdfWriter")

1615

1616 @property

1617 def page_number(self) -> Optional[int]:

1618 """

1619 Read-only property which returns the page number within the PDF file.

1620

1621 Returns:

1622 Page number; None if the page is not attached to a PDF.

1623

1624 """

1625 if self.indirect_reference is None:

1626 return None

1627 try:

1628 lst = self.indirect_reference.pdf.pages

1629 return lst.index(self)

1630 except ValueError:

1631 return None

1632

1633 def _debug_for_extract(self) -> str: # pragma: no cover

1634 out = ""

1635 for ope, op in ContentStream(

1636 self["/Contents"].get_object(), self.pdf, "bytes"

1637 ).operations:

1638 if op == b"TJ":

1639 s = [x for x in ope[0] if isinstance(x, str)]

1640 else:

1641 s = []

1642 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1643 out += "\n=============================\n"

1644 try:

1645 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1646 out += fo + "\n"

1647 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1648 try:

1649 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1650 "/Encoding"

1651 ].__repr__()

1652 out += enc_repr + "\n"

1653 except Exception:

1654 pass

1655 try:

1656 out += (

1657 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1658 "/ToUnicode"

1659 ]

1660 .get_data()

1661 .decode()

1662 + "\n"

1663 )

1664 except Exception:

1665 pass

1666

1667 except KeyError:

1668 out += "No Font\n"

1669 return out

1670

1671 def _extract_text(

1672 self,

1673 obj: Any,

1674 pdf: Any,

1675 orientations: tuple[int, ...] = (0, 90, 180, 270),

1676 space_width: float = 200.0,

1677 content_key: Optional[str] = PG.CONTENTS,

1678 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1679 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1680 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1681 ) -> str:

1682 """

1683 See extract_text for most arguments.

1684

1685 Args:

1686 content_key: indicate the default key where to extract data

1687 None = the object; this allows reusing the function on an XObject

1688 default = "/Content"

1689

1690 """

1691 extractor = TextExtraction()

1692 font_resources: dict[str, DictionaryObject] = {}

1693 fonts: dict[str, Font] = {}

1694

1695 try:

1696 objr = obj

1697 while NameObject(PG.RESOURCES) not in objr:

1698 # /Resources can be inherited so we look to parents

1699 objr = objr["/Parent"].get_object()

1700 # If no parents then no /Resources will be available,

1701 # so an exception will be raised

1702 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])

1703 except Exception:

1704 # No resources means no text is possible (no font); we consider the

1705 # file as not damaged, no need to check for TJ or Tj

1706 return ""

1707

1708 if (

1709 not is_null_or_none(resources_dict)

1710 and "/Font" in resources_dict

1711 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))

1712 ):

1713 for font_resource in font_resources_dict:

1714 try:

1715 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())

1716 font_resources[font_resource] = font_resource_object

1717 fonts[font_resource] = Font.from_font_resource(font_resource_object)

1718 # Override space width, if applicable

1719 if fonts[font_resource].character_widths.get(" ", 0) == 0:

1720 fonts[font_resource].space_width = space_width

1721 except (AttributeError, TypeError):

1722 pass

1723

1724 try:

1725 content = (

1726 obj[content_key].get_object() if isinstance(content_key, str) else obj

1727 )

1728 if not isinstance(content, ContentStream):

1729 content = ContentStream(content, pdf, "bytes")

1730 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1731 return ""

1732 # We check all strings are TextStringObjects. ByteStringObjects

1733 # are strings where the byte->string encoding was unknown, so adding

1734 # them to the text here would be gibberish.

1735

1736 # Initialize the extractor with the necessary parameters

1737 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)

1738

1739 for operands, operator in content.operations:

1740 if visitor_operand_before is not None:

1741 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1742 # Multiple operators are handled here

1743 if operator == b"'":

1744 extractor.process_operation(b"T*", [])

1745 extractor.process_operation(b"Tj", operands)

1746 elif operator == b'"':

1747 extractor.process_operation(b"Tw", [operands[0]])

1748 extractor.process_operation(b"Tc", [operands[1]])

1749 extractor.process_operation(b"T*", [])

1750 extractor.process_operation(b"Tj", operands[2:])

1751 elif operator == b"TJ":

1752 # The space width may be smaller than the font width, so the width should be 95%.

1753 _confirm_space_width = extractor._space_width * 0.95

1754 if operands:

1755 for op in operands[0]:

1756 if isinstance(op, (str, bytes)):

1757 extractor.process_operation(b"Tj", [op])

1758 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1759 abs(float(op)) >= _confirm_space_width

1760 and extractor.text

1761 and extractor.text[-1] != " "

1762 ):

1763 extractor.process_operation(b"Tj", [" "])

1764 elif operator == b"TD":

1765 extractor.process_operation(b"TL", [-operands[1]])

1766 extractor.process_operation(b"Td", operands)

1767 elif operator == b"Do":

1768 extractor.output += extractor.text

1769 if visitor_text is not None:

1770 visitor_text(

1771 extractor.text,

1772 extractor.memo_cm,

1773 extractor.memo_tm,

1774 extractor.font_resource,

1775 extractor.font_size,

1776 )

1777 try:

1778 if extractor.output[-1] != "\n":

1779 extractor.output += "\n"

1780 if visitor_text is not None:

1781 visitor_text(

1782 "\n",

1783 extractor.memo_cm,

1784 extractor.memo_tm,

1785 extractor.font_resource,

1786 extractor.font_size,

1787 )

1788 except IndexError:

1789 pass

1790 try:

1791 xobj = resources_dict["/XObject"]

1792 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore

1793 text = self.extract_xform_text(

1794 xobj[operands[0]], # type: ignore

1795 orientations,

1796 space_width,

1797 visitor_operand_before,

1798 visitor_operand_after,

1799 visitor_text,

1800 )

1801 extractor.output += text

1802 if visitor_text is not None:

1803 visitor_text(

1804 text,

1805 extractor.memo_cm,

1806 extractor.memo_tm,

1807 extractor.font_resource,

1808 extractor.font_size,

1809 )

1810 except Exception as exception:

1811 logger_warning(

1812 f"Impossible to decode XFormObject {operands[0]}: {exception}",

1813 __name__,

1814 )

1815 finally:

1816 extractor.text = ""

1817 extractor.memo_cm = extractor.cm_matrix.copy()

1818 extractor.memo_tm = extractor.tm_matrix.copy()

1819 else:

1820 extractor.process_operation(operator, operands)

1821 if visitor_operand_after is not None:

1822 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1823 extractor.output += extractor.text # just in case

1824 if extractor.text != "" and visitor_text is not None:

1825 visitor_text(

1826 extractor.text,

1827 extractor.memo_cm,

1828 extractor.memo_tm,

1829 extractor.font_resource,

1830 extractor.font_size,

1831 )

1832 return extractor.output

1833

1834 def _layout_mode_fonts(self) -> dict[str, Font]:

1835 """

1836 Get fonts formatted for "layout" mode text extraction.

1837

1838 Returns:

1839 Dict[str, Font]: dictionary of Font instances keyed by font name

1840

1841 """

1842 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1843 objr: Any = self

1844 fonts: dict[str, Font] = {}

1845 while objr is not None:

1846 try:

1847 resources_dict: Any = objr[PG.RESOURCES]

1848 except KeyError:

1849 resources_dict = {}

1850 if "/Font" in resources_dict and self.pdf is not None:

1851 for font_name in resources_dict["/Font"]:

1852 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])

1853 try:

1854 objr = objr["/Parent"].get_object()

1855 except KeyError:

1856 objr = None

1857

1858 return fonts

1859

1860 def _layout_mode_text(

1861 self,

1862 space_vertically: bool = True,

1863 scale_weight: float = 1.25,

1864 strip_rotated: bool = True,

1865 debug_path: Optional[Path] = None,

1866 font_height_weight: float = 1,

1867 ) -> str:

1868 """

1869 Get text preserving fidelity to source PDF text layout.

1870

1871 Args:

1872 space_vertically: include blank lines inferred from y distance + font

1873 height. Defaults to True.

1874 scale_weight: multiplier for string length when calculating weighted

1875 average character width. Defaults to 1.25.

1876 strip_rotated: Removes text that is rotated w.r.t. to the page from

1877 layout mode output. Defaults to True.

1878 debug_path (Path | None): if supplied, must target a directory.

1879 creates the following files with debug information for layout mode

1880 functions if supplied:

1881 - fonts.json: output of self._layout_mode_fonts

1882 - tjs.json: individual text render ops with corresponding transform matrices

1883 - bts.json: text render ops left justified and grouped by BT/ET operators

1884 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1885 Defaults to None.

1886 font_height_weight: multiplier for font height when calculating

1887 blank lines. Defaults to 1.

1888

1889 Returns:

1890 str: multiline string containing page text in a fixed width format that

1891 closely adheres to the rendered layout in the source pdf.

1892

1893 """

1894 fonts = self._layout_mode_fonts()

1895 if debug_path: # pragma: no cover

1896 import json # noqa: PLC0415

1897

1898 debug_path.joinpath("fonts.json").write_text(

1899 json.dumps(fonts, indent=2, default=asdict),

1900 "utf-8"

1901 )

1902

1903 ops = iter(

1904 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1905 )

1906 bt_groups = _layout_mode.text_show_operations(

1907 ops, fonts, strip_rotated, debug_path

1908 )

1909

1910 if not bt_groups:

1911 return ""

1912

1913 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1914

1915 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1916

1917 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1918

1919 def extract_text(

1920 self,

1921 *args: Any,

1922 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1923 space_width: float = 200.0,

1924 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1925 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1926 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1927 extraction_mode: Literal["plain", "layout"] = "plain",

1928 **kwargs: Any,

1929 ) -> str:

1930 """

1931 Locate all text drawing commands, in the order they are provided in the

1932 content stream, and extract the text.

1933

1934 This works well for some PDF files, but poorly for others, depending on

1935 the generator used. This will be refined in the future.

1936

1937 Do not rely on the order of text coming out of this function, as it

1938 will change if this function is made more sophisticated.

1939

1940 Arabic and Hebrew are extracted in the correct order.

1941 If required a custom RTL range of characters can be defined;

1942 see function set_custom_rtl.

1943

1944 Additionally you can provide visitor methods to get informed on all

1945 operations and all text objects.

1946 For example in some PDF files this can be useful to parse tables.

1947

1948 Args:

1949 orientations: list of orientations extract_text will look for

1950 default = (0, 90, 180, 270)

1951 note: currently only 0 (up),90 (turned left), 180 (upside down),

1952 270 (turned right)

1953 Silently ignored in "layout" mode.

1954 space_width: force default space width

1955 if not extracted from font (default: 200)

1956 Silently ignored in "layout" mode.

1957 visitor_operand_before: function to be called before processing an operation.

1958 It has four arguments: operator, operand-arguments,

1959 current transformation matrix and text matrix.

1960 Ignored with a warning in "layout" mode.

1961 visitor_operand_after: function to be called after processing an operation.

1962 It has four arguments: operator, operand-arguments,

1963 current transformation matrix and text matrix.

1964 Ignored with a warning in "layout" mode.

1965 visitor_text: function to be called when extracting some text at some position.

1966 It has five arguments: text, current transformation matrix,

1967 text matrix, font-dictionary and font-size.

1968 The font-dictionary may be None in case of unknown fonts.

1969 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1970 Ignored with a warning in "layout" mode.

1971 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1972 "layout" for experimental layout mode functionality.

1973 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1974 in "layout" mode.

1975

1976 kwargs:

1977 layout_mode_space_vertically (bool): include blank lines inferred from

1978 y distance + font height. Defaults to True.

1979 layout_mode_scale_weight (float): multiplier for string length when calculating

1980 weighted average character width. Defaults to 1.25.

1981 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

1982 Set to False to include rotated text anyway. If rotated text is discovered,

1983 layout will be degraded and a warning will result. Defaults to True.

1984 layout_mode_debug_path (Path | None): if supplied, must target a directory.

1985 creates the following files with debug information for layout mode

1986 functions if supplied:

1987

1988 - fonts.json: output of self._layout_mode_fonts

1989 - tjs.json: individual text render ops with corresponding transform matrices

1990 - bts.json: text render ops left justified and grouped by BT/ET operators

1991 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1992 layout_mode_font_height_weight (float): multiplier for font height when calculating

1993 blank lines. Defaults to 1.

1994

1995 Returns:

1996 The extracted text

1997

1998 """

1999 if extraction_mode not in ["plain", "layout"]:

2000 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

2001 if extraction_mode == "layout":

2002 for visitor in (

2003 "visitor_operand_before",

2004 "visitor_operand_after",

2005 "visitor_text",

2006 ):

2007 if locals()[visitor]:

2008 logger_warning(

2009 f"Argument {visitor} is ignored in layout mode",

2010 __name__,

2011 )

2012 return self._layout_mode_text(

2013 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2014 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2015 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2016 debug_path=kwargs.get("layout_mode_debug_path"),

2017 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2018 )

2019 if len(args) >= 1:

2020 if isinstance(args[0], str):

2021 if len(args) >= 3:

2022 if isinstance(args[2], (tuple, int)):

2023 orientations = args[2]

2024 else:

2025 raise TypeError(f"Invalid positional parameter {args[2]}")

2026 if len(args) >= 4:

2027 if isinstance(args[3], (float, int)):

2028 space_width = args[3]

2029 else:

2030 raise TypeError(f"Invalid positional parameter {args[3]}")

2031 elif isinstance(args[0], (tuple, int)):

2032 orientations = args[0]

2033 if len(args) >= 2:

2034 if isinstance(args[1], (float, int)):

2035 space_width = args[1]

2036 else:

2037 raise TypeError(f"Invalid positional parameter {args[1]}")

2038 else:

2039 raise TypeError(f"Invalid positional parameter {args[0]}")

2040

2041 if isinstance(orientations, int):

2042 orientations = (orientations,)

2043

2044 return self._extract_text(

2045 self,

2046 self.pdf,

2047 orientations,

2048 space_width,

2049 PG.CONTENTS,

2050 visitor_operand_before,

2051 visitor_operand_after,

2052 visitor_text,

2053 )

2054

2055 def extract_xform_text(

2056 self,

2057 xform: EncodedStreamObject,

2058 orientations: tuple[int, ...] = (0, 90, 270, 360),

2059 space_width: float = 200.0,

2060 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2061 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2062 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2063 ) -> str:

2064 """

2065 Extract text from an XObject.

2066

2067 Args:

2068 xform:

2069 orientations:

2070 space_width: force default space width (if not extracted from font (default 200)

2071 visitor_operand_before:

2072 visitor_operand_after:

2073 visitor_text:

2074

2075 Returns:

2076 The extracted text

2077

2078 """

2079 return self._extract_text(

2080 xform,

2081 self.pdf,

2082 orientations,

2083 space_width,

2084 None,

2085 visitor_operand_before,

2086 visitor_operand_after,

2087 visitor_text,

2088 )

2089

2090 def _get_fonts(self) -> tuple[set[str], set[str]]:

2091 """

2092 Get the names of embedded fonts and unembedded fonts.

2093

2094 Returns:

2095 A tuple (set of embedded fonts, set of unembedded fonts)

2096

2097 """

2098 obj = self.get_object()

2099 assert isinstance(obj, DictionaryObject)

2100 fonts: set[str] = set()

2101 embedded: set[str] = set()

2102 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2103 unembedded = fonts - embedded

2104 return embedded, unembedded

2105

2106 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2107 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2108 default user space units, defining the boundaries of the physical medium on

2109 which the page is intended to be displayed or printed."""

2110

2111 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2112 """

2113 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2114 default user space units, defining the visible region of default user

2115 space.

2116

2117 When the page is displayed or printed, its contents are to be clipped

2118 (cropped) to this rectangle and then imposed on the output medium in some

2119 implementation-defined manner. Default value: same as

2120 :attr:`mediabox<mediabox>`.

2121 """

2122

2123 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2124 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2125 default user space units, defining the region to which the contents of the

2126 page should be clipped when output in a production environment."""

2127

2128 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2129 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2130 default user space units, defining the intended dimensions of the finished

2131 page after trimming."""

2132

2133 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2134 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2135 default user space units, defining the extent of the page's meaningful

2136 content as intended by the page's creator."""

2137

2138 @property

2139 def annotations(self) -> Optional[ArrayObject]:

2140 if "/Annots" not in self:

2141 return None

2142 return cast(ArrayObject, self["/Annots"])

2143

2144 @annotations.setter

2145 def annotations(self, value: Optional[ArrayObject]) -> None:

2146 """

2147 Set the annotations array of the page.

2148

2149 Typically you do not want to set this value, but append to it.

2150 If you append to it, remember to add the object first to the writer

2151 and only add the indirect object.

2152 """

2153 if value is None:

2154 if "/Annots" not in self:

2155 return

2156 del self[NameObject("/Annots")]

2157 else:

2158 self[NameObject("/Annots")] = value

2159

2160

2161class _VirtualList(Sequence[PageObject]):

2162 def __init__(

2163 self,

2164 length_function: Callable[[], int],

2165 get_function: Callable[[int], PageObject],

2166 ) -> None:

2167 self.length_function = length_function

2168 self.get_function = get_function

2169 self.current = -1

2170

2171 def __len__(self) -> int:

2172 return self.length_function()

2173

2174 @overload

2175 def __getitem__(self, index: int) -> PageObject:

2176 ...

2177

2178 @overload

2179 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2180 ...

2181

2182 def __getitem__(

2183 self, index: Union[int, slice]

2184 ) -> Union[PageObject, Sequence[PageObject]]:

2185 if isinstance(index, slice):

2186 indices = range(*index.indices(len(self)))

2187 cls = type(self)

2188 return cls(indices.__len__, lambda idx: self[indices[idx]])

2189 if not isinstance(index, int):

2190 raise TypeError("Sequence indices must be integers")

2191 len_self = len(self)

2192 if index < 0:

2193 # support negative indexes

2194 index += len_self

2195 if not (0 <= index < len_self):

2196 raise IndexError("Sequence index out of range")

2197 return self.get_function(index)

2198

2199 def __delitem__(self, index: Union[int, slice]) -> None:

2200 if isinstance(index, slice):

2201 r = list(range(*index.indices(len(self))))

2202 # pages have to be deleted from last to first

2203 r.sort()

2204 r.reverse()

2205 for p in r:

2206 del self[p] # recursive call

2207 return

2208 if not isinstance(index, int):

2209 raise TypeError("Index must be integers")

2210 len_self = len(self)

2211 if index < 0:

2212 # support negative indexes

2213 index += len_self

2214 if not (0 <= index < len_self):

2215 raise IndexError("Index out of range")

2216 ind = self[index].indirect_reference

2217 assert ind is not None

2218 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2219 "/Parent", None

2220 )

2221 first = True

2222 while parent is not None:

2223 parent = cast(DictionaryObject, parent.get_object())

2224 try:

2225 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2226 del cast(ArrayObject, parent["/Kids"])[i]

2227 first = False

2228 try:

2229 assert ind is not None

2230 del ind.pdf.flattened_pages[index] # case of page in a Reader

2231 except Exception: # pragma: no cover

2232 pass

2233 if "/Count" in parent:

2234 parent[NameObject("/Count")] = NumberObject(

2235 cast(int, parent["/Count"]) - 1

2236 )

2237 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2238 # No more objects in this part of this subtree

2239 ind = parent.indirect_reference

2240 parent = parent.get("/Parent", None)

2241 except ValueError: # from index

2242 if first:

2243 raise PdfReadError(f"Page not found in page tree: {ind}")

2244 break

2245

2246 def __iter__(self) -> Iterator[PageObject]:

2247 for i in range(len(self)):

2248 yield self[i]

2249

2250 def __str__(self) -> str:

2251 p = [f"PageObject({i})" for i in range(self.length_function())]

2252 return f"[{', '.join(p)}]"

2253

2254

2255def _get_fonts_walk(

2256 obj: DictionaryObject,

2257 fnt: set[str],

2258 emb: set[str],

2259) -> tuple[set[str], set[str]]:

2260 """

2261 Get the set of all fonts and all embedded fonts.

2262

2263 Args:

2264 obj: Page resources dictionary

2265 fnt: font

2266 emb: embedded fonts

2267

2268 Returns:

2269 A tuple (fnt, emb)

2270

2271 If there is a key called 'BaseFont', that is a font that is used in the document.

2272 If there is a key called 'FontName' and another key in the same dictionary object

2273 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2274 embedded.

2275

2276 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2277

2278 """

2279 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2280

2281 def process_font(f: DictionaryObject) -> None:

2282 nonlocal fnt, emb

2283 f = cast(DictionaryObject, f.get_object()) # to be sure

2284 if "/BaseFont" in f:

2285 fnt.add(cast(str, f["/BaseFont"]))

2286

2287 if (

2288 ("/CharProcs" in f)

2289 or (

2290 "/FontDescriptor" in f

2291 and any(

2292 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2293 )

2294 )

2295 or (

2296 "/DescendantFonts" in f

2297 and "/FontDescriptor"

2298 in cast(

2299 DictionaryObject,

2300 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2301 )

2302 and any(

2303 x

2304 in cast(

2305 DictionaryObject,

2306 cast(

2307 DictionaryObject,

2308 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2309 )["/FontDescriptor"],

2310 )

2311 for x in fontkeys

2312 )

2313 )

2314 ):

2315 # the list comprehension ensures there is FontFile

2316 try:

2317 emb.add(cast(str, f["/BaseFont"]))

2318 except KeyError:

2319 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2320

2321 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2322 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2323 process_font(f)

2324 if "/Resources" in obj:

2325 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2326 for f in cast(

2327 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2328 ).values():

2329 process_font(f)

2330 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2331 for x in cast(

2332 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2333 ).values():

2334 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2335 if "/Annots" in obj:

2336 for a in cast(ArrayObject, obj["/Annots"]):

2337 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2338 if "/AP" in obj:

2339 if (

2340 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2341 "/Type"

2342 )

2343 == "/XObject"

2344 ):

2345 _get_fonts_walk(

2346 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2347 fnt,

2348 emb,

2349 )

2350 else:

2351 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2352 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2353 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

918 statements