Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from copy import deepcopy

33from dataclasses import asdict, dataclass

34from decimal import Decimal

35from io import BytesIO

36from pathlib import Path

37from typing import (

38 Any,

39 Callable,

40 Literal,

41 Optional,

42 Union,

43 cast,

44 overload,

45)

47from ._font import Font

48from ._protocols import PdfCommonDocProtocol

49from ._text_extraction import (

50 _layout_mode,

51)

52from ._text_extraction._text_extractor import TextExtraction

53from ._utils import (

54 CompressedTransformationMatrix,

55 TransformationMatrixType,

56 _human_readable_bytes,

57 deprecate,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING

62from .constants import AnnotationDictionaryAttributes as ADA

63from .constants import ImageAttributes as IA

64from .constants import PageAttributes as PG

65from .constants import Resources as RES

66from .errors import PageSizeNotDefinedError, PdfReadError

67from .generic import (

68 ArrayObject,

69 ContentStream,

70 DictionaryObject,

71 EncodedStreamObject,

72 FloatObject,

73 IndirectObject,

74 NameObject,

75 NullObject,

76 NumberObject,

77 PdfObject,

78 RectangleObject,

79 StreamObject,

80 is_null_or_none,

81)

83try:

84 from PIL.Image import Image

86 pil_not_imported = False

87except ImportError:

88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10

89 pil_not_imported = True # error will be raised only when using images

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

95 retval: Union[None, RectangleObject, IndirectObject] = self.get(name)

96 if isinstance(retval, RectangleObject):

97 return retval

98 if is_null_or_none(retval):

99 for d in defaults:

100 retval = self.get(d)

101 if retval is not None:

102 break

103 if isinstance(retval, IndirectObject):

104 retval = self.pdf.get_object(retval)

105 retval = RectangleObject(retval) # type: ignore

106 _set_rectangle(self, name, retval)

107 return retval

108

109

110def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

111 self[NameObject(name)] = value

112

113

114def _delete_rectangle(self: Any, name: str) -> None:

115 del self[name]

116

117

118def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

119 return property(

120 lambda self: _get_rectangle(self, name, fallback),

121 lambda self, value: _set_rectangle(self, name, value),

122 lambda self: _delete_rectangle(self, name),

123 )

124

125

126class Transformation:

127 """

128 Represent a 2D transformation.

129

130 The transformation between two coordinate systems is represented by a 3-by-3

131 transformation matrix with the following form::

132

133 a b 0

134 c d 0

135 e f 1

136

137 Because a transformation matrix has only six elements that can be changed,

138 it is usually specified in PDF as the six-element array [ a b c d e f ].

139

140 Coordinate transformations are expressed as matrix multiplications::

141

142 a b 0

143 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

144 e f 1

145

146

147 Example:

148 >>> from pypdf import PdfWriter, Transformation

149 >>> page = PdfWriter().add_blank_page(800, 600)

150 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

151 >>> page.add_transformation(op)

152

153 """

154

155 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

156 self.ctm = ctm

157

158 @property

159 def matrix(self) -> TransformationMatrixType:

160 """

161 Return the transformation matrix as a tuple of tuples in the form:

162

163 ((a, b, 0), (c, d, 0), (e, f, 1))

164 """

165 return (

166 (self.ctm[0], self.ctm[1], 0),

167 (self.ctm[2], self.ctm[3], 0),

168 (self.ctm[4], self.ctm[5], 1),

169 )

170

171 @staticmethod

172 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

173 """

174 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

175

176 Args:

177 matrix: The transformation matrix as a tuple of tuples.

178

179 Returns:

180 A tuple representing the transformation matrix as (a, b, c, d, e, f)

181

182 """

183 return (

184 matrix[0][0],

185 matrix[0][1],

186 matrix[1][0],

187 matrix[1][1],

188 matrix[2][0],

189 matrix[2][1],

190 )

191

192 def _to_cm(self) -> str:

193 # Returns the cm operation string for the given transformation matrix

194 return (

195 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

196 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

197 )

198

199 def transform(self, m: "Transformation") -> "Transformation":

200 """

201 Apply one transformation to another.

202

203 Args:

204 m: a Transformation to apply.

205

206 Returns:

207 A new ``Transformation`` instance

208

209 Example:

210 >>> from pypdf import PdfWriter, Transformation

211 >>> height, width = 40, 50

212 >>> page = PdfWriter().add_blank_page(800, 600)

213 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

214 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

215 >>> page.add_transformation(op)

216

217 """

218 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

219 return Transformation(ctm)

220

221 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

222 """

223 Translate the contents of a page.

224

225 Args:

226 tx: The translation along the x-axis.

227 ty: The translation along the y-axis.

228

229 Returns:

230 A new ``Transformation`` instance

231

232 """

233 m = self.ctm

234 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

235

236 def scale(

237 self, sx: Optional[float] = None, sy: Optional[float] = None

238 ) -> "Transformation":

239 """

240 Scale the contents of a page towards the origin of the coordinate system.

241

242 Typically, that is the lower-left corner of the page. That can be

243 changed by translating the contents / the page boxes.

244

245 Args:

246 sx: The scale factor along the x-axis.

247 sy: The scale factor along the y-axis.

248

249 Returns:

250 A new Transformation instance with the scaled matrix.

251

252 """

253 if sx is None and sy is None:

254 raise ValueError("Either sx or sy must be specified")

255 if sx is None:

256 sx = sy

257 if sy is None:

258 sy = sx

259 assert sx is not None

260 assert sy is not None

261 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

262 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

263 return Transformation(ctm)

264

265 def rotate(self, rotation: float) -> "Transformation":

266 """

267 Rotate the contents of a page.

268

269 Args:

270 rotation: The angle of rotation in degrees.

271

272 Returns:

273 A new ``Transformation`` instance with the rotated matrix.

274

275 """

276 rotation = math.radians(rotation)

277 op: TransformationMatrixType = (

278 (math.cos(rotation), math.sin(rotation), 0),

279 (-math.sin(rotation), math.cos(rotation), 0),

280 (0, 0, 1),

281 )

282 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

283 return Transformation(ctm)

284

285 def __repr__(self) -> str:

286 return f"Transformation(ctm={self.ctm})"

287

288 @overload

289 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

290 ...

291

292 @overload

293 def apply_on(

294 self, pt: tuple[float, float], as_object: bool = False

295 ) -> tuple[float, float]:

296 ...

297

298 def apply_on(

299 self,

300 pt: Union[tuple[float, float], list[float]],

301 as_object: bool = False,

302 ) -> Union[tuple[float, float], list[float]]:

303 """

304 Apply the transformation matrix on the given point.

305

306 Args:

307 pt: A tuple or list representing the point in the form (x, y).

308 as_object: If True, return items as FloatObject, otherwise as plain floats.

309

310 Returns:

311 A tuple or list representing the transformed point in the form (x', y')

312

313 """

314 typ = FloatObject if as_object else float

315 pt1 = (

316 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

317 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

318 )

319 return list(pt1) if isinstance(pt, list) else pt1

320

321

322@dataclass

323class ImageFile:

324 """

325 Image within the PDF file. *This object is not designed to be built.*

326

327 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

328 """

329

330 name: str = ""

331 """

332 Filename as identified within the PDF file.

333 """

334

335 data: bytes = b""

336 """

337 Data as bytes.

338 """

339

340 image: Optional[Image] = None

341 """

342 Data as PIL image.

343 """

344

345 indirect_reference: Optional[IndirectObject] = None

346 """

347 Reference to the object storing the stream.

348 """

349

350 def replace(self, new_image: Image, **kwargs: Any) -> None:

351 """

352 Replace the image with a new PIL image.

353

354 Args:

355 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

356 **kwargs: Additional keyword arguments to pass to `Image.save()`.

357

358 Raises:

359 TypeError: If the image is inline or in a PdfReader.

360 TypeError: If the image does not belong to a PdfWriter.

361 TypeError: If `new_image` is not a PIL Image.

362

363 Note:

364 This method replaces the existing image with a new image.

365 It is not allowed for inline images or images within a PdfReader.

366 The `kwargs` parameter allows passing additional parameters

367 to `Image.save()`, such as quality.

368

369 """

370 if pil_not_imported:

371 raise ImportError(

372 "pillow is required to do image extraction. "

373 "It can be installed via 'pip install pypdf[image]'"

374 )

375

376 from ._reader import PdfReader # noqa: PLC0415

377

378 # to prevent circular import

379 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

380 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

381

382 if self.indirect_reference is None:

383 raise TypeError("Cannot update an inline image.")

384 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

385 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

386 if not isinstance(new_image, Image):

387 raise TypeError("new_image shall be a PIL Image")

388 b = BytesIO()

389 new_image.save(b, "PDF", **kwargs)

390 reader = PdfReader(b)

391 page_image = reader.pages[0].images[0]

392 assert page_image.indirect_reference is not None

393 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

394 page_image.indirect_reference.get_object()

395 )

396 cast(

397 PdfObject, self.indirect_reference.get_object()

398 ).indirect_reference = self.indirect_reference

399 # change the object attributes

400 extension, byte_stream, img = _xobj_to_image(

401 cast(DictionaryObject, self.indirect_reference.get_object()),

402 pillow_parameters=kwargs,

403 )

404 assert extension is not None

405 self.name = self.name[: self.name.rfind(".")] + extension

406 self.data = byte_stream

407 self.image = img

408

409 def __str__(self) -> str:

410 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

411

412 def __repr__(self) -> str:

413 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

414

415

416class VirtualListImages(Sequence[ImageFile]):

417 """

418 Provides access to images referenced within a page.

419 Only one copy will be returned if the usage is used on the same page multiple times.

420 See :func:`PageObject.images` for more details.

421 """

422

423 def __init__(

424 self,

425 ids_function: Callable[[], list[Union[str, list[str]]]],

426 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

427 ) -> None:

428 self.ids_function = ids_function

429 self.get_function = get_function

430 self.current = -1

431

432 def __len__(self) -> int:

433 return len(self.ids_function())

434

435 def keys(self) -> list[Union[str, list[str]]]:

436 return self.ids_function()

437

438 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

439 return [(x, self[x]) for x in self.ids_function()]

440

441 @overload

442 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

443 ...

444

445 @overload

446 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

447 ...

448

449 def __getitem__(

450 self, index: Union[int, slice, str, list[str], tuple[str]]

451 ) -> Union[ImageFile, Sequence[ImageFile]]:

452 lst = self.ids_function()

453 if isinstance(index, slice):

454 indices = range(*index.indices(len(self)))

455 lst = [lst[x] for x in indices]

456 cls = type(self)

457 return cls((lambda: lst), self.get_function)

458 if isinstance(index, (str, list, tuple)):

459 return self.get_function(index)

460 if not isinstance(index, int):

461 raise TypeError("Invalid sequence indices type")

462 len_self = len(lst)

463 if index < 0:

464 # support negative indexes

465 index += len_self

466 if not (0 <= index < len_self):

467 raise IndexError("Sequence index out of range")

468 return self.get_function(lst[index])

469

470 def __iter__(self) -> Iterator[ImageFile]:

471 for i in range(len(self)):

472 yield self[i]

473

474 def __str__(self) -> str:

475 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

476 return f"[{', '.join(p)}]"

477

478

479class PageObject(DictionaryObject):

480 """

481 PageObject represents a single page within a PDF file.

482

483 Typically these objects will be created by accessing the

484 :attr:`pages<pypdf.PdfReader.pages>` property of the

485 :class:`PdfReader<pypdf.PdfReader>` class, but it is

486 also possible to create an empty page with the

487 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

488

489 Args:

490 pdf: PDF file the page belongs to.

491 indirect_reference: Stores the original indirect reference to

492 this object in its source PDF

493

494 """

495

496 original_page: "PageObject" # very local use in writer when appending

497

498 def __init__(

499 self,

500 pdf: Optional[PdfCommonDocProtocol] = None,

501 indirect_reference: Optional[IndirectObject] = None,

502 ) -> None:

503 DictionaryObject.__init__(self)

504 self.pdf = pdf

505 self.inline_images: Optional[dict[str, ImageFile]] = None

506 self.indirect_reference = indirect_reference

507 if not is_null_or_none(indirect_reference):

508 assert indirect_reference is not None, "mypy"

509 self.update(cast(DictionaryObject, indirect_reference.get_object()))

510 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}

511

512 def hash_bin(self) -> int:

513 """

514 Used to detect modified object.

515

516 Note: this function is overloaded to return the same results

517 as a DictionaryObject.

518

519 Returns:

520 Hash considering type and value.

521

522 """

523 return hash(

524 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

525 )

526

527 def hash_value_data(self) -> bytes:

528 data = super().hash_value_data()

529 data += f"{id(self)}".encode()

530 return data

531

532 @property

533 def user_unit(self) -> float:

534 """

535 A read-only positive number giving the size of user space units.

536

537 It is in multiples of 1/72 inch. Hence a value of 1 means a user

538 space unit is 1/72 inch, and a value of 3 means that a user

539 space unit is 3/72 inch.

540 """

541 return self.get(PG.USER_UNIT, 1)

542

543 @staticmethod

544 def create_blank_page(

545 pdf: Optional[PdfCommonDocProtocol] = None,

546 width: Union[float, Decimal, None] = None,

547 height: Union[float, Decimal, None] = None,

548 ) -> "PageObject":

549 """

550 Return a new blank page.

551

552 If ``width`` or ``height`` is ``None``, try to get the page size

553 from the last page of *pdf*.

554

555 Args:

556 pdf: PDF file the page is within.

557 width: The width of the new page expressed in default user

558 space units.

559 height: The height of the new page expressed in default user

560 space units.

561

562 Returns:

563 The new blank page

564

565 Raises:

566 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

567 no page

568

569 """

570 page = PageObject(pdf)

571

572 # Creates a new page (cf PDF Reference §7.7.3.3)

573 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

574 page.__setitem__(NameObject(PG.PARENT), NullObject())

575 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

576 if width is None or height is None:

577 if pdf is not None and len(pdf.pages) > 0:

578 lastpage = pdf.pages[len(pdf.pages) - 1]

579 width = lastpage.mediabox.width

580 height = lastpage.mediabox.height

581 else:

582 raise PageSizeNotDefinedError

583 page.__setitem__(

584 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore

585 )

586

587 return page

588

589 def _get_ids_image(

590 self,

591 obj: Optional[DictionaryObject] = None,

592 ancest: Optional[list[str]] = None,

593 call_stack: Optional[list[Any]] = None,

594 ) -> list[Union[str, list[str]]]:

595 if call_stack is None:

596 call_stack = []

597 _i = getattr(obj, "indirect_reference", None)

598 if _i in call_stack:

599 return []

600 call_stack.append(_i)

601 if self.inline_images is None:

602 self.inline_images = self._get_inline_images()

603 if obj is None:

604 obj = self

605 if ancest is None:

606 ancest = []

607 lst: list[Union[str, list[str]]] = []

608 if (

609 PG.RESOURCES not in obj or

610 is_null_or_none(resources := obj[PG.RESOURCES]) or

611 RES.XOBJECT not in cast(DictionaryObject, resources)

612 ):

613 return [] if self.inline_images is None else list(self.inline_images.keys())

614

615 x_object = resources[RES.XOBJECT].get_object() # type: ignore

616 for o in x_object:

617 if not isinstance(x_object[o], StreamObject):

618 continue

619 if x_object[o][IA.SUBTYPE] == "/Image":

620 lst.append(o if len(ancest) == 0 else [*ancest, o])

621 else: # is a form with possible images inside

622 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

623 assert self.inline_images is not None

624 lst.extend(list(self.inline_images.keys()))

625 return lst

626

627 def _get_image(

628 self,

629 id: Union[str, list[str], tuple[str]],

630 obj: Optional[DictionaryObject] = None,

631 ) -> ImageFile:

632 if obj is None:

633 obj = cast(DictionaryObject, self)

634 if isinstance(id, tuple):

635 id = list(id)

636 if isinstance(id, list) and len(id) == 1:

637 id = id[0]

638 try:

639 xobjs = cast(

640 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

641 )

642 except KeyError:

643 if not (id[0] == "~" and id[-1] == "~"):

644 raise

645 if isinstance(id, str):

646 if id[0] == "~" and id[-1] == "~":

647 if self.inline_images is None:

648 self.inline_images = self._get_inline_images()

649 if self.inline_images is None: # pragma: no cover

650 raise KeyError("No inline image can be found")

651 return self.inline_images[id]

652

653 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

654 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

655 extension, byte_stream = imgd[:2]

656 return ImageFile(

657 name=f"{id[1:]}{extension}",

658 data=byte_stream,

659 image=imgd[2],

660 indirect_reference=xobjs[id].indirect_reference,

661 )

662 # in a subobject

663 ids = id[1:]

664 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

665

666 @property

667 def images(self) -> VirtualListImages:

668 """

669 Read-only property emulating a list of images on a page.

670

671 Get a list of all images on the page. The key can be:

672 - A string (for the top object)

673 - A tuple (for images within XObject forms)

674 - An integer

675

676 Examples:

677 * `reader.pages[0].images[0]` # return first image

678 * `reader.pages[0].images['/I0']` # return image '/I0'

679 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

680 * `for img in reader.pages[0].images:` # loops through all objects

681

682 images.keys() and images.items() can be used.

683

684 The ImageFile has the following properties:

685

686 * `.name` : name of the object

687 * `.data` : bytes of the object

688 * `.image` : PIL Image Object

689 * `.indirect_reference` : object reference

690

691 and the following methods:

692 `.replace(new_image: PIL.Image.Image, **kwargs)` :

693 replace the image in the pdf with the new image

694 applying the saving parameters indicated (such as quality)

695

696 Example usage:

697

698 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

699

700 Inline images are extracted and named ~0~, ~1~, ..., with the

701 indirect_reference set to None.

702

703 """

704 return VirtualListImages(self._get_ids_image, self._get_image)

705

706 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

707 """Translate values used in inline image"""

708 try:

709 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

710 except (TypeError, KeyError):

711 if isinstance(v, NameObject):

712 # It is a custom name, thus we have to look in resources.

713 # The only applicable case is for ColorSpace.

714 try:

715 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

716 v = cast(DictionaryObject, res)[v]

717 except KeyError: # for res and v

718 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

719 return v

720

721 def _get_inline_images(self) -> dict[str, ImageFile]:

722 """Load inline images. Entries will be identified as `~1~`."""

723 content = self.get_contents()

724 if is_null_or_none(content):

725 return {}

726 imgs_data = []

727 assert content is not None, "mypy"

728 for param, ope in content.operations:

729 if ope == b"INLINE IMAGE":

730 imgs_data.append(

731 {"settings": param["settings"], "__streamdata__": param["data"]}

732 )

733 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

734 raise PdfReadError(

735 f"{ope!r} operator met whereas not expected, "

736 "please share use case with pypdf dev team"

737 )

738 files = {}

739 for num, ii in enumerate(imgs_data):

740 init = {

741 "__streamdata__": ii["__streamdata__"],

742 "/Length": len(ii["__streamdata__"]),

743 }

744 for k, v in ii["settings"].items():

745 if k in {"/Length", "/L"}: # no length is expected

746 continue

747 if isinstance(v, list):

748 v = ArrayObject(

749 [self._translate_value_inline_image(k, x) for x in v]

750 )

751 else:

752 v = self._translate_value_inline_image(k, v)

753 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

754 if k not in init:

755 init[k] = v

756 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

757 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

758 extension, byte_stream, img = _xobj_to_image(ii["object"])

759 files[f"~{num}~"] = ImageFile(

760 name=f"~{num}~{extension}",

761 data=byte_stream,

762 image=img,

763 indirect_reference=None,

764 )

765 return files

766

767 @property

768 def rotation(self) -> int:

769 """

770 The visual rotation of the page.

771

772 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

773 valid values. This property does not affect ``/Contents``.

774 """

775 rotate_obj = self.get(PG.ROTATE, 0)

776 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

777

778 @rotation.setter

779 def rotation(self, r: float) -> None:

780 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

781

782 def transfer_rotation_to_content(self) -> None:

783 """

784 Apply the rotation of the page to the content and the media/crop/...

785 boxes.

786

787 It is recommended to apply this function before page merging.

788 """

789 r = -self.rotation # rotation to apply is in the otherway

790 self.rotation = 0

791 mb = RectangleObject(self.mediabox)

792 trsf = (

793 Transformation()

794 .translate(

795 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

796 )

797 .rotate(r)

798 )

799 pt1 = trsf.apply_on(mb.lower_left)

800 pt2 = trsf.apply_on(mb.upper_right)

801 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

802 self.add_transformation(trsf, False)

803 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

804 if b in self:

805 rr = RectangleObject(self[b]) # type: ignore

806 pt1 = trsf.apply_on(rr.lower_left)

807 pt2 = trsf.apply_on(rr.upper_right)

808 self[NameObject(b)] = RectangleObject(

809 (

810 min(pt1[0], pt2[0]),

811 min(pt1[1], pt2[1]),

812 max(pt1[0], pt2[0]),

813 max(pt1[1], pt2[1]),

814 )

815 )

816

817 def rotate(self, angle: int) -> "PageObject":

818 """

819 Rotate a page clockwise by increments of 90 degrees.

820

821 Args:

822 angle: Angle to rotate the page. Must be an increment of 90 deg.

823

824 Returns:

825 The rotated PageObject

826

827 """

828 if angle % 90 != 0:

829 raise ValueError("Rotation angle must be a multiple of 90")

830 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

831 return self

832

833 def _merge_resources(

834 self,

835 res1: DictionaryObject,

836 res2: DictionaryObject,

837 resource: Any,

838 new_res1: bool = True,

839 ) -> tuple[dict[str, Any], dict[str, Any]]:

840 try:

841 assert isinstance(self.indirect_reference, IndirectObject)

842 pdf = self.indirect_reference.pdf

843 is_pdf_writer = hasattr(

844 pdf, "_add_object"

845 ) # expect isinstance(pdf, PdfWriter)

846 except (AssertionError, AttributeError):

847 pdf = None

848 is_pdf_writer = False

849

850 def compute_unique_key(base_key: str) -> tuple[str, bool]:

851 """

852 Find a key that either doesn't already exist or has the same value

853 (indicated by the bool)

854

855 Args:

856 base_key: An index is added to this to get the computed key

857

858 Returns:

859 A tuple (computed key, bool) where the boolean indicates

860 if there is a resource of the given computed_key with the same

861 value.

862

863 """

864 value = page2res.raw_get(base_key)

865 # TODO: a possible improvement for writer, the indirect_reference

866 # cannot be found because translated

867

868 # try the current key first (e.g. "foo"), but otherwise iterate

869 # through "foo-0", "foo-1", etc. new_res can contain only finitely

870 # many keys, thus this'll eventually end, even if it's been crafted

871 # to be maximally annoying.

872 computed_key = base_key

873 idx = 0

874 while computed_key in new_res:

875 if new_res.raw_get(computed_key) == value:

876 # there's already a resource of this name, with the exact

877 # same value

878 return computed_key, True

879 computed_key = f"{base_key}-{idx}"

880 idx += 1

881 return computed_key, False

882

883 if new_res1:

884 new_res = DictionaryObject()

885 new_res.update(res1.get(resource, DictionaryObject()).get_object())

886 else:

887 new_res = cast(DictionaryObject, res1[resource])

888 page2res = cast(

889 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

890 )

891 rename_res = {}

892 for key in page2res:

893 unique_key, same_value = compute_unique_key(key)

894 newname = NameObject(unique_key)

895 if key != unique_key:

896 # we have to use a different name for this

897 rename_res[key] = newname

898

899 if not same_value:

900 if is_pdf_writer:

901 new_res[newname] = page2res.raw_get(key).clone(pdf)

902 try:

903 new_res[newname] = new_res[newname].indirect_reference

904 except AttributeError:

905 pass

906 else:

907 new_res[newname] = page2res.raw_get(key)

908 lst = sorted(new_res.items())

909 new_res.clear()

910 for el in lst:

911 new_res[el[0]] = el[1]

912 return new_res, rename_res

913

914 @staticmethod

915 def _content_stream_rename(

916 stream: ContentStream,

917 rename: dict[Any, Any],

918 pdf: Optional[PdfCommonDocProtocol],

919 ) -> ContentStream:

920 if not rename:

921 return stream

922 stream = ContentStream(stream, pdf)

923 for operands, _operator in stream.operations:

924 if isinstance(operands, list):

925 for i, op in enumerate(operands):

926 if isinstance(op, NameObject):

927 operands[i] = rename.get(op, op)

928 elif isinstance(operands, dict):

929 for i, op in operands.items():

930 if isinstance(op, NameObject):

931 operands[i] = rename.get(op, op)

932 else:

933 raise KeyError(f"Type of operands is {type(operands)}")

934 return stream

935

936 @staticmethod

937 def _add_transformation_matrix(

938 contents: Any,

939 pdf: Optional[PdfCommonDocProtocol],

940 ctm: CompressedTransformationMatrix,

941 ) -> ContentStream:

942 """Add transformation matrix at the beginning of the given contents stream."""

943 contents = ContentStream(contents, pdf)

944 contents.operations.insert(

945 0,

946 [

947 [FloatObject(x) for x in ctm],

948 b"cm",

949 ],

950 )

951 return contents

952

953 def _get_contents_as_bytes(self) -> Optional[bytes]:

954 """

955 Return the page contents as bytes.

956

957 Returns:

958 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

959

960 """

961 if PG.CONTENTS in self:

962 obj = self[PG.CONTENTS].get_object()

963 if isinstance(obj, list):

964 return b"".join(x.get_object().get_data() for x in obj)

965 return cast(EncodedStreamObject, obj).get_data()

966 return None

967

968 def get_contents(self) -> Optional[ContentStream]:

969 """

970 Access the page contents.

971

972 Returns:

973 The ``/Contents`` object, or ``None`` if it does not exist.

974 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

975

976 """

977 if PG.CONTENTS in self:

978 try:

979 pdf = cast(IndirectObject, self.indirect_reference).pdf

980 except AttributeError:

981 pdf = None

982 obj = self[PG.CONTENTS]

983 if is_null_or_none(obj):

984 return None

985 resolved_object = obj.get_object()

986 return ContentStream(resolved_object, pdf)

987 return None

988

989 def replace_contents(

990 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

991 ) -> None:

992 """

993 Replace the page contents with the new content and nullify old objects

994 Args:

995 content: new content; if None delete the content field.

996 """

997 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

998 # the page is not attached : the content is directly attached.

999 self[NameObject(PG.CONTENTS)] = content

1000 return

1001

1002 from pypdf._writer import PdfWriter # noqa: PLC0415

1003 if not isinstance(self.indirect_reference.pdf, PdfWriter):

1004 deprecate(

1005 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "

1006 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "

1007 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."

1008 )

1009

1010 writer = self.indirect_reference.pdf

1011 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1012 content_array = cast(ArrayObject, self[PG.CONTENTS])

1013 for reference in content_array:

1014 try:

1015 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())

1016 except ValueError:

1017 # Occurs when called on PdfReader.

1018 pass

1019

1020 if isinstance(content, ArrayObject):

1021 content = ArrayObject(writer._add_object(obj) for obj in content)

1022

1023 if is_null_or_none(content):

1024 if PG.CONTENTS not in self:

1025 return

1026 assert self[PG.CONTENTS].indirect_reference is not None

1027 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())

1028 del self[PG.CONTENTS]

1029 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1030 try:

1031 self[NameObject(PG.CONTENTS)] = writer._add_object(content)

1032 except AttributeError:

1033 # applies at least for page not in writer

1034 # as a backup solution, we put content as an object although not in accordance with pdf ref

1035 # this will be fixed with the _add_object

1036 self[NameObject(PG.CONTENTS)] = content

1037 else:

1038 assert content is not None, "mypy"

1039 content.indirect_reference = self[

1040 PG.CONTENTS

1041 ].indirect_reference # TODO: in the future may require generation management

1042 try:

1043 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)

1044 except AttributeError:

1045 # applies at least for page not in writer

1046 # as a backup solution, we put content as an object although not in accordance with pdf ref

1047 # this will be fixed with the _add_object

1048 self[NameObject(PG.CONTENTS)] = content

1049 # forces recalculation of inline_images

1050 self.inline_images = None

1051

1052 def merge_page(

1053 self, page2: "PageObject", expand: bool = False, over: bool = True

1054 ) -> None:

1055 """

1056 Merge the content streams of two pages into one.

1057

1058 Resource references (e.g. fonts) are maintained from both pages.

1059 The mediabox, cropbox, etc of this page are not altered.

1060 The parameter page's content stream will

1061 be added to the end of this page's content stream,

1062 meaning that it will be drawn after, or "on top" of this page.

1063

1064 Args:

1065 page2: The page to be merged into this one. Should be

1066 an instance of :class:`PageObject<PageObject>`.

1067 over: set the page2 content over page1 if True (default) else under

1068 expand: If True, the current page dimensions will be

1069 expanded to accommodate the dimensions of the page to be merged.

1070

1071 """

1072 self._merge_page(page2, over=over, expand=expand)

1073

1074 def _merge_page(

1075 self,

1076 page2: "PageObject",

1077 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1078 ctm: Optional[CompressedTransformationMatrix] = None,

1079 over: bool = True,

1080 expand: bool = False,

1081 ) -> None:

1082 # First we work on merging the resource dictionaries. This allows us

1083 # to find out what symbols in the content streams we might need to

1084 # rename.

1085 try:

1086 assert isinstance(self.indirect_reference, IndirectObject)

1087 if hasattr(

1088 self.indirect_reference.pdf, "_add_object"

1089 ): # to detect PdfWriter

1090 return self._merge_page_writer(

1091 page2, page2transformation, ctm, over, expand

1092 )

1093 except (AssertionError, AttributeError):

1094 pass

1095

1096 new_resources = DictionaryObject()

1097 rename = {}

1098 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())

1099 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())

1100 new_annots = ArrayObject()

1101

1102 for page in (self, page2):

1103 if PG.ANNOTS in page:

1104 annots = page[PG.ANNOTS]

1105 if isinstance(annots, ArrayObject):

1106 new_annots.extend(annots)

1107

1108 for res in (

1109 RES.EXT_G_STATE,

1110 RES.FONT,

1111 RES.XOBJECT,

1112 RES.COLOR_SPACE,

1113 RES.PATTERN,

1114 RES.SHADING,

1115 RES.PROPERTIES,

1116 ):

1117 new, newrename = self._merge_resources(

1118 original_resources, page2resources, res

1119 )

1120 if new:

1121 new_resources[NameObject(res)] = new

1122 rename.update(newrename)

1123

1124 # Combine /ProcSet sets, making sure there's a consistent order

1125 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1126 sorted(

1127 set(

1128 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1129 ).union(

1130 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())

1131 )

1132 )

1133 )

1134

1135 new_content_array = ArrayObject()

1136 original_content = self.get_contents()

1137 if original_content is not None:

1138 original_content.isolate_graphics_state()

1139 new_content_array.append(original_content)

1140

1141 page2content = page2.get_contents()

1142 if page2content is not None:

1143 rect = getattr(page2, MERGE_CROP_BOX)

1144 page2content.operations.insert(

1145 0,

1146 (

1147 map(

1148 FloatObject,

1149 [

1150 rect.left,

1151 rect.bottom,

1152 rect.width,

1153 rect.height,

1154 ],

1155 ),

1156 b"re",

1157 ),

1158 )

1159 page2content.operations.insert(1, ([], b"W"))

1160 page2content.operations.insert(2, ([], b"n"))

1161 if page2transformation is not None:

1162 page2content = page2transformation(page2content)

1163 page2content = PageObject._content_stream_rename(

1164 page2content, rename, self.pdf

1165 )

1166 page2content.isolate_graphics_state()

1167 if over:

1168 new_content_array.append(page2content)

1169 else:

1170 new_content_array.insert(0, page2content)

1171

1172 # if expanding the page to fit a new page, calculate the new media box size

1173 if expand:

1174 self._expand_mediabox(page2, ctm)

1175

1176 self.replace_contents(ContentStream(new_content_array, self.pdf))

1177 self[NameObject(PG.RESOURCES)] = new_resources

1178 self[NameObject(PG.ANNOTS)] = new_annots

1179 return None

1180

1181 def _merge_page_writer(

1182 self,

1183 page2: "PageObject",

1184 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1185 ctm: Optional[CompressedTransformationMatrix] = None,

1186 over: bool = True,

1187 expand: bool = False,

1188 ) -> None:

1189 # First we work on merging the resource dictionaries. This allows us

1190 # to find which symbols in the content streams we might need to

1191 # rename.

1192 assert isinstance(self.indirect_reference, IndirectObject)

1193 pdf = self.indirect_reference.pdf

1194

1195 rename = {}

1196 if PG.RESOURCES not in self:

1197 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1198 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1199 if PG.RESOURCES not in page2:

1200 page2resources = DictionaryObject()

1201 else:

1202 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1203

1204 for res in (

1205 RES.EXT_G_STATE,

1206 RES.FONT,

1207 RES.XOBJECT,

1208 RES.COLOR_SPACE,

1209 RES.PATTERN,

1210 RES.SHADING,

1211 RES.PROPERTIES,

1212 ):

1213 if res in page2resources:

1214 if res not in original_resources:

1215 original_resources[NameObject(res)] = DictionaryObject()

1216 _, newrename = self._merge_resources(

1217 original_resources, page2resources, res, False

1218 )

1219 rename.update(newrename)

1220 # Combine /ProcSet sets.

1221 if RES.PROC_SET in page2resources:

1222 if RES.PROC_SET not in original_resources:

1223 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1224 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1225 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1226 if x not in arr:

1227 arr.append(x)

1228 arr.sort()

1229

1230 if PG.ANNOTS in page2:

1231 if PG.ANNOTS not in self:

1232 self[NameObject(PG.ANNOTS)] = ArrayObject()

1233 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1234 if ctm is None:

1235 trsf = Transformation()

1236 else:

1237 trsf = Transformation(ctm)

1238 # Ensure we are working on a copy of the list. Otherwise, if both pages

1239 # are the same object, we might run into an infinite loop.

1240 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):

1241 a = a.get_object()

1242 aa = a.clone(

1243 pdf,

1244 ignore_fields=("/P", "/StructParent", "/Parent"),

1245 force_duplicate=True,

1246 )

1247 r = cast(ArrayObject, a["/Rect"])

1248 pt1 = trsf.apply_on((r[0], r[1]), True)

1249 pt2 = trsf.apply_on((r[2], r[3]), True)

1250 aa[NameObject("/Rect")] = ArrayObject(

1251 (

1252 min(pt1[0], pt2[0]),

1253 min(pt1[1], pt2[1]),

1254 max(pt1[0], pt2[0]),

1255 max(pt1[1], pt2[1]),

1256 )

1257 )

1258 if "/QuadPoints" in a:

1259 q = cast(ArrayObject, a["/QuadPoints"])

1260 aa[NameObject("/QuadPoints")] = ArrayObject(

1261 trsf.apply_on((q[0], q[1]), True)

1262 + trsf.apply_on((q[2], q[3]), True)

1263 + trsf.apply_on((q[4], q[5]), True)

1264 + trsf.apply_on((q[6], q[7]), True)

1265 )

1266 try:

1267 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1268 except KeyError:

1269 pass

1270 try:

1271 aa[NameObject("/P")] = self.indirect_reference

1272 annots.append(aa.indirect_reference)

1273 except AttributeError:

1274 pass

1275

1276 new_content_array = ArrayObject()

1277 original_content = self.get_contents()

1278 if original_content is not None:

1279 original_content.isolate_graphics_state()

1280 new_content_array.append(original_content)

1281

1282 page2content = page2.get_contents()

1283 if page2content is not None:

1284 rect = getattr(page2, MERGE_CROP_BOX)

1285 page2content.operations.insert(

1286 0,

1287 (

1288 map(

1289 FloatObject,

1290 [

1291 rect.left,

1292 rect.bottom,

1293 rect.width,

1294 rect.height,

1295 ],

1296 ),

1297 b"re",

1298 ),

1299 )

1300 page2content.operations.insert(1, ([], b"W"))

1301 page2content.operations.insert(2, ([], b"n"))

1302 if page2transformation is not None:

1303 page2content = page2transformation(page2content)

1304 page2content = PageObject._content_stream_rename(

1305 page2content, rename, self.pdf

1306 )

1307 page2content.isolate_graphics_state()

1308 if over:

1309 new_content_array.append(page2content)

1310 else:

1311 new_content_array.insert(0, page2content)

1312

1313 # if expanding the page to fit a new page, calculate the new media box size

1314 if expand:

1315 self._expand_mediabox(page2, ctm)

1316

1317 self.replace_contents(new_content_array)

1318

1319 def _expand_mediabox(

1320 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1321 ) -> None:

1322 corners1 = (

1323 self.mediabox.left.as_numeric(),

1324 self.mediabox.bottom.as_numeric(),

1325 self.mediabox.right.as_numeric(),

1326 self.mediabox.top.as_numeric(),

1327 )

1328 corners2 = (

1329 page2.mediabox.left.as_numeric(),

1330 page2.mediabox.bottom.as_numeric(),

1331 page2.mediabox.left.as_numeric(),

1332 page2.mediabox.top.as_numeric(),

1333 page2.mediabox.right.as_numeric(),

1334 page2.mediabox.top.as_numeric(),

1335 page2.mediabox.right.as_numeric(),

1336 page2.mediabox.bottom.as_numeric(),

1337 )

1338 if ctm is not None:

1339 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1340 new_x = tuple(

1341 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1342 for i in range(0, 8, 2)

1343 )

1344 new_y = tuple(

1345 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1346 for i in range(0, 8, 2)

1347 )

1348 else:

1349 new_x = corners2[0:8:2]

1350 new_y = corners2[1:8:2]

1351 lowerleft = (min(new_x), min(new_y))

1352 upperright = (max(new_x), max(new_y))

1353 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1354 upperright = (

1355 max(corners1[2], upperright[0]),

1356 max(corners1[3], upperright[1]),

1357 )

1358

1359 self.mediabox.lower_left = lowerleft

1360 self.mediabox.upper_right = upperright

1361

1362 def merge_transformed_page(

1363 self,

1364 page2: "PageObject",

1365 ctm: Union[CompressedTransformationMatrix, Transformation],

1366 over: bool = True,

1367 expand: bool = False,

1368 ) -> None:

1369 """

1370 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1371 matrix is applied to the merged stream.

1372

1373 Args:

1374 page2: The page to be merged into this one.

1375 ctm: a 6-element tuple containing the operands of the

1376 transformation matrix

1377 over: set the page2 content over page1 if True (default) else under

1378 expand: Whether the page should be expanded to fit the dimensions

1379 of the page to be merged.

1380

1381 """

1382 if isinstance(ctm, Transformation):

1383 ctm = ctm.ctm

1384 self._merge_page(

1385 page2,

1386 lambda page2_content: PageObject._add_transformation_matrix(

1387 page2_content, page2.pdf, ctm

1388 ),

1389 ctm,

1390 over,

1391 expand,

1392 )

1393

1394 def merge_scaled_page(

1395 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1396 ) -> None:

1397 """

1398 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1399 is scaled by applying a transformation matrix.

1400

1401 Args:

1402 page2: The page to be merged into this one.

1403 scale: The scaling factor

1404 over: set the page2 content over page1 if True (default) else under

1405 expand: Whether the page should be expanded to fit the

1406 dimensions of the page to be merged.

1407

1408 """

1409 op = Transformation().scale(scale, scale)

1410 self.merge_transformed_page(page2, op, over, expand)

1411

1412 def merge_rotated_page(

1413 self,

1414 page2: "PageObject",

1415 rotation: float,

1416 over: bool = True,

1417 expand: bool = False,

1418 ) -> None:

1419 """

1420 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1421 is rotated by applying a transformation matrix.

1422

1423 Args:

1424 page2: The page to be merged into this one.

1425 rotation: The angle of the rotation, in degrees

1426 over: set the page2 content over page1 if True (default) else under

1427 expand: Whether the page should be expanded to fit the

1428 dimensions of the page to be merged.

1429

1430 """

1431 op = Transformation().rotate(rotation)

1432 self.merge_transformed_page(page2, op, over, expand)

1433

1434 def merge_translated_page(

1435 self,

1436 page2: "PageObject",

1437 tx: float,

1438 ty: float,

1439 over: bool = True,

1440 expand: bool = False,

1441 ) -> None:

1442 """

1443 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1444 merged is translated by applying a transformation matrix.

1445

1446 Args:

1447 page2: the page to be merged into this one.

1448 tx: The translation on X axis

1449 ty: The translation on Y axis

1450 over: set the page2 content over page1 if True (default) else under

1451 expand: Whether the page should be expanded to fit the

1452 dimensions of the page to be merged.

1453

1454 """

1455 op = Transformation().translate(tx, ty)

1456 self.merge_transformed_page(page2, op, over, expand)

1457

1458 def add_transformation(

1459 self,

1460 ctm: Union[Transformation, CompressedTransformationMatrix],

1461 expand: bool = False,

1462 ) -> None:

1463 """

1464 Apply a transformation matrix to the page.

1465

1466 Args:

1467 ctm: A 6-element tuple containing the operands of the

1468 transformation matrix. Alternatively, a

1469 :py:class:`Transformation<pypdf.Transformation>`

1470 object can be passed.

1471

1472 See :doc:`/user/cropping-and-transforming`.

1473

1474 """

1475 if isinstance(ctm, Transformation):

1476 ctm = ctm.ctm

1477 content = self.get_contents()

1478 if content is not None:

1479 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1480 content.isolate_graphics_state()

1481 self.replace_contents(content)

1482 # if expanding the page to fit a new page, calculate the new media box size

1483 if expand:

1484 corners = [

1485 self.mediabox.left.as_numeric(),

1486 self.mediabox.bottom.as_numeric(),

1487 self.mediabox.left.as_numeric(),

1488 self.mediabox.top.as_numeric(),

1489 self.mediabox.right.as_numeric(),

1490 self.mediabox.top.as_numeric(),

1491 self.mediabox.right.as_numeric(),

1492 self.mediabox.bottom.as_numeric(),

1493 ]

1494

1495 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1496 new_x = [

1497 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1498 for i in range(0, 8, 2)

1499 ]

1500 new_y = [

1501 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1502 for i in range(0, 8, 2)

1503 ]

1504

1505 self.mediabox.lower_left = (min(new_x), min(new_y))

1506 self.mediabox.upper_right = (max(new_x), max(new_y))

1507

1508 def scale(self, sx: float, sy: float) -> None:

1509 """

1510 Scale a page by the given factors by applying a transformation matrix

1511 to its content and updating the page size.

1512

1513 This updates the various page boundaries (bleedbox, trimbox, etc.)

1514 and the contents of the page.

1515

1516 Args:

1517 sx: The scaling factor on horizontal axis.

1518 sy: The scaling factor on vertical axis.

1519

1520 """

1521 self.add_transformation((sx, 0, 0, sy, 0, 0))

1522 self.bleedbox = self.bleedbox.scale(sx, sy)

1523 self.trimbox = self.trimbox.scale(sx, sy)

1524 self.artbox = self.artbox.scale(sx, sy)

1525 self.cropbox = self.cropbox.scale(sx, sy)

1526 self.mediabox = self.mediabox.scale(sx, sy)

1527

1528 if PG.ANNOTS in self:

1529 annotations = self[PG.ANNOTS]

1530 if isinstance(annotations, ArrayObject):

1531 for annotation in annotations:

1532 annotation_obj = annotation.get_object()

1533 if ADA.Rect in annotation_obj:

1534 rectangle = annotation_obj[ADA.Rect]

1535 if isinstance(rectangle, ArrayObject):

1536 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1537 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1538 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1539 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1540

1541 if PG.VP in self:

1542 viewport = self[PG.VP]

1543 if isinstance(viewport, ArrayObject):

1544 bbox = viewport[0]["/BBox"]

1545 else:

1546 bbox = viewport["/BBox"] # type: ignore

1547 scaled_bbox = RectangleObject(

1548 (

1549 float(bbox[0]) * sx,

1550 float(bbox[1]) * sy,

1551 float(bbox[2]) * sx,

1552 float(bbox[3]) * sy,

1553 )

1554 )

1555 if isinstance(viewport, ArrayObject):

1556 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore

1557 NameObject("/BBox")

1558 ] = scaled_bbox

1559 else:

1560 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore

1561

1562 def scale_by(self, factor: float) -> None:

1563 """

1564 Scale a page by the given factor by applying a transformation matrix to

1565 its content and updating the page size.

1566

1567 Args:

1568 factor: The scaling factor (for both X and Y axis).

1569

1570 """

1571 self.scale(factor, factor)

1572

1573 def scale_to(self, width: float, height: float) -> None:

1574 """

1575 Scale a page to the specified dimensions by applying a transformation

1576 matrix to its content and updating the page size.

1577

1578 Args:

1579 width: The new width.

1580 height: The new height.

1581

1582 """

1583 sx = width / float(self.mediabox.width)

1584 sy = height / float(self.mediabox.height)

1585 self.scale(sx, sy)

1586

1587 def compress_content_streams(self, level: int = -1) -> None:

1588 """

1589 Compress the size of this page by joining all content streams and

1590 applying a FlateDecode filter.

1591

1592 However, it is possible that this function will perform no action if

1593 content stream compression becomes "automatic".

1594 """

1595 content = self.get_contents()

1596 if content is not None:

1597 content_obj = content.flate_encode(level)

1598 try:

1599 content.indirect_reference.pdf._objects[ # type: ignore

1600 content.indirect_reference.idnum - 1 # type: ignore

1601 ] = content_obj

1602 except AttributeError:

1603 if self.indirect_reference is not None and hasattr(

1604 self.indirect_reference.pdf, "_add_object"

1605 ):

1606 self.replace_contents(content_obj)

1607 else:

1608 raise ValueError("Page must be part of a PdfWriter")

1609

1610 @property

1611 def page_number(self) -> Optional[int]:

1612 """

1613 Read-only property which returns the page number within the PDF file.

1614

1615 Returns:

1616 Page number; None if the page is not attached to a PDF.

1617

1618 """

1619 if self.indirect_reference is None:

1620 return None

1621 try:

1622 lst = self.indirect_reference.pdf.pages

1623 return lst.index(self)

1624 except ValueError:

1625 return None

1626

1627 def _debug_for_extract(self) -> str: # pragma: no cover

1628 out = ""

1629 for ope, op in ContentStream(

1630 self["/Contents"].get_object(), self.pdf, "bytes"

1631 ).operations:

1632 if op == b"TJ":

1633 s = [x for x in ope[0] if isinstance(x, str)]

1634 else:

1635 s = []

1636 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1637 out += "\n=============================\n"

1638 try:

1639 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1640 out += fo + "\n"

1641 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1642 try:

1643 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1644 "/Encoding"

1645 ].__repr__()

1646 out += enc_repr + "\n"

1647 except Exception:

1648 pass

1649 try:

1650 out += (

1651 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1652 "/ToUnicode"

1653 ]

1654 .get_data()

1655 .decode()

1656 + "\n"

1657 )

1658 except Exception:

1659 pass

1660

1661 except KeyError:

1662 out += "No Font\n"

1663 return out

1664

1665 def _extract_text(

1666 self,

1667 obj: Any,

1668 pdf: Any,

1669 orientations: tuple[int, ...] = (0, 90, 180, 270),

1670 space_width: float = 200.0,

1671 content_key: Optional[str] = PG.CONTENTS,

1672 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1673 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1674 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1675 ) -> str:

1676 """

1677 See extract_text for most arguments.

1678

1679 Args:

1680 content_key: indicate the default key where to extract data

1681 None = the object; this allows reusing the function on an XObject

1682 default = "/Content"

1683

1684 """

1685 extractor = TextExtraction()

1686 font_resources: dict[str, DictionaryObject] = {}

1687 fonts: dict[str, Font] = {}

1688

1689 try:

1690 objr = obj

1691 while NameObject(PG.RESOURCES) not in objr:

1692 # /Resources can be inherited so we look to parents

1693 objr = objr["/Parent"].get_object()

1694 # If no parents then no /Resources will be available,

1695 # so an exception will be raised

1696 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])

1697 except Exception:

1698 # No resources means no text is possible (no font); we consider the

1699 # file as not damaged, no need to check for TJ or Tj

1700 return ""

1701

1702 if (

1703 not is_null_or_none(resources_dict)

1704 and "/Font" in resources_dict

1705 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))

1706 ):

1707 for font_resource in font_resources_dict:

1708 try:

1709 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())

1710 font_resources[font_resource] = font_resource_object

1711 fonts[font_resource] = Font.from_font_resource(font_resource_object)

1712 # Override space width, if applicable

1713 if fonts[font_resource].character_widths.get(" ", 0) == 0:

1714 fonts[font_resource].space_width = space_width

1715 except (AttributeError, TypeError):

1716 pass

1717

1718 try:

1719 content = (

1720 obj[content_key].get_object() if isinstance(content_key, str) else obj

1721 )

1722 if not isinstance(content, ContentStream):

1723 content = ContentStream(content, pdf, "bytes")

1724 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1725 return ""

1726 # We check all strings are TextStringObjects. ByteStringObjects

1727 # are strings where the byte->string encoding was unknown, so adding

1728 # them to the text here would be gibberish.

1729

1730 # Initialize the extractor with the necessary parameters

1731 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)

1732

1733 for operands, operator in content.operations:

1734 if visitor_operand_before is not None:

1735 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1736 # Multiple operators are handled here

1737 if operator == b"'":

1738 extractor.process_operation(b"T*", [])

1739 extractor.process_operation(b"Tj", operands)

1740 elif operator == b'"':

1741 extractor.process_operation(b"Tw", [operands[0]])

1742 extractor.process_operation(b"Tc", [operands[1]])

1743 extractor.process_operation(b"T*", [])

1744 extractor.process_operation(b"Tj", operands[2:])

1745 elif operator == b"TJ":

1746 # The space width may be smaller than the font width, so the width should be 95%.

1747 _confirm_space_width = extractor._space_width * 0.95

1748 if operands:

1749 for op in operands[0]:

1750 if isinstance(op, (str, bytes)):

1751 extractor.process_operation(b"Tj", [op])

1752 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1753 abs(float(op)) >= _confirm_space_width

1754 and extractor.text

1755 and extractor.text[-1] != " "

1756 ):

1757 extractor.process_operation(b"Tj", [" "])

1758 elif operator == b"TD":

1759 extractor.process_operation(b"TL", [-operands[1]])

1760 extractor.process_operation(b"Td", operands)

1761 elif operator == b"Do":

1762 extractor.output += extractor.text

1763 if visitor_text is not None:

1764 visitor_text(

1765 extractor.text,

1766 extractor.memo_cm,

1767 extractor.memo_tm,

1768 extractor.font_resource,

1769 extractor.font_size,

1770 )

1771 try:

1772 if extractor.output[-1] != "\n":

1773 extractor.output += "\n"

1774 if visitor_text is not None:

1775 visitor_text(

1776 "\n",

1777 extractor.memo_cm,

1778 extractor.memo_tm,

1779 extractor.font_resource,

1780 extractor.font_size,

1781 )

1782 except IndexError:

1783 pass

1784 try:

1785 xobj = resources_dict["/XObject"]

1786 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore

1787 text = self.extract_xform_text(

1788 xobj[operands[0]], # type: ignore

1789 orientations,

1790 space_width,

1791 visitor_operand_before,

1792 visitor_operand_after,

1793 visitor_text,

1794 )

1795 extractor.output += text

1796 if visitor_text is not None:

1797 visitor_text(

1798 text,

1799 extractor.memo_cm,

1800 extractor.memo_tm,

1801 extractor.font_resource,

1802 extractor.font_size,

1803 )

1804 except Exception as exception:

1805 logger_warning(

1806 f"Impossible to decode XFormObject {operands[0]}: {exception}",

1807 __name__,

1808 )

1809 finally:

1810 extractor.text = ""

1811 extractor.memo_cm = extractor.cm_matrix.copy()

1812 extractor.memo_tm = extractor.tm_matrix.copy()

1813 else:

1814 extractor.process_operation(operator, operands)

1815 if visitor_operand_after is not None:

1816 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1817 extractor.output += extractor.text # just in case

1818 if extractor.text != "" and visitor_text is not None:

1819 visitor_text(

1820 extractor.text,

1821 extractor.memo_cm,

1822 extractor.memo_tm,

1823 extractor.font_resource,

1824 extractor.font_size,

1825 )

1826 return extractor.output

1827

1828 def _layout_mode_fonts(self) -> dict[str, Font]:

1829 """

1830 Get fonts formatted for "layout" mode text extraction.

1831

1832 Returns:

1833 Dict[str, Font]: dictionary of Font instances keyed by font name

1834

1835 """

1836 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1837 objr: Any = self

1838 fonts: dict[str, Font] = {}

1839 while objr is not None:

1840 try:

1841 resources_dict: Any = objr[PG.RESOURCES]

1842 except KeyError:

1843 resources_dict = {}

1844 if "/Font" in resources_dict and self.pdf is not None:

1845 for font_name in resources_dict["/Font"]:

1846 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])

1847 try:

1848 objr = objr["/Parent"].get_object()

1849 except KeyError:

1850 objr = None

1851

1852 return fonts

1853

1854 def _layout_mode_text(

1855 self,

1856 space_vertically: bool = True,

1857 scale_weight: float = 1.25,

1858 strip_rotated: bool = True,

1859 debug_path: Optional[Path] = None,

1860 font_height_weight: float = 1,

1861 ) -> str:

1862 """

1863 Get text preserving fidelity to source PDF text layout.

1864

1865 Args:

1866 space_vertically: include blank lines inferred from y distance + font

1867 height. Defaults to True.

1868 scale_weight: multiplier for string length when calculating weighted

1869 average character width. Defaults to 1.25.

1870 strip_rotated: Removes text that is rotated w.r.t. to the page from

1871 layout mode output. Defaults to True.

1872 debug_path (Path | None): if supplied, must target a directory.

1873 creates the following files with debug information for layout mode

1874 functions if supplied:

1875 - fonts.json: output of self._layout_mode_fonts

1876 - tjs.json: individual text render ops with corresponding transform matrices

1877 - bts.json: text render ops left justified and grouped by BT/ET operators

1878 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1879 Defaults to None.

1880 font_height_weight: multiplier for font height when calculating

1881 blank lines. Defaults to 1.

1882

1883 Returns:

1884 str: multiline string containing page text in a fixed width format that

1885 closely adheres to the rendered layout in the source pdf.

1886

1887 """

1888 fonts = self._layout_mode_fonts()

1889 if debug_path: # pragma: no cover

1890 import json # noqa: PLC0415

1891

1892 debug_path.joinpath("fonts.json").write_text(

1893 json.dumps(fonts, indent=2, default=asdict),

1894 "utf-8"

1895 )

1896

1897 ops = iter(

1898 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1899 )

1900 bt_groups = _layout_mode.text_show_operations(

1901 ops, fonts, strip_rotated, debug_path

1902 )

1903

1904 if not bt_groups:

1905 return ""

1906

1907 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1908

1909 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1910

1911 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1912

1913 def extract_text(

1914 self,

1915 *args: Any,

1916 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1917 space_width: float = 200.0,

1918 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1919 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1920 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1921 extraction_mode: Literal["plain", "layout"] = "plain",

1922 **kwargs: Any,

1923 ) -> str:

1924 """

1925 Locate all text drawing commands, in the order they are provided in the

1926 content stream, and extract the text.

1927

1928 This works well for some PDF files, but poorly for others, depending on

1929 the generator used. This will be refined in the future.

1930

1931 Do not rely on the order of text coming out of this function, as it

1932 will change if this function is made more sophisticated.

1933

1934 Arabic and Hebrew are extracted in the correct order.

1935 If required a custom RTL range of characters can be defined;

1936 see function set_custom_rtl.

1937

1938 Additionally you can provide visitor methods to get informed on all

1939 operations and all text objects.

1940 For example in some PDF files this can be useful to parse tables.

1941

1942 Args:

1943 orientations: list of orientations extract_text will look for

1944 default = (0, 90, 180, 270)

1945 note: currently only 0 (up),90 (turned left), 180 (upside down),

1946 270 (turned right)

1947 Silently ignored in "layout" mode.

1948 space_width: force default space width

1949 if not extracted from font (default: 200)

1950 Silently ignored in "layout" mode.

1951 visitor_operand_before: function to be called before processing an operation.

1952 It has four arguments: operator, operand-arguments,

1953 current transformation matrix and text matrix.

1954 Ignored with a warning in "layout" mode.

1955 visitor_operand_after: function to be called after processing an operation.

1956 It has four arguments: operator, operand-arguments,

1957 current transformation matrix and text matrix.

1958 Ignored with a warning in "layout" mode.

1959 visitor_text: function to be called when extracting some text at some position.

1960 It has five arguments: text, current transformation matrix,

1961 text matrix, font-dictionary and font-size.

1962 The font-dictionary may be None in case of unknown fonts.

1963 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1964 Ignored with a warning in "layout" mode.

1965 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1966 "layout" for experimental layout mode functionality.

1967 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1968 in "layout" mode.

1969

1970 kwargs:

1971 layout_mode_space_vertically (bool): include blank lines inferred from

1972 y distance + font height. Defaults to True.

1973 layout_mode_scale_weight (float): multiplier for string length when calculating

1974 weighted average character width. Defaults to 1.25.

1975 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

1976 Set to False to include rotated text anyway. If rotated text is discovered,

1977 layout will be degraded and a warning will result. Defaults to True.

1978 layout_mode_debug_path (Path | None): if supplied, must target a directory.

1979 creates the following files with debug information for layout mode

1980 functions if supplied:

1981

1982 - fonts.json: output of self._layout_mode_fonts

1983 - tjs.json: individual text render ops with corresponding transform matrices

1984 - bts.json: text render ops left justified and grouped by BT/ET operators

1985 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1986 layout_mode_font_height_weight (float): multiplier for font height when calculating

1987 blank lines. Defaults to 1.

1988

1989 Returns:

1990 The extracted text

1991

1992 """

1993 if extraction_mode not in ["plain", "layout"]:

1994 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

1995 if extraction_mode == "layout":

1996 for visitor in (

1997 "visitor_operand_before",

1998 "visitor_operand_after",

1999 "visitor_text",

2000 ):

2001 if locals()[visitor]:

2002 logger_warning(

2003 f"Argument {visitor} is ignored in layout mode",

2004 __name__,

2005 )

2006 return self._layout_mode_text(

2007 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2008 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2009 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2010 debug_path=kwargs.get("layout_mode_debug_path"),

2011 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2012 )

2013 if len(args) >= 1:

2014 if isinstance(args[0], str):

2015 if len(args) >= 3:

2016 if isinstance(args[2], (tuple, int)):

2017 orientations = args[2]

2018 else:

2019 raise TypeError(f"Invalid positional parameter {args[2]}")

2020 if len(args) >= 4:

2021 if isinstance(args[3], (float, int)):

2022 space_width = args[3]

2023 else:

2024 raise TypeError(f"Invalid positional parameter {args[3]}")

2025 elif isinstance(args[0], (tuple, int)):

2026 orientations = args[0]

2027 if len(args) >= 2:

2028 if isinstance(args[1], (float, int)):

2029 space_width = args[1]

2030 else:

2031 raise TypeError(f"Invalid positional parameter {args[1]}")

2032 else:

2033 raise TypeError(f"Invalid positional parameter {args[0]}")

2034

2035 if isinstance(orientations, int):

2036 orientations = (orientations,)

2037

2038 return self._extract_text(

2039 self,

2040 self.pdf,

2041 orientations,

2042 space_width,

2043 PG.CONTENTS,

2044 visitor_operand_before,

2045 visitor_operand_after,

2046 visitor_text,

2047 )

2048

2049 def extract_xform_text(

2050 self,

2051 xform: EncodedStreamObject,

2052 orientations: tuple[int, ...] = (0, 90, 270, 360),

2053 space_width: float = 200.0,

2054 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2055 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2056 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2057 ) -> str:

2058 """

2059 Extract text from an XObject.

2060

2061 Args:

2062 xform:

2063 orientations:

2064 space_width: force default space width (if not extracted from font (default 200)

2065 visitor_operand_before:

2066 visitor_operand_after:

2067 visitor_text:

2068

2069 Returns:

2070 The extracted text

2071

2072 """

2073 return self._extract_text(

2074 xform,

2075 self.pdf,

2076 orientations,

2077 space_width,

2078 None,

2079 visitor_operand_before,

2080 visitor_operand_after,

2081 visitor_text,

2082 )

2083

2084 def _get_fonts(self) -> tuple[set[str], set[str]]:

2085 """

2086 Get the names of embedded fonts and unembedded fonts.

2087

2088 Returns:

2089 A tuple (set of embedded fonts, set of unembedded fonts)

2090

2091 """

2092 obj = self.get_object()

2093 assert isinstance(obj, DictionaryObject)

2094 fonts: set[str] = set()

2095 embedded: set[str] = set()

2096 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2097 unembedded = fonts - embedded

2098 return embedded, unembedded

2099

2100 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2101 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2102 default user space units, defining the boundaries of the physical medium on

2103 which the page is intended to be displayed or printed."""

2104

2105 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2106 """

2107 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2108 default user space units, defining the visible region of default user

2109 space.

2110

2111 When the page is displayed or printed, its contents are to be clipped

2112 (cropped) to this rectangle and then imposed on the output medium in some

2113 implementation-defined manner. Default value: same as

2114 :attr:`mediabox<mediabox>`.

2115 """

2116

2117 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2118 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2119 default user space units, defining the region to which the contents of the

2120 page should be clipped when output in a production environment."""

2121

2122 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2123 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2124 default user space units, defining the intended dimensions of the finished

2125 page after trimming."""

2126

2127 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2128 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2129 default user space units, defining the extent of the page's meaningful

2130 content as intended by the page's creator."""

2131

2132 @property

2133 def annotations(self) -> Optional[ArrayObject]:

2134 if "/Annots" not in self:

2135 return None

2136 return cast(ArrayObject, self["/Annots"])

2137

2138 @annotations.setter

2139 def annotations(self, value: Optional[ArrayObject]) -> None:

2140 """

2141 Set the annotations array of the page.

2142

2143 Typically you do not want to set this value, but append to it.

2144 If you append to it, remember to add the object first to the writer

2145 and only add the indirect object.

2146 """

2147 if value is None:

2148 if "/Annots" not in self:

2149 return

2150 del self[NameObject("/Annots")]

2151 else:

2152 self[NameObject("/Annots")] = value

2153

2154

2155class _VirtualList(Sequence[PageObject]):

2156 def __init__(

2157 self,

2158 length_function: Callable[[], int],

2159 get_function: Callable[[int], PageObject],

2160 ) -> None:

2161 self.length_function = length_function

2162 self.get_function = get_function

2163 self.current = -1

2164

2165 def __len__(self) -> int:

2166 return self.length_function()

2167

2168 @overload

2169 def __getitem__(self, index: int) -> PageObject:

2170 ...

2171

2172 @overload

2173 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2174 ...

2175

2176 def __getitem__(

2177 self, index: Union[int, slice]

2178 ) -> Union[PageObject, Sequence[PageObject]]:

2179 if isinstance(index, slice):

2180 indices = range(*index.indices(len(self)))

2181 cls = type(self)

2182 return cls(indices.__len__, lambda idx: self[indices[idx]])

2183 if not isinstance(index, int):

2184 raise TypeError("Sequence indices must be integers")

2185 len_self = len(self)

2186 if index < 0:

2187 # support negative indexes

2188 index += len_self

2189 if not (0 <= index < len_self):

2190 raise IndexError("Sequence index out of range")

2191 return self.get_function(index)

2192

2193 def __delitem__(self, index: Union[int, slice]) -> None:

2194 if isinstance(index, slice):

2195 r = list(range(*index.indices(len(self))))

2196 # pages have to be deleted from last to first

2197 r.sort()

2198 r.reverse()

2199 for p in r:

2200 del self[p] # recursive call

2201 return

2202 if not isinstance(index, int):

2203 raise TypeError("Index must be integers")

2204 len_self = len(self)

2205 if index < 0:

2206 # support negative indexes

2207 index += len_self

2208 if not (0 <= index < len_self):

2209 raise IndexError("Index out of range")

2210 ind = self[index].indirect_reference

2211 assert ind is not None

2212 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2213 "/Parent", None

2214 )

2215 first = True

2216 while parent is not None:

2217 parent = cast(DictionaryObject, parent.get_object())

2218 try:

2219 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2220 del cast(ArrayObject, parent["/Kids"])[i]

2221 first = False

2222 try:

2223 assert ind is not None

2224 del ind.pdf.flattened_pages[index] # case of page in a Reader

2225 except Exception: # pragma: no cover

2226 pass

2227 if "/Count" in parent:

2228 parent[NameObject("/Count")] = NumberObject(

2229 cast(int, parent["/Count"]) - 1

2230 )

2231 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2232 # No more objects in this part of this subtree

2233 ind = parent.indirect_reference

2234 parent = parent.get("/Parent", None)

2235 except ValueError: # from index

2236 if first:

2237 raise PdfReadError(f"Page not found in page tree: {ind}")

2238 break

2239

2240 def __iter__(self) -> Iterator[PageObject]:

2241 for i in range(len(self)):

2242 yield self[i]

2243

2244 def __str__(self) -> str:

2245 p = [f"PageObject({i})" for i in range(self.length_function())]

2246 return f"[{', '.join(p)}]"

2247

2248

2249def _get_fonts_walk(

2250 obj: DictionaryObject,

2251 fnt: set[str],

2252 emb: set[str],

2253) -> tuple[set[str], set[str]]:

2254 """

2255 Get the set of all fonts and all embedded fonts.

2256

2257 Args:

2258 obj: Page resources dictionary

2259 fnt: font

2260 emb: embedded fonts

2261

2262 Returns:

2263 A tuple (fnt, emb)

2264

2265 If there is a key called 'BaseFont', that is a font that is used in the document.

2266 If there is a key called 'FontName' and another key in the same dictionary object

2267 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2268 embedded.

2269

2270 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2271

2272 """

2273 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2274

2275 def process_font(f: DictionaryObject) -> None:

2276 nonlocal fnt, emb

2277 f = cast(DictionaryObject, f.get_object()) # to be sure

2278 if "/BaseFont" in f:

2279 fnt.add(cast(str, f["/BaseFont"]))

2280

2281 if (

2282 ("/CharProcs" in f)

2283 or (

2284 "/FontDescriptor" in f

2285 and any(

2286 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2287 )

2288 )

2289 or (

2290 "/DescendantFonts" in f

2291 and "/FontDescriptor"

2292 in cast(

2293 DictionaryObject,

2294 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2295 )

2296 and any(

2297 x

2298 in cast(

2299 DictionaryObject,

2300 cast(

2301 DictionaryObject,

2302 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2303 )["/FontDescriptor"],

2304 )

2305 for x in fontkeys

2306 )

2307 )

2308 ):

2309 # the list comprehension ensures there is FontFile

2310 try:

2311 emb.add(cast(str, f["/BaseFont"]))

2312 except KeyError:

2313 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2314

2315 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2316 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2317 process_font(f)

2318 if "/Resources" in obj:

2319 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2320 for f in cast(

2321 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2322 ).values():

2323 process_font(f)

2324 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2325 for x in cast(

2326 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2327 ).values():

2328 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2329 if "/Annots" in obj:

2330 for a in cast(ArrayObject, obj["/Annots"]):

2331 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2332 if "/AP" in obj:

2333 if (

2334 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2335 "/Type"

2336 )

2337 == "/XObject"

2338 ):

2339 _get_fonts_walk(

2340 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2341 fnt,

2342 emb,

2343 )

2344 else:

2345 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2346 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2347 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

919 statements