Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from dataclasses import dataclass

33from decimal import Decimal

34from io import BytesIO

35from pathlib import Path

36from typing import (

37 Any,

38 Callable,

39 Literal,

40 Optional,

41 Union,

42 cast,

43 overload,

44)

46from ._cmap import (

47 build_char_map,

48)

49from ._protocols import PdfCommonDocProtocol

50from ._text_extraction import (

51 _layout_mode,

52)

53from ._text_extraction._text_extractor import TextExtraction

54from ._utils import (

55 CompressedTransformationMatrix,

56 TransformationMatrixType,

57 _human_readable_bytes,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING

62from .constants import AnnotationDictionaryAttributes as ADA

63from .constants import ImageAttributes as IA

64from .constants import PageAttributes as PG

65from .constants import Resources as RES

66from .errors import PageSizeNotDefinedError, PdfReadError

67from .filters import _xobj_to_image

68from .generic import (

69 ArrayObject,

70 ContentStream,

71 DictionaryObject,

72 EncodedStreamObject,

73 FloatObject,

74 IndirectObject,

75 NameObject,

76 NullObject,

77 NumberObject,

78 PdfObject,

79 RectangleObject,

80 StreamObject,

81 is_null_or_none,

82)

84try:

85 from PIL.Image import Image

87 pil_not_imported = False

88except ImportError:

89 Image = object # type: ignore

90 pil_not_imported = True # error will be raised only when using images

92MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

95def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

96 retval: Union[None, RectangleObject, IndirectObject] = self.get(name)

97 if isinstance(retval, RectangleObject):

98 return retval

99 if is_null_or_none(retval):

100 for d in defaults:

101 retval = self.get(d)

102 if retval is not None:

103 break

104 if isinstance(retval, IndirectObject):

105 retval = self.pdf.get_object(retval)

106 retval = RectangleObject(retval) # type: ignore

107 _set_rectangle(self, name, retval)

108 return retval

109

110

111def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

112 self[NameObject(name)] = value

113

114

115def _delete_rectangle(self: Any, name: str) -> None:

116 del self[name]

117

118

119def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

120 return property(

121 lambda self: _get_rectangle(self, name, fallback),

122 lambda self, value: _set_rectangle(self, name, value),

123 lambda self: _delete_rectangle(self, name),

124 )

125

126

127class Transformation:

128 """

129 Represent a 2D transformation.

130

131 The transformation between two coordinate systems is represented by a 3-by-3

132 transformation matrix with the following form::

133

134 a b 0

135 c d 0

136 e f 1

137

138 Because a transformation matrix has only six elements that can be changed,

139 it is usually specified in PDF as the six-element array [ a b c d e f ].

140

141 Coordinate transformations are expressed as matrix multiplications::

142

143 a b 0

144 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

145 e f 1

146

147

148 Example:

149 >>> from pypdf import PdfWriter, Transformation

150 >>> page = PdfWriter().add_blank_page(800, 600)

151 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

152 >>> page.add_transformation(op)

153

154 """

155

156 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

157 self.ctm = ctm

158

159 @property

160 def matrix(self) -> TransformationMatrixType:

161 """

162 Return the transformation matrix as a tuple of tuples in the form:

163

164 ((a, b, 0), (c, d, 0), (e, f, 1))

165 """

166 return (

167 (self.ctm[0], self.ctm[1], 0),

168 (self.ctm[2], self.ctm[3], 0),

169 (self.ctm[4], self.ctm[5], 1),

170 )

171

172 @staticmethod

173 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

174 """

175 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

176

177 Args:

178 matrix: The transformation matrix as a tuple of tuples.

179

180 Returns:

181 A tuple representing the transformation matrix as (a, b, c, d, e, f)

182

183 """

184 return (

185 matrix[0][0],

186 matrix[0][1],

187 matrix[1][0],

188 matrix[1][1],

189 matrix[2][0],

190 matrix[2][1],

191 )

192

193 def _to_cm(self) -> str:

194 # Returns the cm operation string for the given transformation matrix

195 return (

196 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

197 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

198 )

199

200 def transform(self, m: "Transformation") -> "Transformation":

201 """

202 Apply one transformation to another.

203

204 Args:

205 m: a Transformation to apply.

206

207 Returns:

208 A new ``Transformation`` instance

209

210 Example:

211 >>> from pypdf import PdfWriter, Transformation

212 >>> height, width = 40, 50

213 >>> page = PdfWriter().add_blank_page(800, 600)

214 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

215 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

216 >>> page.add_transformation(op)

217

218 """

219 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

220 return Transformation(ctm)

221

222 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

223 """

224 Translate the contents of a page.

225

226 Args:

227 tx: The translation along the x-axis.

228 ty: The translation along the y-axis.

229

230 Returns:

231 A new ``Transformation`` instance

232

233 """

234 m = self.ctm

235 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

236

237 def scale(

238 self, sx: Optional[float] = None, sy: Optional[float] = None

239 ) -> "Transformation":

240 """

241 Scale the contents of a page towards the origin of the coordinate system.

242

243 Typically, that is the lower-left corner of the page. That can be

244 changed by translating the contents / the page boxes.

245

246 Args:

247 sx: The scale factor along the x-axis.

248 sy: The scale factor along the y-axis.

249

250 Returns:

251 A new Transformation instance with the scaled matrix.

252

253 """

254 if sx is None and sy is None:

255 raise ValueError("Either sx or sy must be specified")

256 if sx is None:

257 sx = sy

258 if sy is None:

259 sy = sx

260 assert sx is not None

261 assert sy is not None

262 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

263 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

264 return Transformation(ctm)

265

266 def rotate(self, rotation: float) -> "Transformation":

267 """

268 Rotate the contents of a page.

269

270 Args:

271 rotation: The angle of rotation in degrees.

272

273 Returns:

274 A new ``Transformation`` instance with the rotated matrix.

275

276 """

277 rotation = math.radians(rotation)

278 op: TransformationMatrixType = (

279 (math.cos(rotation), math.sin(rotation), 0),

280 (-math.sin(rotation), math.cos(rotation), 0),

281 (0, 0, 1),

282 )

283 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

284 return Transformation(ctm)

285

286 def __repr__(self) -> str:

287 return f"Transformation(ctm={self.ctm})"

288

289 @overload

290 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

291 ...

292

293 @overload

294 def apply_on(

295 self, pt: tuple[float, float], as_object: bool = False

296 ) -> tuple[float, float]:

297 ...

298

299 def apply_on(

300 self,

301 pt: Union[tuple[float, float], list[float]],

302 as_object: bool = False,

303 ) -> Union[tuple[float, float], list[float]]:

304 """

305 Apply the transformation matrix on the given point.

306

307 Args:

308 pt: A tuple or list representing the point in the form (x, y).

309 as_object: If True, return items as FloatObject, otherwise as plain floats.

310

311 Returns:

312 A tuple or list representing the transformed point in the form (x', y')

313

314 """

315 typ = FloatObject if as_object else float

316 pt1 = (

317 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

318 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

319 )

320 return list(pt1) if isinstance(pt, list) else pt1

321

322

323@dataclass

324class ImageFile:

325 """

326 Image within the PDF file. *This object is not designed to be built.*

327

328 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

329 """

330

331 name: str = ""

332 """

333 Filename as identified within the PDF file.

334 """

335

336 data: bytes = b""

337 """

338 Data as bytes.

339 """

340

341 image: Optional[Image] = None

342 """

343 Data as PIL image.

344 """

345

346 indirect_reference: Optional[IndirectObject] = None

347 """

348 Reference to the object storing the stream.

349 """

350

351 def replace(self, new_image: Image, **kwargs: Any) -> None:

352 """

353 Replace the image with a new PIL image.

354

355 Args:

356 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

357 **kwargs: Additional keyword arguments to pass to `Image.save()`.

358

359 Raises:

360 TypeError: If the image is inline or in a PdfReader.

361 TypeError: If the image does not belong to a PdfWriter.

362 TypeError: If `new_image` is not a PIL Image.

363

364 Note:

365 This method replaces the existing image with a new image.

366 It is not allowed for inline images or images within a PdfReader.

367 The `kwargs` parameter allows passing additional parameters

368 to `Image.save()`, such as quality.

369

370 """

371 if pil_not_imported:

372 raise ImportError(

373 "pillow is required to do image extraction. "

374 "It can be installed via 'pip install pypdf[image]'"

375 )

376

377 from ._reader import PdfReader # noqa: PLC0415

378

379 # to prevent circular import

380 from .filters import _xobj_to_image # noqa: PLC0415

381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

382

383 if self.indirect_reference is None:

384 raise TypeError("Cannot update an inline image.")

385 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

386 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

387 if not isinstance(new_image, Image):

388 raise TypeError("new_image shall be a PIL Image")

389 b = BytesIO()

390 new_image.save(b, "PDF", **kwargs)

391 reader = PdfReader(b)

392 assert reader.pages[0].images[0].indirect_reference is not None

393 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

394 reader.pages[0].images[0].indirect_reference.get_object()

395 )

396 cast(

397 PdfObject, self.indirect_reference.get_object()

398 ).indirect_reference = self.indirect_reference

399 # change the object attributes

400 extension, byte_stream, img = _xobj_to_image(

401 cast(DictionaryObject, self.indirect_reference.get_object())

402 )

403 assert extension is not None

404 self.name = self.name[: self.name.rfind(".")] + extension

405 self.data = byte_stream

406 self.image = img

407

408 def __str__(self) -> str:

409 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

410

411 def __repr__(self) -> str:

412 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

413

414

415class VirtualListImages(Sequence[ImageFile]):

416 """

417 Provides access to images referenced within a page.

418 Only one copy will be returned if the usage is used on the same page multiple times.

419 See :func:`PageObject.images` for more details.

420 """

421

422 def __init__(

423 self,

424 ids_function: Callable[[], list[Union[str, list[str]]]],

425 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

426 ) -> None:

427 self.ids_function = ids_function

428 self.get_function = get_function

429 self.current = -1

430

431 def __len__(self) -> int:

432 return len(self.ids_function())

433

434 def keys(self) -> list[Union[str, list[str]]]:

435 return self.ids_function()

436

437 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

438 return [(x, self[x]) for x in self.ids_function()]

439

440 @overload

441 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

442 ...

443

444 @overload

445 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

446 ...

447

448 def __getitem__(

449 self, index: Union[int, slice, str, list[str], tuple[str]]

450 ) -> Union[ImageFile, Sequence[ImageFile]]:

451 lst = self.ids_function()

452 if isinstance(index, slice):

453 indices = range(*index.indices(len(self)))

454 lst = [lst[x] for x in indices]

455 cls = type(self)

456 return cls((lambda: lst), self.get_function)

457 if isinstance(index, (str, list, tuple)):

458 return self.get_function(index)

459 if not isinstance(index, int):

460 raise TypeError("Invalid sequence indices type")

461 len_self = len(lst)

462 if index < 0:

463 # support negative indexes

464 index += len_self

465 if not (0 <= index < len_self):

466 raise IndexError("Sequence index out of range")

467 return self.get_function(lst[index])

468

469 def __iter__(self) -> Iterator[ImageFile]:

470 for i in range(len(self)):

471 yield self[i]

472

473 def __str__(self) -> str:

474 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

475 return f"[{', '.join(p)}]"

476

477

478class PageObject(DictionaryObject):

479 """

480 PageObject represents a single page within a PDF file.

481

482 Typically these objects will be created by accessing the

483 :attr:`pages<pypdf.PdfReader.pages>` property of the

484 :class:`PdfReader<pypdf.PdfReader>` class, but it is

485 also possible to create an empty page with the

486 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

487

488 Args:

489 pdf: PDF file the page belongs to.

490 indirect_reference: Stores the original indirect reference to

491 this object in its source PDF

492

493 """

494

495 original_page: "PageObject" # very local use in writer when appending

496

497 def __init__(

498 self,

499 pdf: Optional[PdfCommonDocProtocol] = None,

500 indirect_reference: Optional[IndirectObject] = None,

501 ) -> None:

502 DictionaryObject.__init__(self)

503 self.pdf = pdf

504 self.inline_images: Optional[dict[str, ImageFile]] = None

505 self.indirect_reference = indirect_reference

506 if not is_null_or_none(indirect_reference):

507 assert indirect_reference is not None, "mypy"

508 self.update(cast(DictionaryObject, indirect_reference.get_object()))

509 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}

510

511 def hash_bin(self) -> int:

512 """

513 Used to detect modified object.

514

515 Note: this function is overloaded to return the same results

516 as a DictionaryObject.

517

518 Returns:

519 Hash considering type and value.

520

521 """

522 return hash(

523 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

524 )

525

526 def hash_value_data(self) -> bytes:

527 data = super().hash_value_data()

528 data += f"{id(self)}".encode()

529 return data

530

531 @property

532 def user_unit(self) -> float:

533 """

534 A read-only positive number giving the size of user space units.

535

536 It is in multiples of 1/72 inch. Hence a value of 1 means a user

537 space unit is 1/72 inch, and a value of 3 means that a user

538 space unit is 3/72 inch.

539 """

540 return self.get(PG.USER_UNIT, 1)

541

542 @staticmethod

543 def create_blank_page(

544 pdf: Optional[PdfCommonDocProtocol] = None,

545 width: Union[float, Decimal, None] = None,

546 height: Union[float, Decimal, None] = None,

547 ) -> "PageObject":

548 """

549 Return a new blank page.

550

551 If ``width`` or ``height`` is ``None``, try to get the page size

552 from the last page of *pdf*.

553

554 Args:

555 pdf: PDF file the page is within.

556 width: The width of the new page expressed in default user

557 space units.

558 height: The height of the new page expressed in default user

559 space units.

560

561 Returns:

562 The new blank page

563

564 Raises:

565 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

566 no page

567

568 """

569 page = PageObject(pdf)

570

571 # Creates a new page (cf PDF Reference §7.7.3.3)

572 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

573 page.__setitem__(NameObject(PG.PARENT), NullObject())

574 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

575 if width is None or height is None:

576 if pdf is not None and len(pdf.pages) > 0:

577 lastpage = pdf.pages[len(pdf.pages) - 1]

578 width = lastpage.mediabox.width

579 height = lastpage.mediabox.height

580 else:

581 raise PageSizeNotDefinedError

582 page.__setitem__(

583 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore

584 )

585

586 return page

587

588 def _get_ids_image(

589 self,

590 obj: Optional[DictionaryObject] = None,

591 ancest: Optional[list[str]] = None,

592 call_stack: Optional[list[Any]] = None,

593 ) -> list[Union[str, list[str]]]:

594 if call_stack is None:

595 call_stack = []

596 _i = getattr(obj, "indirect_reference", None)

597 if _i in call_stack:

598 return []

599 call_stack.append(_i)

600 if self.inline_images is None:

601 self.inline_images = self._get_inline_images()

602 if obj is None:

603 obj = self

604 if ancest is None:

605 ancest = []

606 lst: list[Union[str, list[str]]] = []

607 if (

608 PG.RESOURCES not in obj or

609 is_null_or_none(resources := obj[PG.RESOURCES]) or

610 RES.XOBJECT not in cast(DictionaryObject, resources)

611 ):

612 return [] if self.inline_images is None else list(self.inline_images.keys())

613

614 x_object = resources[RES.XOBJECT].get_object() # type: ignore

615 for o in x_object:

616 if not isinstance(x_object[o], StreamObject):

617 continue

618 if x_object[o][IA.SUBTYPE] == "/Image":

619 lst.append(o if len(ancest) == 0 else [*ancest, o])

620 else: # is a form with possible images inside

621 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

622 assert self.inline_images is not None

623 lst.extend(list(self.inline_images.keys()))

624 return lst

625

626 def _get_image(

627 self,

628 id: Union[str, list[str], tuple[str]],

629 obj: Optional[DictionaryObject] = None,

630 ) -> ImageFile:

631 if obj is None:

632 obj = cast(DictionaryObject, self)

633 if isinstance(id, tuple):

634 id = list(id)

635 if isinstance(id, list) and len(id) == 1:

636 id = id[0]

637 try:

638 xobjs = cast(

639 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

640 )

641 except KeyError:

642 if not (id[0] == "~" and id[-1] == "~"):

643 raise

644 if isinstance(id, str):

645 if id[0] == "~" and id[-1] == "~":

646 if self.inline_images is None:

647 self.inline_images = self._get_inline_images()

648 if self.inline_images is None: # pragma: no cover

649 raise KeyError("No inline image can be found")

650 return self.inline_images[id]

651

652 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

653 extension, byte_stream = imgd[:2]

654 return ImageFile(

655 name=f"{id[1:]}{extension}",

656 data=byte_stream,

657 image=imgd[2],

658 indirect_reference=xobjs[id].indirect_reference,

659 )

660 # in a subobject

661 ids = id[1:]

662 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

663

664 @property

665 def images(self) -> VirtualListImages:

666 """

667 Read-only property emulating a list of images on a page.

668

669 Get a list of all images on the page. The key can be:

670 - A string (for the top object)

671 - A tuple (for images within XObject forms)

672 - An integer

673

674 Examples:

675 * `reader.pages[0].images[0]` # return first image

676 * `reader.pages[0].images['/I0']` # return image '/I0'

677 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

678 * `for img in reader.pages[0].images:` # loops through all objects

679

680 images.keys() and images.items() can be used.

681

682 The ImageFile has the following properties:

683

684 * `.name` : name of the object

685 * `.data` : bytes of the object

686 * `.image` : PIL Image Object

687 * `.indirect_reference` : object reference

688

689 and the following methods:

690 `.replace(new_image: PIL.Image.Image, **kwargs)` :

691 replace the image in the pdf with the new image

692 applying the saving parameters indicated (such as quality)

693

694 Example usage:

695

696 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

697

698 Inline images are extracted and named ~0~, ~1~, ..., with the

699 indirect_reference set to None.

700

701 """

702 return VirtualListImages(self._get_ids_image, self._get_image)

703

704 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

705 """Translate values used in inline image"""

706 try:

707 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

708 except (TypeError, KeyError):

709 if isinstance(v, NameObject):

710 # It is a custom name, thus we have to look in resources.

711 # The only applicable case is for ColorSpace.

712 try:

713 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

714 v = cast(DictionaryObject, res)[v]

715 except KeyError: # for res and v

716 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

717 return v

718

719 def _get_inline_images(self) -> dict[str, ImageFile]:

720 """Load inline images. Entries will be identified as `~1~`."""

721 content = self.get_contents()

722 if is_null_or_none(content):

723 return {}

724 imgs_data = []

725 assert content is not None, "mypy"

726 for param, ope in content.operations:

727 if ope == b"INLINE IMAGE":

728 imgs_data.append(

729 {"settings": param["settings"], "__streamdata__": param["data"]}

730 )

731 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

732 raise PdfReadError(

733 f"{ope!r} operator met whereas not expected, "

734 "please share use case with pypdf dev team"

735 )

736 files = {}

737 for num, ii in enumerate(imgs_data):

738 init = {

739 "__streamdata__": ii["__streamdata__"],

740 "/Length": len(ii["__streamdata__"]),

741 }

742 for k, v in ii["settings"].items():

743 if k in {"/Length", "/L"}: # no length is expected

744 continue

745 if isinstance(v, list):

746 v = ArrayObject(

747 [self._translate_value_inline_image(k, x) for x in v]

748 )

749 else:

750 v = self._translate_value_inline_image(k, v)

751 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

752 if k not in init:

753 init[k] = v

754 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

755 extension, byte_stream, img = _xobj_to_image(ii["object"])

756 files[f"~{num}~"] = ImageFile(

757 name=f"~{num}~{extension}",

758 data=byte_stream,

759 image=img,

760 indirect_reference=None,

761 )

762 return files

763

764 @property

765 def rotation(self) -> int:

766 """

767 The visual rotation of the page.

768

769 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

770 valid values. This property does not affect ``/Contents``.

771 """

772 rotate_obj = self.get(PG.ROTATE, 0)

773 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

774

775 @rotation.setter

776 def rotation(self, r: float) -> None:

777 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

778

779 def transfer_rotation_to_content(self) -> None:

780 """

781 Apply the rotation of the page to the content and the media/crop/...

782 boxes.

783

784 It is recommended to apply this function before page merging.

785 """

786 r = -self.rotation # rotation to apply is in the otherway

787 self.rotation = 0

788 mb = RectangleObject(self.mediabox)

789 trsf = (

790 Transformation()

791 .translate(

792 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

793 )

794 .rotate(r)

795 )

796 pt1 = trsf.apply_on(mb.lower_left)

797 pt2 = trsf.apply_on(mb.upper_right)

798 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

799 self.add_transformation(trsf, False)

800 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

801 if b in self:

802 rr = RectangleObject(self[b]) # type: ignore

803 pt1 = trsf.apply_on(rr.lower_left)

804 pt2 = trsf.apply_on(rr.upper_right)

805 self[NameObject(b)] = RectangleObject(

806 (

807 min(pt1[0], pt2[0]),

808 min(pt1[1], pt2[1]),

809 max(pt1[0], pt2[0]),

810 max(pt1[1], pt2[1]),

811 )

812 )

813

814 def rotate(self, angle: int) -> "PageObject":

815 """

816 Rotate a page clockwise by increments of 90 degrees.

817

818 Args:

819 angle: Angle to rotate the page. Must be an increment of 90 deg.

820

821 Returns:

822 The rotated PageObject

823

824 """

825 if angle % 90 != 0:

826 raise ValueError("Rotation angle must be a multiple of 90")

827 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

828 return self

829

830 def _merge_resources(

831 self,

832 res1: DictionaryObject,

833 res2: DictionaryObject,

834 resource: Any,

835 new_res1: bool = True,

836 ) -> tuple[dict[str, Any], dict[str, Any]]:

837 try:

838 assert isinstance(self.indirect_reference, IndirectObject)

839 pdf = self.indirect_reference.pdf

840 is_pdf_writer = hasattr(

841 pdf, "_add_object"

842 ) # expect isinstance(pdf, PdfWriter)

843 except (AssertionError, AttributeError):

844 pdf = None

845 is_pdf_writer = False

846

847 def compute_unique_key(base_key: str) -> tuple[str, bool]:

848 """

849 Find a key that either doesn't already exist or has the same value

850 (indicated by the bool)

851

852 Args:

853 base_key: An index is added to this to get the computed key

854

855 Returns:

856 A tuple (computed key, bool) where the boolean indicates

857 if there is a resource of the given computed_key with the same

858 value.

859

860 """

861 value = page2res.raw_get(base_key)

862 # TODO: a possible improvement for writer, the indirect_reference

863 # cannot be found because translated

864

865 # try the current key first (e.g. "foo"), but otherwise iterate

866 # through "foo-0", "foo-1", etc. new_res can contain only finitely

867 # many keys, thus this'll eventually end, even if it's been crafted

868 # to be maximally annoying.

869 computed_key = base_key

870 idx = 0

871 while computed_key in new_res:

872 if new_res.raw_get(computed_key) == value:

873 # there's already a resource of this name, with the exact

874 # same value

875 return computed_key, True

876 computed_key = f"{base_key}-{idx}"

877 idx += 1

878 return computed_key, False

879

880 if new_res1:

881 new_res = DictionaryObject()

882 new_res.update(res1.get(resource, DictionaryObject()).get_object())

883 else:

884 new_res = cast(DictionaryObject, res1[resource])

885 page2res = cast(

886 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

887 )

888 rename_res = {}

889 for key in page2res:

890 unique_key, same_value = compute_unique_key(key)

891 newname = NameObject(unique_key)

892 if key != unique_key:

893 # we have to use a different name for this

894 rename_res[key] = newname

895

896 if not same_value:

897 if is_pdf_writer:

898 new_res[newname] = page2res.raw_get(key).clone(pdf)

899 try:

900 new_res[newname] = new_res[newname].indirect_reference

901 except AttributeError:

902 pass

903 else:

904 new_res[newname] = page2res.raw_get(key)

905 lst = sorted(new_res.items())

906 new_res.clear()

907 for el in lst:

908 new_res[el[0]] = el[1]

909 return new_res, rename_res

910

911 @staticmethod

912 def _content_stream_rename(

913 stream: ContentStream,

914 rename: dict[Any, Any],

915 pdf: Optional[PdfCommonDocProtocol],

916 ) -> ContentStream:

917 if not rename:

918 return stream

919 stream = ContentStream(stream, pdf)

920 for operands, _operator in stream.operations:

921 if isinstance(operands, list):

922 for i, op in enumerate(operands):

923 if isinstance(op, NameObject):

924 operands[i] = rename.get(op, op)

925 elif isinstance(operands, dict):

926 for i, op in operands.items():

927 if isinstance(op, NameObject):

928 operands[i] = rename.get(op, op)

929 else:

930 raise KeyError(f"Type of operands is {type(operands)}")

931 return stream

932

933 @staticmethod

934 def _add_transformation_matrix(

935 contents: Any,

936 pdf: Optional[PdfCommonDocProtocol],

937 ctm: CompressedTransformationMatrix,

938 ) -> ContentStream:

939 """Add transformation matrix at the beginning of the given contents stream."""

940 contents = ContentStream(contents, pdf)

941 contents.operations.insert(

942 0,

943 [

944 [FloatObject(x) for x in ctm],

945 b"cm",

946 ],

947 )

948 return contents

949

950 def _get_contents_as_bytes(self) -> Optional[bytes]:

951 """

952 Return the page contents as bytes.

953

954 Returns:

955 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

956

957 """

958 if PG.CONTENTS in self:

959 obj = self[PG.CONTENTS].get_object()

960 if isinstance(obj, list):

961 return b"".join(x.get_object().get_data() for x in obj)

962 return cast(EncodedStreamObject, obj).get_data()

963 return None

964

965 def get_contents(self) -> Optional[ContentStream]:

966 """

967 Access the page contents.

968

969 Returns:

970 The ``/Contents`` object, or ``None`` if it does not exist.

971 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

972

973 """

974 if PG.CONTENTS in self:

975 try:

976 pdf = cast(IndirectObject, self.indirect_reference).pdf

977 except AttributeError:

978 pdf = None

979 obj = self[PG.CONTENTS]

980 if is_null_or_none(obj):

981 return None

982 resolved_object = obj.get_object()

983 return ContentStream(resolved_object, pdf)

984 return None

985

986 def replace_contents(

987 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

988 ) -> None:

989 """

990 Replace the page contents with the new content and nullify old objects

991 Args:

992 content: new content; if None delete the content field.

993 """

994 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

995 # the page is not attached : the content is directly attached.

996 self[NameObject(PG.CONTENTS)] = content

997 return

998

999 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1000 for o in self[PG.CONTENTS]: # type: ignore[attr-defined]

1001 try:

1002 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore

1003 except AttributeError:

1004 pass

1005

1006 if isinstance(content, ArrayObject):

1007 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content)

1008

1009 if is_null_or_none(content):

1010 if PG.CONTENTS not in self:

1011 return

1012 assert self.indirect_reference is not None

1013 assert self[PG.CONTENTS].indirect_reference is not None

1014 self.indirect_reference.pdf._objects[

1015 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore

1016 ] = NullObject()

1017 del self[PG.CONTENTS]

1018 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1019 try:

1020 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(

1021 content

1022 )

1023 except AttributeError:

1024 # applies at least for page not in writer

1025 # as a backup solution, we put content as an object although not in accordance with pdf ref

1026 # this will be fixed with the _add_object

1027 self[NameObject(PG.CONTENTS)] = content

1028 else:

1029 assert content is not None, "mypy"

1030 content.indirect_reference = self[

1031 PG.CONTENTS

1032 ].indirect_reference # TODO: in the future may require generation management

1033 try:

1034 self.indirect_reference.pdf._objects[

1035 content.indirect_reference.idnum - 1 # type: ignore

1036 ] = content

1037 except AttributeError:

1038 # applies at least for page not in writer

1039 # as a backup solution, we put content as an object although not in accordance with pdf ref

1040 # this will be fixed with the _add_object

1041 self[NameObject(PG.CONTENTS)] = content

1042 # forces recalculation of inline_images

1043 self.inline_images = None

1044

1045 def merge_page(

1046 self, page2: "PageObject", expand: bool = False, over: bool = True

1047 ) -> None:

1048 """

1049 Merge the content streams of two pages into one.

1050

1051 Resource references (e.g. fonts) are maintained from both pages.

1052 The mediabox, cropbox, etc of this page are not altered.

1053 The parameter page's content stream will

1054 be added to the end of this page's content stream,

1055 meaning that it will be drawn after, or "on top" of this page.

1056

1057 Args:

1058 page2: The page to be merged into this one. Should be

1059 an instance of :class:`PageObject<PageObject>`.

1060 over: set the page2 content over page1 if True (default) else under

1061 expand: If True, the current page dimensions will be

1062 expanded to accommodate the dimensions of the page to be merged.

1063

1064 """

1065 self._merge_page(page2, over=over, expand=expand)

1066

1067 def _merge_page(

1068 self,

1069 page2: "PageObject",

1070 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1071 ctm: Optional[CompressedTransformationMatrix] = None,

1072 over: bool = True,

1073 expand: bool = False,

1074 ) -> None:

1075 # First we work on merging the resource dictionaries. This allows us

1076 # to find out what symbols in the content streams we might need to

1077 # rename.

1078 try:

1079 assert isinstance(self.indirect_reference, IndirectObject)

1080 if hasattr(

1081 self.indirect_reference.pdf, "_add_object"

1082 ): # to detect PdfWriter

1083 return self._merge_page_writer(

1084 page2, page2transformation, ctm, over, expand

1085 )

1086 return None

1087 except (AssertionError, AttributeError):

1088 pass

1089

1090 new_resources = DictionaryObject()

1091 rename = {}

1092 try:

1093 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1094 except KeyError:

1095 original_resources = DictionaryObject()

1096 try:

1097 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1098 except KeyError:

1099 page2resources = DictionaryObject()

1100 new_annots = ArrayObject()

1101

1102 for page in (self, page2):

1103 if PG.ANNOTS in page:

1104 annots = page[PG.ANNOTS]

1105 if isinstance(annots, ArrayObject):

1106 new_annots.extend(annots)

1107

1108 for res in (

1109 RES.EXT_G_STATE,

1110 RES.FONT,

1111 RES.XOBJECT,

1112 RES.COLOR_SPACE,

1113 RES.PATTERN,

1114 RES.SHADING,

1115 RES.PROPERTIES,

1116 ):

1117 new, newrename = self._merge_resources(

1118 original_resources, page2resources, res

1119 )

1120 if new:

1121 new_resources[NameObject(res)] = new

1122 rename.update(newrename)

1123

1124 # Combine /ProcSet sets, making sure there's a consistent order

1125 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1126 sorted(

1127 set(

1128 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1129 ).union(

1130 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())

1131 )

1132 )

1133 )

1134

1135 new_content_array = ArrayObject()

1136 original_content = self.get_contents()

1137 if original_content is not None:

1138 original_content.isolate_graphics_state()

1139 new_content_array.append(original_content)

1140

1141 page2content = page2.get_contents()

1142 if page2content is not None:

1143 rect = getattr(page2, MERGE_CROP_BOX)

1144 page2content.operations.insert(

1145 0,

1146 (

1147 map(

1148 FloatObject,

1149 [

1150 rect.left,

1151 rect.bottom,

1152 rect.width,

1153 rect.height,

1154 ],

1155 ),

1156 b"re",

1157 ),

1158 )

1159 page2content.operations.insert(1, ([], b"W"))

1160 page2content.operations.insert(2, ([], b"n"))

1161 if page2transformation is not None:

1162 page2content = page2transformation(page2content)

1163 page2content = PageObject._content_stream_rename(

1164 page2content, rename, self.pdf

1165 )

1166 page2content.isolate_graphics_state()

1167 if over:

1168 new_content_array.append(page2content)

1169 else:

1170 new_content_array.insert(0, page2content)

1171

1172 # if expanding the page to fit a new page, calculate the new media box size

1173 if expand:

1174 self._expand_mediabox(page2, ctm)

1175

1176 self.replace_contents(ContentStream(new_content_array, self.pdf))

1177 self[NameObject(PG.RESOURCES)] = new_resources

1178 self[NameObject(PG.ANNOTS)] = new_annots

1179 return None

1180

1181 def _merge_page_writer(

1182 self,

1183 page2: "PageObject",

1184 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1185 ctm: Optional[CompressedTransformationMatrix] = None,

1186 over: bool = True,

1187 expand: bool = False,

1188 ) -> None:

1189 # First we work on merging the resource dictionaries. This allows us

1190 # to find which symbols in the content streams we might need to

1191 # rename.

1192 assert isinstance(self.indirect_reference, IndirectObject)

1193 pdf = self.indirect_reference.pdf

1194

1195 rename = {}

1196 if PG.RESOURCES not in self:

1197 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1198 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1199 if PG.RESOURCES not in page2:

1200 page2resources = DictionaryObject()

1201 else:

1202 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1203

1204 for res in (

1205 RES.EXT_G_STATE,

1206 RES.FONT,

1207 RES.XOBJECT,

1208 RES.COLOR_SPACE,

1209 RES.PATTERN,

1210 RES.SHADING,

1211 RES.PROPERTIES,

1212 ):

1213 if res in page2resources:

1214 if res not in original_resources:

1215 original_resources[NameObject(res)] = DictionaryObject()

1216 _, newrename = self._merge_resources(

1217 original_resources, page2resources, res, False

1218 )

1219 rename.update(newrename)

1220 # Combine /ProcSet sets.

1221 if RES.PROC_SET in page2resources:

1222 if RES.PROC_SET not in original_resources:

1223 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1224 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1225 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1226 if x not in arr:

1227 arr.append(x)

1228 arr.sort()

1229

1230 if PG.ANNOTS in page2:

1231 if PG.ANNOTS not in self:

1232 self[NameObject(PG.ANNOTS)] = ArrayObject()

1233 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1234 if ctm is None:

1235 trsf = Transformation()

1236 else:

1237 trsf = Transformation(ctm)

1238 for a in cast(ArrayObject, page2[PG.ANNOTS]):

1239 a = a.get_object()

1240 aa = a.clone(

1241 pdf,

1242 ignore_fields=("/P", "/StructParent", "/Parent"),

1243 force_duplicate=True,

1244 )

1245 r = cast(ArrayObject, a["/Rect"])

1246 pt1 = trsf.apply_on((r[0], r[1]), True)

1247 pt2 = trsf.apply_on((r[2], r[3]), True)

1248 aa[NameObject("/Rect")] = ArrayObject(

1249 (

1250 min(pt1[0], pt2[0]),

1251 min(pt1[1], pt2[1]),

1252 max(pt1[0], pt2[0]),

1253 max(pt1[1], pt2[1]),

1254 )

1255 )

1256 if "/QuadPoints" in a:

1257 q = cast(ArrayObject, a["/QuadPoints"])

1258 aa[NameObject("/QuadPoints")] = ArrayObject(

1259 trsf.apply_on((q[0], q[1]), True)

1260 + trsf.apply_on((q[2], q[3]), True)

1261 + trsf.apply_on((q[4], q[5]), True)

1262 + trsf.apply_on((q[6], q[7]), True)

1263 )

1264 try:

1265 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1266 except KeyError:

1267 pass

1268 try:

1269 aa[NameObject("/P")] = self.indirect_reference

1270 annots.append(aa.indirect_reference)

1271 except AttributeError:

1272 pass

1273

1274 new_content_array = ArrayObject()

1275 original_content = self.get_contents()

1276 if original_content is not None:

1277 original_content.isolate_graphics_state()

1278 new_content_array.append(original_content)

1279

1280 page2content = page2.get_contents()

1281 if page2content is not None:

1282 rect = getattr(page2, MERGE_CROP_BOX)

1283 page2content.operations.insert(

1284 0,

1285 (

1286 map(

1287 FloatObject,

1288 [

1289 rect.left,

1290 rect.bottom,

1291 rect.width,

1292 rect.height,

1293 ],

1294 ),

1295 b"re",

1296 ),

1297 )

1298 page2content.operations.insert(1, ([], b"W"))

1299 page2content.operations.insert(2, ([], b"n"))

1300 if page2transformation is not None:

1301 page2content = page2transformation(page2content)

1302 page2content = PageObject._content_stream_rename(

1303 page2content, rename, self.pdf

1304 )

1305 page2content.isolate_graphics_state()

1306 if over:

1307 new_content_array.append(page2content)

1308 else:

1309 new_content_array.insert(0, page2content)

1310

1311 # if expanding the page to fit a new page, calculate the new media box size

1312 if expand:

1313 self._expand_mediabox(page2, ctm)

1314

1315 self.replace_contents(new_content_array)

1316

1317 def _expand_mediabox(

1318 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1319 ) -> None:

1320 corners1 = (

1321 self.mediabox.left.as_numeric(),

1322 self.mediabox.bottom.as_numeric(),

1323 self.mediabox.right.as_numeric(),

1324 self.mediabox.top.as_numeric(),

1325 )

1326 corners2 = (

1327 page2.mediabox.left.as_numeric(),

1328 page2.mediabox.bottom.as_numeric(),

1329 page2.mediabox.left.as_numeric(),

1330 page2.mediabox.top.as_numeric(),

1331 page2.mediabox.right.as_numeric(),

1332 page2.mediabox.top.as_numeric(),

1333 page2.mediabox.right.as_numeric(),

1334 page2.mediabox.bottom.as_numeric(),

1335 )

1336 if ctm is not None:

1337 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1338 new_x = tuple(

1339 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1340 for i in range(0, 8, 2)

1341 )

1342 new_y = tuple(

1343 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1344 for i in range(0, 8, 2)

1345 )

1346 else:

1347 new_x = corners2[0:8:2]

1348 new_y = corners2[1:8:2]

1349 lowerleft = (min(new_x), min(new_y))

1350 upperright = (max(new_x), max(new_y))

1351 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1352 upperright = (

1353 max(corners1[2], upperright[0]),

1354 max(corners1[3], upperright[1]),

1355 )

1356

1357 self.mediabox.lower_left = lowerleft

1358 self.mediabox.upper_right = upperright

1359

1360 def merge_transformed_page(

1361 self,

1362 page2: "PageObject",

1363 ctm: Union[CompressedTransformationMatrix, Transformation],

1364 over: bool = True,

1365 expand: bool = False,

1366 ) -> None:

1367 """

1368 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1369 matrix is applied to the merged stream.

1370

1371 Args:

1372 page2: The page to be merged into this one.

1373 ctm: a 6-element tuple containing the operands of the

1374 transformation matrix

1375 over: set the page2 content over page1 if True (default) else under

1376 expand: Whether the page should be expanded to fit the dimensions

1377 of the page to be merged.

1378

1379 """

1380 if isinstance(ctm, Transformation):

1381 ctm = ctm.ctm

1382 self._merge_page(

1383 page2,

1384 lambda page2Content: PageObject._add_transformation_matrix(

1385 page2Content, page2.pdf, ctm

1386 ),

1387 ctm,

1388 over,

1389 expand,

1390 )

1391

1392 def merge_scaled_page(

1393 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1394 ) -> None:

1395 """

1396 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1397 is scaled by applying a transformation matrix.

1398

1399 Args:

1400 page2: The page to be merged into this one.

1401 scale: The scaling factor

1402 over: set the page2 content over page1 if True (default) else under

1403 expand: Whether the page should be expanded to fit the

1404 dimensions of the page to be merged.

1405

1406 """

1407 op = Transformation().scale(scale, scale)

1408 self.merge_transformed_page(page2, op, over, expand)

1409

1410 def merge_rotated_page(

1411 self,

1412 page2: "PageObject",

1413 rotation: float,

1414 over: bool = True,

1415 expand: bool = False,

1416 ) -> None:

1417 """

1418 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1419 is rotated by applying a transformation matrix.

1420

1421 Args:

1422 page2: The page to be merged into this one.

1423 rotation: The angle of the rotation, in degrees

1424 over: set the page2 content over page1 if True (default) else under

1425 expand: Whether the page should be expanded to fit the

1426 dimensions of the page to be merged.

1427

1428 """

1429 op = Transformation().rotate(rotation)

1430 self.merge_transformed_page(page2, op, over, expand)

1431

1432 def merge_translated_page(

1433 self,

1434 page2: "PageObject",

1435 tx: float,

1436 ty: float,

1437 over: bool = True,

1438 expand: bool = False,

1439 ) -> None:

1440 """

1441 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1442 merged is translated by applying a transformation matrix.

1443

1444 Args:

1445 page2: the page to be merged into this one.

1446 tx: The translation on X axis

1447 ty: The translation on Y axis

1448 over: set the page2 content over page1 if True (default) else under

1449 expand: Whether the page should be expanded to fit the

1450 dimensions of the page to be merged.

1451

1452 """

1453 op = Transformation().translate(tx, ty)

1454 self.merge_transformed_page(page2, op, over, expand)

1455

1456 def add_transformation(

1457 self,

1458 ctm: Union[Transformation, CompressedTransformationMatrix],

1459 expand: bool = False,

1460 ) -> None:

1461 """

1462 Apply a transformation matrix to the page.

1463

1464 Args:

1465 ctm: A 6-element tuple containing the operands of the

1466 transformation matrix. Alternatively, a

1467 :py:class:`Transformation<pypdf.Transformation>`

1468 object can be passed.

1469

1470 See :doc:`/user/cropping-and-transforming`.

1471

1472 """

1473 if isinstance(ctm, Transformation):

1474 ctm = ctm.ctm

1475 content = self.get_contents()

1476 if content is not None:

1477 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1478 content.isolate_graphics_state()

1479 self.replace_contents(content)

1480 # if expanding the page to fit a new page, calculate the new media box size

1481 if expand:

1482 corners = [

1483 self.mediabox.left.as_numeric(),

1484 self.mediabox.bottom.as_numeric(),

1485 self.mediabox.left.as_numeric(),

1486 self.mediabox.top.as_numeric(),

1487 self.mediabox.right.as_numeric(),

1488 self.mediabox.top.as_numeric(),

1489 self.mediabox.right.as_numeric(),

1490 self.mediabox.bottom.as_numeric(),

1491 ]

1492

1493 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1494 new_x = [

1495 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1496 for i in range(0, 8, 2)

1497 ]

1498 new_y = [

1499 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1500 for i in range(0, 8, 2)

1501 ]

1502

1503 self.mediabox.lower_left = (min(new_x), min(new_y))

1504 self.mediabox.upper_right = (max(new_x), max(new_y))

1505

1506 def scale(self, sx: float, sy: float) -> None:

1507 """

1508 Scale a page by the given factors by applying a transformation matrix

1509 to its content and updating the page size.

1510

1511 This updates the various page boundaries (bleedbox, trimbox, etc.)

1512 and the contents of the page.

1513

1514 Args:

1515 sx: The scaling factor on horizontal axis.

1516 sy: The scaling factor on vertical axis.

1517

1518 """

1519 self.add_transformation((sx, 0, 0, sy, 0, 0))

1520 self.bleedbox = self.bleedbox.scale(sx, sy)

1521 self.trimbox = self.trimbox.scale(sx, sy)

1522 self.artbox = self.artbox.scale(sx, sy)

1523 self.cropbox = self.cropbox.scale(sx, sy)

1524 self.mediabox = self.mediabox.scale(sx, sy)

1525

1526 if PG.ANNOTS in self:

1527 annotations = self[PG.ANNOTS]

1528 if isinstance(annotations, ArrayObject):

1529 for annotation in annotations:

1530 annotation_obj = annotation.get_object()

1531 if ADA.Rect in annotation_obj:

1532 rectangle = annotation_obj[ADA.Rect]

1533 if isinstance(rectangle, ArrayObject):

1534 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1535 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1536 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1537 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1538

1539 if PG.VP in self:

1540 viewport = self[PG.VP]

1541 if isinstance(viewport, ArrayObject):

1542 bbox = viewport[0]["/BBox"]

1543 else:

1544 bbox = viewport["/BBox"] # type: ignore

1545 scaled_bbox = RectangleObject(

1546 (

1547 float(bbox[0]) * sx,

1548 float(bbox[1]) * sy,

1549 float(bbox[2]) * sx,

1550 float(bbox[3]) * sy,

1551 )

1552 )

1553 if isinstance(viewport, ArrayObject):

1554 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore

1555 NameObject("/BBox")

1556 ] = scaled_bbox

1557 else:

1558 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore

1559

1560 def scale_by(self, factor: float) -> None:

1561 """

1562 Scale a page by the given factor by applying a transformation matrix to

1563 its content and updating the page size.

1564

1565 Args:

1566 factor: The scaling factor (for both X and Y axis).

1567

1568 """

1569 self.scale(factor, factor)

1570

1571 def scale_to(self, width: float, height: float) -> None:

1572 """

1573 Scale a page to the specified dimensions by applying a transformation

1574 matrix to its content and updating the page size.

1575

1576 Args:

1577 width: The new width.

1578 height: The new height.

1579

1580 """

1581 sx = width / float(self.mediabox.width)

1582 sy = height / float(self.mediabox.height)

1583 self.scale(sx, sy)

1584

1585 def compress_content_streams(self, level: int = -1) -> None:

1586 """

1587 Compress the size of this page by joining all content streams and

1588 applying a FlateDecode filter.

1589

1590 However, it is possible that this function will perform no action if

1591 content stream compression becomes "automatic".

1592 """

1593 content = self.get_contents()

1594 if content is not None:

1595 content_obj = content.flate_encode(level)

1596 try:

1597 content.indirect_reference.pdf._objects[ # type: ignore

1598 content.indirect_reference.idnum - 1 # type: ignore

1599 ] = content_obj

1600 except AttributeError:

1601 if self.indirect_reference is not None and hasattr(

1602 self.indirect_reference.pdf, "_add_object"

1603 ):

1604 self.replace_contents(content_obj)

1605 else:

1606 raise ValueError("Page must be part of a PdfWriter")

1607

1608 @property

1609 def page_number(self) -> Optional[int]:

1610 """

1611 Read-only property which returns the page number within the PDF file.

1612

1613 Returns:

1614 Page number; None if the page is not attached to a PDF.

1615

1616 """

1617 if self.indirect_reference is None:

1618 return None

1619 try:

1620 lst = self.indirect_reference.pdf.pages

1621 return lst.index(self)

1622 except ValueError:

1623 return None

1624

1625 def _debug_for_extract(self) -> str: # pragma: no cover

1626 out = ""

1627 for ope, op in ContentStream(

1628 self["/Contents"].get_object(), self.pdf, "bytes"

1629 ).operations:

1630 if op == b"TJ":

1631 s = [x for x in ope[0] if isinstance(x, str)]

1632 else:

1633 s = []

1634 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1635 out += "\n=============================\n"

1636 try:

1637 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1638 out += fo + "\n"

1639 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1640 try:

1641 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1642 "/Encoding"

1643 ].__repr__()

1644 out += enc_repr + "\n"

1645 except Exception:

1646 pass

1647 try:

1648 out += (

1649 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1650 "/ToUnicode"

1651 ]

1652 .get_data()

1653 .decode()

1654 + "\n"

1655 )

1656 except Exception:

1657 pass

1658

1659 except KeyError:

1660 out += "No Font\n"

1661 return out

1662

1663 def _extract_text(

1664 self,

1665 obj: Any,

1666 pdf: Any,

1667 orientations: tuple[int, ...] = (0, 90, 180, 270),

1668 space_width: float = 200.0,

1669 content_key: Optional[str] = PG.CONTENTS,

1670 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1671 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1672 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1673 ) -> str:

1674 """

1675 See extract_text for most arguments.

1676

1677 Args:

1678 content_key: indicate the default key where to extract data

1679 None = the object; this allows reusing the function on an XObject

1680 default = "/Content"

1681

1682 """

1683 extractor = TextExtraction()

1684 cmaps: dict[

1685 str,

1686 tuple[

1687 str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject

1688 ],

1689 ] = {}

1690

1691 try:

1692 objr = obj

1693 while NameObject(PG.RESOURCES) not in objr:

1694 # /Resources can be inherited so we look to parents

1695 objr = objr["/Parent"].get_object()

1696 # If no parents then no /Resources will be available,

1697 # so an exception will be raised

1698 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])

1699 except Exception:

1700 # No resources means no text is possible (no font); we consider the

1701 # file as not damaged, no need to check for TJ or Tj

1702 return ""

1703

1704 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]):

1705 for f in cast(DictionaryObject, font):

1706 try:

1707 cmaps[f] = build_char_map(f, space_width, obj)

1708 except TypeError:

1709 pass

1710

1711 try:

1712 content = (

1713 obj[content_key].get_object() if isinstance(content_key, str) else obj

1714 )

1715 if not isinstance(content, ContentStream):

1716 content = ContentStream(content, pdf, "bytes")

1717 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1718 return ""

1719 # We check all strings are TextStringObjects. ByteStringObjects

1720 # are strings where the byte->string encoding was unknown, so adding

1721 # them to the text here would be gibberish.

1722

1723 # Initialize the extractor with the necessary parameters

1724 extractor.initialize_extraction(orientations, visitor_text, cmaps)

1725

1726 for operands, operator in content.operations:

1727 if visitor_operand_before is not None:

1728 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1729 # Multiple operators are handled here

1730 if operator == b"'":

1731 extractor.process_operation(b"T*", [])

1732 extractor.process_operation(b"Tj", operands)

1733 elif operator == b'"':

1734 extractor.process_operation(b"Tw", [operands[0]])

1735 extractor.process_operation(b"Tc", [operands[1]])

1736 extractor.process_operation(b"T*", [])

1737 extractor.process_operation(b"Tj", operands[2:])

1738 elif operator == b"TJ":

1739 # The space width may be smaller than the font width, so the width should be 95%.

1740 _confirm_space_width = extractor._space_width * 0.95

1741 if operands:

1742 for op in operands[0]:

1743 if isinstance(op, (str, bytes)):

1744 extractor.process_operation(b"Tj", [op])

1745 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1746 abs(float(op)) >= _confirm_space_width

1747 and extractor.text

1748 and extractor.text[-1] != " "

1749 ):

1750 extractor.process_operation(b"Tj", [" "])

1751 elif operator == b"TD":

1752 extractor.process_operation(b"TL", [-operands[1]])

1753 extractor.process_operation(b"Td", operands)

1754 elif operator == b"Do":

1755 extractor.output += extractor.text

1756 if visitor_text is not None:

1757 visitor_text(

1758 extractor.text,

1759 extractor.memo_cm,

1760 extractor.memo_tm,

1761 extractor.cmap[3],

1762 extractor.font_size,

1763 )

1764 try:

1765 if extractor.output[-1] != "\n":

1766 extractor.output += "\n"

1767 if visitor_text is not None:

1768 visitor_text(

1769 "\n",

1770 extractor.memo_cm,

1771 extractor.memo_tm,

1772 extractor.cmap[3],

1773 extractor.font_size,

1774 )

1775 except IndexError:

1776 pass

1777 try:

1778 xobj = resources_dict["/XObject"]

1779 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore

1780 text = self.extract_xform_text(

1781 xobj[operands[0]], # type: ignore

1782 orientations,

1783 space_width,

1784 visitor_operand_before,

1785 visitor_operand_after,

1786 visitor_text,

1787 )

1788 extractor.output += text

1789 if visitor_text is not None:

1790 visitor_text(

1791 text,

1792 extractor.memo_cm,

1793 extractor.memo_tm,

1794 extractor.cmap[3],

1795 extractor.font_size,

1796 )

1797 except Exception as exception:

1798 logger_warning(

1799 f"Impossible to decode XFormObject {operands[0]}: {exception}",

1800 __name__,

1801 )

1802 finally:

1803 extractor.text = ""

1804 extractor.memo_cm = extractor.cm_matrix.copy()

1805 extractor.memo_tm = extractor.tm_matrix.copy()

1806 else:

1807 extractor.process_operation(operator, operands)

1808 if visitor_operand_after is not None:

1809 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1810 extractor.output += extractor.text # just in case

1811 if extractor.text != "" and visitor_text is not None:

1812 visitor_text(

1813 extractor.text,

1814 extractor.memo_cm,

1815 extractor.memo_tm,

1816 extractor.cmap[3],

1817 extractor.font_size,

1818 )

1819 return extractor.output

1820

1821 def _layout_mode_fonts(self) -> dict[str, _layout_mode.Font]:

1822 """

1823 Get fonts formatted for "layout" mode text extraction.

1824

1825 Returns:

1826 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name

1827

1828 """

1829 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1830 objr: Any = self

1831 fonts: dict[str, _layout_mode.Font] = {}

1832 while objr is not None:

1833 try:

1834 resources_dict: Any = objr[PG.RESOURCES]

1835 except KeyError:

1836 resources_dict = {}

1837 if "/Font" in resources_dict and self.pdf is not None:

1838 for font_name in resources_dict["/Font"]:

1839 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)

1840 font_dict = {

1841 k: v.get_object()

1842 if isinstance(v, IndirectObject)

1843 else [_v.get_object() for _v in v]

1844 if isinstance(v, ArrayObject)

1845 else v

1846 for k, v in font_dict_obj.items()

1847 }

1848 # mypy really sucks at unpacking

1849 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]

1850 try:

1851 objr = objr["/Parent"].get_object()

1852 except KeyError:

1853 objr = None

1854

1855 return fonts

1856

1857 def _layout_mode_text(

1858 self,

1859 space_vertically: bool = True,

1860 scale_weight: float = 1.25,

1861 strip_rotated: bool = True,

1862 debug_path: Optional[Path] = None,

1863 font_height_weight: float = 1,

1864 ) -> str:

1865 """

1866 Get text preserving fidelity to source PDF text layout.

1867

1868 Args:

1869 space_vertically: include blank lines inferred from y distance + font

1870 height. Defaults to True.

1871 scale_weight: multiplier for string length when calculating weighted

1872 average character width. Defaults to 1.25.

1873 strip_rotated: Removes text that is rotated w.r.t. to the page from

1874 layout mode output. Defaults to True.

1875 debug_path (Path | None): if supplied, must target a directory.

1876 creates the following files with debug information for layout mode

1877 functions if supplied:

1878 - fonts.json: output of self._layout_mode_fonts

1879 - tjs.json: individual text render ops with corresponding transform matrices

1880 - bts.json: text render ops left justified and grouped by BT/ET operators

1881 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1882 Defaults to None.

1883 font_height_weight: multiplier for font height when calculating

1884 blank lines. Defaults to 1.

1885

1886 Returns:

1887 str: multiline string containing page text in a fixed width format that

1888 closely adheres to the rendered layout in the source pdf.

1889

1890 """

1891 fonts = self._layout_mode_fonts()

1892 if debug_path: # pragma: no cover

1893 import json # noqa: PLC0415

1894

1895 debug_path.joinpath("fonts.json").write_text(

1896 json.dumps(

1897 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)

1898 ),

1899 "utf-8",

1900 )

1901

1902 ops = iter(

1903 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1904 )

1905 bt_groups = _layout_mode.text_show_operations(

1906 ops, fonts, strip_rotated, debug_path

1907 )

1908

1909 if not bt_groups:

1910 return ""

1911

1912 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1913

1914 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1915

1916 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1917

1918 def extract_text(

1919 self,

1920 *args: Any,

1921 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1922 space_width: float = 200.0,

1923 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1924 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1925 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1926 extraction_mode: Literal["plain", "layout"] = "plain",

1927 **kwargs: Any,

1928 ) -> str:

1929 """

1930 Locate all text drawing commands, in the order they are provided in the

1931 content stream, and extract the text.

1932

1933 This works well for some PDF files, but poorly for others, depending on

1934 the generator used. This will be refined in the future.

1935

1936 Do not rely on the order of text coming out of this function, as it

1937 will change if this function is made more sophisticated.

1938

1939 Arabic and Hebrew are extracted in the correct order.

1940 If required a custom RTL range of characters can be defined;

1941 see function set_custom_rtl.

1942

1943 Additionally you can provide visitor methods to get informed on all

1944 operations and all text objects.

1945 For example in some PDF files this can be useful to parse tables.

1946

1947 Args:

1948 orientations: list of orientations extract_text will look for

1949 default = (0, 90, 180, 270)

1950 note: currently only 0 (up),90 (turned left), 180 (upside down),

1951 270 (turned right)

1952 Silently ignored in "layout" mode.

1953 space_width: force default space width

1954 if not extracted from font (default: 200)

1955 Silently ignored in "layout" mode.

1956 visitor_operand_before: function to be called before processing an operation.

1957 It has four arguments: operator, operand-arguments,

1958 current transformation matrix and text matrix.

1959 Ignored with a warning in "layout" mode.

1960 visitor_operand_after: function to be called after processing an operation.

1961 It has four arguments: operator, operand-arguments,

1962 current transformation matrix and text matrix.

1963 Ignored with a warning in "layout" mode.

1964 visitor_text: function to be called when extracting some text at some position.

1965 It has five arguments: text, current transformation matrix,

1966 text matrix, font-dictionary and font-size.

1967 The font-dictionary may be None in case of unknown fonts.

1968 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1969 Ignored with a warning in "layout" mode.

1970 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1971 "layout" for experimental layout mode functionality.

1972 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1973 in "layout" mode.

1974

1975 kwargs:

1976 layout_mode_space_vertically (bool): include blank lines inferred from

1977 y distance + font height. Defaults to True.

1978 layout_mode_scale_weight (float): multiplier for string length when calculating

1979 weighted average character width. Defaults to 1.25.

1980 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

1981 Set to False to include rotated text anyway. If rotated text is discovered,

1982 layout will be degraded and a warning will result. Defaults to True.

1983 layout_mode_debug_path (Path | None): if supplied, must target a directory.

1984 creates the following files with debug information for layout mode

1985 functions if supplied:

1986

1987 - fonts.json: output of self._layout_mode_fonts

1988 - tjs.json: individual text render ops with corresponding transform matrices

1989 - bts.json: text render ops left justified and grouped by BT/ET operators

1990 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1991 layout_mode_font_height_weight (float): multiplier for font height when calculating

1992 blank lines. Defaults to 1.

1993

1994 Returns:

1995 The extracted text

1996

1997 """

1998 if extraction_mode not in ["plain", "layout"]:

1999 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

2000 if extraction_mode == "layout":

2001 for visitor in (

2002 "visitor_operand_before",

2003 "visitor_operand_after",

2004 "visitor_text",

2005 ):

2006 if locals()[visitor]:

2007 logger_warning(

2008 f"Argument {visitor} is ignored in layout mode",

2009 __name__,

2010 )

2011 return self._layout_mode_text(

2012 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2013 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2014 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2015 debug_path=kwargs.get("layout_mode_debug_path"),

2016 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2017 )

2018 if len(args) >= 1:

2019 if isinstance(args[0], str):

2020 if len(args) >= 3:

2021 if isinstance(args[2], (tuple, int)):

2022 orientations = args[2]

2023 else:

2024 raise TypeError(f"Invalid positional parameter {args[2]}")

2025 if len(args) >= 4:

2026 if isinstance(args[3], (float, int)):

2027 space_width = args[3]

2028 else:

2029 raise TypeError(f"Invalid positional parameter {args[3]}")

2030 elif isinstance(args[0], (tuple, int)):

2031 orientations = args[0]

2032 if len(args) >= 2:

2033 if isinstance(args[1], (float, int)):

2034 space_width = args[1]

2035 else:

2036 raise TypeError(f"Invalid positional parameter {args[1]}")

2037 else:

2038 raise TypeError(f"Invalid positional parameter {args[0]}")

2039

2040 if isinstance(orientations, int):

2041 orientations = (orientations,)

2042

2043 return self._extract_text(

2044 self,

2045 self.pdf,

2046 orientations,

2047 space_width,

2048 PG.CONTENTS,

2049 visitor_operand_before,

2050 visitor_operand_after,

2051 visitor_text,

2052 )

2053

2054 def extract_xform_text(

2055 self,

2056 xform: EncodedStreamObject,

2057 orientations: tuple[int, ...] = (0, 90, 270, 360),

2058 space_width: float = 200.0,

2059 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2060 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2061 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2062 ) -> str:

2063 """

2064 Extract text from an XObject.

2065

2066 Args:

2067 xform:

2068 orientations:

2069 space_width: force default space width (if not extracted from font (default 200)

2070 visitor_operand_before:

2071 visitor_operand_after:

2072 visitor_text:

2073

2074 Returns:

2075 The extracted text

2076

2077 """

2078 return self._extract_text(

2079 xform,

2080 self.pdf,

2081 orientations,

2082 space_width,

2083 None,

2084 visitor_operand_before,

2085 visitor_operand_after,

2086 visitor_text,

2087 )

2088

2089 def _get_fonts(self) -> tuple[set[str], set[str]]:

2090 """

2091 Get the names of embedded fonts and unembedded fonts.

2092

2093 Returns:

2094 A tuple (set of embedded fonts, set of unembedded fonts)

2095

2096 """

2097 obj = self.get_object()

2098 assert isinstance(obj, DictionaryObject)

2099 fonts: set[str] = set()

2100 embedded: set[str] = set()

2101 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2102 unembedded = fonts - embedded

2103 return embedded, unembedded

2104

2105 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2106 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2107 default user space units, defining the boundaries of the physical medium on

2108 which the page is intended to be displayed or printed."""

2109

2110 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2111 """

2112 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2113 default user space units, defining the visible region of default user

2114 space.

2115

2116 When the page is displayed or printed, its contents are to be clipped

2117 (cropped) to this rectangle and then imposed on the output medium in some

2118 implementation-defined manner. Default value: same as

2119 :attr:`mediabox<mediabox>`.

2120 """

2121

2122 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2123 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2124 default user space units, defining the region to which the contents of the

2125 page should be clipped when output in a production environment."""

2126

2127 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2128 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2129 default user space units, defining the intended dimensions of the finished

2130 page after trimming."""

2131

2132 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2133 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2134 default user space units, defining the extent of the page's meaningful

2135 content as intended by the page's creator."""

2136

2137 @property

2138 def annotations(self) -> Optional[ArrayObject]:

2139 if "/Annots" not in self:

2140 return None

2141 return cast(ArrayObject, self["/Annots"])

2142

2143 @annotations.setter

2144 def annotations(self, value: Optional[ArrayObject]) -> None:

2145 """

2146 Set the annotations array of the page.

2147

2148 Typically you do not want to set this value, but append to it.

2149 If you append to it, remember to add the object first to the writer

2150 and only add the indirect object.

2151 """

2152 if value is None:

2153 del self[NameObject("/Annots")]

2154 else:

2155 self[NameObject("/Annots")] = value

2156

2157

2158class _VirtualList(Sequence[PageObject]):

2159 def __init__(

2160 self,

2161 length_function: Callable[[], int],

2162 get_function: Callable[[int], PageObject],

2163 ) -> None:

2164 self.length_function = length_function

2165 self.get_function = get_function

2166 self.current = -1

2167

2168 def __len__(self) -> int:

2169 return self.length_function()

2170

2171 @overload

2172 def __getitem__(self, index: int) -> PageObject:

2173 ...

2174

2175 @overload

2176 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2177 ...

2178

2179 def __getitem__(

2180 self, index: Union[int, slice]

2181 ) -> Union[PageObject, Sequence[PageObject]]:

2182 if isinstance(index, slice):

2183 indices = range(*index.indices(len(self)))

2184 cls = type(self)

2185 return cls(indices.__len__, lambda idx: self[indices[idx]])

2186 if not isinstance(index, int):

2187 raise TypeError("Sequence indices must be integers")

2188 len_self = len(self)

2189 if index < 0:

2190 # support negative indexes

2191 index += len_self

2192 if not (0 <= index < len_self):

2193 raise IndexError("Sequence index out of range")

2194 return self.get_function(index)

2195

2196 def __delitem__(self, index: Union[int, slice]) -> None:

2197 if isinstance(index, slice):

2198 r = list(range(*index.indices(len(self))))

2199 # pages have to be deleted from last to first

2200 r.sort()

2201 r.reverse()

2202 for p in r:

2203 del self[p] # recursive call

2204 return

2205 if not isinstance(index, int):

2206 raise TypeError("Index must be integers")

2207 len_self = len(self)

2208 if index < 0:

2209 # support negative indexes

2210 index += len_self

2211 if not (0 <= index < len_self):

2212 raise IndexError("Index out of range")

2213 ind = self[index].indirect_reference

2214 assert ind is not None

2215 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2216 "/Parent", None

2217 )

2218 first = True

2219 while parent is not None:

2220 parent = cast(DictionaryObject, parent.get_object())

2221 try:

2222 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2223 del cast(ArrayObject, parent["/Kids"])[i]

2224 first = False

2225 try:

2226 assert ind is not None

2227 del ind.pdf.flattened_pages[index] # case of page in a Reader

2228 except Exception: # pragma: no cover

2229 pass

2230 if "/Count" in parent:

2231 parent[NameObject("/Count")] = NumberObject(

2232 cast(int, parent["/Count"]) - 1

2233 )

2234 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2235 # No more objects in this part of this subtree

2236 ind = parent.indirect_reference

2237 parent = parent.get("/Parent", None)

2238 except ValueError: # from index

2239 if first:

2240 raise PdfReadError(f"Page not found in page tree: {ind}")

2241 break

2242

2243 def __iter__(self) -> Iterator[PageObject]:

2244 for i in range(len(self)):

2245 yield self[i]

2246

2247 def __str__(self) -> str:

2248 p = [f"PageObject({i})" for i in range(self.length_function())]

2249 return f"[{', '.join(p)}]"

2250

2251

2252def _get_fonts_walk(

2253 obj: DictionaryObject,

2254 fnt: set[str],

2255 emb: set[str],

2256) -> tuple[set[str], set[str]]:

2257 """

2258 Get the set of all fonts and all embedded fonts.

2259

2260 Args:

2261 obj: Page resources dictionary

2262 fnt: font

2263 emb: embedded fonts

2264

2265 Returns:

2266 A tuple (fnt, emb)

2267

2268 If there is a key called 'BaseFont', that is a font that is used in the document.

2269 If there is a key called 'FontName' and another key in the same dictionary object

2270 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2271 embedded.

2272

2273 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2274

2275 """

2276 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2277

2278 def process_font(f: DictionaryObject) -> None:

2279 nonlocal fnt, emb

2280 f = cast(DictionaryObject, f.get_object()) # to be sure

2281 if "/BaseFont" in f:

2282 fnt.add(cast(str, f["/BaseFont"]))

2283

2284 if (

2285 ("/CharProcs" in f)

2286 or (

2287 "/FontDescriptor" in f

2288 and any(

2289 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2290 )

2291 )

2292 or (

2293 "/DescendantFonts" in f

2294 and "/FontDescriptor"

2295 in cast(

2296 DictionaryObject,

2297 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2298 )

2299 and any(

2300 x

2301 in cast(

2302 DictionaryObject,

2303 cast(

2304 DictionaryObject,

2305 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2306 )["/FontDescriptor"],

2307 )

2308 for x in fontkeys

2309 )

2310 )

2311 ):

2312 # the list comprehension ensures there is FontFile

2313 try:

2314 emb.add(cast(str, f["/BaseFont"]))

2315 except KeyError:

2316 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2317

2318 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2319 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2320 process_font(f)

2321 if "/Resources" in obj:

2322 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2323 for f in cast(

2324 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2325 ).values():

2326 process_font(f)

2327 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2328 for x in cast(

2329 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2330 ).values():

2331 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2332 if "/Annots" in obj:

2333 for a in cast(ArrayObject, obj["/Annots"]):

2334 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2335 if "/AP" in obj:

2336 if (

2337 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2338 "/Type"

2339 )

2340 == "/XObject"

2341 ):

2342 _get_fonts_walk(

2343 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2344 fnt,

2345 emb,

2346 )

2347 else:

2348 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2349 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2350 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

913 statements