Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from dataclasses import dataclass

33from decimal import Decimal

34from io import BytesIO

35from pathlib import Path

36from typing import (

37 Any,

38 Callable,

39 Literal,

40 Optional,

41 Union,

42 cast,

43 overload,

44)

46from ._cmap import (

47 build_char_map,

48)

49from ._protocols import PdfCommonDocProtocol

50from ._text_extraction import (

51 _layout_mode,

52)

53from ._text_extraction._text_extractor import TextExtraction

54from ._utils import (

55 CompressedTransformationMatrix,

56 TransformationMatrixType,

57 _human_readable_bytes,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING

62from .constants import AnnotationDictionaryAttributes as ADA

63from .constants import ImageAttributes as IA

64from .constants import PageAttributes as PG

65from .constants import Resources as RES

66from .errors import PageSizeNotDefinedError, PdfReadError

67from .generic import (

68 ArrayObject,

69 ContentStream,

70 DictionaryObject,

71 EncodedStreamObject,

72 FloatObject,

73 IndirectObject,

74 NameObject,

75 NullObject,

76 NumberObject,

77 PdfObject,

78 RectangleObject,

79 StreamObject,

80 is_null_or_none,

81)

83try:

84 from PIL.Image import Image

86 pil_not_imported = False

87except ImportError:

88 Image = object # type: ignore

89 pil_not_imported = True # error will be raised only when using images

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

95 retval: Union[None, RectangleObject, IndirectObject] = self.get(name)

96 if isinstance(retval, RectangleObject):

97 return retval

98 if is_null_or_none(retval):

99 for d in defaults:

100 retval = self.get(d)

101 if retval is not None:

102 break

103 if isinstance(retval, IndirectObject):

104 retval = self.pdf.get_object(retval)

105 retval = RectangleObject(retval) # type: ignore

106 _set_rectangle(self, name, retval)

107 return retval

108

109

110def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

111 self[NameObject(name)] = value

112

113

114def _delete_rectangle(self: Any, name: str) -> None:

115 del self[name]

116

117

118def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

119 return property(

120 lambda self: _get_rectangle(self, name, fallback),

121 lambda self, value: _set_rectangle(self, name, value),

122 lambda self: _delete_rectangle(self, name),

123 )

124

125

126class Transformation:

127 """

128 Represent a 2D transformation.

129

130 The transformation between two coordinate systems is represented by a 3-by-3

131 transformation matrix with the following form::

132

133 a b 0

134 c d 0

135 e f 1

136

137 Because a transformation matrix has only six elements that can be changed,

138 it is usually specified in PDF as the six-element array [ a b c d e f ].

139

140 Coordinate transformations are expressed as matrix multiplications::

141

142 a b 0

143 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

144 e f 1

145

146

147 Example:

148 >>> from pypdf import PdfWriter, Transformation

149 >>> page = PdfWriter().add_blank_page(800, 600)

150 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

151 >>> page.add_transformation(op)

152

153 """

154

155 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

156 self.ctm = ctm

157

158 @property

159 def matrix(self) -> TransformationMatrixType:

160 """

161 Return the transformation matrix as a tuple of tuples in the form:

162

163 ((a, b, 0), (c, d, 0), (e, f, 1))

164 """

165 return (

166 (self.ctm[0], self.ctm[1], 0),

167 (self.ctm[2], self.ctm[3], 0),

168 (self.ctm[4], self.ctm[5], 1),

169 )

170

171 @staticmethod

172 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

173 """

174 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

175

176 Args:

177 matrix: The transformation matrix as a tuple of tuples.

178

179 Returns:

180 A tuple representing the transformation matrix as (a, b, c, d, e, f)

181

182 """

183 return (

184 matrix[0][0],

185 matrix[0][1],

186 matrix[1][0],

187 matrix[1][1],

188 matrix[2][0],

189 matrix[2][1],

190 )

191

192 def _to_cm(self) -> str:

193 # Returns the cm operation string for the given transformation matrix

194 return (

195 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

196 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

197 )

198

199 def transform(self, m: "Transformation") -> "Transformation":

200 """

201 Apply one transformation to another.

202

203 Args:

204 m: a Transformation to apply.

205

206 Returns:

207 A new ``Transformation`` instance

208

209 Example:

210 >>> from pypdf import PdfWriter, Transformation

211 >>> height, width = 40, 50

212 >>> page = PdfWriter().add_blank_page(800, 600)

213 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

214 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

215 >>> page.add_transformation(op)

216

217 """

218 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

219 return Transformation(ctm)

220

221 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

222 """

223 Translate the contents of a page.

224

225 Args:

226 tx: The translation along the x-axis.

227 ty: The translation along the y-axis.

228

229 Returns:

230 A new ``Transformation`` instance

231

232 """

233 m = self.ctm

234 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

235

236 def scale(

237 self, sx: Optional[float] = None, sy: Optional[float] = None

238 ) -> "Transformation":

239 """

240 Scale the contents of a page towards the origin of the coordinate system.

241

242 Typically, that is the lower-left corner of the page. That can be

243 changed by translating the contents / the page boxes.

244

245 Args:

246 sx: The scale factor along the x-axis.

247 sy: The scale factor along the y-axis.

248

249 Returns:

250 A new Transformation instance with the scaled matrix.

251

252 """

253 if sx is None and sy is None:

254 raise ValueError("Either sx or sy must be specified")

255 if sx is None:

256 sx = sy

257 if sy is None:

258 sy = sx

259 assert sx is not None

260 assert sy is not None

261 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

262 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

263 return Transformation(ctm)

264

265 def rotate(self, rotation: float) -> "Transformation":

266 """

267 Rotate the contents of a page.

268

269 Args:

270 rotation: The angle of rotation in degrees.

271

272 Returns:

273 A new ``Transformation`` instance with the rotated matrix.

274

275 """

276 rotation = math.radians(rotation)

277 op: TransformationMatrixType = (

278 (math.cos(rotation), math.sin(rotation), 0),

279 (-math.sin(rotation), math.cos(rotation), 0),

280 (0, 0, 1),

281 )

282 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

283 return Transformation(ctm)

284

285 def __repr__(self) -> str:

286 return f"Transformation(ctm={self.ctm})"

287

288 @overload

289 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

290 ...

291

292 @overload

293 def apply_on(

294 self, pt: tuple[float, float], as_object: bool = False

295 ) -> tuple[float, float]:

296 ...

297

298 def apply_on(

299 self,

300 pt: Union[tuple[float, float], list[float]],

301 as_object: bool = False,

302 ) -> Union[tuple[float, float], list[float]]:

303 """

304 Apply the transformation matrix on the given point.

305

306 Args:

307 pt: A tuple or list representing the point in the form (x, y).

308 as_object: If True, return items as FloatObject, otherwise as plain floats.

309

310 Returns:

311 A tuple or list representing the transformed point in the form (x', y')

312

313 """

314 typ = FloatObject if as_object else float

315 pt1 = (

316 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

317 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

318 )

319 return list(pt1) if isinstance(pt, list) else pt1

320

321

322@dataclass

323class ImageFile:

324 """

325 Image within the PDF file. *This object is not designed to be built.*

326

327 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

328 """

329

330 name: str = ""

331 """

332 Filename as identified within the PDF file.

333 """

334

335 data: bytes = b""

336 """

337 Data as bytes.

338 """

339

340 image: Optional[Image] = None

341 """

342 Data as PIL image.

343 """

344

345 indirect_reference: Optional[IndirectObject] = None

346 """

347 Reference to the object storing the stream.

348 """

349

350 def replace(self, new_image: Image, **kwargs: Any) -> None:

351 """

352 Replace the image with a new PIL image.

353

354 Args:

355 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

356 **kwargs: Additional keyword arguments to pass to `Image.save()`.

357

358 Raises:

359 TypeError: If the image is inline or in a PdfReader.

360 TypeError: If the image does not belong to a PdfWriter.

361 TypeError: If `new_image` is not a PIL Image.

362

363 Note:

364 This method replaces the existing image with a new image.

365 It is not allowed for inline images or images within a PdfReader.

366 The `kwargs` parameter allows passing additional parameters

367 to `Image.save()`, such as quality.

368

369 """

370 if pil_not_imported:

371 raise ImportError(

372 "pillow is required to do image extraction. "

373 "It can be installed via 'pip install pypdf[image]'"

374 )

375

376 from ._reader import PdfReader # noqa: PLC0415

377

378 # to prevent circular import

379 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

380 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

381

382 if self.indirect_reference is None:

383 raise TypeError("Cannot update an inline image.")

384 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

385 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

386 if not isinstance(new_image, Image):

387 raise TypeError("new_image shall be a PIL Image")

388 b = BytesIO()

389 new_image.save(b, "PDF", **kwargs)

390 reader = PdfReader(b)

391 page_image = reader.pages[0].images[0]

392 assert page_image.indirect_reference is not None

393 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

394 page_image.indirect_reference.get_object()

395 )

396 cast(

397 PdfObject, self.indirect_reference.get_object()

398 ).indirect_reference = self.indirect_reference

399 # change the object attributes

400 extension, byte_stream, img = _xobj_to_image(

401 cast(DictionaryObject, self.indirect_reference.get_object()),

402 pillow_parameters=kwargs,

403 )

404 assert extension is not None

405 self.name = self.name[: self.name.rfind(".")] + extension

406 self.data = byte_stream

407 self.image = img

408

409 def __str__(self) -> str:

410 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

411

412 def __repr__(self) -> str:

413 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

414

415

416class VirtualListImages(Sequence[ImageFile]):

417 """

418 Provides access to images referenced within a page.

419 Only one copy will be returned if the usage is used on the same page multiple times.

420 See :func:`PageObject.images` for more details.

421 """

422

423 def __init__(

424 self,

425 ids_function: Callable[[], list[Union[str, list[str]]]],

426 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

427 ) -> None:

428 self.ids_function = ids_function

429 self.get_function = get_function

430 self.current = -1

431

432 def __len__(self) -> int:

433 return len(self.ids_function())

434

435 def keys(self) -> list[Union[str, list[str]]]:

436 return self.ids_function()

437

438 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

439 return [(x, self[x]) for x in self.ids_function()]

440

441 @overload

442 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

443 ...

444

445 @overload

446 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

447 ...

448

449 def __getitem__(

450 self, index: Union[int, slice, str, list[str], tuple[str]]

451 ) -> Union[ImageFile, Sequence[ImageFile]]:

452 lst = self.ids_function()

453 if isinstance(index, slice):

454 indices = range(*index.indices(len(self)))

455 lst = [lst[x] for x in indices]

456 cls = type(self)

457 return cls((lambda: lst), self.get_function)

458 if isinstance(index, (str, list, tuple)):

459 return self.get_function(index)

460 if not isinstance(index, int):

461 raise TypeError("Invalid sequence indices type")

462 len_self = len(lst)

463 if index < 0:

464 # support negative indexes

465 index += len_self

466 if not (0 <= index < len_self):

467 raise IndexError("Sequence index out of range")

468 return self.get_function(lst[index])

469

470 def __iter__(self) -> Iterator[ImageFile]:

471 for i in range(len(self)):

472 yield self[i]

473

474 def __str__(self) -> str:

475 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

476 return f"[{', '.join(p)}]"

477

478

479class PageObject(DictionaryObject):

480 """

481 PageObject represents a single page within a PDF file.

482

483 Typically these objects will be created by accessing the

484 :attr:`pages<pypdf.PdfReader.pages>` property of the

485 :class:`PdfReader<pypdf.PdfReader>` class, but it is

486 also possible to create an empty page with the

487 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

488

489 Args:

490 pdf: PDF file the page belongs to.

491 indirect_reference: Stores the original indirect reference to

492 this object in its source PDF

493

494 """

495

496 original_page: "PageObject" # very local use in writer when appending

497

498 def __init__(

499 self,

500 pdf: Optional[PdfCommonDocProtocol] = None,

501 indirect_reference: Optional[IndirectObject] = None,

502 ) -> None:

503 DictionaryObject.__init__(self)

504 self.pdf = pdf

505 self.inline_images: Optional[dict[str, ImageFile]] = None

506 self.indirect_reference = indirect_reference

507 if not is_null_or_none(indirect_reference):

508 assert indirect_reference is not None, "mypy"

509 self.update(cast(DictionaryObject, indirect_reference.get_object()))

510 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {}

511

512 def hash_bin(self) -> int:

513 """

514 Used to detect modified object.

515

516 Note: this function is overloaded to return the same results

517 as a DictionaryObject.

518

519 Returns:

520 Hash considering type and value.

521

522 """

523 return hash(

524 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

525 )

526

527 def hash_value_data(self) -> bytes:

528 data = super().hash_value_data()

529 data += f"{id(self)}".encode()

530 return data

531

532 @property

533 def user_unit(self) -> float:

534 """

535 A read-only positive number giving the size of user space units.

536

537 It is in multiples of 1/72 inch. Hence a value of 1 means a user

538 space unit is 1/72 inch, and a value of 3 means that a user

539 space unit is 3/72 inch.

540 """

541 return self.get(PG.USER_UNIT, 1)

542

543 @staticmethod

544 def create_blank_page(

545 pdf: Optional[PdfCommonDocProtocol] = None,

546 width: Union[float, Decimal, None] = None,

547 height: Union[float, Decimal, None] = None,

548 ) -> "PageObject":

549 """

550 Return a new blank page.

551

552 If ``width`` or ``height`` is ``None``, try to get the page size

553 from the last page of *pdf*.

554

555 Args:

556 pdf: PDF file the page is within.

557 width: The width of the new page expressed in default user

558 space units.

559 height: The height of the new page expressed in default user

560 space units.

561

562 Returns:

563 The new blank page

564

565 Raises:

566 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

567 no page

568

569 """

570 page = PageObject(pdf)

571

572 # Creates a new page (cf PDF Reference §7.7.3.3)

573 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

574 page.__setitem__(NameObject(PG.PARENT), NullObject())

575 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

576 if width is None or height is None:

577 if pdf is not None and len(pdf.pages) > 0:

578 lastpage = pdf.pages[len(pdf.pages) - 1]

579 width = lastpage.mediabox.width

580 height = lastpage.mediabox.height

581 else:

582 raise PageSizeNotDefinedError

583 page.__setitem__(

584 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore

585 )

586

587 return page

588

589 def _get_ids_image(

590 self,

591 obj: Optional[DictionaryObject] = None,

592 ancest: Optional[list[str]] = None,

593 call_stack: Optional[list[Any]] = None,

594 ) -> list[Union[str, list[str]]]:

595 if call_stack is None:

596 call_stack = []

597 _i = getattr(obj, "indirect_reference", None)

598 if _i in call_stack:

599 return []

600 call_stack.append(_i)

601 if self.inline_images is None:

602 self.inline_images = self._get_inline_images()

603 if obj is None:

604 obj = self

605 if ancest is None:

606 ancest = []

607 lst: list[Union[str, list[str]]] = []

608 if (

609 PG.RESOURCES not in obj or

610 is_null_or_none(resources := obj[PG.RESOURCES]) or

611 RES.XOBJECT not in cast(DictionaryObject, resources)

612 ):

613 return [] if self.inline_images is None else list(self.inline_images.keys())

614

615 x_object = resources[RES.XOBJECT].get_object() # type: ignore

616 for o in x_object:

617 if not isinstance(x_object[o], StreamObject):

618 continue

619 if x_object[o][IA.SUBTYPE] == "/Image":

620 lst.append(o if len(ancest) == 0 else [*ancest, o])

621 else: # is a form with possible images inside

622 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

623 assert self.inline_images is not None

624 lst.extend(list(self.inline_images.keys()))

625 return lst

626

627 def _get_image(

628 self,

629 id: Union[str, list[str], tuple[str]],

630 obj: Optional[DictionaryObject] = None,

631 ) -> ImageFile:

632 if obj is None:

633 obj = cast(DictionaryObject, self)

634 if isinstance(id, tuple):

635 id = list(id)

636 if isinstance(id, list) and len(id) == 1:

637 id = id[0]

638 try:

639 xobjs = cast(

640 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

641 )

642 except KeyError:

643 if not (id[0] == "~" and id[-1] == "~"):

644 raise

645 if isinstance(id, str):

646 if id[0] == "~" and id[-1] == "~":

647 if self.inline_images is None:

648 self.inline_images = self._get_inline_images()

649 if self.inline_images is None: # pragma: no cover

650 raise KeyError("No inline image can be found")

651 return self.inline_images[id]

652

653 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

654 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

655 extension, byte_stream = imgd[:2]

656 return ImageFile(

657 name=f"{id[1:]}{extension}",

658 data=byte_stream,

659 image=imgd[2],

660 indirect_reference=xobjs[id].indirect_reference,

661 )

662 # in a subobject

663 ids = id[1:]

664 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

665

666 @property

667 def images(self) -> VirtualListImages:

668 """

669 Read-only property emulating a list of images on a page.

670

671 Get a list of all images on the page. The key can be:

672 - A string (for the top object)

673 - A tuple (for images within XObject forms)

674 - An integer

675

676 Examples:

677 * `reader.pages[0].images[0]` # return first image

678 * `reader.pages[0].images['/I0']` # return image '/I0'

679 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

680 * `for img in reader.pages[0].images:` # loops through all objects

681

682 images.keys() and images.items() can be used.

683

684 The ImageFile has the following properties:

685

686 * `.name` : name of the object

687 * `.data` : bytes of the object

688 * `.image` : PIL Image Object

689 * `.indirect_reference` : object reference

690

691 and the following methods:

692 `.replace(new_image: PIL.Image.Image, **kwargs)` :

693 replace the image in the pdf with the new image

694 applying the saving parameters indicated (such as quality)

695

696 Example usage:

697

698 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

699

700 Inline images are extracted and named ~0~, ~1~, ..., with the

701 indirect_reference set to None.

702

703 """

704 return VirtualListImages(self._get_ids_image, self._get_image)

705

706 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

707 """Translate values used in inline image"""

708 try:

709 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

710 except (TypeError, KeyError):

711 if isinstance(v, NameObject):

712 # It is a custom name, thus we have to look in resources.

713 # The only applicable case is for ColorSpace.

714 try:

715 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

716 v = cast(DictionaryObject, res)[v]

717 except KeyError: # for res and v

718 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

719 return v

720

721 def _get_inline_images(self) -> dict[str, ImageFile]:

722 """Load inline images. Entries will be identified as `~1~`."""

723 content = self.get_contents()

724 if is_null_or_none(content):

725 return {}

726 imgs_data = []

727 assert content is not None, "mypy"

728 for param, ope in content.operations:

729 if ope == b"INLINE IMAGE":

730 imgs_data.append(

731 {"settings": param["settings"], "__streamdata__": param["data"]}

732 )

733 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

734 raise PdfReadError(

735 f"{ope!r} operator met whereas not expected, "

736 "please share use case with pypdf dev team"

737 )

738 files = {}

739 for num, ii in enumerate(imgs_data):

740 init = {

741 "__streamdata__": ii["__streamdata__"],

742 "/Length": len(ii["__streamdata__"]),

743 }

744 for k, v in ii["settings"].items():

745 if k in {"/Length", "/L"}: # no length is expected

746 continue

747 if isinstance(v, list):

748 v = ArrayObject(

749 [self._translate_value_inline_image(k, x) for x in v]

750 )

751 else:

752 v = self._translate_value_inline_image(k, v)

753 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

754 if k not in init:

755 init[k] = v

756 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

757 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415

758 extension, byte_stream, img = _xobj_to_image(ii["object"])

759 files[f"~{num}~"] = ImageFile(

760 name=f"~{num}~{extension}",

761 data=byte_stream,

762 image=img,

763 indirect_reference=None,

764 )

765 return files

766

767 @property

768 def rotation(self) -> int:

769 """

770 The visual rotation of the page.

771

772 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

773 valid values. This property does not affect ``/Contents``.

774 """

775 rotate_obj = self.get(PG.ROTATE, 0)

776 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

777

778 @rotation.setter

779 def rotation(self, r: float) -> None:

780 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

781

782 def transfer_rotation_to_content(self) -> None:

783 """

784 Apply the rotation of the page to the content and the media/crop/...

785 boxes.

786

787 It is recommended to apply this function before page merging.

788 """

789 r = -self.rotation # rotation to apply is in the otherway

790 self.rotation = 0

791 mb = RectangleObject(self.mediabox)

792 trsf = (

793 Transformation()

794 .translate(

795 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

796 )

797 .rotate(r)

798 )

799 pt1 = trsf.apply_on(mb.lower_left)

800 pt2 = trsf.apply_on(mb.upper_right)

801 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

802 self.add_transformation(trsf, False)

803 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

804 if b in self:

805 rr = RectangleObject(self[b]) # type: ignore

806 pt1 = trsf.apply_on(rr.lower_left)

807 pt2 = trsf.apply_on(rr.upper_right)

808 self[NameObject(b)] = RectangleObject(

809 (

810 min(pt1[0], pt2[0]),

811 min(pt1[1], pt2[1]),

812 max(pt1[0], pt2[0]),

813 max(pt1[1], pt2[1]),

814 )

815 )

816

817 def rotate(self, angle: int) -> "PageObject":

818 """

819 Rotate a page clockwise by increments of 90 degrees.

820

821 Args:

822 angle: Angle to rotate the page. Must be an increment of 90 deg.

823

824 Returns:

825 The rotated PageObject

826

827 """

828 if angle % 90 != 0:

829 raise ValueError("Rotation angle must be a multiple of 90")

830 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

831 return self

832

833 def _merge_resources(

834 self,

835 res1: DictionaryObject,

836 res2: DictionaryObject,

837 resource: Any,

838 new_res1: bool = True,

839 ) -> tuple[dict[str, Any], dict[str, Any]]:

840 try:

841 assert isinstance(self.indirect_reference, IndirectObject)

842 pdf = self.indirect_reference.pdf

843 is_pdf_writer = hasattr(

844 pdf, "_add_object"

845 ) # expect isinstance(pdf, PdfWriter)

846 except (AssertionError, AttributeError):

847 pdf = None

848 is_pdf_writer = False

849

850 def compute_unique_key(base_key: str) -> tuple[str, bool]:

851 """

852 Find a key that either doesn't already exist or has the same value

853 (indicated by the bool)

854

855 Args:

856 base_key: An index is added to this to get the computed key

857

858 Returns:

859 A tuple (computed key, bool) where the boolean indicates

860 if there is a resource of the given computed_key with the same

861 value.

862

863 """

864 value = page2res.raw_get(base_key)

865 # TODO: a possible improvement for writer, the indirect_reference

866 # cannot be found because translated

867

868 # try the current key first (e.g. "foo"), but otherwise iterate

869 # through "foo-0", "foo-1", etc. new_res can contain only finitely

870 # many keys, thus this'll eventually end, even if it's been crafted

871 # to be maximally annoying.

872 computed_key = base_key

873 idx = 0

874 while computed_key in new_res:

875 if new_res.raw_get(computed_key) == value:

876 # there's already a resource of this name, with the exact

877 # same value

878 return computed_key, True

879 computed_key = f"{base_key}-{idx}"

880 idx += 1

881 return computed_key, False

882

883 if new_res1:

884 new_res = DictionaryObject()

885 new_res.update(res1.get(resource, DictionaryObject()).get_object())

886 else:

887 new_res = cast(DictionaryObject, res1[resource])

888 page2res = cast(

889 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

890 )

891 rename_res = {}

892 for key in page2res:

893 unique_key, same_value = compute_unique_key(key)

894 newname = NameObject(unique_key)

895 if key != unique_key:

896 # we have to use a different name for this

897 rename_res[key] = newname

898

899 if not same_value:

900 if is_pdf_writer:

901 new_res[newname] = page2res.raw_get(key).clone(pdf)

902 try:

903 new_res[newname] = new_res[newname].indirect_reference

904 except AttributeError:

905 pass

906 else:

907 new_res[newname] = page2res.raw_get(key)

908 lst = sorted(new_res.items())

909 new_res.clear()

910 for el in lst:

911 new_res[el[0]] = el[1]

912 return new_res, rename_res

913

914 @staticmethod

915 def _content_stream_rename(

916 stream: ContentStream,

917 rename: dict[Any, Any],

918 pdf: Optional[PdfCommonDocProtocol],

919 ) -> ContentStream:

920 if not rename:

921 return stream

922 stream = ContentStream(stream, pdf)

923 for operands, _operator in stream.operations:

924 if isinstance(operands, list):

925 for i, op in enumerate(operands):

926 if isinstance(op, NameObject):

927 operands[i] = rename.get(op, op)

928 elif isinstance(operands, dict):

929 for i, op in operands.items():

930 if isinstance(op, NameObject):

931 operands[i] = rename.get(op, op)

932 else:

933 raise KeyError(f"Type of operands is {type(operands)}")

934 return stream

935

936 @staticmethod

937 def _add_transformation_matrix(

938 contents: Any,

939 pdf: Optional[PdfCommonDocProtocol],

940 ctm: CompressedTransformationMatrix,

941 ) -> ContentStream:

942 """Add transformation matrix at the beginning of the given contents stream."""

943 contents = ContentStream(contents, pdf)

944 contents.operations.insert(

945 0,

946 [

947 [FloatObject(x) for x in ctm],

948 b"cm",

949 ],

950 )

951 return contents

952

953 def _get_contents_as_bytes(self) -> Optional[bytes]:

954 """

955 Return the page contents as bytes.

956

957 Returns:

958 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

959

960 """

961 if PG.CONTENTS in self:

962 obj = self[PG.CONTENTS].get_object()

963 if isinstance(obj, list):

964 return b"".join(x.get_object().get_data() for x in obj)

965 return cast(EncodedStreamObject, obj).get_data()

966 return None

967

968 def get_contents(self) -> Optional[ContentStream]:

969 """

970 Access the page contents.

971

972 Returns:

973 The ``/Contents`` object, or ``None`` if it does not exist.

974 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

975

976 """

977 if PG.CONTENTS in self:

978 try:

979 pdf = cast(IndirectObject, self.indirect_reference).pdf

980 except AttributeError:

981 pdf = None

982 obj = self[PG.CONTENTS]

983 if is_null_or_none(obj):

984 return None

985 resolved_object = obj.get_object()

986 return ContentStream(resolved_object, pdf)

987 return None

988

989 def replace_contents(

990 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

991 ) -> None:

992 """

993 Replace the page contents with the new content and nullify old objects

994 Args:

995 content: new content; if None delete the content field.

996 """

997 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

998 # the page is not attached : the content is directly attached.

999 self[NameObject(PG.CONTENTS)] = content

1000 return

1001

1002 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1003 for o in self[PG.CONTENTS]: # type: ignore[attr-defined]

1004 try:

1005 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore

1006 except AttributeError:

1007 pass

1008

1009 if isinstance(content, ArrayObject):

1010 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content)

1011

1012 if is_null_or_none(content):

1013 if PG.CONTENTS not in self:

1014 return

1015 assert self.indirect_reference is not None

1016 assert self[PG.CONTENTS].indirect_reference is not None

1017 self.indirect_reference.pdf._objects[

1018 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore

1019 ] = NullObject()

1020 del self[PG.CONTENTS]

1021 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1022 try:

1023 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(

1024 content

1025 )

1026 except AttributeError:

1027 # applies at least for page not in writer

1028 # as a backup solution, we put content as an object although not in accordance with pdf ref

1029 # this will be fixed with the _add_object

1030 self[NameObject(PG.CONTENTS)] = content

1031 else:

1032 assert content is not None, "mypy"

1033 content.indirect_reference = self[

1034 PG.CONTENTS

1035 ].indirect_reference # TODO: in the future may require generation management

1036 try:

1037 self.indirect_reference.pdf._objects[

1038 content.indirect_reference.idnum - 1 # type: ignore

1039 ] = content

1040 except AttributeError:

1041 # applies at least for page not in writer

1042 # as a backup solution, we put content as an object although not in accordance with pdf ref

1043 # this will be fixed with the _add_object

1044 self[NameObject(PG.CONTENTS)] = content

1045 # forces recalculation of inline_images

1046 self.inline_images = None

1047

1048 def merge_page(

1049 self, page2: "PageObject", expand: bool = False, over: bool = True

1050 ) -> None:

1051 """

1052 Merge the content streams of two pages into one.

1053

1054 Resource references (e.g. fonts) are maintained from both pages.

1055 The mediabox, cropbox, etc of this page are not altered.

1056 The parameter page's content stream will

1057 be added to the end of this page's content stream,

1058 meaning that it will be drawn after, or "on top" of this page.

1059

1060 Args:

1061 page2: The page to be merged into this one. Should be

1062 an instance of :class:`PageObject<PageObject>`.

1063 over: set the page2 content over page1 if True (default) else under

1064 expand: If True, the current page dimensions will be

1065 expanded to accommodate the dimensions of the page to be merged.

1066

1067 """

1068 self._merge_page(page2, over=over, expand=expand)

1069

1070 def _merge_page(

1071 self,

1072 page2: "PageObject",

1073 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1074 ctm: Optional[CompressedTransformationMatrix] = None,

1075 over: bool = True,

1076 expand: bool = False,

1077 ) -> None:

1078 # First we work on merging the resource dictionaries. This allows us

1079 # to find out what symbols in the content streams we might need to

1080 # rename.

1081 try:

1082 assert isinstance(self.indirect_reference, IndirectObject)

1083 if hasattr(

1084 self.indirect_reference.pdf, "_add_object"

1085 ): # to detect PdfWriter

1086 return self._merge_page_writer(

1087 page2, page2transformation, ctm, over, expand

1088 )

1089 return None

1090 except (AssertionError, AttributeError):

1091 pass

1092

1093 new_resources = DictionaryObject()

1094 rename = {}

1095 try:

1096 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1097 except KeyError:

1098 original_resources = DictionaryObject()

1099 try:

1100 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1101 except KeyError:

1102 page2resources = DictionaryObject()

1103 new_annots = ArrayObject()

1104

1105 for page in (self, page2):

1106 if PG.ANNOTS in page:

1107 annots = page[PG.ANNOTS]

1108 if isinstance(annots, ArrayObject):

1109 new_annots.extend(annots)

1110

1111 for res in (

1112 RES.EXT_G_STATE,

1113 RES.FONT,

1114 RES.XOBJECT,

1115 RES.COLOR_SPACE,

1116 RES.PATTERN,

1117 RES.SHADING,

1118 RES.PROPERTIES,

1119 ):

1120 new, newrename = self._merge_resources(

1121 original_resources, page2resources, res

1122 )

1123 if new:

1124 new_resources[NameObject(res)] = new

1125 rename.update(newrename)

1126

1127 # Combine /ProcSet sets, making sure there's a consistent order

1128 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1129 sorted(

1130 set(

1131 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1132 ).union(

1133 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())

1134 )

1135 )

1136 )

1137

1138 new_content_array = ArrayObject()

1139 original_content = self.get_contents()

1140 if original_content is not None:

1141 original_content.isolate_graphics_state()

1142 new_content_array.append(original_content)

1143

1144 page2content = page2.get_contents()

1145 if page2content is not None:

1146 rect = getattr(page2, MERGE_CROP_BOX)

1147 page2content.operations.insert(

1148 0,

1149 (

1150 map(

1151 FloatObject,

1152 [

1153 rect.left,

1154 rect.bottom,

1155 rect.width,

1156 rect.height,

1157 ],

1158 ),

1159 b"re",

1160 ),

1161 )

1162 page2content.operations.insert(1, ([], b"W"))

1163 page2content.operations.insert(2, ([], b"n"))

1164 if page2transformation is not None:

1165 page2content = page2transformation(page2content)

1166 page2content = PageObject._content_stream_rename(

1167 page2content, rename, self.pdf

1168 )

1169 page2content.isolate_graphics_state()

1170 if over:

1171 new_content_array.append(page2content)

1172 else:

1173 new_content_array.insert(0, page2content)

1174

1175 # if expanding the page to fit a new page, calculate the new media box size

1176 if expand:

1177 self._expand_mediabox(page2, ctm)

1178

1179 self.replace_contents(ContentStream(new_content_array, self.pdf))

1180 self[NameObject(PG.RESOURCES)] = new_resources

1181 self[NameObject(PG.ANNOTS)] = new_annots

1182 return None

1183

1184 def _merge_page_writer(

1185 self,

1186 page2: "PageObject",

1187 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1188 ctm: Optional[CompressedTransformationMatrix] = None,

1189 over: bool = True,

1190 expand: bool = False,

1191 ) -> None:

1192 # First we work on merging the resource dictionaries. This allows us

1193 # to find which symbols in the content streams we might need to

1194 # rename.

1195 assert isinstance(self.indirect_reference, IndirectObject)

1196 pdf = self.indirect_reference.pdf

1197

1198 rename = {}

1199 if PG.RESOURCES not in self:

1200 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1201 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1202 if PG.RESOURCES not in page2:

1203 page2resources = DictionaryObject()

1204 else:

1205 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1206

1207 for res in (

1208 RES.EXT_G_STATE,

1209 RES.FONT,

1210 RES.XOBJECT,

1211 RES.COLOR_SPACE,

1212 RES.PATTERN,

1213 RES.SHADING,

1214 RES.PROPERTIES,

1215 ):

1216 if res in page2resources:

1217 if res not in original_resources:

1218 original_resources[NameObject(res)] = DictionaryObject()

1219 _, newrename = self._merge_resources(

1220 original_resources, page2resources, res, False

1221 )

1222 rename.update(newrename)

1223 # Combine /ProcSet sets.

1224 if RES.PROC_SET in page2resources:

1225 if RES.PROC_SET not in original_resources:

1226 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1227 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1228 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1229 if x not in arr:

1230 arr.append(x)

1231 arr.sort()

1232

1233 if PG.ANNOTS in page2:

1234 if PG.ANNOTS not in self:

1235 self[NameObject(PG.ANNOTS)] = ArrayObject()

1236 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1237 if ctm is None:

1238 trsf = Transformation()

1239 else:

1240 trsf = Transformation(ctm)

1241 for a in cast(ArrayObject, page2[PG.ANNOTS]):

1242 a = a.get_object()

1243 aa = a.clone(

1244 pdf,

1245 ignore_fields=("/P", "/StructParent", "/Parent"),

1246 force_duplicate=True,

1247 )

1248 r = cast(ArrayObject, a["/Rect"])

1249 pt1 = trsf.apply_on((r[0], r[1]), True)

1250 pt2 = trsf.apply_on((r[2], r[3]), True)

1251 aa[NameObject("/Rect")] = ArrayObject(

1252 (

1253 min(pt1[0], pt2[0]),

1254 min(pt1[1], pt2[1]),

1255 max(pt1[0], pt2[0]),

1256 max(pt1[1], pt2[1]),

1257 )

1258 )

1259 if "/QuadPoints" in a:

1260 q = cast(ArrayObject, a["/QuadPoints"])

1261 aa[NameObject("/QuadPoints")] = ArrayObject(

1262 trsf.apply_on((q[0], q[1]), True)

1263 + trsf.apply_on((q[2], q[3]), True)

1264 + trsf.apply_on((q[4], q[5]), True)

1265 + trsf.apply_on((q[6], q[7]), True)

1266 )

1267 try:

1268 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1269 except KeyError:

1270 pass

1271 try:

1272 aa[NameObject("/P")] = self.indirect_reference

1273 annots.append(aa.indirect_reference)

1274 except AttributeError:

1275 pass

1276

1277 new_content_array = ArrayObject()

1278 original_content = self.get_contents()

1279 if original_content is not None:

1280 original_content.isolate_graphics_state()

1281 new_content_array.append(original_content)

1282

1283 page2content = page2.get_contents()

1284 if page2content is not None:

1285 rect = getattr(page2, MERGE_CROP_BOX)

1286 page2content.operations.insert(

1287 0,

1288 (

1289 map(

1290 FloatObject,

1291 [

1292 rect.left,

1293 rect.bottom,

1294 rect.width,

1295 rect.height,

1296 ],

1297 ),

1298 b"re",

1299 ),

1300 )

1301 page2content.operations.insert(1, ([], b"W"))

1302 page2content.operations.insert(2, ([], b"n"))

1303 if page2transformation is not None:

1304 page2content = page2transformation(page2content)

1305 page2content = PageObject._content_stream_rename(

1306 page2content, rename, self.pdf

1307 )

1308 page2content.isolate_graphics_state()

1309 if over:

1310 new_content_array.append(page2content)

1311 else:

1312 new_content_array.insert(0, page2content)

1313

1314 # if expanding the page to fit a new page, calculate the new media box size

1315 if expand:

1316 self._expand_mediabox(page2, ctm)

1317

1318 self.replace_contents(new_content_array)

1319

1320 def _expand_mediabox(

1321 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1322 ) -> None:

1323 corners1 = (

1324 self.mediabox.left.as_numeric(),

1325 self.mediabox.bottom.as_numeric(),

1326 self.mediabox.right.as_numeric(),

1327 self.mediabox.top.as_numeric(),

1328 )

1329 corners2 = (

1330 page2.mediabox.left.as_numeric(),

1331 page2.mediabox.bottom.as_numeric(),

1332 page2.mediabox.left.as_numeric(),

1333 page2.mediabox.top.as_numeric(),

1334 page2.mediabox.right.as_numeric(),

1335 page2.mediabox.top.as_numeric(),

1336 page2.mediabox.right.as_numeric(),

1337 page2.mediabox.bottom.as_numeric(),

1338 )

1339 if ctm is not None:

1340 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1341 new_x = tuple(

1342 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1343 for i in range(0, 8, 2)

1344 )

1345 new_y = tuple(

1346 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1347 for i in range(0, 8, 2)

1348 )

1349 else:

1350 new_x = corners2[0:8:2]

1351 new_y = corners2[1:8:2]

1352 lowerleft = (min(new_x), min(new_y))

1353 upperright = (max(new_x), max(new_y))

1354 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1355 upperright = (

1356 max(corners1[2], upperright[0]),

1357 max(corners1[3], upperright[1]),

1358 )

1359

1360 self.mediabox.lower_left = lowerleft

1361 self.mediabox.upper_right = upperright

1362

1363 def merge_transformed_page(

1364 self,

1365 page2: "PageObject",

1366 ctm: Union[CompressedTransformationMatrix, Transformation],

1367 over: bool = True,

1368 expand: bool = False,

1369 ) -> None:

1370 """

1371 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1372 matrix is applied to the merged stream.

1373

1374 Args:

1375 page2: The page to be merged into this one.

1376 ctm: a 6-element tuple containing the operands of the

1377 transformation matrix

1378 over: set the page2 content over page1 if True (default) else under

1379 expand: Whether the page should be expanded to fit the dimensions

1380 of the page to be merged.

1381

1382 """

1383 if isinstance(ctm, Transformation):

1384 ctm = ctm.ctm

1385 self._merge_page(

1386 page2,

1387 lambda page2Content: PageObject._add_transformation_matrix(

1388 page2Content, page2.pdf, ctm

1389 ),

1390 ctm,

1391 over,

1392 expand,

1393 )

1394

1395 def merge_scaled_page(

1396 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1397 ) -> None:

1398 """

1399 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1400 is scaled by applying a transformation matrix.

1401

1402 Args:

1403 page2: The page to be merged into this one.

1404 scale: The scaling factor

1405 over: set the page2 content over page1 if True (default) else under

1406 expand: Whether the page should be expanded to fit the

1407 dimensions of the page to be merged.

1408

1409 """

1410 op = Transformation().scale(scale, scale)

1411 self.merge_transformed_page(page2, op, over, expand)

1412

1413 def merge_rotated_page(

1414 self,

1415 page2: "PageObject",

1416 rotation: float,

1417 over: bool = True,

1418 expand: bool = False,

1419 ) -> None:

1420 """

1421 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1422 is rotated by applying a transformation matrix.

1423

1424 Args:

1425 page2: The page to be merged into this one.

1426 rotation: The angle of the rotation, in degrees

1427 over: set the page2 content over page1 if True (default) else under

1428 expand: Whether the page should be expanded to fit the

1429 dimensions of the page to be merged.

1430

1431 """

1432 op = Transformation().rotate(rotation)

1433 self.merge_transformed_page(page2, op, over, expand)

1434

1435 def merge_translated_page(

1436 self,

1437 page2: "PageObject",

1438 tx: float,

1439 ty: float,

1440 over: bool = True,

1441 expand: bool = False,

1442 ) -> None:

1443 """

1444 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1445 merged is translated by applying a transformation matrix.

1446

1447 Args:

1448 page2: the page to be merged into this one.

1449 tx: The translation on X axis

1450 ty: The translation on Y axis

1451 over: set the page2 content over page1 if True (default) else under

1452 expand: Whether the page should be expanded to fit the

1453 dimensions of the page to be merged.

1454

1455 """

1456 op = Transformation().translate(tx, ty)

1457 self.merge_transformed_page(page2, op, over, expand)

1458

1459 def add_transformation(

1460 self,

1461 ctm: Union[Transformation, CompressedTransformationMatrix],

1462 expand: bool = False,

1463 ) -> None:

1464 """

1465 Apply a transformation matrix to the page.

1466

1467 Args:

1468 ctm: A 6-element tuple containing the operands of the

1469 transformation matrix. Alternatively, a

1470 :py:class:`Transformation<pypdf.Transformation>`

1471 object can be passed.

1472

1473 See :doc:`/user/cropping-and-transforming`.

1474

1475 """

1476 if isinstance(ctm, Transformation):

1477 ctm = ctm.ctm

1478 content = self.get_contents()

1479 if content is not None:

1480 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1481 content.isolate_graphics_state()

1482 self.replace_contents(content)

1483 # if expanding the page to fit a new page, calculate the new media box size

1484 if expand:

1485 corners = [

1486 self.mediabox.left.as_numeric(),

1487 self.mediabox.bottom.as_numeric(),

1488 self.mediabox.left.as_numeric(),

1489 self.mediabox.top.as_numeric(),

1490 self.mediabox.right.as_numeric(),

1491 self.mediabox.top.as_numeric(),

1492 self.mediabox.right.as_numeric(),

1493 self.mediabox.bottom.as_numeric(),

1494 ]

1495

1496 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1497 new_x = [

1498 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1499 for i in range(0, 8, 2)

1500 ]

1501 new_y = [

1502 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1503 for i in range(0, 8, 2)

1504 ]

1505

1506 self.mediabox.lower_left = (min(new_x), min(new_y))

1507 self.mediabox.upper_right = (max(new_x), max(new_y))

1508

1509 def scale(self, sx: float, sy: float) -> None:

1510 """

1511 Scale a page by the given factors by applying a transformation matrix

1512 to its content and updating the page size.

1513

1514 This updates the various page boundaries (bleedbox, trimbox, etc.)

1515 and the contents of the page.

1516

1517 Args:

1518 sx: The scaling factor on horizontal axis.

1519 sy: The scaling factor on vertical axis.

1520

1521 """

1522 self.add_transformation((sx, 0, 0, sy, 0, 0))

1523 self.bleedbox = self.bleedbox.scale(sx, sy)

1524 self.trimbox = self.trimbox.scale(sx, sy)

1525 self.artbox = self.artbox.scale(sx, sy)

1526 self.cropbox = self.cropbox.scale(sx, sy)

1527 self.mediabox = self.mediabox.scale(sx, sy)

1528

1529 if PG.ANNOTS in self:

1530 annotations = self[PG.ANNOTS]

1531 if isinstance(annotations, ArrayObject):

1532 for annotation in annotations:

1533 annotation_obj = annotation.get_object()

1534 if ADA.Rect in annotation_obj:

1535 rectangle = annotation_obj[ADA.Rect]

1536 if isinstance(rectangle, ArrayObject):

1537 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1538 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1539 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1540 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1541

1542 if PG.VP in self:

1543 viewport = self[PG.VP]

1544 if isinstance(viewport, ArrayObject):

1545 bbox = viewport[0]["/BBox"]

1546 else:

1547 bbox = viewport["/BBox"] # type: ignore

1548 scaled_bbox = RectangleObject(

1549 (

1550 float(bbox[0]) * sx,

1551 float(bbox[1]) * sy,

1552 float(bbox[2]) * sx,

1553 float(bbox[3]) * sy,

1554 )

1555 )

1556 if isinstance(viewport, ArrayObject):

1557 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore

1558 NameObject("/BBox")

1559 ] = scaled_bbox

1560 else:

1561 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore

1562

1563 def scale_by(self, factor: float) -> None:

1564 """

1565 Scale a page by the given factor by applying a transformation matrix to

1566 its content and updating the page size.

1567

1568 Args:

1569 factor: The scaling factor (for both X and Y axis).

1570

1571 """

1572 self.scale(factor, factor)

1573

1574 def scale_to(self, width: float, height: float) -> None:

1575 """

1576 Scale a page to the specified dimensions by applying a transformation

1577 matrix to its content and updating the page size.

1578

1579 Args:

1580 width: The new width.

1581 height: The new height.

1582

1583 """

1584 sx = width / float(self.mediabox.width)

1585 sy = height / float(self.mediabox.height)

1586 self.scale(sx, sy)

1587

1588 def compress_content_streams(self, level: int = -1) -> None:

1589 """

1590 Compress the size of this page by joining all content streams and

1591 applying a FlateDecode filter.

1592

1593 However, it is possible that this function will perform no action if

1594 content stream compression becomes "automatic".

1595 """

1596 content = self.get_contents()

1597 if content is not None:

1598 content_obj = content.flate_encode(level)

1599 try:

1600 content.indirect_reference.pdf._objects[ # type: ignore

1601 content.indirect_reference.idnum - 1 # type: ignore

1602 ] = content_obj

1603 except AttributeError:

1604 if self.indirect_reference is not None and hasattr(

1605 self.indirect_reference.pdf, "_add_object"

1606 ):

1607 self.replace_contents(content_obj)

1608 else:

1609 raise ValueError("Page must be part of a PdfWriter")

1610

1611 @property

1612 def page_number(self) -> Optional[int]:

1613 """

1614 Read-only property which returns the page number within the PDF file.

1615

1616 Returns:

1617 Page number; None if the page is not attached to a PDF.

1618

1619 """

1620 if self.indirect_reference is None:

1621 return None

1622 try:

1623 lst = self.indirect_reference.pdf.pages

1624 return lst.index(self)

1625 except ValueError:

1626 return None

1627

1628 def _debug_for_extract(self) -> str: # pragma: no cover

1629 out = ""

1630 for ope, op in ContentStream(

1631 self["/Contents"].get_object(), self.pdf, "bytes"

1632 ).operations:

1633 if op == b"TJ":

1634 s = [x for x in ope[0] if isinstance(x, str)]

1635 else:

1636 s = []

1637 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1638 out += "\n=============================\n"

1639 try:

1640 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1641 out += fo + "\n"

1642 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1643 try:

1644 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1645 "/Encoding"

1646 ].__repr__()

1647 out += enc_repr + "\n"

1648 except Exception:

1649 pass

1650 try:

1651 out += (

1652 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1653 "/ToUnicode"

1654 ]

1655 .get_data()

1656 .decode()

1657 + "\n"

1658 )

1659 except Exception:

1660 pass

1661

1662 except KeyError:

1663 out += "No Font\n"

1664 return out

1665

1666 def _extract_text(

1667 self,

1668 obj: Any,

1669 pdf: Any,

1670 orientations: tuple[int, ...] = (0, 90, 180, 270),

1671 space_width: float = 200.0,

1672 content_key: Optional[str] = PG.CONTENTS,

1673 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1674 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1675 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1676 ) -> str:

1677 """

1678 See extract_text for most arguments.

1679

1680 Args:

1681 content_key: indicate the default key where to extract data

1682 None = the object; this allows reusing the function on an XObject

1683 default = "/Content"

1684

1685 """

1686 extractor = TextExtraction()

1687 cmaps: dict[

1688 str,

1689 tuple[

1690 str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject

1691 ],

1692 ] = {}

1693

1694 try:

1695 objr = obj

1696 while NameObject(PG.RESOURCES) not in objr:

1697 # /Resources can be inherited so we look to parents

1698 objr = objr["/Parent"].get_object()

1699 # If no parents then no /Resources will be available,

1700 # so an exception will be raised

1701 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])

1702 except Exception:

1703 # No resources means no text is possible (no font); we consider the

1704 # file as not damaged, no need to check for TJ or Tj

1705 return ""

1706

1707 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]):

1708 for f in cast(DictionaryObject, font):

1709 try:

1710 cmaps[f] = build_char_map(f, space_width, obj)

1711 except TypeError:

1712 pass

1713

1714 try:

1715 content = (

1716 obj[content_key].get_object() if isinstance(content_key, str) else obj

1717 )

1718 if not isinstance(content, ContentStream):

1719 content = ContentStream(content, pdf, "bytes")

1720 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1721 return ""

1722 # We check all strings are TextStringObjects. ByteStringObjects

1723 # are strings where the byte->string encoding was unknown, so adding

1724 # them to the text here would be gibberish.

1725

1726 # Initialize the extractor with the necessary parameters

1727 extractor.initialize_extraction(orientations, visitor_text, cmaps)

1728

1729 for operands, operator in content.operations:

1730 if visitor_operand_before is not None:

1731 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1732 # Multiple operators are handled here

1733 if operator == b"'":

1734 extractor.process_operation(b"T*", [])

1735 extractor.process_operation(b"Tj", operands)

1736 elif operator == b'"':

1737 extractor.process_operation(b"Tw", [operands[0]])

1738 extractor.process_operation(b"Tc", [operands[1]])

1739 extractor.process_operation(b"T*", [])

1740 extractor.process_operation(b"Tj", operands[2:])

1741 elif operator == b"TJ":

1742 # The space width may be smaller than the font width, so the width should be 95%.

1743 _confirm_space_width = extractor._space_width * 0.95

1744 if operands:

1745 for op in operands[0]:

1746 if isinstance(op, (str, bytes)):

1747 extractor.process_operation(b"Tj", [op])

1748 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1749 abs(float(op)) >= _confirm_space_width

1750 and extractor.text

1751 and extractor.text[-1] != " "

1752 ):

1753 extractor.process_operation(b"Tj", [" "])

1754 elif operator == b"TD":

1755 extractor.process_operation(b"TL", [-operands[1]])

1756 extractor.process_operation(b"Td", operands)

1757 elif operator == b"Do":

1758 extractor.output += extractor.text

1759 if visitor_text is not None:

1760 visitor_text(

1761 extractor.text,

1762 extractor.memo_cm,

1763 extractor.memo_tm,

1764 extractor.cmap[3],

1765 extractor.font_size,

1766 )

1767 try:

1768 if extractor.output[-1] != "\n":

1769 extractor.output += "\n"

1770 if visitor_text is not None:

1771 visitor_text(

1772 "\n",

1773 extractor.memo_cm,

1774 extractor.memo_tm,

1775 extractor.cmap[3],

1776 extractor.font_size,

1777 )

1778 except IndexError:

1779 pass

1780 try:

1781 xobj = resources_dict["/XObject"]

1782 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore

1783 text = self.extract_xform_text(

1784 xobj[operands[0]], # type: ignore

1785 orientations,

1786 space_width,

1787 visitor_operand_before,

1788 visitor_operand_after,

1789 visitor_text,

1790 )

1791 extractor.output += text

1792 if visitor_text is not None:

1793 visitor_text(

1794 text,

1795 extractor.memo_cm,

1796 extractor.memo_tm,

1797 extractor.cmap[3],

1798 extractor.font_size,

1799 )

1800 except Exception as exception:

1801 logger_warning(

1802 f"Impossible to decode XFormObject {operands[0]}: {exception}",

1803 __name__,

1804 )

1805 finally:

1806 extractor.text = ""

1807 extractor.memo_cm = extractor.cm_matrix.copy()

1808 extractor.memo_tm = extractor.tm_matrix.copy()

1809 else:

1810 extractor.process_operation(operator, operands)

1811 if visitor_operand_after is not None:

1812 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1813 extractor.output += extractor.text # just in case

1814 if extractor.text != "" and visitor_text is not None:

1815 visitor_text(

1816 extractor.text,

1817 extractor.memo_cm,

1818 extractor.memo_tm,

1819 extractor.cmap[3],

1820 extractor.font_size,

1821 )

1822 return extractor.output

1823

1824 def _layout_mode_fonts(self) -> dict[str, _layout_mode.Font]:

1825 """

1826 Get fonts formatted for "layout" mode text extraction.

1827

1828 Returns:

1829 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name

1830

1831 """

1832 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1833 objr: Any = self

1834 fonts: dict[str, _layout_mode.Font] = {}

1835 while objr is not None:

1836 try:

1837 resources_dict: Any = objr[PG.RESOURCES]

1838 except KeyError:

1839 resources_dict = {}

1840 if "/Font" in resources_dict and self.pdf is not None:

1841 for font_name in resources_dict["/Font"]:

1842 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)

1843 font_dict = {

1844 k: v.get_object()

1845 if isinstance(v, IndirectObject)

1846 else [_v.get_object() for _v in v]

1847 if isinstance(v, ArrayObject)

1848 else v

1849 for k, v in font_dict_obj.items()

1850 }

1851 # mypy really sucks at unpacking

1852 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]

1853 try:

1854 objr = objr["/Parent"].get_object()

1855 except KeyError:

1856 objr = None

1857

1858 return fonts

1859

1860 def _layout_mode_text(

1861 self,

1862 space_vertically: bool = True,

1863 scale_weight: float = 1.25,

1864 strip_rotated: bool = True,

1865 debug_path: Optional[Path] = None,

1866 font_height_weight: float = 1,

1867 ) -> str:

1868 """

1869 Get text preserving fidelity to source PDF text layout.

1870

1871 Args:

1872 space_vertically: include blank lines inferred from y distance + font

1873 height. Defaults to True.

1874 scale_weight: multiplier for string length when calculating weighted

1875 average character width. Defaults to 1.25.

1876 strip_rotated: Removes text that is rotated w.r.t. to the page from

1877 layout mode output. Defaults to True.

1878 debug_path (Path | None): if supplied, must target a directory.

1879 creates the following files with debug information for layout mode

1880 functions if supplied:

1881 - fonts.json: output of self._layout_mode_fonts

1882 - tjs.json: individual text render ops with corresponding transform matrices

1883 - bts.json: text render ops left justified and grouped by BT/ET operators

1884 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1885 Defaults to None.

1886 font_height_weight: multiplier for font height when calculating

1887 blank lines. Defaults to 1.

1888

1889 Returns:

1890 str: multiline string containing page text in a fixed width format that

1891 closely adheres to the rendered layout in the source pdf.

1892

1893 """

1894 fonts = self._layout_mode_fonts()

1895 if debug_path: # pragma: no cover

1896 import json # noqa: PLC0415

1897

1898 debug_path.joinpath("fonts.json").write_text(

1899 json.dumps(

1900 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)

1901 ),

1902 "utf-8",

1903 )

1904

1905 ops = iter(

1906 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1907 )

1908 bt_groups = _layout_mode.text_show_operations(

1909 ops, fonts, strip_rotated, debug_path

1910 )

1911

1912 if not bt_groups:

1913 return ""

1914

1915 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1916

1917 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1918

1919 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1920

1921 def extract_text(

1922 self,

1923 *args: Any,

1924 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1925 space_width: float = 200.0,

1926 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1927 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1928 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1929 extraction_mode: Literal["plain", "layout"] = "plain",

1930 **kwargs: Any,

1931 ) -> str:

1932 """

1933 Locate all text drawing commands, in the order they are provided in the

1934 content stream, and extract the text.

1935

1936 This works well for some PDF files, but poorly for others, depending on

1937 the generator used. This will be refined in the future.

1938

1939 Do not rely on the order of text coming out of this function, as it

1940 will change if this function is made more sophisticated.

1941

1942 Arabic and Hebrew are extracted in the correct order.

1943 If required a custom RTL range of characters can be defined;

1944 see function set_custom_rtl.

1945

1946 Additionally you can provide visitor methods to get informed on all

1947 operations and all text objects.

1948 For example in some PDF files this can be useful to parse tables.

1949

1950 Args:

1951 orientations: list of orientations extract_text will look for

1952 default = (0, 90, 180, 270)

1953 note: currently only 0 (up),90 (turned left), 180 (upside down),

1954 270 (turned right)

1955 Silently ignored in "layout" mode.

1956 space_width: force default space width

1957 if not extracted from font (default: 200)

1958 Silently ignored in "layout" mode.

1959 visitor_operand_before: function to be called before processing an operation.

1960 It has four arguments: operator, operand-arguments,

1961 current transformation matrix and text matrix.

1962 Ignored with a warning in "layout" mode.

1963 visitor_operand_after: function to be called after processing an operation.

1964 It has four arguments: operator, operand-arguments,

1965 current transformation matrix and text matrix.

1966 Ignored with a warning in "layout" mode.

1967 visitor_text: function to be called when extracting some text at some position.

1968 It has five arguments: text, current transformation matrix,

1969 text matrix, font-dictionary and font-size.

1970 The font-dictionary may be None in case of unknown fonts.

1971 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1972 Ignored with a warning in "layout" mode.

1973 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1974 "layout" for experimental layout mode functionality.

1975 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1976 in "layout" mode.

1977

1978 kwargs:

1979 layout_mode_space_vertically (bool): include blank lines inferred from

1980 y distance + font height. Defaults to True.

1981 layout_mode_scale_weight (float): multiplier for string length when calculating

1982 weighted average character width. Defaults to 1.25.

1983 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

1984 Set to False to include rotated text anyway. If rotated text is discovered,

1985 layout will be degraded and a warning will result. Defaults to True.

1986 layout_mode_debug_path (Path | None): if supplied, must target a directory.

1987 creates the following files with debug information for layout mode

1988 functions if supplied:

1989

1990 - fonts.json: output of self._layout_mode_fonts

1991 - tjs.json: individual text render ops with corresponding transform matrices

1992 - bts.json: text render ops left justified and grouped by BT/ET operators

1993 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1994 layout_mode_font_height_weight (float): multiplier for font height when calculating

1995 blank lines. Defaults to 1.

1996

1997 Returns:

1998 The extracted text

1999

2000 """

2001 if extraction_mode not in ["plain", "layout"]:

2002 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

2003 if extraction_mode == "layout":

2004 for visitor in (

2005 "visitor_operand_before",

2006 "visitor_operand_after",

2007 "visitor_text",

2008 ):

2009 if locals()[visitor]:

2010 logger_warning(

2011 f"Argument {visitor} is ignored in layout mode",

2012 __name__,

2013 )

2014 return self._layout_mode_text(

2015 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2016 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2017 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2018 debug_path=kwargs.get("layout_mode_debug_path"),

2019 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2020 )

2021 if len(args) >= 1:

2022 if isinstance(args[0], str):

2023 if len(args) >= 3:

2024 if isinstance(args[2], (tuple, int)):

2025 orientations = args[2]

2026 else:

2027 raise TypeError(f"Invalid positional parameter {args[2]}")

2028 if len(args) >= 4:

2029 if isinstance(args[3], (float, int)):

2030 space_width = args[3]

2031 else:

2032 raise TypeError(f"Invalid positional parameter {args[3]}")

2033 elif isinstance(args[0], (tuple, int)):

2034 orientations = args[0]

2035 if len(args) >= 2:

2036 if isinstance(args[1], (float, int)):

2037 space_width = args[1]

2038 else:

2039 raise TypeError(f"Invalid positional parameter {args[1]}")

2040 else:

2041 raise TypeError(f"Invalid positional parameter {args[0]}")

2042

2043 if isinstance(orientations, int):

2044 orientations = (orientations,)

2045

2046 return self._extract_text(

2047 self,

2048 self.pdf,

2049 orientations,

2050 space_width,

2051 PG.CONTENTS,

2052 visitor_operand_before,

2053 visitor_operand_after,

2054 visitor_text,

2055 )

2056

2057 def extract_xform_text(

2058 self,

2059 xform: EncodedStreamObject,

2060 orientations: tuple[int, ...] = (0, 90, 270, 360),

2061 space_width: float = 200.0,

2062 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2063 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2064 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2065 ) -> str:

2066 """

2067 Extract text from an XObject.

2068

2069 Args:

2070 xform:

2071 orientations:

2072 space_width: force default space width (if not extracted from font (default 200)

2073 visitor_operand_before:

2074 visitor_operand_after:

2075 visitor_text:

2076

2077 Returns:

2078 The extracted text

2079

2080 """

2081 return self._extract_text(

2082 xform,

2083 self.pdf,

2084 orientations,

2085 space_width,

2086 None,

2087 visitor_operand_before,

2088 visitor_operand_after,

2089 visitor_text,

2090 )

2091

2092 def _get_fonts(self) -> tuple[set[str], set[str]]:

2093 """

2094 Get the names of embedded fonts and unembedded fonts.

2095

2096 Returns:

2097 A tuple (set of embedded fonts, set of unembedded fonts)

2098

2099 """

2100 obj = self.get_object()

2101 assert isinstance(obj, DictionaryObject)

2102 fonts: set[str] = set()

2103 embedded: set[str] = set()

2104 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2105 unembedded = fonts - embedded

2106 return embedded, unembedded

2107

2108 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2109 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2110 default user space units, defining the boundaries of the physical medium on

2111 which the page is intended to be displayed or printed."""

2112

2113 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2114 """

2115 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2116 default user space units, defining the visible region of default user

2117 space.

2118

2119 When the page is displayed or printed, its contents are to be clipped

2120 (cropped) to this rectangle and then imposed on the output medium in some

2121 implementation-defined manner. Default value: same as

2122 :attr:`mediabox<mediabox>`.

2123 """

2124

2125 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2126 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2127 default user space units, defining the region to which the contents of the

2128 page should be clipped when output in a production environment."""

2129

2130 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2131 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2132 default user space units, defining the intended dimensions of the finished

2133 page after trimming."""

2134

2135 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2136 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2137 default user space units, defining the extent of the page's meaningful

2138 content as intended by the page's creator."""

2139

2140 @property

2141 def annotations(self) -> Optional[ArrayObject]:

2142 if "/Annots" not in self:

2143 return None

2144 return cast(ArrayObject, self["/Annots"])

2145

2146 @annotations.setter

2147 def annotations(self, value: Optional[ArrayObject]) -> None:

2148 """

2149 Set the annotations array of the page.

2150

2151 Typically you do not want to set this value, but append to it.

2152 If you append to it, remember to add the object first to the writer

2153 and only add the indirect object.

2154 """

2155 if value is None:

2156 if "/Annots" not in self:

2157 return

2158 del self[NameObject("/Annots")]

2159 else:

2160 self[NameObject("/Annots")] = value

2161

2162

2163class _VirtualList(Sequence[PageObject]):

2164 def __init__(

2165 self,

2166 length_function: Callable[[], int],

2167 get_function: Callable[[int], PageObject],

2168 ) -> None:

2169 self.length_function = length_function

2170 self.get_function = get_function

2171 self.current = -1

2172

2173 def __len__(self) -> int:

2174 return self.length_function()

2175

2176 @overload

2177 def __getitem__(self, index: int) -> PageObject:

2178 ...

2179

2180 @overload

2181 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2182 ...

2183

2184 def __getitem__(

2185 self, index: Union[int, slice]

2186 ) -> Union[PageObject, Sequence[PageObject]]:

2187 if isinstance(index, slice):

2188 indices = range(*index.indices(len(self)))

2189 cls = type(self)

2190 return cls(indices.__len__, lambda idx: self[indices[idx]])

2191 if not isinstance(index, int):

2192 raise TypeError("Sequence indices must be integers")

2193 len_self = len(self)

2194 if index < 0:

2195 # support negative indexes

2196 index += len_self

2197 if not (0 <= index < len_self):

2198 raise IndexError("Sequence index out of range")

2199 return self.get_function(index)

2200

2201 def __delitem__(self, index: Union[int, slice]) -> None:

2202 if isinstance(index, slice):

2203 r = list(range(*index.indices(len(self))))

2204 # pages have to be deleted from last to first

2205 r.sort()

2206 r.reverse()

2207 for p in r:

2208 del self[p] # recursive call

2209 return

2210 if not isinstance(index, int):

2211 raise TypeError("Index must be integers")

2212 len_self = len(self)

2213 if index < 0:

2214 # support negative indexes

2215 index += len_self

2216 if not (0 <= index < len_self):

2217 raise IndexError("Index out of range")

2218 ind = self[index].indirect_reference

2219 assert ind is not None

2220 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2221 "/Parent", None

2222 )

2223 first = True

2224 while parent is not None:

2225 parent = cast(DictionaryObject, parent.get_object())

2226 try:

2227 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2228 del cast(ArrayObject, parent["/Kids"])[i]

2229 first = False

2230 try:

2231 assert ind is not None

2232 del ind.pdf.flattened_pages[index] # case of page in a Reader

2233 except Exception: # pragma: no cover

2234 pass

2235 if "/Count" in parent:

2236 parent[NameObject("/Count")] = NumberObject(

2237 cast(int, parent["/Count"]) - 1

2238 )

2239 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2240 # No more objects in this part of this subtree

2241 ind = parent.indirect_reference

2242 parent = parent.get("/Parent", None)

2243 except ValueError: # from index

2244 if first:

2245 raise PdfReadError(f"Page not found in page tree: {ind}")

2246 break

2247

2248 def __iter__(self) -> Iterator[PageObject]:

2249 for i in range(len(self)):

2250 yield self[i]

2251

2252 def __str__(self) -> str:

2253 p = [f"PageObject({i})" for i in range(self.length_function())]

2254 return f"[{', '.join(p)}]"

2255

2256

2257def _get_fonts_walk(

2258 obj: DictionaryObject,

2259 fnt: set[str],

2260 emb: set[str],

2261) -> tuple[set[str], set[str]]:

2262 """

2263 Get the set of all fonts and all embedded fonts.

2264

2265 Args:

2266 obj: Page resources dictionary

2267 fnt: font

2268 emb: embedded fonts

2269

2270 Returns:

2271 A tuple (fnt, emb)

2272

2273 If there is a key called 'BaseFont', that is a font that is used in the document.

2274 If there is a key called 'FontName' and another key in the same dictionary object

2275 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2276 embedded.

2277

2278 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2279

2280 """

2281 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2282

2283 def process_font(f: DictionaryObject) -> None:

2284 nonlocal fnt, emb

2285 f = cast(DictionaryObject, f.get_object()) # to be sure

2286 if "/BaseFont" in f:

2287 fnt.add(cast(str, f["/BaseFont"]))

2288

2289 if (

2290 ("/CharProcs" in f)

2291 or (

2292 "/FontDescriptor" in f

2293 and any(

2294 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2295 )

2296 )

2297 or (

2298 "/DescendantFonts" in f

2299 and "/FontDescriptor"

2300 in cast(

2301 DictionaryObject,

2302 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2303 )

2304 and any(

2305 x

2306 in cast(

2307 DictionaryObject,

2308 cast(

2309 DictionaryObject,

2310 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2311 )["/FontDescriptor"],

2312 )

2313 for x in fontkeys

2314 )

2315 )

2316 ):

2317 # the list comprehension ensures there is FontFile

2318 try:

2319 emb.add(cast(str, f["/BaseFont"]))

2320 except KeyError:

2321 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2322

2323 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2324 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2325 process_font(f)

2326 if "/Resources" in obj:

2327 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2328 for f in cast(

2329 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2330 ).values():

2331 process_font(f)

2332 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2333 for x in cast(

2334 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2335 ).values():

2336 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2337 if "/Annots" in obj:

2338 for a in cast(ArrayObject, obj["/Annots"]):

2339 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2340 if "/AP" in obj:

2341 if (

2342 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2343 "/Type"

2344 )

2345 == "/XObject"

2346 ):

2347 _get_fonts_walk(

2348 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2349 fnt,

2350 emb,

2351 )

2352 else:

2353 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2354 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2355 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

917 statements