Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import math

31from collections.abc import Iterable, Iterator, Sequence

32from copy import deepcopy

33from dataclasses import asdict, dataclass

34from decimal import Decimal

35from io import BytesIO

36from pathlib import Path

37from typing import (

38 Any,

39 Callable,

40 Literal,

41 Optional,

42 Union,

43 cast,

44 overload,

45)

47from ._font import Font

48from ._protocols import PdfCommonDocProtocol

49from ._text_extraction import (

50 _layout_mode,

51)

52from ._text_extraction._text_extractor import TextExtraction

53from ._utils import (

54 CompressedTransformationMatrix,

55 TransformationMatrixType,

56 _human_readable_bytes,

57 deprecate,

58 logger_warning,

59 matrix_multiply,

60)

61from .constants import (

62 _INLINE_IMAGE_KEY_MAPPING,

63 _INLINE_IMAGE_VALUE_MAPPING,

64 AnnotationDictionaryAttributes,

65 ImageAttributes,

66)

67from .constants import PageAttributes as PG

68from .constants import Resources as RES

69from .errors import PageSizeNotDefinedError, PdfReadError

70from .generic import (

71 ArrayObject,

72 ContentStream,

73 DictionaryObject,

74 EncodedStreamObject,

75 FloatObject,

76 IndirectObject,

77 NameObject,

78 NullObject,

79 NumberObject,

80 PdfObject,

81 RectangleObject,

82 StreamObject,

83 is_null_or_none,

84)

86try:

87 from PIL.Image import Image

89 pil_not_imported = False

90except ImportError:

91 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10

92 pil_not_imported = True # error will be raised only when using images

94MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox"

97def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:

98 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name)

99 if isinstance(retval, RectangleObject):

100 return retval

101 if is_null_or_none(retval):

102 for d in defaults:

103 retval = self.get(d)

104 if retval is not None:

105 break

106 if isinstance(retval, IndirectObject):

107 retval = self.pdf.get_object(retval)

108 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4:

109 logger_warning(

110 "Expected four values, got %(length)d: %(retval)s",

111 source=__name__,

112 length=length,

113 retval=retval,

114 )

115 retval = RectangleObject(tuple(retval[:4]))

116 else:

117 retval = RectangleObject(retval) # type: ignore[arg-type]

118 _set_rectangle(self, name, retval)

119 return retval

120

121

122def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:

123 self[NameObject(name)] = value

124

125

126def _delete_rectangle(self: Any, name: str) -> None:

127 del self[name]

128

129

130def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:

131 return property(

132 lambda self: _get_rectangle(self, name, fallback),

133 lambda self, value: _set_rectangle(self, name, value),

134 lambda self: _delete_rectangle(self, name),

135 )

136

137

138class Transformation:

139 """

140 Represent a 2D transformation.

141

142 The transformation between two coordinate systems is represented by a 3-by-3

143 transformation matrix with the following form::

144

145 a b 0

146 c d 0

147 e f 1

148

149 Because a transformation matrix has only six elements that can be changed,

150 it is usually specified in PDF as the six-element array [ a b c d e f ].

151

152 Coordinate transformations are expressed as matrix multiplications::

153

154 a b 0

155 [ x′ y′ 1 ] = [ x y 1 ] × c d 0

156 e f 1

157

158

159 Example:

160 >>> from pypdf import PdfWriter, Transformation

161 >>> page = PdfWriter().add_blank_page(800, 600)

162 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)

163 >>> page.add_transformation(op)

164

165 """

166

167 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None:

168 self.ctm = ctm

169

170 @property

171 def matrix(self) -> TransformationMatrixType:

172 """

173 Return the transformation matrix as a tuple of tuples in the form:

174

175 ((a, b, 0), (c, d, 0), (e, f, 1))

176 """

177 return (

178 (self.ctm[0], self.ctm[1], 0),

179 (self.ctm[2], self.ctm[3], 0),

180 (self.ctm[4], self.ctm[5], 1),

181 )

182

183 @staticmethod

184 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:

185 """

186 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).

187

188 Args:

189 matrix: The transformation matrix as a tuple of tuples.

190

191 Returns:

192 A tuple representing the transformation matrix as (a, b, c, d, e, f)

193

194 """

195 return (

196 matrix[0][0],

197 matrix[0][1],

198 matrix[1][0],

199 matrix[1][1],

200 matrix[2][0],

201 matrix[2][1],

202 )

203

204 def _to_cm(self) -> str:

205 # Returns the cm operation string for the given transformation matrix

206 return (

207 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "

208 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"

209 )

210

211 def transform(self, m: "Transformation") -> "Transformation":

212 """

213 Apply one transformation to another.

214

215 Args:

216 m: a Transformation to apply.

217

218 Returns:

219 A new ``Transformation`` instance

220

221 Example:

222 >>> from pypdf import PdfWriter, Transformation

223 >>> height, width = 40, 50

224 >>> page = PdfWriter().add_blank_page(800, 600)

225 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror

226 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror

227 >>> page.add_transformation(op)

228

229 """

230 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))

231 return Transformation(ctm)

232

233 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":

234 """

235 Translate the contents of a page.

236

237 Args:

238 tx: The translation along the x-axis.

239 ty: The translation along the y-axis.

240

241 Returns:

242 A new ``Transformation`` instance

243

244 """

245 m = self.ctm

246 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))

247

248 def scale(

249 self, sx: Optional[float] = None, sy: Optional[float] = None

250 ) -> "Transformation":

251 """

252 Scale the contents of a page towards the origin of the coordinate system.

253

254 Typically, that is the lower-left corner of the page. That can be

255 changed by translating the contents / the page boxes.

256

257 Args:

258 sx: The scale factor along the x-axis.

259 sy: The scale factor along the y-axis.

260

261 Returns:

262 A new Transformation instance with the scaled matrix.

263

264 """

265 if sx is None and sy is None:

266 raise ValueError("Either sx or sy must be specified")

267 if sx is None:

268 sx = sy

269 if sy is None:

270 sy = sx

271 assert sx is not None

272 assert sy is not None

273 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))

274 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

275 return Transformation(ctm)

276

277 def rotate(self, rotation: float) -> "Transformation":

278 """

279 Rotate the contents of a page.

280

281 Args:

282 rotation: The angle of rotation in degrees.

283

284 Returns:

285 A new ``Transformation`` instance with the rotated matrix.

286

287 """

288 rotation = math.radians(rotation)

289 op: TransformationMatrixType = (

290 (math.cos(rotation), math.sin(rotation), 0),

291 (-math.sin(rotation), math.cos(rotation), 0),

292 (0, 0, 1),

293 )

294 ctm = Transformation.compress(matrix_multiply(self.matrix, op))

295 return Transformation(ctm)

296

297 def __repr__(self) -> str:

298 return f"Transformation(ctm={self.ctm})"

299

300 @overload

301 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]:

302 ...

303

304 @overload

305 def apply_on(

306 self, pt: tuple[float, float], as_object: bool = False

307 ) -> tuple[float, float]:

308 ...

309

310 def apply_on(

311 self,

312 pt: Union[tuple[float, float], list[float]],

313 as_object: bool = False,

314 ) -> Union[tuple[float, float], list[float]]:

315 """

316 Apply the transformation matrix on the given point.

317

318 Args:

319 pt: A tuple or list representing the point in the form (x, y).

320 as_object: If True, return items as FloatObject, otherwise as plain floats.

321

322 Returns:

323 A tuple or list representing the transformed point in the form (x', y')

324

325 """

326 typ = FloatObject if as_object else float

327 pt1 = (

328 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),

329 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),

330 )

331 return list(pt1) if isinstance(pt, list) else pt1

332

333

334@dataclass

335class ImageFile:

336 """

337 Image within the PDF file. *This object is not designed to be built.*

338

339 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one.

340 """

341

342 name: str = ""

343 """

344 Filename as identified within the PDF file.

345 """

346

347 data: bytes = b""

348 """

349 Data as bytes.

350 """

351

352 image: Optional[Image] = None

353 """

354 Data as PIL image.

355 """

356

357 indirect_reference: Optional[IndirectObject] = None

358 """

359 Reference to the object storing the stream.

360 """

361

362 def replace(self, new_image: Image, **kwargs: Any) -> None:

363 """

364 Replace the image with a new PIL image.

365

366 Args:

367 new_image (PIL.Image.Image): The new PIL image to replace the existing image.

368 **kwargs: Additional keyword arguments to pass to `Image.save()`.

369

370 Raises:

371 TypeError: If the image is inline or in a PdfReader.

372 TypeError: If the image does not belong to a PdfWriter.

373 TypeError: If `new_image` is not a PIL Image.

374

375 Note:

376 This method replaces the existing image with a new image.

377 It is not allowed for inline images or images within a PdfReader.

378 The `kwargs` parameter allows passing additional parameters

379 to `Image.save()`, such as quality.

380

381 """

382 if pil_not_imported:

383 raise ImportError(

384 "pillow is required to do image extraction. "

385 "It can be installed via 'pip install pypdf[image]'"

386 )

387

388 from ._reader import PdfReader # noqa: PLC0415

389 from .generic import DictionaryObject, PdfObject # noqa: PLC0415

390 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415

391

392 if self.indirect_reference is None:

393 raise TypeError("Cannot update an inline image.")

394 if not hasattr(self.indirect_reference.pdf, "_id_translated"):

395 raise TypeError("Cannot update an image not belonging to a PdfWriter.")

396 if not isinstance(new_image, Image):

397 raise TypeError("new_image shall be a PIL Image")

398 b = BytesIO()

399 new_image.save(b, "PDF", **kwargs)

400 reader = PdfReader(b)

401 page_image = reader.pages[0].images[0]

402 assert page_image.indirect_reference is not None

403 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (

404 page_image.indirect_reference.get_object()

405 )

406 cast(

407 PdfObject, self.indirect_reference.get_object()

408 ).indirect_reference = self.indirect_reference

409 # change the object attributes

410 extension, byte_stream, img = _xobj_to_image(

411 cast(DictionaryObject, self.indirect_reference.get_object()),

412 pillow_parameters=kwargs,

413 )

414 assert extension is not None

415 self.name = self.name[: self.name.rfind(".")] + extension

416 self.data = byte_stream

417 self.image = img

418

419 def __str__(self) -> str:

420 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

421

422 def __repr__(self) -> str:

423 return self.__str__()[:-1] + f", hash: {hash(self.data)})"

424

425

426class VirtualListImages(Sequence[ImageFile]):

427 """

428 Provides access to images referenced within a page.

429 Only one copy will be returned if the usage is used on the same page multiple times.

430 See :func:`PageObject.images` for more details.

431 """

432

433 def __init__(

434 self,

435 ids_function: Callable[[], list[Union[str, list[str]]]],

436 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile],

437 ) -> None:

438 self.ids_function = ids_function

439 self.get_function = get_function

440 self.current = -1

441

442 def __len__(self) -> int:

443 return len(self.ids_function())

444

445 def keys(self) -> list[Union[str, list[str]]]:

446 return self.ids_function()

447

448 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]:

449 return [(x, self[x]) for x in self.ids_function()]

450

451 @overload

452 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile:

453 ...

454

455 @overload

456 def __getitem__(self, index: slice) -> Sequence[ImageFile]:

457 ...

458

459 def __getitem__(

460 self, index: Union[int, slice, str, list[str], tuple[str]]

461 ) -> Union[ImageFile, Sequence[ImageFile]]:

462 lst = self.ids_function()

463 if isinstance(index, slice):

464 indices = range(*index.indices(len(self)))

465 lst = [lst[x] for x in indices]

466 cls = type(self)

467 return cls((lambda: lst), self.get_function)

468 if isinstance(index, (str, list, tuple)):

469 return self.get_function(index)

470 if not isinstance(index, int):

471 raise TypeError("Invalid sequence indices type")

472 len_self = len(lst)

473 if index < 0:

474 # support negative indexes

475 index += len_self

476 if not (0 <= index < len_self):

477 raise IndexError("Sequence index out of range")

478 return self.get_function(lst[index])

479

480 def __iter__(self) -> Iterator[ImageFile]:

481 for i in range(len(self)):

482 yield self[i]

483

484 def __str__(self) -> str:

485 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]

486 return f"[{', '.join(p)}]"

487

488

489class PageObject(DictionaryObject):

490 """

491 PageObject represents a single page within a PDF file.

492

493 Typically these objects will be created by accessing the

494 :attr:`pages<pypdf.PdfReader.pages>` property of the

495 :class:`PdfReader<pypdf.PdfReader>` class, but it is

496 also possible to create an empty page with the

497 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.

498

499 Args:

500 pdf: PDF file the page belongs to.

501 indirect_reference: Stores the original indirect reference to

502 this object in its source PDF

503

504 """

505

506 original_page: "PageObject" # very local use in writer when appending

507

508 def __init__(

509 self,

510 pdf: Optional[PdfCommonDocProtocol] = None,

511 indirect_reference: Optional[IndirectObject] = None,

512 ) -> None:

513 DictionaryObject.__init__(self)

514 self.pdf = pdf

515 self.inline_images: Optional[dict[str, ImageFile]] = None

516 self.indirect_reference = indirect_reference

517 if not is_null_or_none(indirect_reference):

518 assert indirect_reference is not None, "mypy"

519 self.update(cast(DictionaryObject, indirect_reference.get_object()))

520

521 def hash_bin(self) -> int:

522 """

523 Used to detect modified object.

524

525 Note: this function is overloaded to return the same results

526 as a DictionaryObject.

527

528 Returns:

529 Hash considering type and value.

530

531 """

532 return hash(

533 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))

534 )

535

536 def hash_value_data(self) -> bytes:

537 data = super().hash_value_data()

538 data += f"{id(self)}".encode()

539 return data

540

541 @property

542 def user_unit(self) -> float:

543 """

544 A read-only positive number giving the size of user space units.

545

546 It is in multiples of 1/72 inch. Hence a value of 1 means a user

547 space unit is 1/72 inch, and a value of 3 means that a user

548 space unit is 3/72 inch.

549 """

550 return cast(float, self.get(PG.USER_UNIT, 1))

551

552 @staticmethod

553 def create_blank_page(

554 pdf: Optional[PdfCommonDocProtocol] = None,

555 width: Union[float, Decimal, None] = None,

556 height: Union[float, Decimal, None] = None,

557 ) -> "PageObject":

558 """

559 Return a new blank page.

560

561 If ``width`` or ``height`` is ``None``, try to get the page size

562 from the last page of *pdf*.

563

564 Args:

565 pdf: PDF file the page is within.

566 width: The width of the new page expressed in default user

567 space units.

568 height: The height of the new page expressed in default user

569 space units.

570

571 Returns:

572 The new blank page

573

574 Raises:

575 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains

576 no page

577

578 """

579 page = PageObject(pdf)

580

581 # Creates a new page (cf PDF Reference §7.7.3.3)

582 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))

583 page.__setitem__(NameObject(PG.PARENT), NullObject())

584 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())

585 if width is None or height is None:

586 if pdf is not None and len(pdf.pages) > 0:

587 lastpage = pdf.pages[len(pdf.pages) - 1]

588 width = lastpage.mediabox.width

589 height = lastpage.mediabox.height

590 else:

591 raise PageSizeNotDefinedError

592 page.__setitem__(

593 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore[arg-type]

594 )

595

596 return page

597

598 def _get_ids_image(

599 self,

600 obj: Optional[DictionaryObject] = None,

601 ancest: Optional[list[str]] = None,

602 call_stack: Optional[list[Any]] = None,

603 ) -> list[Union[str, list[str]]]:

604 if call_stack is None:

605 call_stack = []

606 _i = getattr(obj, "indirect_reference", None)

607 if _i in call_stack:

608 return []

609 call_stack.append(_i)

610 if self.inline_images is None:

611 self.inline_images = self._get_inline_images()

612 if obj is None:

613 obj = self

614 if ancest is None:

615 ancest = []

616 lst: list[Union[str, list[str]]] = []

617 if (

618 PG.RESOURCES not in obj or

619 is_null_or_none(resources := obj[PG.RESOURCES]) or

620 RES.XOBJECT not in cast(DictionaryObject, resources)

621 ):

622 return [] if self.inline_images is None else list(self.inline_images.keys())

623

624 x_object = resources[RES.XOBJECT].get_object() # type: ignore[index]

625 for o in x_object:

626 if not isinstance(x_object[o], StreamObject):

627 continue

628 if x_object[o][ImageAttributes.SUBTYPE] == "/Image":

629 lst.append(o if len(ancest) == 0 else [*ancest, o])

630 else: # is a form with possible images inside

631 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack))

632 assert self.inline_images is not None

633 lst.extend(list(self.inline_images.keys()))

634 return lst

635

636 def _get_image(

637 self,

638 id: Union[str, list[str], tuple[str]],

639 obj: Optional[DictionaryObject] = None,

640 ) -> ImageFile:

641 if obj is None:

642 obj = cast(DictionaryObject, self)

643 if isinstance(id, tuple):

644 id = list(id)

645 if isinstance(id, list) and len(id) == 1:

646 id = id[0]

647 xobjs: Optional[DictionaryObject] = None

648 try:

649 xobjs = cast(

650 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]

651 )

652 except KeyError as exc:

653 if not (id[0] == "~" and id[-1] == "~"):

654 raise KeyError(

655 f"Cannot access image object {id} without XObject resources"

656 ) from exc

657 if isinstance(id, str):

658 if id[0] == "~" and id[-1] == "~":

659 if self.inline_images is None:

660 self.inline_images = self._get_inline_images()

661 if self.inline_images is None:

662 raise KeyError("No inline image can be found")

663 return self.inline_images[id]

664

665 assert xobjs is not None

666 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415

667 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))

668 extension, byte_stream = imgd[:2]

669 return ImageFile(

670 name=f"{id[1:]}{extension}",

671 data=byte_stream,

672 image=imgd[2],

673 indirect_reference=xobjs[id].indirect_reference,

674 )

675 # in a subobject

676 assert xobjs is not None

677 ids = id[1:]

678 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

679

680 @property

681 def images(self) -> VirtualListImages:

682 """

683 Read-only property emulating a list of images on a page.

684

685 Get a list of all images on the page. The key can be:

686 - A string (for the top object)

687 - A tuple (for images within XObject forms)

688 - An integer

689

690 Examples:

691 * `reader.pages[0].images[0]` # return first image

692 * `reader.pages[0].images['/I0']` # return image '/I0'

693 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form

694 * `for img in reader.pages[0].images:` # loops through all objects

695

696 images.keys() and images.items() can be used.

697

698 The ImageFile has the following properties:

699

700 * `.name` : name of the object

701 * `.data` : bytes of the object

702 * `.image` : PIL Image Object

703 * `.indirect_reference` : object reference

704

705 and the following methods:

706 `.replace(new_image: PIL.Image.Image, **kwargs)` :

707 replace the image in the pdf with the new image

708 applying the saving parameters indicated (such as quality)

709

710 Example usage:

711

712 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20)

713

714 Inline images are extracted and named ~0~, ~1~, ..., with the

715 indirect_reference set to None.

716

717 """

718 return VirtualListImages(self._get_ids_image, self._get_image)

719

720 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject:

721 """Translate values used in inline image"""

722 try:

723 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)])

724 except (TypeError, KeyError):

725 if isinstance(v, NameObject):

726 # It is a custom name, thus we have to look in resources.

727 # The only applicable case is for ColorSpace.

728 try:

729 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]

730 v = cast(DictionaryObject, res)[v]

731 except KeyError: # for res and v

732 raise PdfReadError(f"Cannot find resource entry {v} for {k}")

733 return v

734

735 def _get_inline_images(self) -> dict[str, ImageFile]:

736 """Load inline images. Entries will be identified as `~1~`."""

737 content = self.get_contents()

738 if is_null_or_none(content):

739 return {}

740 imgs_data = []

741 assert content is not None, "mypy"

742 for param, ope in content.operations:

743 if ope == b"INLINE IMAGE":

744 imgs_data.append(

745 {"settings": param["settings"], "__streamdata__": param["data"]}

746 )

747 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover

748 raise PdfReadError(

749 f"{ope!r} operator met whereas not expected, "

750 "please share use case with pypdf dev team"

751 )

752 files = {}

753 for num, ii in enumerate(imgs_data):

754 init = {

755 "__streamdata__": ii["__streamdata__"],

756 "/Length": len(ii["__streamdata__"]),

757 }

758 for k, v in ii["settings"].items():

759 if k in {"/Length", "/L"}: # no length is expected

760 continue

761 if isinstance(v, list):

762 v = ArrayObject(

763 [self._translate_value_inline_image(k, x) for x in v]

764 )

765 else:

766 v = self._translate_value_inline_image(k, v)

767 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k])

768 if k not in init:

769 init[k] = v

770 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)

771 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415

772 extension, byte_stream, img = _xobj_to_image(ii["object"])

773 files[f"~{num}~"] = ImageFile(

774 name=f"~{num}~{extension}",

775 data=byte_stream,

776 image=img,

777 indirect_reference=None,

778 )

779 return files

780

781 @property

782 def rotation(self) -> int:

783 """

784 The visual rotation of the page.

785

786 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are

787 valid values. This property does not affect ``/Contents``.

788 """

789 rotate_obj = self.get(PG.ROTATE, 0)

790 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()

791

792 @rotation.setter

793 def rotation(self, r: float) -> None:

794 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)

795

796 def transfer_rotation_to_content(self) -> None:

797 """

798 Apply the rotation of the page to the content and the media/crop/...

799 boxes.

800

801 It is recommended to apply this function before page merging.

802 """

803 r = -self.rotation # rotation to apply is in the otherway

804 self.rotation = 0

805 mb = RectangleObject(self.mediabox)

806 trsf = (

807 Transformation()

808 .translate(

809 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)

810 )

811 .rotate(r)

812 )

813 pt1 = trsf.apply_on(mb.lower_left)

814 pt2 = trsf.apply_on(mb.upper_right)

815 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))

816 self.add_transformation(trsf, False)

817 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:

818 if b in self:

819 rr = RectangleObject(self[b]) # type: ignore[arg-type]

820 pt1 = trsf.apply_on(rr.lower_left)

821 pt2 = trsf.apply_on(rr.upper_right)

822 self[NameObject(b)] = RectangleObject(

823 (

824 min(pt1[0], pt2[0]),

825 min(pt1[1], pt2[1]),

826 max(pt1[0], pt2[0]),

827 max(pt1[1], pt2[1]),

828 )

829 )

830

831 def rotate(self, angle: int) -> "PageObject":

832 """

833 Rotate a page clockwise by increments of 90 degrees.

834

835 Args:

836 angle: Angle to rotate the page. Must be an increment of 90 deg.

837

838 Returns:

839 The rotated PageObject

840

841 """

842 if angle % 90 != 0:

843 raise ValueError("Rotation angle must be a multiple of 90")

844 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)

845 return self

846

847 def _merge_resources(

848 self,

849 res1: DictionaryObject,

850 res2: DictionaryObject,

851 resource: Any,

852 new_res1: bool = True,

853 ) -> tuple[dict[str, Any], dict[str, Any]]:

854 try:

855 assert isinstance(self.indirect_reference, IndirectObject)

856 pdf = self.indirect_reference.pdf

857 is_pdf_writer = hasattr(

858 pdf, "_add_object"

859 ) # expect isinstance(pdf, PdfWriter)

860 except (AssertionError, AttributeError):

861 pdf = None

862 is_pdf_writer = False

863

864 def compute_unique_key(base_key: str) -> tuple[str, bool]:

865 """

866 Find a key that either doesn't already exist or has the same value

867 (indicated by the bool)

868

869 Args:

870 base_key: An index is added to this to get the computed key

871

872 Returns:

873 A tuple (computed key, bool) where the boolean indicates

874 if there is a resource of the given computed_key with the same

875 value.

876

877 """

878 value = page2res.raw_get(base_key)

879 # TODO: a possible improvement for writer, the indirect_reference

880 # cannot be found because translated

881

882 # try the current key first (e.g. "foo"), but otherwise iterate

883 # through "foo-0", "foo-1", etc. new_res can contain only finitely

884 # many keys, thus this'll eventually end, even if it's been crafted

885 # to be maximally annoying.

886 computed_key = base_key

887 idx = 0

888 while computed_key in new_res:

889 if new_res.raw_get(computed_key) == value:

890 # there's already a resource of this name, with the exact

891 # same value

892 return computed_key, True

893 computed_key = f"{base_key}-{idx}"

894 idx += 1

895 return computed_key, False

896

897 if new_res1:

898 new_res = DictionaryObject()

899 new_res.update(res1.get(resource, DictionaryObject()).get_object())

900 else:

901 new_res = cast(DictionaryObject, res1[resource])

902 page2res = cast(

903 DictionaryObject, res2.get(resource, DictionaryObject()).get_object()

904 )

905 rename_res = {}

906 for key in page2res:

907 unique_key, same_value = compute_unique_key(key)

908 newname = NameObject(unique_key)

909 if key != unique_key:

910 # we have to use a different name for this

911 rename_res[key] = newname

912

913 if not same_value:

914 if is_pdf_writer:

915 new_res[newname] = page2res.raw_get(key).clone(pdf)

916 try:

917 new_res[newname] = new_res[newname].indirect_reference

918 except AttributeError:

919 pass

920 else:

921 new_res[newname] = page2res.raw_get(key)

922 lst = sorted(new_res.items())

923 new_res.clear()

924 for el in lst:

925 new_res[el[0]] = el[1]

926 return new_res, rename_res

927

928 @staticmethod

929 def _content_stream_rename(

930 stream: ContentStream,

931 rename: dict[Any, Any],

932 pdf: Optional[PdfCommonDocProtocol],

933 ) -> ContentStream:

934 if not rename:

935 return stream

936 stream = ContentStream(stream, pdf)

937 for operands, _operator in stream.operations:

938 if isinstance(operands, list):

939 for i, op in enumerate(operands):

940 if isinstance(op, NameObject):

941 operands[i] = rename.get(op, op)

942 elif isinstance(operands, dict):

943 for i, op in operands.items():

944 if isinstance(op, NameObject):

945 operands[i] = rename.get(op, op)

946 else:

947 raise KeyError(f"Type of operands is {type(operands)}")

948 return stream

949

950 @staticmethod

951 def _add_transformation_matrix(

952 contents: Any,

953 pdf: Optional[PdfCommonDocProtocol],

954 ctm: CompressedTransformationMatrix,

955 ) -> ContentStream:

956 """Add transformation matrix at the beginning of the given contents stream."""

957 content_stream = ContentStream(contents, pdf)

958 content_stream.operations.insert(

959 0,

960 (

961 [FloatObject(x) for x in ctm],

962 b"cm",

963 ),

964 )

965 return content_stream

966

967 def _get_contents_as_bytes(self) -> Optional[bytes]:

968 """

969 Return the page contents as bytes.

970

971 Returns:

972 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.

973

974 """

975 if PG.CONTENTS in self:

976 obj = self[PG.CONTENTS].get_object()

977 if isinstance(obj, list):

978 return b"".join(x.get_object().get_data() for x in obj)

979 return cast(EncodedStreamObject, obj).get_data()

980 return None

981

982 def get_contents(self) -> Optional[ContentStream]:

983 """

984 Access the page contents.

985

986 Returns:

987 The ``/Contents`` object, or ``None`` if it does not exist.

988 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.

989

990 """

991 if PG.CONTENTS in self:

992 try:

993 pdf = cast(IndirectObject, self.indirect_reference).pdf

994 except AttributeError:

995 pdf = None

996 obj = self[PG.CONTENTS]

997 if is_null_or_none(obj):

998 return None

999 resolved_object = obj.get_object()

1000 return ContentStream(resolved_object, pdf)

1001 return None

1002

1003 def replace_contents(

1004 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]

1005 ) -> None:

1006 """

1007 Replace the page contents with the new content and nullify old objects

1008 Args:

1009 content: new content; if None delete the content field.

1010 """

1011 if not hasattr(self, "indirect_reference") or self.indirect_reference is None:

1012 # the page is not attached : the content is directly attached.

1013 self[NameObject(PG.CONTENTS)] = content

1014 return

1015

1016 from pypdf._writer import PdfWriter # noqa: PLC0415

1017 if not isinstance(self.indirect_reference.pdf, PdfWriter):

1018 deprecate(

1019 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated "

1020 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use "

1021 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable."

1022 )

1023

1024 writer = self.indirect_reference.pdf

1025 if isinstance(self.get(PG.CONTENTS, None), ArrayObject):

1026 content_array = cast(ArrayObject, self[PG.CONTENTS])

1027 for reference in content_array:

1028 try:

1029 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject())

1030 except ValueError:

1031 # Occurs when called on PdfReader.

1032 pass

1033

1034 if isinstance(content, ArrayObject):

1035 content = ArrayObject(writer._add_object(obj) for obj in content)

1036

1037 if is_null_or_none(content):

1038 if PG.CONTENTS not in self:

1039 return

1040 assert self[PG.CONTENTS].indirect_reference is not None

1041 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject())

1042 del self[PG.CONTENTS]

1043 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):

1044 try:

1045 self[NameObject(PG.CONTENTS)] = writer._add_object(content)

1046 except AttributeError:

1047 # applies at least for page not in writer

1048 # as a backup solution, we put content as an object although not in accordance with pdf ref

1049 # this will be fixed with the _add_object

1050 self[NameObject(PG.CONTENTS)] = content

1051 else:

1052 assert content is not None, "mypy"

1053 content.indirect_reference = self[

1054 PG.CONTENTS

1055 ].indirect_reference # TODO: in the future may require generation management

1056 try:

1057 writer._replace_object(indirect_reference=content.indirect_reference, obj=content)

1058 except AttributeError:

1059 # applies at least for page not in writer

1060 # as a backup solution, we put content as an object although not in accordance with pdf ref

1061 # this will be fixed with the _add_object

1062 self[NameObject(PG.CONTENTS)] = content

1063 # forces recalculation of inline_images

1064 self.inline_images = None

1065

1066 def merge_page(

1067 self, page2: "PageObject", expand: bool = False, over: bool = True

1068 ) -> None:

1069 """

1070 Merge the content streams of two pages into one.

1071

1072 Resource references (e.g. fonts) are maintained from both pages.

1073 The mediabox, cropbox, etc of this page are not altered.

1074 The parameter page's content stream will

1075 be added to the end of this page's content stream,

1076 meaning that it will be drawn after, or "on top" of this page.

1077

1078 Args:

1079 page2: The page to be merged into this one. Should be

1080 an instance of :class:`PageObject<PageObject>`.

1081 over: set the page2 content over page1 if True (default) else under

1082 expand: If True, the current page dimensions will be

1083 expanded to accommodate the dimensions of the page to be merged.

1084

1085 """

1086 self._merge_page(page2, over=over, expand=expand)

1087

1088 def _merge_page(

1089 self,

1090 page2: "PageObject",

1091 page2_transformation: Optional[Callable[[Any], ContentStream]] = None,

1092 ctm: Optional[CompressedTransformationMatrix] = None,

1093 over: bool = True,

1094 expand: bool = False,

1095 ) -> None:

1096 # First we work on merging the resource dictionaries. This allows us

1097 # to find out what symbols in the content streams we might need to

1098 # rename.

1099 try:

1100 assert isinstance(self.indirect_reference, IndirectObject)

1101 if hasattr(self.indirect_reference.pdf, "_add_object"): # to detect PdfWriter

1102 return self._merge_page_writer(

1103 page2, page2_transformation, ctm, over, expand

1104 )

1105 except (AssertionError, AttributeError):

1106 pass

1107

1108 new_resources = DictionaryObject()

1109 rename: dict[str, Any] = {}

1110 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object())

1111 page2_resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object())

1112 new_annots = ArrayObject()

1113

1114 for page in (self, page2):

1115 if PG.ANNOTS in page:

1116 annots = page[PG.ANNOTS]

1117 if isinstance(annots, ArrayObject):

1118 new_annots.extend(annots)

1119 self[NameObject(PG.ANNOTS)] = new_annots

1120

1121 for res in (

1122 RES.EXT_G_STATE,

1123 RES.COLOR_SPACE,

1124 RES.PATTERN,

1125 RES.SHADING,

1126 RES.XOBJECT,

1127 RES.FONT,

1128 RES.PROPERTIES,

1129 ):

1130 new, new_resource_name = self._merge_resources(

1131 original_resources, page2_resources, res

1132 )

1133 if new:

1134 new_resources[NameObject(res)] = new

1135 rename.update(new_resource_name)

1136

1137 # Combine /ProcSet sets, making sure there is a consistent order

1138 new_resources[NameObject(RES.PROC_SET)] = ArrayObject(

1139 sorted(

1140 set(

1141 original_resources.get(RES.PROC_SET, ArrayObject()).get_object()

1142 ).union(

1143 set(page2_resources.get(RES.PROC_SET, ArrayObject()).get_object())

1144 )

1145 )

1146 )

1147

1148 new_content_array = ArrayObject()

1149 original_content = self.get_contents()

1150 if original_content is not None:

1151 original_content.isolate_graphics_state()

1152 new_content_array.append(original_content)

1153

1154 page2_content = page2.get_contents()

1155 if page2_content is not None:

1156 rect = getattr(page2, MERGE_CROP_BOX)

1157 page2_content.operations.insert(

1158 0,

1159 (

1160 map(

1161 FloatObject,

1162 [

1163 rect.left,

1164 rect.bottom,

1165 rect.width,

1166 rect.height,

1167 ],

1168 ),

1169 b"re",

1170 ),

1171 )

1172 page2_content.operations.insert(1, ([], b"W"))

1173 page2_content.operations.insert(2, ([], b"n"))

1174 if page2_transformation is not None:

1175 page2_content = page2_transformation(page2_content)

1176 page2_content = PageObject._content_stream_rename(

1177 page2_content, rename, self.pdf

1178 )

1179 page2_content.isolate_graphics_state()

1180 if over:

1181 new_content_array.append(page2_content)

1182 else:

1183 new_content_array.insert(0, page2_content)

1184

1185 # if expanding the page to fit a new page, calculate the new media box size

1186 if expand:

1187 self._expand_mediabox(page2, ctm)

1188

1189 self.replace_contents(ContentStream(new_content_array, self.pdf))

1190 self[NameObject(PG.RESOURCES)] = new_resources

1191

1192 return None

1193

1194 def _merge_page_writer(

1195 self,

1196 page2: "PageObject",

1197 page2transformation: Optional[Callable[[Any], ContentStream]] = None,

1198 ctm: Optional[CompressedTransformationMatrix] = None,

1199 over: bool = True,

1200 expand: bool = False,

1201 ) -> None:

1202 # First we work on merging the resource dictionaries. This allows us

1203 # to find which symbols in the content streams we might need to

1204 # rename.

1205 assert isinstance(self.indirect_reference, IndirectObject)

1206 pdf = self.indirect_reference.pdf

1207

1208 if PG.RESOURCES not in self:

1209 self[NameObject(PG.RESOURCES)] = DictionaryObject()

1210 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())

1211 if PG.RESOURCES not in page2:

1212 page2resources = DictionaryObject()

1213 else:

1214 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

1215

1216 rename = {}

1217 for res in (

1218 RES.EXT_G_STATE,

1219 RES.COLOR_SPACE,

1220 RES.PATTERN,

1221 RES.SHADING,

1222 RES.XOBJECT,

1223 RES.FONT,

1224 RES.PROPERTIES,

1225 ):

1226 if res in page2resources:

1227 if res not in original_resources:

1228 original_resources[NameObject(res)] = DictionaryObject()

1229 _, newrename = self._merge_resources(

1230 original_resources, page2resources, res, False

1231 )

1232 rename.update(newrename)

1233 # Combine /ProcSet sets

1234 if RES.PROC_SET in page2resources:

1235 if RES.PROC_SET not in original_resources:

1236 original_resources[NameObject(RES.PROC_SET)] = ArrayObject()

1237 arr = cast(ArrayObject, original_resources[RES.PROC_SET])

1238 for x in cast(ArrayObject, page2resources[RES.PROC_SET]):

1239 if x not in arr:

1240 arr.append(x)

1241 arr.sort()

1242

1243 if not is_null_or_none(page2.get(PG.ANNOTS, None)):

1244 if PG.ANNOTS not in self:

1245 self[NameObject(PG.ANNOTS)] = ArrayObject()

1246 annots = cast(ArrayObject, self[PG.ANNOTS].get_object())

1247 if ctm is None:

1248 trsf = Transformation()

1249 else:

1250 trsf = Transformation(ctm)

1251 # Ensure we are working on a copy of the list. Otherwise, if both pages

1252 # are the same object, we might run into an infinite loop.

1253 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])):

1254 a = a.get_object()

1255 aa = a.clone(

1256 pdf,

1257 ignore_fields=("/P", "/StructParent", "/Parent"),

1258 force_duplicate=True,

1259 )

1260 r = cast(ArrayObject, a["/Rect"])

1261 pt1 = trsf.apply_on((r[0], r[1]), True)

1262 pt2 = trsf.apply_on((r[2], r[3]), True)

1263 aa[NameObject("/Rect")] = ArrayObject(

1264 (

1265 min(pt1[0], pt2[0]),

1266 min(pt1[1], pt2[1]),

1267 max(pt1[0], pt2[0]),

1268 max(pt1[1], pt2[1]),

1269 )

1270 )

1271 if "/QuadPoints" in a:

1272 q = cast(ArrayObject, a["/QuadPoints"])

1273 aa[NameObject("/QuadPoints")] = ArrayObject(

1274 trsf.apply_on((q[0], q[1]), True)

1275 + trsf.apply_on((q[2], q[3]), True)

1276 + trsf.apply_on((q[4], q[5]), True)

1277 + trsf.apply_on((q[6], q[7]), True)

1278 )

1279 try:

1280 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference

1281 except KeyError:

1282 pass

1283 try:

1284 aa[NameObject("/P")] = self.indirect_reference

1285 annots.append(aa.indirect_reference)

1286 except AttributeError:

1287 pass

1288

1289 new_content_array = ArrayObject()

1290 original_content = self.get_contents()

1291 if original_content is not None:

1292 original_content.isolate_graphics_state()

1293 new_content_array.append(original_content)

1294

1295 page2content = page2.get_contents()

1296 if page2content is not None:

1297 rect = getattr(page2, MERGE_CROP_BOX)

1298 page2content.operations.insert(

1299 0,

1300 (

1301 map(

1302 FloatObject,

1303 [

1304 rect.left,

1305 rect.bottom,

1306 rect.width,

1307 rect.height,

1308 ],

1309 ),

1310 b"re",

1311 ),

1312 )

1313 page2content.operations.insert(1, ([], b"W"))

1314 page2content.operations.insert(2, ([], b"n"))

1315 if page2transformation is not None:

1316 page2content = page2transformation(page2content)

1317 page2content = PageObject._content_stream_rename(

1318 page2content, rename, self.pdf

1319 )

1320 page2content.isolate_graphics_state()

1321 if over:

1322 new_content_array.append(page2content)

1323 else:

1324 new_content_array.insert(0, page2content)

1325

1326 # if expanding the page to fit a new page, calculate the new media box size

1327 if expand:

1328 self._expand_mediabox(page2, ctm)

1329

1330 self.replace_contents(new_content_array)

1331

1332 def _expand_mediabox(

1333 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]

1334 ) -> None:

1335 corners1 = (

1336 self.mediabox.left.as_numeric(),

1337 self.mediabox.bottom.as_numeric(),

1338 self.mediabox.right.as_numeric(),

1339 self.mediabox.top.as_numeric(),

1340 )

1341 corners2 = (

1342 page2.mediabox.left.as_numeric(),

1343 page2.mediabox.bottom.as_numeric(),

1344 page2.mediabox.left.as_numeric(),

1345 page2.mediabox.top.as_numeric(),

1346 page2.mediabox.right.as_numeric(),

1347 page2.mediabox.top.as_numeric(),

1348 page2.mediabox.right.as_numeric(),

1349 page2.mediabox.bottom.as_numeric(),

1350 )

1351 if ctm is not None:

1352 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1353 new_x = tuple(

1354 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]

1355 for i in range(0, 8, 2)

1356 )

1357 new_y = tuple(

1358 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]

1359 for i in range(0, 8, 2)

1360 )

1361 else:

1362 new_x = corners2[0:8:2]

1363 new_y = corners2[1:8:2]

1364 lowerleft = (min(new_x), min(new_y))

1365 upperright = (max(new_x), max(new_y))

1366 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))

1367 upperright = (

1368 max(corners1[2], upperright[0]),

1369 max(corners1[3], upperright[1]),

1370 )

1371

1372 self.mediabox.lower_left = lowerleft

1373 self.mediabox.upper_right = upperright

1374

1375 def merge_transformed_page(

1376 self,

1377 page2: "PageObject",

1378 ctm: Union[CompressedTransformationMatrix, Transformation],

1379 over: bool = True,

1380 expand: bool = False,

1381 ) -> None:

1382 """

1383 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation

1384 matrix is applied to the merged stream.

1385

1386 Args:

1387 page2: The page to be merged into this one.

1388 ctm: a 6-element tuple containing the operands of the

1389 transformation matrix

1390 over: set the page2 content over page1 if True (default) else under

1391 expand: Whether the page should be expanded to fit the dimensions

1392 of the page to be merged.

1393

1394 """

1395 if isinstance(ctm, Transformation):

1396 ctm = ctm.ctm

1397 self._merge_page(

1398 page2,

1399 lambda page2_content: PageObject._add_transformation_matrix(

1400 page2_content, page2.pdf, ctm

1401 ),

1402 ctm,

1403 over,

1404 expand,

1405 )

1406

1407 def merge_scaled_page(

1408 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False

1409 ) -> None:

1410 """

1411 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1412 is scaled by applying a transformation matrix.

1413

1414 Args:

1415 page2: The page to be merged into this one.

1416 scale: The scaling factor

1417 over: set the page2 content over page1 if True (default) else under

1418 expand: Whether the page should be expanded to fit the

1419 dimensions of the page to be merged.

1420

1421 """

1422 op = Transformation().scale(scale, scale)

1423 self.merge_transformed_page(page2, op, over, expand)

1424

1425 def merge_rotated_page(

1426 self,

1427 page2: "PageObject",

1428 rotation: float,

1429 over: bool = True,

1430 expand: bool = False,

1431 ) -> None:

1432 """

1433 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged

1434 is rotated by applying a transformation matrix.

1435

1436 Args:

1437 page2: The page to be merged into this one.

1438 rotation: The angle of the rotation, in degrees

1439 over: set the page2 content over page1 if True (default) else under

1440 expand: Whether the page should be expanded to fit the

1441 dimensions of the page to be merged.

1442

1443 """

1444 op = Transformation().rotate(rotation)

1445 self.merge_transformed_page(page2, op, over, expand)

1446

1447 def merge_translated_page(

1448 self,

1449 page2: "PageObject",

1450 tx: float,

1451 ty: float,

1452 over: bool = True,

1453 expand: bool = False,

1454 ) -> None:

1455 """

1456 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be

1457 merged is translated by applying a transformation matrix.

1458

1459 Args:

1460 page2: the page to be merged into this one.

1461 tx: The translation on X axis

1462 ty: The translation on Y axis

1463 over: set the page2 content over page1 if True (default) else under

1464 expand: Whether the page should be expanded to fit the

1465 dimensions of the page to be merged.

1466

1467 """

1468 op = Transformation().translate(tx, ty)

1469 self.merge_transformed_page(page2, op, over, expand)

1470

1471 def add_transformation(

1472 self,

1473 ctm: Union[Transformation, CompressedTransformationMatrix],

1474 expand: bool = False,

1475 ) -> None:

1476 """

1477 Apply a transformation matrix to the page.

1478

1479 Args:

1480 ctm: A 6-element tuple containing the operands of the

1481 transformation matrix. Alternatively, a

1482 :py:class:`Transformation<pypdf.Transformation>`

1483 object can be passed.

1484

1485 See :doc:`/user/cropping-and-transforming`.

1486

1487 """

1488 if isinstance(ctm, Transformation):

1489 ctm = ctm.ctm

1490 content = self.get_contents()

1491 if content is not None:

1492 content = PageObject._add_transformation_matrix(content, self.pdf, ctm)

1493 content.isolate_graphics_state()

1494 self.replace_contents(content)

1495 # if expanding the page to fit a new page, calculate the new media box size

1496 if expand:

1497 corners = [

1498 self.mediabox.left.as_numeric(),

1499 self.mediabox.bottom.as_numeric(),

1500 self.mediabox.left.as_numeric(),

1501 self.mediabox.top.as_numeric(),

1502 self.mediabox.right.as_numeric(),

1503 self.mediabox.top.as_numeric(),

1504 self.mediabox.right.as_numeric(),

1505 self.mediabox.bottom.as_numeric(),

1506 ]

1507

1508 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment]

1509 new_x = [

1510 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]

1511 for i in range(0, 8, 2)

1512 ]

1513 new_y = [

1514 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]

1515 for i in range(0, 8, 2)

1516 ]

1517

1518 self.mediabox.lower_left = (min(new_x), min(new_y))

1519 self.mediabox.upper_right = (max(new_x), max(new_y))

1520

1521 def scale(self, sx: float, sy: float) -> None:

1522 """

1523 Scale a page by the given factors by applying a transformation matrix

1524 to its content and updating the page size.

1525

1526 This updates the various page boundaries (bleedbox, trimbox, etc.)

1527 and the contents of the page.

1528

1529 Args:

1530 sx: The scaling factor on horizontal axis.

1531 sy: The scaling factor on vertical axis.

1532

1533 """

1534 self.add_transformation((sx, 0, 0, sy, 0, 0))

1535 self.bleedbox = self.bleedbox.scale(sx, sy)

1536 self.trimbox = self.trimbox.scale(sx, sy)

1537 self.artbox = self.artbox.scale(sx, sy)

1538 self.cropbox = self.cropbox.scale(sx, sy)

1539 self.mediabox = self.mediabox.scale(sx, sy)

1540

1541 if PG.ANNOTS in self:

1542 annotations = self[PG.ANNOTS]

1543 if isinstance(annotations, ArrayObject):

1544 for annotation in annotations:

1545 annotation_obj = annotation.get_object()

1546 if AnnotationDictionaryAttributes.Rect in annotation_obj:

1547 rectangle = annotation_obj[AnnotationDictionaryAttributes.Rect]

1548 if isinstance(rectangle, ArrayObject):

1549 rectangle[0] = FloatObject(float(rectangle[0]) * sx)

1550 rectangle[1] = FloatObject(float(rectangle[1]) * sy)

1551 rectangle[2] = FloatObject(float(rectangle[2]) * sx)

1552 rectangle[3] = FloatObject(float(rectangle[3]) * sy)

1553

1554 if PG.VP in self:

1555 viewport = self[PG.VP]

1556 if isinstance(viewport, ArrayObject):

1557 bbox = viewport[0]["/BBox"]

1558 else:

1559 bbox = viewport["/BBox"] # type: ignore[index]

1560 scaled_bbox = RectangleObject(

1561 (

1562 float(bbox[0]) * sx,

1563 float(bbox[1]) * sy,

1564 float(bbox[2]) * sx,

1565 float(bbox[3]) * sy,

1566 )

1567 )

1568 if isinstance(viewport, ArrayObject):

1569 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore[index]

1570 NameObject("/BBox")

1571 ] = scaled_bbox

1572 else:

1573 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore[index]

1574

1575 def scale_by(self, factor: float) -> None:

1576 """

1577 Scale a page by the given factor by applying a transformation matrix to

1578 its content and updating the page size.

1579

1580 Args:

1581 factor: The scaling factor (for both X and Y axis).

1582

1583 """

1584 self.scale(factor, factor)

1585

1586 def scale_to(self, width: float, height: float) -> None:

1587 """

1588 Scale a page to the specified dimensions by applying a transformation

1589 matrix to its content and updating the page size.

1590

1591 Args:

1592 width: The new width.

1593 height: The new height.

1594

1595 """

1596 sx = width / float(self.mediabox.width)

1597 sy = height / float(self.mediabox.height)

1598 self.scale(sx, sy)

1599

1600 def compress_content_streams(self, level: int = -1) -> None:

1601 """

1602 Compress the size of this page by joining all content streams and

1603 applying a FlateDecode filter.

1604

1605 However, it is possible that this function will perform no action if

1606 content stream compression becomes "automatic".

1607 """

1608 content = self.get_contents()

1609 if content is not None:

1610 content_obj = content.flate_encode(level)

1611 try:

1612 content.indirect_reference.pdf._objects[ # type: ignore[union-attr]

1613 content.indirect_reference.idnum - 1 # type: ignore[union-attr]

1614 ] = content_obj

1615 except AttributeError:

1616 if self.indirect_reference is not None and hasattr(

1617 self.indirect_reference.pdf, "_add_object"

1618 ):

1619 self.replace_contents(content_obj)

1620 else:

1621 raise ValueError("Page must be part of a PdfWriter")

1622

1623 @property

1624 def page_number(self) -> Optional[int]:

1625 """

1626 Read-only property which returns the page number within the PDF file.

1627

1628 Returns:

1629 Page number; None if the page is not attached to a PDF.

1630

1631 """

1632 if self.indirect_reference is None:

1633 return None

1634 try:

1635 lst = self.indirect_reference.pdf.pages

1636 return int(lst.index(self))

1637 except ValueError:

1638 return None

1639

1640 def _debug_for_extract(self) -> str: # pragma: no cover

1641 out = ""

1642 for ope, op in ContentStream(

1643 self["/Contents"].get_object(), self.pdf, "bytes"

1644 ).operations:

1645 if op == b"TJ":

1646 s = [x for x in ope[0] if isinstance(x, str)]

1647 else:

1648 s = []

1649 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"

1650 out += "\n=============================\n"

1651 try:

1652 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore

1653 out += fo + "\n"

1654 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore

1655 try:

1656 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1657 "/Encoding"

1658 ].__repr__()

1659 out += enc_repr + "\n"

1660 except Exception:

1661 pass

1662 try:

1663 out += (

1664 self[PG.RESOURCES]["/Font"][fo][ # type:ignore

1665 "/ToUnicode"

1666 ]

1667 .get_data()

1668 .decode()

1669 + "\n"

1670 )

1671 except Exception:

1672 pass

1673

1674 except KeyError:

1675 out += "No Font\n"

1676 return out

1677

1678 def _extract_text(

1679 self,

1680 obj: DictionaryObject,

1681 pdf: Any,

1682 orientations: tuple[int, ...] = (0, 90, 180, 270),

1683 space_width: float = 200.0,

1684 content_key: Optional[str] = PG.CONTENTS,

1685 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1686 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1687 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1688 *,

1689 known_ids: Optional[set[int]] = None,

1690 ) -> str:

1691 """

1692 See extract_text for most arguments.

1693

1694 Args:

1695 content_key: indicate the default key where to extract data

1696 None = the object; this allows reusing the function on an XObject

1697 default = "/Content"

1698

1699 """

1700 if known_ids is None:

1701 known_ids = set()

1702

1703 extractor = TextExtraction()

1704 font_resources: dict[str, DictionaryObject] = {}

1705 fonts: dict[str, Font] = {}

1706

1707 resources_dict = cast(

1708 Optional[DictionaryObject],

1709 obj.get_inherited(key=PG.RESOURCES, default=DictionaryObject())

1710 )

1711 if is_null_or_none(resources_dict) or not resources_dict:

1712 # No resources means no text is possible (no font); we consider the

1713 # file as not damaged, no need to check for TJ or Tj

1714 return ""

1715

1716 if (

1717 "/Font" in resources_dict

1718 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"]))

1719 ):

1720 for font_resource in font_resources_dict:

1721 try:

1722 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object())

1723 font_resources[font_resource] = font_resource_object

1724 fonts[font_resource] = Font.from_font_resource(font_resource_object)

1725 # Override space width, if applicable

1726 if fonts[font_resource].character_widths.get(fonts[font_resource].space_char, 0) == 0:

1727 fonts[font_resource].space_width = space_width

1728 except (AttributeError, TypeError):

1729 pass

1730

1731 try:

1732 content = (

1733 obj[content_key].get_object() if isinstance(content_key, str) else obj

1734 )

1735 if not isinstance(content, ContentStream):

1736 content = ContentStream(content, pdf, "bytes")

1737 except (AttributeError, KeyError): # no content can be extracted (certainly empty page)

1738 return ""

1739 # We check all strings are TextStringObjects. ByteStringObjects

1740 # are strings where the byte->string encoding was unknown, so adding

1741 # them to the text here would be gibberish.

1742

1743 # Initialize the extractor with the necessary parameters

1744 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts)

1745

1746 for operands, operator in content.operations:

1747 if visitor_operand_before is not None:

1748 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1749 # Multiple operators are handled here

1750 if operator == b"'":

1751 extractor.process_operation(b"T*", [])

1752 extractor.process_operation(b"Tj", operands)

1753 elif operator == b'"':

1754 extractor.process_operation(b"Tw", [operands[0]])

1755 extractor.process_operation(b"Tc", [operands[1]])

1756 extractor.process_operation(b"T*", [])

1757 extractor.process_operation(b"Tj", operands[2:])

1758 elif operator == b"TJ":

1759 # The space width may be smaller than the font width, so the width should be 95%.

1760 _confirm_space_width = extractor._space_width * 0.95

1761 if operands:

1762 for op in operands[0]:

1763 if isinstance(op, (str, bytes)):

1764 extractor.process_operation(b"Tj", [op])

1765 if isinstance(op, (int, float, NumberObject, FloatObject)) and (

1766 abs(float(op)) >= _confirm_space_width

1767 and extractor.text

1768 and extractor.text[-1] != " "

1769 ):

1770 extractor.process_operation(b"Tj", [" "])

1771 elif operator == b"TD":

1772 extractor.process_operation(b"TL", [-operands[1]])

1773 extractor.process_operation(b"Td", operands)

1774 elif operator == b"Do":

1775 extractor.output += extractor.text

1776 if visitor_text is not None:

1777 visitor_text(

1778 extractor.text,

1779 extractor.memo_cm,

1780 extractor.memo_tm,

1781 extractor.font_resource,

1782 extractor.font_size,

1783 )

1784 try:

1785 if extractor.output[-1] != "\n":

1786 extractor.output += "\n"

1787 if visitor_text is not None:

1788 visitor_text(

1789 "\n",

1790 extractor.memo_cm,

1791 extractor.memo_tm,

1792 extractor.font_resource,

1793 extractor.font_size,

1794 )

1795 except IndexError:

1796 pass

1797 try:

1798 xobj = cast(DictionaryObject, resources_dict["/XObject"])

1799 xform = cast(EncodedStreamObject, xobj[operands[0]])

1800 if xform["/Subtype"] != NameObject("/Image"):

1801 xform_id = id(xform)

1802 if xform_id in known_ids:

1803 logger_warning(

1804 "Detected cyclic form XObject reference, skipping %(operand)s.",

1805 source=__name__,

1806 operand=operands[0]

1807 )

1808 text = ""

1809 else:

1810 known_ids.add(xform_id)

1811 try:

1812 text = self.extract_xform_text(

1813 xform,

1814 orientations,

1815 space_width,

1816 visitor_operand_before,

1817 visitor_operand_after,

1818 visitor_text,

1819 known_ids=known_ids,

1820 )

1821 finally:

1822 known_ids.discard(xform_id)

1823 extractor.output += text

1824 if visitor_text is not None:

1825 visitor_text(

1826 text,

1827 extractor.memo_cm,

1828 extractor.memo_tm,

1829 extractor.font_resource,

1830 extractor.font_size,

1831 )

1832 except Exception as exception:

1833 logger_warning(

1834 "Impossible to decode XFormObject %(operand)s: %(exception)s",

1835 source=__name__,

1836 operand=operands[0],

1837 exception=exception,

1838 )

1839 finally:

1840 extractor.text = ""

1841 extractor.memo_cm = extractor.cm_matrix.copy()

1842 extractor.memo_tm = extractor.tm_matrix.copy()

1843 else:

1844 extractor.process_operation(operator, operands)

1845 if visitor_operand_after is not None:

1846 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)

1847 extractor.output += extractor.text # just in case

1848 if extractor.text != "" and visitor_text is not None:

1849 visitor_text(

1850 extractor.text,

1851 extractor.memo_cm,

1852 extractor.memo_tm,

1853 extractor.font_resource,

1854 extractor.font_size,

1855 )

1856 return extractor.output

1857

1858 def _layout_mode_fonts(self) -> dict[str, Font]:

1859 """

1860 Get fonts formatted for "layout" mode text extraction.

1861

1862 Returns:

1863 Dict[str, Font]: dictionary of Font instances keyed by font name

1864

1865 """

1866 # Font retrieval logic adapted from pypdf.PageObject._extract_text()

1867 objr: Any = self

1868 fonts: dict[str, Font] = {}

1869 while objr is not None:

1870 try:

1871 resources_dict: Any = objr[PG.RESOURCES]

1872 except KeyError:

1873 resources_dict = {}

1874 if "/Font" in resources_dict and self.pdf is not None:

1875 for font_name in resources_dict["/Font"]:

1876 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name])

1877 try:

1878 objr = objr["/Parent"].get_object()

1879 except KeyError:

1880 objr = None

1881

1882 return fonts

1883

1884 def _layout_mode_text(

1885 self,

1886 space_vertically: bool = True,

1887 scale_weight: float = 1.25,

1888 strip_rotated: bool = True,

1889 debug_path: Optional[Path] = None,

1890 font_height_weight: float = 1,

1891 ) -> str:

1892 """

1893 Get text preserving fidelity to source PDF text layout.

1894

1895 Args:

1896 space_vertically: include blank lines inferred from y distance + font

1897 height. Defaults to True.

1898 scale_weight: multiplier for string length when calculating weighted

1899 average character width. Defaults to 1.25.

1900 strip_rotated: Removes text that is rotated w.r.t. to the page from

1901 layout mode output. Defaults to True.

1902 debug_path (Path | None): if supplied, must target a directory.

1903 creates the following files with debug information for layout mode

1904 functions if supplied:

1905 - fonts.json: output of self._layout_mode_fonts

1906 - tjs.json: individual text render ops with corresponding transform matrices

1907 - bts.json: text render ops left justified and grouped by BT/ET operators

1908 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

1909 Defaults to None.

1910 font_height_weight: multiplier for font height when calculating

1911 blank lines. Defaults to 1.

1912

1913 Returns:

1914 str: multiline string containing page text in a fixed width format that

1915 closely adheres to the rendered layout in the source pdf.

1916

1917 """

1918 fonts = self._layout_mode_fonts()

1919 if debug_path: # pragma: no cover

1920 import json # noqa: PLC0415

1921

1922 debug_path.joinpath("fonts.json").write_text(

1923 json.dumps(fonts, indent=2, default=asdict),

1924 "utf-8"

1925 )

1926

1927 ops = iter(

1928 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations

1929 )

1930 bt_groups = _layout_mode.text_show_operations(

1931 ops, fonts, strip_rotated, debug_path

1932 )

1933

1934 if not bt_groups:

1935 return ""

1936

1937 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

1938

1939 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

1940

1941 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

1942

1943 def extract_text(

1944 self,

1945 *args: Any,

1946 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270),

1947 space_width: float = 200.0,

1948 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1949 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

1950 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

1951 extraction_mode: Literal["plain", "layout"] = "plain",

1952 **kwargs: Any,

1953 ) -> str:

1954 """

1955 Locate all text drawing commands, in the order they are provided in the

1956 content stream, and extract the text.

1957

1958 This works well for some PDF files, but poorly for others, depending on

1959 the generator used. This will be refined in the future.

1960

1961 Do not rely on the order of text coming out of this function, as it

1962 will change if this function is made more sophisticated.

1963

1964 Arabic and Hebrew are extracted in the correct order.

1965 If required a custom RTL range of characters can be defined;

1966 see function set_custom_rtl.

1967

1968 Additionally you can provide visitor methods to get informed on all

1969 operations and all text objects.

1970 For example in some PDF files this can be useful to parse tables.

1971

1972 Args:

1973 orientations: list of orientations extract_text will look for

1974 default = (0, 90, 180, 270)

1975 note: currently only 0 (up),90 (turned left), 180 (upside down),

1976 270 (turned right)

1977 Silently ignored in "layout" mode.

1978 space_width: force default space width

1979 if not extracted from font (default: 200)

1980 Silently ignored in "layout" mode.

1981 visitor_operand_before: function to be called before processing an operation.

1982 It has four arguments: operator, operand-arguments,

1983 current transformation matrix and text matrix.

1984 Ignored with a warning in "layout" mode.

1985 visitor_operand_after: function to be called after processing an operation.

1986 It has four arguments: operator, operand-arguments,

1987 current transformation matrix and text matrix.

1988 Ignored with a warning in "layout" mode.

1989 visitor_text: function to be called when extracting some text at some position.

1990 It has five arguments: text, current transformation matrix,

1991 text matrix, font-dictionary and font-size.

1992 The font-dictionary may be None in case of unknown fonts.

1993 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

1994 Ignored with a warning in "layout" mode.

1995 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,

1996 "layout" for experimental layout mode functionality.

1997 NOTE: orientations, space_width, and visitor_* parameters are NOT respected

1998 in "layout" mode.

1999

2000 kwargs:

2001 layout_mode_space_vertically (bool): include blank lines inferred from

2002 y distance + font height. Defaults to True.

2003 layout_mode_scale_weight (float): multiplier for string length when calculating

2004 weighted average character width. Defaults to 1.25.

2005 layout_mode_strip_rotated (bool): layout mode does not support rotated text.

2006 Set to False to include rotated text anyway. If rotated text is discovered,

2007 layout will be degraded and a warning will result. Defaults to True.

2008 layout_mode_debug_path (Path | None): if supplied, must target a directory.

2009 creates the following files with debug information for layout mode

2010 functions if supplied:

2011

2012 - fonts.json: output of self._layout_mode_fonts

2013 - tjs.json: individual text render ops with corresponding transform matrices

2014 - bts.json: text render ops left justified and grouped by BT/ET operators

2015 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)

2016 layout_mode_font_height_weight (float): multiplier for font height when calculating

2017 blank lines. Defaults to 1.

2018

2019 Returns:

2020 The extracted text

2021

2022 """

2023 if extraction_mode not in ["plain", "layout"]:

2024 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")

2025 if extraction_mode == "layout":

2026 for visitor in (

2027 "visitor_operand_before",

2028 "visitor_operand_after",

2029 "visitor_text",

2030 ):

2031 if locals()[visitor]:

2032 logger_warning(

2033 "Argument %(visitor)s is ignored in layout mode",

2034 source=__name__,

2035 visitor=visitor,

2036 )

2037 return self._layout_mode_text(

2038 space_vertically=kwargs.get("layout_mode_space_vertically", True),

2039 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),

2040 strip_rotated=kwargs.get("layout_mode_strip_rotated", True),

2041 debug_path=kwargs.get("layout_mode_debug_path"),

2042 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)

2043 )

2044 if len(args) >= 1:

2045 if isinstance(args[0], str):

2046 if len(args) >= 3:

2047 if isinstance(args[2], (tuple, int)):

2048 orientations = args[2]

2049 else:

2050 raise TypeError(f"Invalid positional parameter {args[2]}")

2051 if len(args) >= 4:

2052 if isinstance(args[3], (float, int)):

2053 space_width = args[3]

2054 else:

2055 raise TypeError(f"Invalid positional parameter {args[3]}")

2056 elif isinstance(args[0], (tuple, int)):

2057 orientations = args[0]

2058 if len(args) >= 2:

2059 if isinstance(args[1], (float, int)):

2060 space_width = args[1]

2061 else:

2062 raise TypeError(f"Invalid positional parameter {args[1]}")

2063 else:

2064 raise TypeError(f"Invalid positional parameter {args[0]}")

2065

2066 if isinstance(orientations, int):

2067 orientations = (orientations,)

2068

2069 return self._extract_text(

2070 self,

2071 self.pdf,

2072 orientations,

2073 space_width,

2074 PG.CONTENTS,

2075 visitor_operand_before,

2076 visitor_operand_after,

2077 visitor_text,

2078 )

2079

2080 def extract_xform_text(

2081 self,

2082 xform: EncodedStreamObject,

2083 orientations: tuple[int, ...] = (0, 90, 270, 360),

2084 space_width: float = 200.0,

2085 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2086 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,

2087 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,

2088 *,

2089 known_ids: Optional[set[int]] = None,

2090 ) -> str:

2091 """

2092 Extract text from an XObject.

2093

2094 Args:

2095 xform:

2096 orientations:

2097 space_width: force default space width (if not extracted from font (default 200)

2098 visitor_operand_before:

2099 visitor_operand_after:

2100 visitor_text:

2101

2102 Returns:

2103 The extracted text

2104

2105 """

2106 return self._extract_text(

2107 xform,

2108 self.pdf,

2109 orientations,

2110 space_width,

2111 None,

2112 visitor_operand_before,

2113 visitor_operand_after,

2114 visitor_text,

2115 known_ids=known_ids,

2116 )

2117

2118 def _get_fonts(self) -> tuple[set[str], set[str]]:

2119 """

2120 Get the names of embedded fonts and unembedded fonts.

2121

2122 Returns:

2123 A tuple (set of embedded fonts, set of unembedded fonts)

2124

2125 """

2126 obj = self.get_object()

2127 assert isinstance(obj, DictionaryObject)

2128 fonts: set[str] = set()

2129 embedded: set[str] = set()

2130 fonts, embedded = _get_fonts_walk(obj, fonts, embedded)

2131 unembedded = fonts - embedded

2132 return embedded, unembedded

2133

2134 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())

2135 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2136 default user space units, defining the boundaries of the physical medium on

2137 which the page is intended to be displayed or printed."""

2138

2139 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))

2140 """

2141 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2142 default user space units, defining the visible region of default user

2143 space.

2144

2145 When the page is displayed or printed, its contents are to be clipped

2146 (cropped) to this rectangle and then imposed on the output medium in some

2147 implementation-defined manner. Default value: same as

2148 :attr:`mediabox<mediabox>`.

2149 """

2150

2151 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))

2152 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2153 default user space units, defining the region to which the contents of the

2154 page should be clipped when output in a production environment."""

2155

2156 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))

2157 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2158 default user space units, defining the intended dimensions of the finished

2159 page after trimming."""

2160

2161 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))

2162 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in

2163 default user space units, defining the extent of the page's meaningful

2164 content as intended by the page's creator."""

2165

2166 @property

2167 def annotations(self) -> Optional[ArrayObject]:

2168 if "/Annots" not in self:

2169 return None

2170 return cast(ArrayObject, self["/Annots"])

2171

2172 @annotations.setter

2173 def annotations(self, value: Optional[ArrayObject]) -> None:

2174 """

2175 Set the annotations array of the page.

2176

2177 Typically you do not want to set this value, but append to it.

2178 If you append to it, remember to add the object first to the writer

2179 and only add the indirect object.

2180 """

2181 if value is None:

2182 if "/Annots" not in self:

2183 return

2184 del self[NameObject("/Annots")]

2185 else:

2186 self[NameObject("/Annots")] = value

2187

2188

2189class _VirtualList(Sequence[PageObject]):

2190 def __init__(

2191 self,

2192 length_function: Callable[[], int],

2193 get_function: Callable[[int], PageObject],

2194 ) -> None:

2195 self.length_function = length_function

2196 self.get_function = get_function

2197 self.current = -1

2198

2199 def __len__(self) -> int:

2200 return self.length_function()

2201

2202 @overload

2203 def __getitem__(self, index: int) -> PageObject:

2204 ...

2205

2206 @overload

2207 def __getitem__(self, index: slice) -> Sequence[PageObject]:

2208 ...

2209

2210 def __getitem__(

2211 self, index: Union[int, slice]

2212 ) -> Union[PageObject, Sequence[PageObject]]:

2213 if isinstance(index, slice):

2214 indices = range(*index.indices(len(self)))

2215 cls = type(self)

2216 return cls(indices.__len__, lambda idx: self[indices[idx]])

2217 if not isinstance(index, int):

2218 raise TypeError("Sequence indices must be integers")

2219 len_self = len(self)

2220 if index < 0:

2221 # support negative indexes

2222 index += len_self

2223 if not (0 <= index < len_self):

2224 raise IndexError("Sequence index out of range")

2225 return self.get_function(index)

2226

2227 def __delitem__(self, index: Union[int, slice]) -> None:

2228 if isinstance(index, slice):

2229 r = list(range(*index.indices(len(self))))

2230 # pages have to be deleted from last to first

2231 r.sort()

2232 r.reverse()

2233 for p in r:

2234 del self[p] # recursive call

2235 return

2236 if not isinstance(index, int):

2237 raise TypeError("Index must be integers")

2238 len_self = len(self)

2239 if index < 0:

2240 # support negative indexes

2241 index += len_self

2242 if not (0 <= index < len_self):

2243 raise IndexError("Index out of range")

2244 ind = self[index].indirect_reference

2245 assert ind is not None

2246 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(

2247 "/Parent", None

2248 )

2249 first = True

2250 while parent is not None:

2251 parent = cast(DictionaryObject, parent.get_object())

2252 try:

2253 i = cast(ArrayObject, parent["/Kids"]).index(ind)

2254 del cast(ArrayObject, parent["/Kids"])[i]

2255 first = False

2256 try:

2257 assert ind is not None

2258 del ind.pdf.flattened_pages[index] # case of page in a Reader

2259 except Exception: # pragma: no cover

2260 pass

2261 if "/Count" in parent:

2262 parent[NameObject("/Count")] = NumberObject(

2263 cast(int, parent["/Count"]) - 1

2264 )

2265 if len(cast(ArrayObject, parent["/Kids"])) == 0:

2266 # No more objects in this part of this subtree

2267 ind = parent.indirect_reference

2268 parent = parent.get("/Parent", None)

2269 except ValueError: # from index

2270 if first:

2271 raise PdfReadError(f"Page not found in page tree: {ind}")

2272 break

2273

2274 def __iter__(self) -> Iterator[PageObject]:

2275 for i in range(len(self)):

2276 yield self[i]

2277

2278 def __str__(self) -> str:

2279 p = [f"PageObject({i})" for i in range(self.length_function())]

2280 return f"[{', '.join(p)}]"

2281

2282

2283def _get_fonts_walk(

2284 obj: DictionaryObject,

2285 fnt: set[str],

2286 emb: set[str],

2287) -> tuple[set[str], set[str]]:

2288 """

2289 Get the set of all fonts and all embedded fonts.

2290

2291 Args:

2292 obj: Page resources dictionary

2293 fnt: font

2294 emb: embedded fonts

2295

2296 Returns:

2297 A tuple (fnt, emb)

2298

2299 If there is a key called 'BaseFont', that is a font that is used in the document.

2300 If there is a key called 'FontName' and another key in the same dictionary object

2301 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is

2302 embedded.

2303

2304 We create and add to two sets, fnt = fonts used and emb = fonts embedded.

2305

2306 """

2307 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")

2308

2309 def process_font(f: DictionaryObject) -> None:

2310 nonlocal fnt, emb

2311 f = cast(DictionaryObject, f.get_object()) # to be sure

2312 if "/BaseFont" in f:

2313 fnt.add(cast(str, f["/BaseFont"]))

2314

2315 if (

2316 ("/CharProcs" in f)

2317 or (

2318 "/FontDescriptor" in f

2319 and any(

2320 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys

2321 )

2322 )

2323 or (

2324 "/DescendantFonts" in f

2325 and "/FontDescriptor"

2326 in cast(

2327 DictionaryObject,

2328 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2329 )

2330 and any(

2331 x

2332 in cast(

2333 DictionaryObject,

2334 cast(

2335 DictionaryObject,

2336 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),

2337 )["/FontDescriptor"],

2338 )

2339 for x in fontkeys

2340 )

2341 )

2342 ):

2343 # the list comprehension ensures there is FontFile

2344 try:

2345 emb.add(cast(str, f["/BaseFont"]))

2346 except KeyError:

2347 emb.add("(" + cast(str, f["/Subtype"]) + ")")

2348

2349 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):

2350 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):

2351 process_font(f)

2352 if "/Resources" in obj:

2353 if "/Font" in cast(DictionaryObject, obj["/Resources"]):

2354 for f in cast(

2355 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]

2356 ).values():

2357 process_font(f)

2358 if "/XObject" in cast(DictionaryObject, obj["/Resources"]):

2359 for x in cast(

2360 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]

2361 ).values():

2362 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)

2363 if "/Annots" in obj:

2364 for a in cast(ArrayObject, obj["/Annots"]):

2365 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)

2366 if "/AP" in obj:

2367 if (

2368 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(

2369 "/Type"

2370 )

2371 == "/XObject"

2372 ):

2373 _get_fonts_walk(

2374 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),

2375 fnt,

2376 emb,

2377 )

2378 else:

2379 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):

2380 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)

2381 return fnt, emb # return the sets for each page

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

930 statements