Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

922 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from collections.abc import Iterable, Iterator, Sequence 

32from copy import deepcopy 

33from dataclasses import asdict, dataclass 

34from decimal import Decimal 

35from io import BytesIO 

36from pathlib import Path 

37from typing import ( 

38 Any, 

39 Callable, 

40 Literal, 

41 Optional, 

42 Union, 

43 cast, 

44 overload, 

45) 

46 

47from ._font import Font 

48from ._protocols import PdfCommonDocProtocol 

49from ._text_extraction import ( 

50 _layout_mode, 

51) 

52from ._text_extraction._text_extractor import TextExtraction 

53from ._utils import ( 

54 CompressedTransformationMatrix, 

55 TransformationMatrixType, 

56 _human_readable_bytes, 

57 deprecate, 

58 logger_warning, 

59 matrix_multiply, 

60) 

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

62from .constants import AnnotationDictionaryAttributes as ADA 

63from .constants import ImageAttributes as IA 

64from .constants import PageAttributes as PG 

65from .constants import Resources as RES 

66from .errors import PageSizeNotDefinedError, PdfReadError 

67from .generic import ( 

68 ArrayObject, 

69 ContentStream, 

70 DictionaryObject, 

71 EncodedStreamObject, 

72 FloatObject, 

73 IndirectObject, 

74 NameObject, 

75 NullObject, 

76 NumberObject, 

77 PdfObject, 

78 RectangleObject, 

79 StreamObject, 

80 is_null_or_none, 

81) 

82 

83try: 

84 from PIL.Image import Image 

85 

86 pil_not_imported = False 

87except ImportError: 

88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10 

89 pil_not_imported = True # error will be raised only when using images 

90 

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" 

92 

93 

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name) 

96 if isinstance(retval, RectangleObject): 

97 return retval 

98 if is_null_or_none(retval): 

99 for d in defaults: 

100 retval = self.get(d) 

101 if retval is not None: 

102 break 

103 if isinstance(retval, IndirectObject): 

104 retval = self.pdf.get_object(retval) 

105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4: 

106 logger_warning(f"Expected four values, got {length}: {retval}", __name__) 

107 retval = RectangleObject(tuple(retval[:4])) 

108 else: 

109 retval = RectangleObject(retval) # type: ignore 

110 _set_rectangle(self, name, retval) 

111 return retval 

112 

113 

114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

115 self[NameObject(name)] = value 

116 

117 

118def _delete_rectangle(self: Any, name: str) -> None: 

119 del self[name] 

120 

121 

122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

123 return property( 

124 lambda self: _get_rectangle(self, name, fallback), 

125 lambda self, value: _set_rectangle(self, name, value), 

126 lambda self: _delete_rectangle(self, name), 

127 ) 

128 

129 

130class Transformation: 

131 """ 

132 Represent a 2D transformation. 

133 

134 The transformation between two coordinate systems is represented by a 3-by-3 

135 transformation matrix with the following form:: 

136 

137 a b 0 

138 c d 0 

139 e f 1 

140 

141 Because a transformation matrix has only six elements that can be changed, 

142 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

143 

144 Coordinate transformations are expressed as matrix multiplications:: 

145 

146 a b 0 

147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

148 e f 1 

149 

150 

151 Example: 

152 >>> from pypdf import PdfWriter, Transformation 

153 >>> page = PdfWriter().add_blank_page(800, 600) 

154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

155 >>> page.add_transformation(op) 

156 

157 """ 

158 

159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

160 self.ctm = ctm 

161 

162 @property 

163 def matrix(self) -> TransformationMatrixType: 

164 """ 

165 Return the transformation matrix as a tuple of tuples in the form: 

166 

167 ((a, b, 0), (c, d, 0), (e, f, 1)) 

168 """ 

169 return ( 

170 (self.ctm[0], self.ctm[1], 0), 

171 (self.ctm[2], self.ctm[3], 0), 

172 (self.ctm[4], self.ctm[5], 1), 

173 ) 

174 

175 @staticmethod 

176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

177 """ 

178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

179 

180 Args: 

181 matrix: The transformation matrix as a tuple of tuples. 

182 

183 Returns: 

184 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

185 

186 """ 

187 return ( 

188 matrix[0][0], 

189 matrix[0][1], 

190 matrix[1][0], 

191 matrix[1][1], 

192 matrix[2][0], 

193 matrix[2][1], 

194 ) 

195 

196 def _to_cm(self) -> str: 

197 # Returns the cm operation string for the given transformation matrix 

198 return ( 

199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " 

200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" 

201 ) 

202 

203 def transform(self, m: "Transformation") -> "Transformation": 

204 """ 

205 Apply one transformation to another. 

206 

207 Args: 

208 m: a Transformation to apply. 

209 

210 Returns: 

211 A new ``Transformation`` instance 

212 

213 Example: 

214 >>> from pypdf import PdfWriter, Transformation 

215 >>> height, width = 40, 50 

216 >>> page = PdfWriter().add_blank_page(800, 600) 

217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

219 >>> page.add_transformation(op) 

220 

221 """ 

222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

223 return Transformation(ctm) 

224 

225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

226 """ 

227 Translate the contents of a page. 

228 

229 Args: 

230 tx: The translation along the x-axis. 

231 ty: The translation along the y-axis. 

232 

233 Returns: 

234 A new ``Transformation`` instance 

235 

236 """ 

237 m = self.ctm 

238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

239 

240 def scale( 

241 self, sx: Optional[float] = None, sy: Optional[float] = None 

242 ) -> "Transformation": 

243 """ 

244 Scale the contents of a page towards the origin of the coordinate system. 

245 

246 Typically, that is the lower-left corner of the page. That can be 

247 changed by translating the contents / the page boxes. 

248 

249 Args: 

250 sx: The scale factor along the x-axis. 

251 sy: The scale factor along the y-axis. 

252 

253 Returns: 

254 A new Transformation instance with the scaled matrix. 

255 

256 """ 

257 if sx is None and sy is None: 

258 raise ValueError("Either sx or sy must be specified") 

259 if sx is None: 

260 sx = sy 

261 if sy is None: 

262 sy = sx 

263 assert sx is not None 

264 assert sy is not None 

265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

266 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

267 return Transformation(ctm) 

268 

269 def rotate(self, rotation: float) -> "Transformation": 

270 """ 

271 Rotate the contents of a page. 

272 

273 Args: 

274 rotation: The angle of rotation in degrees. 

275 

276 Returns: 

277 A new ``Transformation`` instance with the rotated matrix. 

278 

279 """ 

280 rotation = math.radians(rotation) 

281 op: TransformationMatrixType = ( 

282 (math.cos(rotation), math.sin(rotation), 0), 

283 (-math.sin(rotation), math.cos(rotation), 0), 

284 (0, 0, 1), 

285 ) 

286 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

287 return Transformation(ctm) 

288 

289 def __repr__(self) -> str: 

290 return f"Transformation(ctm={self.ctm})" 

291 

292 @overload 

293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: 

294 ... 

295 

296 @overload 

297 def apply_on( 

298 self, pt: tuple[float, float], as_object: bool = False 

299 ) -> tuple[float, float]: 

300 ... 

301 

302 def apply_on( 

303 self, 

304 pt: Union[tuple[float, float], list[float]], 

305 as_object: bool = False, 

306 ) -> Union[tuple[float, float], list[float]]: 

307 """ 

308 Apply the transformation matrix on the given point. 

309 

310 Args: 

311 pt: A tuple or list representing the point in the form (x, y). 

312 as_object: If True, return items as FloatObject, otherwise as plain floats. 

313 

314 Returns: 

315 A tuple or list representing the transformed point in the form (x', y') 

316 

317 """ 

318 typ = FloatObject if as_object else float 

319 pt1 = ( 

320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

322 ) 

323 return list(pt1) if isinstance(pt, list) else pt1 

324 

325 

326@dataclass 

327class ImageFile: 

328 """ 

329 Image within the PDF file. *This object is not designed to be built.* 

330 

331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

332 """ 

333 

334 name: str = "" 

335 """ 

336 Filename as identified within the PDF file. 

337 """ 

338 

339 data: bytes = b"" 

340 """ 

341 Data as bytes. 

342 """ 

343 

344 image: Optional[Image] = None 

345 """ 

346 Data as PIL image. 

347 """ 

348 

349 indirect_reference: Optional[IndirectObject] = None 

350 """ 

351 Reference to the object storing the stream. 

352 """ 

353 

354 def replace(self, new_image: Image, **kwargs: Any) -> None: 

355 """ 

356 Replace the image with a new PIL image. 

357 

358 Args: 

359 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

360 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

361 

362 Raises: 

363 TypeError: If the image is inline or in a PdfReader. 

364 TypeError: If the image does not belong to a PdfWriter. 

365 TypeError: If `new_image` is not a PIL Image. 

366 

367 Note: 

368 This method replaces the existing image with a new image. 

369 It is not allowed for inline images or images within a PdfReader. 

370 The `kwargs` parameter allows passing additional parameters 

371 to `Image.save()`, such as quality. 

372 

373 """ 

374 if pil_not_imported: 

375 raise ImportError( 

376 "pillow is required to do image extraction. " 

377 "It can be installed via 'pip install pypdf[image]'" 

378 ) 

379 

380 from ._reader import PdfReader # noqa: PLC0415 

381 

382 # to prevent circular import 

383 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 

384 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

385 

386 if self.indirect_reference is None: 

387 raise TypeError("Cannot update an inline image.") 

388 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

389 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

390 if not isinstance(new_image, Image): 

391 raise TypeError("new_image shall be a PIL Image") 

392 b = BytesIO() 

393 new_image.save(b, "PDF", **kwargs) 

394 reader = PdfReader(b) 

395 page_image = reader.pages[0].images[0] 

396 assert page_image.indirect_reference is not None 

397 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

398 page_image.indirect_reference.get_object() 

399 ) 

400 cast( 

401 PdfObject, self.indirect_reference.get_object() 

402 ).indirect_reference = self.indirect_reference 

403 # change the object attributes 

404 extension, byte_stream, img = _xobj_to_image( 

405 cast(DictionaryObject, self.indirect_reference.get_object()), 

406 pillow_parameters=kwargs, 

407 ) 

408 assert extension is not None 

409 self.name = self.name[: self.name.rfind(".")] + extension 

410 self.data = byte_stream 

411 self.image = img 

412 

413 def __str__(self) -> str: 

414 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

415 

416 def __repr__(self) -> str: 

417 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

418 

419 

420class VirtualListImages(Sequence[ImageFile]): 

421 """ 

422 Provides access to images referenced within a page. 

423 Only one copy will be returned if the usage is used on the same page multiple times. 

424 See :func:`PageObject.images` for more details. 

425 """ 

426 

427 def __init__( 

428 self, 

429 ids_function: Callable[[], list[Union[str, list[str]]]], 

430 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], 

431 ) -> None: 

432 self.ids_function = ids_function 

433 self.get_function = get_function 

434 self.current = -1 

435 

436 def __len__(self) -> int: 

437 return len(self.ids_function()) 

438 

439 def keys(self) -> list[Union[str, list[str]]]: 

440 return self.ids_function() 

441 

442 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: 

443 return [(x, self[x]) for x in self.ids_function()] 

444 

445 @overload 

446 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: 

447 ... 

448 

449 @overload 

450 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

451 ... 

452 

453 def __getitem__( 

454 self, index: Union[int, slice, str, list[str], tuple[str]] 

455 ) -> Union[ImageFile, Sequence[ImageFile]]: 

456 lst = self.ids_function() 

457 if isinstance(index, slice): 

458 indices = range(*index.indices(len(self))) 

459 lst = [lst[x] for x in indices] 

460 cls = type(self) 

461 return cls((lambda: lst), self.get_function) 

462 if isinstance(index, (str, list, tuple)): 

463 return self.get_function(index) 

464 if not isinstance(index, int): 

465 raise TypeError("Invalid sequence indices type") 

466 len_self = len(lst) 

467 if index < 0: 

468 # support negative indexes 

469 index += len_self 

470 if not (0 <= index < len_self): 

471 raise IndexError("Sequence index out of range") 

472 return self.get_function(lst[index]) 

473 

474 def __iter__(self) -> Iterator[ImageFile]: 

475 for i in range(len(self)): 

476 yield self[i] 

477 

478 def __str__(self) -> str: 

479 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

480 return f"[{', '.join(p)}]" 

481 

482 

483class PageObject(DictionaryObject): 

484 """ 

485 PageObject represents a single page within a PDF file. 

486 

487 Typically these objects will be created by accessing the 

488 :attr:`pages<pypdf.PdfReader.pages>` property of the 

489 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

490 also possible to create an empty page with the 

491 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

492 

493 Args: 

494 pdf: PDF file the page belongs to. 

495 indirect_reference: Stores the original indirect reference to 

496 this object in its source PDF 

497 

498 """ 

499 

500 original_page: "PageObject" # very local use in writer when appending 

501 

502 def __init__( 

503 self, 

504 pdf: Optional[PdfCommonDocProtocol] = None, 

505 indirect_reference: Optional[IndirectObject] = None, 

506 ) -> None: 

507 DictionaryObject.__init__(self) 

508 self.pdf = pdf 

509 self.inline_images: Optional[dict[str, ImageFile]] = None 

510 self.indirect_reference = indirect_reference 

511 if not is_null_or_none(indirect_reference): 

512 assert indirect_reference is not None, "mypy" 

513 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

514 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {} 

515 

516 def hash_bin(self) -> int: 

517 """ 

518 Used to detect modified object. 

519 

520 Note: this function is overloaded to return the same results 

521 as a DictionaryObject. 

522 

523 Returns: 

524 Hash considering type and value. 

525 

526 """ 

527 return hash( 

528 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

529 ) 

530 

531 def hash_value_data(self) -> bytes: 

532 data = super().hash_value_data() 

533 data += f"{id(self)}".encode() 

534 return data 

535 

536 @property 

537 def user_unit(self) -> float: 

538 """ 

539 A read-only positive number giving the size of user space units. 

540 

541 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

542 space unit is 1/72 inch, and a value of 3 means that a user 

543 space unit is 3/72 inch. 

544 """ 

545 return self.get(PG.USER_UNIT, 1) 

546 

547 @staticmethod 

548 def create_blank_page( 

549 pdf: Optional[PdfCommonDocProtocol] = None, 

550 width: Union[float, Decimal, None] = None, 

551 height: Union[float, Decimal, None] = None, 

552 ) -> "PageObject": 

553 """ 

554 Return a new blank page. 

555 

556 If ``width`` or ``height`` is ``None``, try to get the page size 

557 from the last page of *pdf*. 

558 

559 Args: 

560 pdf: PDF file the page is within. 

561 width: The width of the new page expressed in default user 

562 space units. 

563 height: The height of the new page expressed in default user 

564 space units. 

565 

566 Returns: 

567 The new blank page 

568 

569 Raises: 

570 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

571 no page 

572 

573 """ 

574 page = PageObject(pdf) 

575 

576 # Creates a new page (cf PDF Reference §7.7.3.3) 

577 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

578 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

579 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

580 if width is None or height is None: 

581 if pdf is not None and len(pdf.pages) > 0: 

582 lastpage = pdf.pages[len(pdf.pages) - 1] 

583 width = lastpage.mediabox.width 

584 height = lastpage.mediabox.height 

585 else: 

586 raise PageSizeNotDefinedError 

587 page.__setitem__( 

588 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

589 ) 

590 

591 return page 

592 

593 def _get_ids_image( 

594 self, 

595 obj: Optional[DictionaryObject] = None, 

596 ancest: Optional[list[str]] = None, 

597 call_stack: Optional[list[Any]] = None, 

598 ) -> list[Union[str, list[str]]]: 

599 if call_stack is None: 

600 call_stack = [] 

601 _i = getattr(obj, "indirect_reference", None) 

602 if _i in call_stack: 

603 return [] 

604 call_stack.append(_i) 

605 if self.inline_images is None: 

606 self.inline_images = self._get_inline_images() 

607 if obj is None: 

608 obj = self 

609 if ancest is None: 

610 ancest = [] 

611 lst: list[Union[str, list[str]]] = [] 

612 if ( 

613 PG.RESOURCES not in obj or 

614 is_null_or_none(resources := obj[PG.RESOURCES]) or 

615 RES.XOBJECT not in cast(DictionaryObject, resources) 

616 ): 

617 return [] if self.inline_images is None else list(self.inline_images.keys()) 

618 

619 x_object = resources[RES.XOBJECT].get_object() # type: ignore 

620 for o in x_object: 

621 if not isinstance(x_object[o], StreamObject): 

622 continue 

623 if x_object[o][IA.SUBTYPE] == "/Image": 

624 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

625 else: # is a form with possible images inside 

626 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

627 assert self.inline_images is not None 

628 lst.extend(list(self.inline_images.keys())) 

629 return lst 

630 

631 def _get_image( 

632 self, 

633 id: Union[str, list[str], tuple[str]], 

634 obj: Optional[DictionaryObject] = None, 

635 ) -> ImageFile: 

636 if obj is None: 

637 obj = cast(DictionaryObject, self) 

638 if isinstance(id, tuple): 

639 id = list(id) 

640 if isinstance(id, list) and len(id) == 1: 

641 id = id[0] 

642 try: 

643 xobjs = cast( 

644 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

645 ) 

646 except KeyError: 

647 if not (id[0] == "~" and id[-1] == "~"): 

648 raise 

649 if isinstance(id, str): 

650 if id[0] == "~" and id[-1] == "~": 

651 if self.inline_images is None: 

652 self.inline_images = self._get_inline_images() 

653 if self.inline_images is None: # pragma: no cover 

654 raise KeyError("No inline image can be found") 

655 return self.inline_images[id] 

656 

657 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 

658 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

659 extension, byte_stream = imgd[:2] 

660 return ImageFile( 

661 name=f"{id[1:]}{extension}", 

662 data=byte_stream, 

663 image=imgd[2], 

664 indirect_reference=xobjs[id].indirect_reference, 

665 ) 

666 # in a subobject 

667 ids = id[1:] 

668 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

669 

670 @property 

671 def images(self) -> VirtualListImages: 

672 """ 

673 Read-only property emulating a list of images on a page. 

674 

675 Get a list of all images on the page. The key can be: 

676 - A string (for the top object) 

677 - A tuple (for images within XObject forms) 

678 - An integer 

679 

680 Examples: 

681 * `reader.pages[0].images[0]` # return first image 

682 * `reader.pages[0].images['/I0']` # return image '/I0' 

683 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

684 * `for img in reader.pages[0].images:` # loops through all objects 

685 

686 images.keys() and images.items() can be used. 

687 

688 The ImageFile has the following properties: 

689 

690 * `.name` : name of the object 

691 * `.data` : bytes of the object 

692 * `.image` : PIL Image Object 

693 * `.indirect_reference` : object reference 

694 

695 and the following methods: 

696 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

697 replace the image in the pdf with the new image 

698 applying the saving parameters indicated (such as quality) 

699 

700 Example usage: 

701 

702 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

703 

704 Inline images are extracted and named ~0~, ~1~, ..., with the 

705 indirect_reference set to None. 

706 

707 """ 

708 return VirtualListImages(self._get_ids_image, self._get_image) 

709 

710 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

711 """Translate values used in inline image""" 

712 try: 

713 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

714 except (TypeError, KeyError): 

715 if isinstance(v, NameObject): 

716 # It is a custom name, thus we have to look in resources. 

717 # The only applicable case is for ColorSpace. 

718 try: 

719 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

720 v = cast(DictionaryObject, res)[v] 

721 except KeyError: # for res and v 

722 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

723 return v 

724 

725 def _get_inline_images(self) -> dict[str, ImageFile]: 

726 """Load inline images. Entries will be identified as `~1~`.""" 

727 content = self.get_contents() 

728 if is_null_or_none(content): 

729 return {} 

730 imgs_data = [] 

731 assert content is not None, "mypy" 

732 for param, ope in content.operations: 

733 if ope == b"INLINE IMAGE": 

734 imgs_data.append( 

735 {"settings": param["settings"], "__streamdata__": param["data"]} 

736 ) 

737 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

738 raise PdfReadError( 

739 f"{ope!r} operator met whereas not expected, " 

740 "please share use case with pypdf dev team" 

741 ) 

742 files = {} 

743 for num, ii in enumerate(imgs_data): 

744 init = { 

745 "__streamdata__": ii["__streamdata__"], 

746 "/Length": len(ii["__streamdata__"]), 

747 } 

748 for k, v in ii["settings"].items(): 

749 if k in {"/Length", "/L"}: # no length is expected 

750 continue 

751 if isinstance(v, list): 

752 v = ArrayObject( 

753 [self._translate_value_inline_image(k, x) for x in v] 

754 ) 

755 else: 

756 v = self._translate_value_inline_image(k, v) 

757 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

758 if k not in init: 

759 init[k] = v 

760 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

761 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 

762 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

763 files[f"~{num}~"] = ImageFile( 

764 name=f"~{num}~{extension}", 

765 data=byte_stream, 

766 image=img, 

767 indirect_reference=None, 

768 ) 

769 return files 

770 

771 @property 

772 def rotation(self) -> int: 

773 """ 

774 The visual rotation of the page. 

775 

776 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

777 valid values. This property does not affect ``/Contents``. 

778 """ 

779 rotate_obj = self.get(PG.ROTATE, 0) 

780 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

781 

782 @rotation.setter 

783 def rotation(self, r: float) -> None: 

784 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

785 

786 def transfer_rotation_to_content(self) -> None: 

787 """ 

788 Apply the rotation of the page to the content and the media/crop/... 

789 boxes. 

790 

791 It is recommended to apply this function before page merging. 

792 """ 

793 r = -self.rotation # rotation to apply is in the otherway 

794 self.rotation = 0 

795 mb = RectangleObject(self.mediabox) 

796 trsf = ( 

797 Transformation() 

798 .translate( 

799 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

800 ) 

801 .rotate(r) 

802 ) 

803 pt1 = trsf.apply_on(mb.lower_left) 

804 pt2 = trsf.apply_on(mb.upper_right) 

805 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

806 self.add_transformation(trsf, False) 

807 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

808 if b in self: 

809 rr = RectangleObject(self[b]) # type: ignore 

810 pt1 = trsf.apply_on(rr.lower_left) 

811 pt2 = trsf.apply_on(rr.upper_right) 

812 self[NameObject(b)] = RectangleObject( 

813 ( 

814 min(pt1[0], pt2[0]), 

815 min(pt1[1], pt2[1]), 

816 max(pt1[0], pt2[0]), 

817 max(pt1[1], pt2[1]), 

818 ) 

819 ) 

820 

821 def rotate(self, angle: int) -> "PageObject": 

822 """ 

823 Rotate a page clockwise by increments of 90 degrees. 

824 

825 Args: 

826 angle: Angle to rotate the page. Must be an increment of 90 deg. 

827 

828 Returns: 

829 The rotated PageObject 

830 

831 """ 

832 if angle % 90 != 0: 

833 raise ValueError("Rotation angle must be a multiple of 90") 

834 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

835 return self 

836 

837 def _merge_resources( 

838 self, 

839 res1: DictionaryObject, 

840 res2: DictionaryObject, 

841 resource: Any, 

842 new_res1: bool = True, 

843 ) -> tuple[dict[str, Any], dict[str, Any]]: 

844 try: 

845 assert isinstance(self.indirect_reference, IndirectObject) 

846 pdf = self.indirect_reference.pdf 

847 is_pdf_writer = hasattr( 

848 pdf, "_add_object" 

849 ) # expect isinstance(pdf, PdfWriter) 

850 except (AssertionError, AttributeError): 

851 pdf = None 

852 is_pdf_writer = False 

853 

854 def compute_unique_key(base_key: str) -> tuple[str, bool]: 

855 """ 

856 Find a key that either doesn't already exist or has the same value 

857 (indicated by the bool) 

858 

859 Args: 

860 base_key: An index is added to this to get the computed key 

861 

862 Returns: 

863 A tuple (computed key, bool) where the boolean indicates 

864 if there is a resource of the given computed_key with the same 

865 value. 

866 

867 """ 

868 value = page2res.raw_get(base_key) 

869 # TODO: a possible improvement for writer, the indirect_reference 

870 # cannot be found because translated 

871 

872 # try the current key first (e.g. "foo"), but otherwise iterate 

873 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

874 # many keys, thus this'll eventually end, even if it's been crafted 

875 # to be maximally annoying. 

876 computed_key = base_key 

877 idx = 0 

878 while computed_key in new_res: 

879 if new_res.raw_get(computed_key) == value: 

880 # there's already a resource of this name, with the exact 

881 # same value 

882 return computed_key, True 

883 computed_key = f"{base_key}-{idx}" 

884 idx += 1 

885 return computed_key, False 

886 

887 if new_res1: 

888 new_res = DictionaryObject() 

889 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

890 else: 

891 new_res = cast(DictionaryObject, res1[resource]) 

892 page2res = cast( 

893 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

894 ) 

895 rename_res = {} 

896 for key in page2res: 

897 unique_key, same_value = compute_unique_key(key) 

898 newname = NameObject(unique_key) 

899 if key != unique_key: 

900 # we have to use a different name for this 

901 rename_res[key] = newname 

902 

903 if not same_value: 

904 if is_pdf_writer: 

905 new_res[newname] = page2res.raw_get(key).clone(pdf) 

906 try: 

907 new_res[newname] = new_res[newname].indirect_reference 

908 except AttributeError: 

909 pass 

910 else: 

911 new_res[newname] = page2res.raw_get(key) 

912 lst = sorted(new_res.items()) 

913 new_res.clear() 

914 for el in lst: 

915 new_res[el[0]] = el[1] 

916 return new_res, rename_res 

917 

918 @staticmethod 

919 def _content_stream_rename( 

920 stream: ContentStream, 

921 rename: dict[Any, Any], 

922 pdf: Optional[PdfCommonDocProtocol], 

923 ) -> ContentStream: 

924 if not rename: 

925 return stream 

926 stream = ContentStream(stream, pdf) 

927 for operands, _operator in stream.operations: 

928 if isinstance(operands, list): 

929 for i, op in enumerate(operands): 

930 if isinstance(op, NameObject): 

931 operands[i] = rename.get(op, op) 

932 elif isinstance(operands, dict): 

933 for i, op in operands.items(): 

934 if isinstance(op, NameObject): 

935 operands[i] = rename.get(op, op) 

936 else: 

937 raise KeyError(f"Type of operands is {type(operands)}") 

938 return stream 

939 

940 @staticmethod 

941 def _add_transformation_matrix( 

942 contents: Any, 

943 pdf: Optional[PdfCommonDocProtocol], 

944 ctm: CompressedTransformationMatrix, 

945 ) -> ContentStream: 

946 """Add transformation matrix at the beginning of the given contents stream.""" 

947 contents = ContentStream(contents, pdf) 

948 contents.operations.insert( 

949 0, 

950 [ 

951 [FloatObject(x) for x in ctm], 

952 b"cm", 

953 ], 

954 ) 

955 return contents 

956 

957 def _get_contents_as_bytes(self) -> Optional[bytes]: 

958 """ 

959 Return the page contents as bytes. 

960 

961 Returns: 

962 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

963 

964 """ 

965 if PG.CONTENTS in self: 

966 obj = self[PG.CONTENTS].get_object() 

967 if isinstance(obj, list): 

968 return b"".join(x.get_object().get_data() for x in obj) 

969 return cast(EncodedStreamObject, obj).get_data() 

970 return None 

971 

972 def get_contents(self) -> Optional[ContentStream]: 

973 """ 

974 Access the page contents. 

975 

976 Returns: 

977 The ``/Contents`` object, or ``None`` if it does not exist. 

978 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

979 

980 """ 

981 if PG.CONTENTS in self: 

982 try: 

983 pdf = cast(IndirectObject, self.indirect_reference).pdf 

984 except AttributeError: 

985 pdf = None 

986 obj = self[PG.CONTENTS] 

987 if is_null_or_none(obj): 

988 return None 

989 resolved_object = obj.get_object() 

990 return ContentStream(resolved_object, pdf) 

991 return None 

992 

993 def replace_contents( 

994 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

995 ) -> None: 

996 """ 

997 Replace the page contents with the new content and nullify old objects 

998 Args: 

999 content: new content; if None delete the content field. 

1000 """ 

1001 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

1002 # the page is not attached : the content is directly attached. 

1003 self[NameObject(PG.CONTENTS)] = content 

1004 return 

1005 

1006 from pypdf._writer import PdfWriter # noqa: PLC0415 

1007 if not isinstance(self.indirect_reference.pdf, PdfWriter): 

1008 deprecate( 

1009 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated " 

1010 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use " 

1011 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable." 

1012 ) 

1013 

1014 writer = self.indirect_reference.pdf 

1015 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1016 content_array = cast(ArrayObject, self[PG.CONTENTS]) 

1017 for reference in content_array: 

1018 try: 

1019 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject()) 

1020 except ValueError: 

1021 # Occurs when called on PdfReader. 

1022 pass 

1023 

1024 if isinstance(content, ArrayObject): 

1025 content = ArrayObject(writer._add_object(obj) for obj in content) 

1026 

1027 if is_null_or_none(content): 

1028 if PG.CONTENTS not in self: 

1029 return 

1030 assert self[PG.CONTENTS].indirect_reference is not None 

1031 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject()) 

1032 del self[PG.CONTENTS] 

1033 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1034 try: 

1035 self[NameObject(PG.CONTENTS)] = writer._add_object(content) 

1036 except AttributeError: 

1037 # applies at least for page not in writer 

1038 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1039 # this will be fixed with the _add_object 

1040 self[NameObject(PG.CONTENTS)] = content 

1041 else: 

1042 assert content is not None, "mypy" 

1043 content.indirect_reference = self[ 

1044 PG.CONTENTS 

1045 ].indirect_reference # TODO: in the future may require generation management 

1046 try: 

1047 writer._replace_object(indirect_reference=content.indirect_reference, obj=content) 

1048 except AttributeError: 

1049 # applies at least for page not in writer 

1050 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1051 # this will be fixed with the _add_object 

1052 self[NameObject(PG.CONTENTS)] = content 

1053 # forces recalculation of inline_images 

1054 self.inline_images = None 

1055 

1056 def merge_page( 

1057 self, page2: "PageObject", expand: bool = False, over: bool = True 

1058 ) -> None: 

1059 """ 

1060 Merge the content streams of two pages into one. 

1061 

1062 Resource references (e.g. fonts) are maintained from both pages. 

1063 The mediabox, cropbox, etc of this page are not altered. 

1064 The parameter page's content stream will 

1065 be added to the end of this page's content stream, 

1066 meaning that it will be drawn after, or "on top" of this page. 

1067 

1068 Args: 

1069 page2: The page to be merged into this one. Should be 

1070 an instance of :class:`PageObject<PageObject>`. 

1071 over: set the page2 content over page1 if True (default) else under 

1072 expand: If True, the current page dimensions will be 

1073 expanded to accommodate the dimensions of the page to be merged. 

1074 

1075 """ 

1076 self._merge_page(page2, over=over, expand=expand) 

1077 

1078 def _merge_page( 

1079 self, 

1080 page2: "PageObject", 

1081 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1082 ctm: Optional[CompressedTransformationMatrix] = None, 

1083 over: bool = True, 

1084 expand: bool = False, 

1085 ) -> None: 

1086 # First we work on merging the resource dictionaries. This allows us 

1087 # to find out what symbols in the content streams we might need to 

1088 # rename. 

1089 try: 

1090 assert isinstance(self.indirect_reference, IndirectObject) 

1091 if hasattr( 

1092 self.indirect_reference.pdf, "_add_object" 

1093 ): # to detect PdfWriter 

1094 return self._merge_page_writer( 

1095 page2, page2transformation, ctm, over, expand 

1096 ) 

1097 except (AssertionError, AttributeError): 

1098 pass 

1099 

1100 new_resources = DictionaryObject() 

1101 rename = {} 

1102 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1103 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1104 new_annots = ArrayObject() 

1105 

1106 for page in (self, page2): 

1107 if PG.ANNOTS in page: 

1108 annots = page[PG.ANNOTS] 

1109 if isinstance(annots, ArrayObject): 

1110 new_annots.extend(annots) 

1111 

1112 for res in ( 

1113 RES.EXT_G_STATE, 

1114 RES.FONT, 

1115 RES.XOBJECT, 

1116 RES.COLOR_SPACE, 

1117 RES.PATTERN, 

1118 RES.SHADING, 

1119 RES.PROPERTIES, 

1120 ): 

1121 new, newrename = self._merge_resources( 

1122 original_resources, page2resources, res 

1123 ) 

1124 if new: 

1125 new_resources[NameObject(res)] = new 

1126 rename.update(newrename) 

1127 

1128 # Combine /ProcSet sets, making sure there's a consistent order 

1129 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1130 sorted( 

1131 set( 

1132 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1133 ).union( 

1134 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1135 ) 

1136 ) 

1137 ) 

1138 

1139 new_content_array = ArrayObject() 

1140 original_content = self.get_contents() 

1141 if original_content is not None: 

1142 original_content.isolate_graphics_state() 

1143 new_content_array.append(original_content) 

1144 

1145 page2content = page2.get_contents() 

1146 if page2content is not None: 

1147 rect = getattr(page2, MERGE_CROP_BOX) 

1148 page2content.operations.insert( 

1149 0, 

1150 ( 

1151 map( 

1152 FloatObject, 

1153 [ 

1154 rect.left, 

1155 rect.bottom, 

1156 rect.width, 

1157 rect.height, 

1158 ], 

1159 ), 

1160 b"re", 

1161 ), 

1162 ) 

1163 page2content.operations.insert(1, ([], b"W")) 

1164 page2content.operations.insert(2, ([], b"n")) 

1165 if page2transformation is not None: 

1166 page2content = page2transformation(page2content) 

1167 page2content = PageObject._content_stream_rename( 

1168 page2content, rename, self.pdf 

1169 ) 

1170 page2content.isolate_graphics_state() 

1171 if over: 

1172 new_content_array.append(page2content) 

1173 else: 

1174 new_content_array.insert(0, page2content) 

1175 

1176 # if expanding the page to fit a new page, calculate the new media box size 

1177 if expand: 

1178 self._expand_mediabox(page2, ctm) 

1179 

1180 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1181 self[NameObject(PG.RESOURCES)] = new_resources 

1182 self[NameObject(PG.ANNOTS)] = new_annots 

1183 return None 

1184 

1185 def _merge_page_writer( 

1186 self, 

1187 page2: "PageObject", 

1188 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1189 ctm: Optional[CompressedTransformationMatrix] = None, 

1190 over: bool = True, 

1191 expand: bool = False, 

1192 ) -> None: 

1193 # First we work on merging the resource dictionaries. This allows us 

1194 # to find which symbols in the content streams we might need to 

1195 # rename. 

1196 assert isinstance(self.indirect_reference, IndirectObject) 

1197 pdf = self.indirect_reference.pdf 

1198 

1199 rename = {} 

1200 if PG.RESOURCES not in self: 

1201 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1202 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1203 if PG.RESOURCES not in page2: 

1204 page2resources = DictionaryObject() 

1205 else: 

1206 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1207 

1208 for res in ( 

1209 RES.EXT_G_STATE, 

1210 RES.FONT, 

1211 RES.XOBJECT, 

1212 RES.COLOR_SPACE, 

1213 RES.PATTERN, 

1214 RES.SHADING, 

1215 RES.PROPERTIES, 

1216 ): 

1217 if res in page2resources: 

1218 if res not in original_resources: 

1219 original_resources[NameObject(res)] = DictionaryObject() 

1220 _, newrename = self._merge_resources( 

1221 original_resources, page2resources, res, False 

1222 ) 

1223 rename.update(newrename) 

1224 # Combine /ProcSet sets. 

1225 if RES.PROC_SET in page2resources: 

1226 if RES.PROC_SET not in original_resources: 

1227 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1228 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1229 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1230 if x not in arr: 

1231 arr.append(x) 

1232 arr.sort() 

1233 

1234 if PG.ANNOTS in page2: 

1235 if PG.ANNOTS not in self: 

1236 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1237 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1238 if ctm is None: 

1239 trsf = Transformation() 

1240 else: 

1241 trsf = Transformation(ctm) 

1242 # Ensure we are working on a copy of the list. Otherwise, if both pages 

1243 # are the same object, we might run into an infinite loop. 

1244 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])): 

1245 a = a.get_object() 

1246 aa = a.clone( 

1247 pdf, 

1248 ignore_fields=("/P", "/StructParent", "/Parent"), 

1249 force_duplicate=True, 

1250 ) 

1251 r = cast(ArrayObject, a["/Rect"]) 

1252 pt1 = trsf.apply_on((r[0], r[1]), True) 

1253 pt2 = trsf.apply_on((r[2], r[3]), True) 

1254 aa[NameObject("/Rect")] = ArrayObject( 

1255 ( 

1256 min(pt1[0], pt2[0]), 

1257 min(pt1[1], pt2[1]), 

1258 max(pt1[0], pt2[0]), 

1259 max(pt1[1], pt2[1]), 

1260 ) 

1261 ) 

1262 if "/QuadPoints" in a: 

1263 q = cast(ArrayObject, a["/QuadPoints"]) 

1264 aa[NameObject("/QuadPoints")] = ArrayObject( 

1265 trsf.apply_on((q[0], q[1]), True) 

1266 + trsf.apply_on((q[2], q[3]), True) 

1267 + trsf.apply_on((q[4], q[5]), True) 

1268 + trsf.apply_on((q[6], q[7]), True) 

1269 ) 

1270 try: 

1271 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1272 except KeyError: 

1273 pass 

1274 try: 

1275 aa[NameObject("/P")] = self.indirect_reference 

1276 annots.append(aa.indirect_reference) 

1277 except AttributeError: 

1278 pass 

1279 

1280 new_content_array = ArrayObject() 

1281 original_content = self.get_contents() 

1282 if original_content is not None: 

1283 original_content.isolate_graphics_state() 

1284 new_content_array.append(original_content) 

1285 

1286 page2content = page2.get_contents() 

1287 if page2content is not None: 

1288 rect = getattr(page2, MERGE_CROP_BOX) 

1289 page2content.operations.insert( 

1290 0, 

1291 ( 

1292 map( 

1293 FloatObject, 

1294 [ 

1295 rect.left, 

1296 rect.bottom, 

1297 rect.width, 

1298 rect.height, 

1299 ], 

1300 ), 

1301 b"re", 

1302 ), 

1303 ) 

1304 page2content.operations.insert(1, ([], b"W")) 

1305 page2content.operations.insert(2, ([], b"n")) 

1306 if page2transformation is not None: 

1307 page2content = page2transformation(page2content) 

1308 page2content = PageObject._content_stream_rename( 

1309 page2content, rename, self.pdf 

1310 ) 

1311 page2content.isolate_graphics_state() 

1312 if over: 

1313 new_content_array.append(page2content) 

1314 else: 

1315 new_content_array.insert(0, page2content) 

1316 

1317 # if expanding the page to fit a new page, calculate the new media box size 

1318 if expand: 

1319 self._expand_mediabox(page2, ctm) 

1320 

1321 self.replace_contents(new_content_array) 

1322 

1323 def _expand_mediabox( 

1324 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1325 ) -> None: 

1326 corners1 = ( 

1327 self.mediabox.left.as_numeric(), 

1328 self.mediabox.bottom.as_numeric(), 

1329 self.mediabox.right.as_numeric(), 

1330 self.mediabox.top.as_numeric(), 

1331 ) 

1332 corners2 = ( 

1333 page2.mediabox.left.as_numeric(), 

1334 page2.mediabox.bottom.as_numeric(), 

1335 page2.mediabox.left.as_numeric(), 

1336 page2.mediabox.top.as_numeric(), 

1337 page2.mediabox.right.as_numeric(), 

1338 page2.mediabox.top.as_numeric(), 

1339 page2.mediabox.right.as_numeric(), 

1340 page2.mediabox.bottom.as_numeric(), 

1341 ) 

1342 if ctm is not None: 

1343 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1344 new_x = tuple( 

1345 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1346 for i in range(0, 8, 2) 

1347 ) 

1348 new_y = tuple( 

1349 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1350 for i in range(0, 8, 2) 

1351 ) 

1352 else: 

1353 new_x = corners2[0:8:2] 

1354 new_y = corners2[1:8:2] 

1355 lowerleft = (min(new_x), min(new_y)) 

1356 upperright = (max(new_x), max(new_y)) 

1357 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1358 upperright = ( 

1359 max(corners1[2], upperright[0]), 

1360 max(corners1[3], upperright[1]), 

1361 ) 

1362 

1363 self.mediabox.lower_left = lowerleft 

1364 self.mediabox.upper_right = upperright 

1365 

1366 def merge_transformed_page( 

1367 self, 

1368 page2: "PageObject", 

1369 ctm: Union[CompressedTransformationMatrix, Transformation], 

1370 over: bool = True, 

1371 expand: bool = False, 

1372 ) -> None: 

1373 """ 

1374 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1375 matrix is applied to the merged stream. 

1376 

1377 Args: 

1378 page2: The page to be merged into this one. 

1379 ctm: a 6-element tuple containing the operands of the 

1380 transformation matrix 

1381 over: set the page2 content over page1 if True (default) else under 

1382 expand: Whether the page should be expanded to fit the dimensions 

1383 of the page to be merged. 

1384 

1385 """ 

1386 if isinstance(ctm, Transformation): 

1387 ctm = ctm.ctm 

1388 self._merge_page( 

1389 page2, 

1390 lambda page2_content: PageObject._add_transformation_matrix( 

1391 page2_content, page2.pdf, ctm 

1392 ), 

1393 ctm, 

1394 over, 

1395 expand, 

1396 ) 

1397 

1398 def merge_scaled_page( 

1399 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1400 ) -> None: 

1401 """ 

1402 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1403 is scaled by applying a transformation matrix. 

1404 

1405 Args: 

1406 page2: The page to be merged into this one. 

1407 scale: The scaling factor 

1408 over: set the page2 content over page1 if True (default) else under 

1409 expand: Whether the page should be expanded to fit the 

1410 dimensions of the page to be merged. 

1411 

1412 """ 

1413 op = Transformation().scale(scale, scale) 

1414 self.merge_transformed_page(page2, op, over, expand) 

1415 

1416 def merge_rotated_page( 

1417 self, 

1418 page2: "PageObject", 

1419 rotation: float, 

1420 over: bool = True, 

1421 expand: bool = False, 

1422 ) -> None: 

1423 """ 

1424 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1425 is rotated by applying a transformation matrix. 

1426 

1427 Args: 

1428 page2: The page to be merged into this one. 

1429 rotation: The angle of the rotation, in degrees 

1430 over: set the page2 content over page1 if True (default) else under 

1431 expand: Whether the page should be expanded to fit the 

1432 dimensions of the page to be merged. 

1433 

1434 """ 

1435 op = Transformation().rotate(rotation) 

1436 self.merge_transformed_page(page2, op, over, expand) 

1437 

1438 def merge_translated_page( 

1439 self, 

1440 page2: "PageObject", 

1441 tx: float, 

1442 ty: float, 

1443 over: bool = True, 

1444 expand: bool = False, 

1445 ) -> None: 

1446 """ 

1447 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1448 merged is translated by applying a transformation matrix. 

1449 

1450 Args: 

1451 page2: the page to be merged into this one. 

1452 tx: The translation on X axis 

1453 ty: The translation on Y axis 

1454 over: set the page2 content over page1 if True (default) else under 

1455 expand: Whether the page should be expanded to fit the 

1456 dimensions of the page to be merged. 

1457 

1458 """ 

1459 op = Transformation().translate(tx, ty) 

1460 self.merge_transformed_page(page2, op, over, expand) 

1461 

1462 def add_transformation( 

1463 self, 

1464 ctm: Union[Transformation, CompressedTransformationMatrix], 

1465 expand: bool = False, 

1466 ) -> None: 

1467 """ 

1468 Apply a transformation matrix to the page. 

1469 

1470 Args: 

1471 ctm: A 6-element tuple containing the operands of the 

1472 transformation matrix. Alternatively, a 

1473 :py:class:`Transformation<pypdf.Transformation>` 

1474 object can be passed. 

1475 

1476 See :doc:`/user/cropping-and-transforming`. 

1477 

1478 """ 

1479 if isinstance(ctm, Transformation): 

1480 ctm = ctm.ctm 

1481 content = self.get_contents() 

1482 if content is not None: 

1483 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1484 content.isolate_graphics_state() 

1485 self.replace_contents(content) 

1486 # if expanding the page to fit a new page, calculate the new media box size 

1487 if expand: 

1488 corners = [ 

1489 self.mediabox.left.as_numeric(), 

1490 self.mediabox.bottom.as_numeric(), 

1491 self.mediabox.left.as_numeric(), 

1492 self.mediabox.top.as_numeric(), 

1493 self.mediabox.right.as_numeric(), 

1494 self.mediabox.top.as_numeric(), 

1495 self.mediabox.right.as_numeric(), 

1496 self.mediabox.bottom.as_numeric(), 

1497 ] 

1498 

1499 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1500 new_x = [ 

1501 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1502 for i in range(0, 8, 2) 

1503 ] 

1504 new_y = [ 

1505 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1506 for i in range(0, 8, 2) 

1507 ] 

1508 

1509 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1510 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1511 

1512 def scale(self, sx: float, sy: float) -> None: 

1513 """ 

1514 Scale a page by the given factors by applying a transformation matrix 

1515 to its content and updating the page size. 

1516 

1517 This updates the various page boundaries (bleedbox, trimbox, etc.) 

1518 and the contents of the page. 

1519 

1520 Args: 

1521 sx: The scaling factor on horizontal axis. 

1522 sy: The scaling factor on vertical axis. 

1523 

1524 """ 

1525 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1526 self.bleedbox = self.bleedbox.scale(sx, sy) 

1527 self.trimbox = self.trimbox.scale(sx, sy) 

1528 self.artbox = self.artbox.scale(sx, sy) 

1529 self.cropbox = self.cropbox.scale(sx, sy) 

1530 self.mediabox = self.mediabox.scale(sx, sy) 

1531 

1532 if PG.ANNOTS in self: 

1533 annotations = self[PG.ANNOTS] 

1534 if isinstance(annotations, ArrayObject): 

1535 for annotation in annotations: 

1536 annotation_obj = annotation.get_object() 

1537 if ADA.Rect in annotation_obj: 

1538 rectangle = annotation_obj[ADA.Rect] 

1539 if isinstance(rectangle, ArrayObject): 

1540 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1541 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1542 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1543 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1544 

1545 if PG.VP in self: 

1546 viewport = self[PG.VP] 

1547 if isinstance(viewport, ArrayObject): 

1548 bbox = viewport[0]["/BBox"] 

1549 else: 

1550 bbox = viewport["/BBox"] # type: ignore 

1551 scaled_bbox = RectangleObject( 

1552 ( 

1553 float(bbox[0]) * sx, 

1554 float(bbox[1]) * sy, 

1555 float(bbox[2]) * sx, 

1556 float(bbox[3]) * sy, 

1557 ) 

1558 ) 

1559 if isinstance(viewport, ArrayObject): 

1560 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1561 NameObject("/BBox") 

1562 ] = scaled_bbox 

1563 else: 

1564 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1565 

1566 def scale_by(self, factor: float) -> None: 

1567 """ 

1568 Scale a page by the given factor by applying a transformation matrix to 

1569 its content and updating the page size. 

1570 

1571 Args: 

1572 factor: The scaling factor (for both X and Y axis). 

1573 

1574 """ 

1575 self.scale(factor, factor) 

1576 

1577 def scale_to(self, width: float, height: float) -> None: 

1578 """ 

1579 Scale a page to the specified dimensions by applying a transformation 

1580 matrix to its content and updating the page size. 

1581 

1582 Args: 

1583 width: The new width. 

1584 height: The new height. 

1585 

1586 """ 

1587 sx = width / float(self.mediabox.width) 

1588 sy = height / float(self.mediabox.height) 

1589 self.scale(sx, sy) 

1590 

1591 def compress_content_streams(self, level: int = -1) -> None: 

1592 """ 

1593 Compress the size of this page by joining all content streams and 

1594 applying a FlateDecode filter. 

1595 

1596 However, it is possible that this function will perform no action if 

1597 content stream compression becomes "automatic". 

1598 """ 

1599 content = self.get_contents() 

1600 if content is not None: 

1601 content_obj = content.flate_encode(level) 

1602 try: 

1603 content.indirect_reference.pdf._objects[ # type: ignore 

1604 content.indirect_reference.idnum - 1 # type: ignore 

1605 ] = content_obj 

1606 except AttributeError: 

1607 if self.indirect_reference is not None and hasattr( 

1608 self.indirect_reference.pdf, "_add_object" 

1609 ): 

1610 self.replace_contents(content_obj) 

1611 else: 

1612 raise ValueError("Page must be part of a PdfWriter") 

1613 

1614 @property 

1615 def page_number(self) -> Optional[int]: 

1616 """ 

1617 Read-only property which returns the page number within the PDF file. 

1618 

1619 Returns: 

1620 Page number; None if the page is not attached to a PDF. 

1621 

1622 """ 

1623 if self.indirect_reference is None: 

1624 return None 

1625 try: 

1626 lst = self.indirect_reference.pdf.pages 

1627 return lst.index(self) 

1628 except ValueError: 

1629 return None 

1630 

1631 def _debug_for_extract(self) -> str: # pragma: no cover 

1632 out = "" 

1633 for ope, op in ContentStream( 

1634 self["/Contents"].get_object(), self.pdf, "bytes" 

1635 ).operations: 

1636 if op == b"TJ": 

1637 s = [x for x in ope[0] if isinstance(x, str)] 

1638 else: 

1639 s = [] 

1640 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1641 out += "\n=============================\n" 

1642 try: 

1643 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1644 out += fo + "\n" 

1645 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1646 try: 

1647 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1648 "/Encoding" 

1649 ].__repr__() 

1650 out += enc_repr + "\n" 

1651 except Exception: 

1652 pass 

1653 try: 

1654 out += ( 

1655 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1656 "/ToUnicode" 

1657 ] 

1658 .get_data() 

1659 .decode() 

1660 + "\n" 

1661 ) 

1662 except Exception: 

1663 pass 

1664 

1665 except KeyError: 

1666 out += "No Font\n" 

1667 return out 

1668 

1669 def _extract_text( 

1670 self, 

1671 obj: Any, 

1672 pdf: Any, 

1673 orientations: tuple[int, ...] = (0, 90, 180, 270), 

1674 space_width: float = 200.0, 

1675 content_key: Optional[str] = PG.CONTENTS, 

1676 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1677 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1678 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1679 ) -> str: 

1680 """ 

1681 See extract_text for most arguments. 

1682 

1683 Args: 

1684 content_key: indicate the default key where to extract data 

1685 None = the object; this allows reusing the function on an XObject 

1686 default = "/Content" 

1687 

1688 """ 

1689 extractor = TextExtraction() 

1690 font_resources: dict[str, DictionaryObject] = {} 

1691 fonts: dict[str, Font] = {} 

1692 

1693 try: 

1694 objr = obj 

1695 while NameObject(PG.RESOURCES) not in objr: 

1696 # /Resources can be inherited so we look to parents 

1697 objr = objr["/Parent"].get_object() 

1698 # If no parents then no /Resources will be available, 

1699 # so an exception will be raised 

1700 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1701 except Exception: 

1702 # No resources means no text is possible (no font); we consider the 

1703 # file as not damaged, no need to check for TJ or Tj 

1704 return "" 

1705 

1706 if ( 

1707 not is_null_or_none(resources_dict) 

1708 and "/Font" in resources_dict 

1709 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"])) 

1710 ): 

1711 for font_resource in font_resources_dict: 

1712 try: 

1713 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object()) 

1714 font_resources[font_resource] = font_resource_object 

1715 fonts[font_resource] = Font.from_font_resource(font_resource_object) 

1716 # Override space width, if applicable 

1717 if fonts[font_resource].character_widths.get(" ", 0) == 0: 

1718 fonts[font_resource].space_width = space_width 

1719 except (AttributeError, TypeError): 

1720 pass 

1721 

1722 try: 

1723 content = ( 

1724 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1725 ) 

1726 if not isinstance(content, ContentStream): 

1727 content = ContentStream(content, pdf, "bytes") 

1728 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1729 return "" 

1730 # We check all strings are TextStringObjects. ByteStringObjects 

1731 # are strings where the byte->string encoding was unknown, so adding 

1732 # them to the text here would be gibberish. 

1733 

1734 # Initialize the extractor with the necessary parameters 

1735 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts) 

1736 

1737 for operands, operator in content.operations: 

1738 if visitor_operand_before is not None: 

1739 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1740 # Multiple operators are handled here 

1741 if operator == b"'": 

1742 extractor.process_operation(b"T*", []) 

1743 extractor.process_operation(b"Tj", operands) 

1744 elif operator == b'"': 

1745 extractor.process_operation(b"Tw", [operands[0]]) 

1746 extractor.process_operation(b"Tc", [operands[1]]) 

1747 extractor.process_operation(b"T*", []) 

1748 extractor.process_operation(b"Tj", operands[2:]) 

1749 elif operator == b"TJ": 

1750 # The space width may be smaller than the font width, so the width should be 95%. 

1751 _confirm_space_width = extractor._space_width * 0.95 

1752 if operands: 

1753 for op in operands[0]: 

1754 if isinstance(op, (str, bytes)): 

1755 extractor.process_operation(b"Tj", [op]) 

1756 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1757 abs(float(op)) >= _confirm_space_width 

1758 and extractor.text 

1759 and extractor.text[-1] != " " 

1760 ): 

1761 extractor.process_operation(b"Tj", [" "]) 

1762 elif operator == b"TD": 

1763 extractor.process_operation(b"TL", [-operands[1]]) 

1764 extractor.process_operation(b"Td", operands) 

1765 elif operator == b"Do": 

1766 extractor.output += extractor.text 

1767 if visitor_text is not None: 

1768 visitor_text( 

1769 extractor.text, 

1770 extractor.memo_cm, 

1771 extractor.memo_tm, 

1772 extractor.font_resource, 

1773 extractor.font_size, 

1774 ) 

1775 try: 

1776 if extractor.output[-1] != "\n": 

1777 extractor.output += "\n" 

1778 if visitor_text is not None: 

1779 visitor_text( 

1780 "\n", 

1781 extractor.memo_cm, 

1782 extractor.memo_tm, 

1783 extractor.font_resource, 

1784 extractor.font_size, 

1785 ) 

1786 except IndexError: 

1787 pass 

1788 try: 

1789 xobj = resources_dict["/XObject"] 

1790 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1791 text = self.extract_xform_text( 

1792 xobj[operands[0]], # type: ignore 

1793 orientations, 

1794 space_width, 

1795 visitor_operand_before, 

1796 visitor_operand_after, 

1797 visitor_text, 

1798 ) 

1799 extractor.output += text 

1800 if visitor_text is not None: 

1801 visitor_text( 

1802 text, 

1803 extractor.memo_cm, 

1804 extractor.memo_tm, 

1805 extractor.font_resource, 

1806 extractor.font_size, 

1807 ) 

1808 except Exception as exception: 

1809 logger_warning( 

1810 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1811 __name__, 

1812 ) 

1813 finally: 

1814 extractor.text = "" 

1815 extractor.memo_cm = extractor.cm_matrix.copy() 

1816 extractor.memo_tm = extractor.tm_matrix.copy() 

1817 else: 

1818 extractor.process_operation(operator, operands) 

1819 if visitor_operand_after is not None: 

1820 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1821 extractor.output += extractor.text # just in case 

1822 if extractor.text != "" and visitor_text is not None: 

1823 visitor_text( 

1824 extractor.text, 

1825 extractor.memo_cm, 

1826 extractor.memo_tm, 

1827 extractor.font_resource, 

1828 extractor.font_size, 

1829 ) 

1830 return extractor.output 

1831 

1832 def _layout_mode_fonts(self) -> dict[str, Font]: 

1833 """ 

1834 Get fonts formatted for "layout" mode text extraction. 

1835 

1836 Returns: 

1837 Dict[str, Font]: dictionary of Font instances keyed by font name 

1838 

1839 """ 

1840 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1841 objr: Any = self 

1842 fonts: dict[str, Font] = {} 

1843 while objr is not None: 

1844 try: 

1845 resources_dict: Any = objr[PG.RESOURCES] 

1846 except KeyError: 

1847 resources_dict = {} 

1848 if "/Font" in resources_dict and self.pdf is not None: 

1849 for font_name in resources_dict["/Font"]: 

1850 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name]) 

1851 try: 

1852 objr = objr["/Parent"].get_object() 

1853 except KeyError: 

1854 objr = None 

1855 

1856 return fonts 

1857 

1858 def _layout_mode_text( 

1859 self, 

1860 space_vertically: bool = True, 

1861 scale_weight: float = 1.25, 

1862 strip_rotated: bool = True, 

1863 debug_path: Optional[Path] = None, 

1864 font_height_weight: float = 1, 

1865 ) -> str: 

1866 """ 

1867 Get text preserving fidelity to source PDF text layout. 

1868 

1869 Args: 

1870 space_vertically: include blank lines inferred from y distance + font 

1871 height. Defaults to True. 

1872 scale_weight: multiplier for string length when calculating weighted 

1873 average character width. Defaults to 1.25. 

1874 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1875 layout mode output. Defaults to True. 

1876 debug_path (Path | None): if supplied, must target a directory. 

1877 creates the following files with debug information for layout mode 

1878 functions if supplied: 

1879 - fonts.json: output of self._layout_mode_fonts 

1880 - tjs.json: individual text render ops with corresponding transform matrices 

1881 - bts.json: text render ops left justified and grouped by BT/ET operators 

1882 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1883 Defaults to None. 

1884 font_height_weight: multiplier for font height when calculating 

1885 blank lines. Defaults to 1. 

1886 

1887 Returns: 

1888 str: multiline string containing page text in a fixed width format that 

1889 closely adheres to the rendered layout in the source pdf. 

1890 

1891 """ 

1892 fonts = self._layout_mode_fonts() 

1893 if debug_path: # pragma: no cover 

1894 import json # noqa: PLC0415 

1895 

1896 debug_path.joinpath("fonts.json").write_text( 

1897 json.dumps(fonts, indent=2, default=asdict), 

1898 "utf-8" 

1899 ) 

1900 

1901 ops = iter( 

1902 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1903 ) 

1904 bt_groups = _layout_mode.text_show_operations( 

1905 ops, fonts, strip_rotated, debug_path 

1906 ) 

1907 

1908 if not bt_groups: 

1909 return "" 

1910 

1911 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1912 

1913 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1914 

1915 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1916 

1917 def extract_text( 

1918 self, 

1919 *args: Any, 

1920 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), 

1921 space_width: float = 200.0, 

1922 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1923 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1924 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1925 extraction_mode: Literal["plain", "layout"] = "plain", 

1926 **kwargs: Any, 

1927 ) -> str: 

1928 """ 

1929 Locate all text drawing commands, in the order they are provided in the 

1930 content stream, and extract the text. 

1931 

1932 This works well for some PDF files, but poorly for others, depending on 

1933 the generator used. This will be refined in the future. 

1934 

1935 Do not rely on the order of text coming out of this function, as it 

1936 will change if this function is made more sophisticated. 

1937 

1938 Arabic and Hebrew are extracted in the correct order. 

1939 If required a custom RTL range of characters can be defined; 

1940 see function set_custom_rtl. 

1941 

1942 Additionally you can provide visitor methods to get informed on all 

1943 operations and all text objects. 

1944 For example in some PDF files this can be useful to parse tables. 

1945 

1946 Args: 

1947 orientations: list of orientations extract_text will look for 

1948 default = (0, 90, 180, 270) 

1949 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1950 270 (turned right) 

1951 Silently ignored in "layout" mode. 

1952 space_width: force default space width 

1953 if not extracted from font (default: 200) 

1954 Silently ignored in "layout" mode. 

1955 visitor_operand_before: function to be called before processing an operation. 

1956 It has four arguments: operator, operand-arguments, 

1957 current transformation matrix and text matrix. 

1958 Ignored with a warning in "layout" mode. 

1959 visitor_operand_after: function to be called after processing an operation. 

1960 It has four arguments: operator, operand-arguments, 

1961 current transformation matrix and text matrix. 

1962 Ignored with a warning in "layout" mode. 

1963 visitor_text: function to be called when extracting some text at some position. 

1964 It has five arguments: text, current transformation matrix, 

1965 text matrix, font-dictionary and font-size. 

1966 The font-dictionary may be None in case of unknown fonts. 

1967 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1968 Ignored with a warning in "layout" mode. 

1969 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1970 "layout" for experimental layout mode functionality. 

1971 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1972 in "layout" mode. 

1973 

1974 kwargs: 

1975 layout_mode_space_vertically (bool): include blank lines inferred from 

1976 y distance + font height. Defaults to True. 

1977 layout_mode_scale_weight (float): multiplier for string length when calculating 

1978 weighted average character width. Defaults to 1.25. 

1979 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1980 Set to False to include rotated text anyway. If rotated text is discovered, 

1981 layout will be degraded and a warning will result. Defaults to True. 

1982 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1983 creates the following files with debug information for layout mode 

1984 functions if supplied: 

1985 

1986 - fonts.json: output of self._layout_mode_fonts 

1987 - tjs.json: individual text render ops with corresponding transform matrices 

1988 - bts.json: text render ops left justified and grouped by BT/ET operators 

1989 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1990 layout_mode_font_height_weight (float): multiplier for font height when calculating 

1991 blank lines. Defaults to 1. 

1992 

1993 Returns: 

1994 The extracted text 

1995 

1996 """ 

1997 if extraction_mode not in ["plain", "layout"]: 

1998 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

1999 if extraction_mode == "layout": 

2000 for visitor in ( 

2001 "visitor_operand_before", 

2002 "visitor_operand_after", 

2003 "visitor_text", 

2004 ): 

2005 if locals()[visitor]: 

2006 logger_warning( 

2007 f"Argument {visitor} is ignored in layout mode", 

2008 __name__, 

2009 ) 

2010 return self._layout_mode_text( 

2011 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2012 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2013 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2014 debug_path=kwargs.get("layout_mode_debug_path"), 

2015 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2016 ) 

2017 if len(args) >= 1: 

2018 if isinstance(args[0], str): 

2019 if len(args) >= 3: 

2020 if isinstance(args[2], (tuple, int)): 

2021 orientations = args[2] 

2022 else: 

2023 raise TypeError(f"Invalid positional parameter {args[2]}") 

2024 if len(args) >= 4: 

2025 if isinstance(args[3], (float, int)): 

2026 space_width = args[3] 

2027 else: 

2028 raise TypeError(f"Invalid positional parameter {args[3]}") 

2029 elif isinstance(args[0], (tuple, int)): 

2030 orientations = args[0] 

2031 if len(args) >= 2: 

2032 if isinstance(args[1], (float, int)): 

2033 space_width = args[1] 

2034 else: 

2035 raise TypeError(f"Invalid positional parameter {args[1]}") 

2036 else: 

2037 raise TypeError(f"Invalid positional parameter {args[0]}") 

2038 

2039 if isinstance(orientations, int): 

2040 orientations = (orientations,) 

2041 

2042 return self._extract_text( 

2043 self, 

2044 self.pdf, 

2045 orientations, 

2046 space_width, 

2047 PG.CONTENTS, 

2048 visitor_operand_before, 

2049 visitor_operand_after, 

2050 visitor_text, 

2051 ) 

2052 

2053 def extract_xform_text( 

2054 self, 

2055 xform: EncodedStreamObject, 

2056 orientations: tuple[int, ...] = (0, 90, 270, 360), 

2057 space_width: float = 200.0, 

2058 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2059 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2060 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2061 ) -> str: 

2062 """ 

2063 Extract text from an XObject. 

2064 

2065 Args: 

2066 xform: 

2067 orientations: 

2068 space_width: force default space width (if not extracted from font (default 200) 

2069 visitor_operand_before: 

2070 visitor_operand_after: 

2071 visitor_text: 

2072 

2073 Returns: 

2074 The extracted text 

2075 

2076 """ 

2077 return self._extract_text( 

2078 xform, 

2079 self.pdf, 

2080 orientations, 

2081 space_width, 

2082 None, 

2083 visitor_operand_before, 

2084 visitor_operand_after, 

2085 visitor_text, 

2086 ) 

2087 

2088 def _get_fonts(self) -> tuple[set[str], set[str]]: 

2089 """ 

2090 Get the names of embedded fonts and unembedded fonts. 

2091 

2092 Returns: 

2093 A tuple (set of embedded fonts, set of unembedded fonts) 

2094 

2095 """ 

2096 obj = self.get_object() 

2097 assert isinstance(obj, DictionaryObject) 

2098 fonts: set[str] = set() 

2099 embedded: set[str] = set() 

2100 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2101 unembedded = fonts - embedded 

2102 return embedded, unembedded 

2103 

2104 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2105 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2106 default user space units, defining the boundaries of the physical medium on 

2107 which the page is intended to be displayed or printed.""" 

2108 

2109 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2110 """ 

2111 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2112 default user space units, defining the visible region of default user 

2113 space. 

2114 

2115 When the page is displayed or printed, its contents are to be clipped 

2116 (cropped) to this rectangle and then imposed on the output medium in some 

2117 implementation-defined manner. Default value: same as 

2118 :attr:`mediabox<mediabox>`. 

2119 """ 

2120 

2121 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2122 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2123 default user space units, defining the region to which the contents of the 

2124 page should be clipped when output in a production environment.""" 

2125 

2126 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2127 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2128 default user space units, defining the intended dimensions of the finished 

2129 page after trimming.""" 

2130 

2131 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2132 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2133 default user space units, defining the extent of the page's meaningful 

2134 content as intended by the page's creator.""" 

2135 

2136 @property 

2137 def annotations(self) -> Optional[ArrayObject]: 

2138 if "/Annots" not in self: 

2139 return None 

2140 return cast(ArrayObject, self["/Annots"]) 

2141 

2142 @annotations.setter 

2143 def annotations(self, value: Optional[ArrayObject]) -> None: 

2144 """ 

2145 Set the annotations array of the page. 

2146 

2147 Typically you do not want to set this value, but append to it. 

2148 If you append to it, remember to add the object first to the writer 

2149 and only add the indirect object. 

2150 """ 

2151 if value is None: 

2152 if "/Annots" not in self: 

2153 return 

2154 del self[NameObject("/Annots")] 

2155 else: 

2156 self[NameObject("/Annots")] = value 

2157 

2158 

2159class _VirtualList(Sequence[PageObject]): 

2160 def __init__( 

2161 self, 

2162 length_function: Callable[[], int], 

2163 get_function: Callable[[int], PageObject], 

2164 ) -> None: 

2165 self.length_function = length_function 

2166 self.get_function = get_function 

2167 self.current = -1 

2168 

2169 def __len__(self) -> int: 

2170 return self.length_function() 

2171 

2172 @overload 

2173 def __getitem__(self, index: int) -> PageObject: 

2174 ... 

2175 

2176 @overload 

2177 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2178 ... 

2179 

2180 def __getitem__( 

2181 self, index: Union[int, slice] 

2182 ) -> Union[PageObject, Sequence[PageObject]]: 

2183 if isinstance(index, slice): 

2184 indices = range(*index.indices(len(self))) 

2185 cls = type(self) 

2186 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2187 if not isinstance(index, int): 

2188 raise TypeError("Sequence indices must be integers") 

2189 len_self = len(self) 

2190 if index < 0: 

2191 # support negative indexes 

2192 index += len_self 

2193 if not (0 <= index < len_self): 

2194 raise IndexError("Sequence index out of range") 

2195 return self.get_function(index) 

2196 

2197 def __delitem__(self, index: Union[int, slice]) -> None: 

2198 if isinstance(index, slice): 

2199 r = list(range(*index.indices(len(self)))) 

2200 # pages have to be deleted from last to first 

2201 r.sort() 

2202 r.reverse() 

2203 for p in r: 

2204 del self[p] # recursive call 

2205 return 

2206 if not isinstance(index, int): 

2207 raise TypeError("Index must be integers") 

2208 len_self = len(self) 

2209 if index < 0: 

2210 # support negative indexes 

2211 index += len_self 

2212 if not (0 <= index < len_self): 

2213 raise IndexError("Index out of range") 

2214 ind = self[index].indirect_reference 

2215 assert ind is not None 

2216 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2217 "/Parent", None 

2218 ) 

2219 first = True 

2220 while parent is not None: 

2221 parent = cast(DictionaryObject, parent.get_object()) 

2222 try: 

2223 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2224 del cast(ArrayObject, parent["/Kids"])[i] 

2225 first = False 

2226 try: 

2227 assert ind is not None 

2228 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2229 except Exception: # pragma: no cover 

2230 pass 

2231 if "/Count" in parent: 

2232 parent[NameObject("/Count")] = NumberObject( 

2233 cast(int, parent["/Count"]) - 1 

2234 ) 

2235 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2236 # No more objects in this part of this subtree 

2237 ind = parent.indirect_reference 

2238 parent = parent.get("/Parent", None) 

2239 except ValueError: # from index 

2240 if first: 

2241 raise PdfReadError(f"Page not found in page tree: {ind}") 

2242 break 

2243 

2244 def __iter__(self) -> Iterator[PageObject]: 

2245 for i in range(len(self)): 

2246 yield self[i] 

2247 

2248 def __str__(self) -> str: 

2249 p = [f"PageObject({i})" for i in range(self.length_function())] 

2250 return f"[{', '.join(p)}]" 

2251 

2252 

2253def _get_fonts_walk( 

2254 obj: DictionaryObject, 

2255 fnt: set[str], 

2256 emb: set[str], 

2257) -> tuple[set[str], set[str]]: 

2258 """ 

2259 Get the set of all fonts and all embedded fonts. 

2260 

2261 Args: 

2262 obj: Page resources dictionary 

2263 fnt: font 

2264 emb: embedded fonts 

2265 

2266 Returns: 

2267 A tuple (fnt, emb) 

2268 

2269 If there is a key called 'BaseFont', that is a font that is used in the document. 

2270 If there is a key called 'FontName' and another key in the same dictionary object 

2271 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2272 embedded. 

2273 

2274 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2275 

2276 """ 

2277 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2278 

2279 def process_font(f: DictionaryObject) -> None: 

2280 nonlocal fnt, emb 

2281 f = cast(DictionaryObject, f.get_object()) # to be sure 

2282 if "/BaseFont" in f: 

2283 fnt.add(cast(str, f["/BaseFont"])) 

2284 

2285 if ( 

2286 ("/CharProcs" in f) 

2287 or ( 

2288 "/FontDescriptor" in f 

2289 and any( 

2290 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2291 ) 

2292 ) 

2293 or ( 

2294 "/DescendantFonts" in f 

2295 and "/FontDescriptor" 

2296 in cast( 

2297 DictionaryObject, 

2298 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2299 ) 

2300 and any( 

2301 x 

2302 in cast( 

2303 DictionaryObject, 

2304 cast( 

2305 DictionaryObject, 

2306 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2307 )["/FontDescriptor"], 

2308 ) 

2309 for x in fontkeys 

2310 ) 

2311 ) 

2312 ): 

2313 # the list comprehension ensures there is FontFile 

2314 try: 

2315 emb.add(cast(str, f["/BaseFont"])) 

2316 except KeyError: 

2317 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2318 

2319 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2320 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2321 process_font(f) 

2322 if "/Resources" in obj: 

2323 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2324 for f in cast( 

2325 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2326 ).values(): 

2327 process_font(f) 

2328 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2329 for x in cast( 

2330 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2331 ).values(): 

2332 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2333 if "/Annots" in obj: 

2334 for a in cast(ArrayObject, obj["/Annots"]): 

2335 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2336 if "/AP" in obj: 

2337 if ( 

2338 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2339 "/Type" 

2340 ) 

2341 == "/XObject" 

2342 ): 

2343 _get_fonts_walk( 

2344 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2345 fnt, 

2346 emb, 

2347 ) 

2348 else: 

2349 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2350 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2351 return fnt, emb # return the sets for each page