Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

915 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from collections.abc import Iterable, Iterator, Sequence 

32from copy import deepcopy 

33from dataclasses import dataclass 

34from decimal import Decimal 

35from io import BytesIO 

36from pathlib import Path 

37from typing import ( 

38 Any, 

39 Callable, 

40 Literal, 

41 Optional, 

42 Union, 

43 cast, 

44 overload, 

45) 

46 

47from ._cmap import ( 

48 build_char_map, 

49) 

50from ._protocols import PdfCommonDocProtocol 

51from ._text_extraction import ( 

52 _layout_mode, 

53) 

54from ._text_extraction._text_extractor import TextExtraction 

55from ._utils import ( 

56 CompressedTransformationMatrix, 

57 TransformationMatrixType, 

58 _human_readable_bytes, 

59 deprecate, 

60 logger_warning, 

61 matrix_multiply, 

62) 

63from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

64from .constants import AnnotationDictionaryAttributes as ADA 

65from .constants import ImageAttributes as IA 

66from .constants import PageAttributes as PG 

67from .constants import Resources as RES 

68from .errors import PageSizeNotDefinedError, PdfReadError 

69from .generic import ( 

70 ArrayObject, 

71 ContentStream, 

72 DictionaryObject, 

73 EncodedStreamObject, 

74 FloatObject, 

75 IndirectObject, 

76 NameObject, 

77 NullObject, 

78 NumberObject, 

79 PdfObject, 

80 RectangleObject, 

81 StreamObject, 

82 is_null_or_none, 

83) 

84 

85try: 

86 from PIL.Image import Image 

87 

88 pil_not_imported = False 

89except ImportError: 

90 Image = object # type: ignore 

91 pil_not_imported = True # error will be raised only when using images 

92 

93MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" 

94 

95 

96def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

97 retval: Union[None, RectangleObject, IndirectObject] = self.get(name) 

98 if isinstance(retval, RectangleObject): 

99 return retval 

100 if is_null_or_none(retval): 

101 for d in defaults: 

102 retval = self.get(d) 

103 if retval is not None: 

104 break 

105 if isinstance(retval, IndirectObject): 

106 retval = self.pdf.get_object(retval) 

107 retval = RectangleObject(retval) # type: ignore 

108 _set_rectangle(self, name, retval) 

109 return retval 

110 

111 

112def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

113 self[NameObject(name)] = value 

114 

115 

116def _delete_rectangle(self: Any, name: str) -> None: 

117 del self[name] 

118 

119 

120def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

121 return property( 

122 lambda self: _get_rectangle(self, name, fallback), 

123 lambda self, value: _set_rectangle(self, name, value), 

124 lambda self: _delete_rectangle(self, name), 

125 ) 

126 

127 

128class Transformation: 

129 """ 

130 Represent a 2D transformation. 

131 

132 The transformation between two coordinate systems is represented by a 3-by-3 

133 transformation matrix with the following form:: 

134 

135 a b 0 

136 c d 0 

137 e f 1 

138 

139 Because a transformation matrix has only six elements that can be changed, 

140 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

141 

142 Coordinate transformations are expressed as matrix multiplications:: 

143 

144 a b 0 

145 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

146 e f 1 

147 

148 

149 Example: 

150 >>> from pypdf import PdfWriter, Transformation 

151 >>> page = PdfWriter().add_blank_page(800, 600) 

152 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

153 >>> page.add_transformation(op) 

154 

155 """ 

156 

157 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

158 self.ctm = ctm 

159 

160 @property 

161 def matrix(self) -> TransformationMatrixType: 

162 """ 

163 Return the transformation matrix as a tuple of tuples in the form: 

164 

165 ((a, b, 0), (c, d, 0), (e, f, 1)) 

166 """ 

167 return ( 

168 (self.ctm[0], self.ctm[1], 0), 

169 (self.ctm[2], self.ctm[3], 0), 

170 (self.ctm[4], self.ctm[5], 1), 

171 ) 

172 

173 @staticmethod 

174 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

175 """ 

176 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

177 

178 Args: 

179 matrix: The transformation matrix as a tuple of tuples. 

180 

181 Returns: 

182 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

183 

184 """ 

185 return ( 

186 matrix[0][0], 

187 matrix[0][1], 

188 matrix[1][0], 

189 matrix[1][1], 

190 matrix[2][0], 

191 matrix[2][1], 

192 ) 

193 

194 def _to_cm(self) -> str: 

195 # Returns the cm operation string for the given transformation matrix 

196 return ( 

197 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " 

198 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" 

199 ) 

200 

201 def transform(self, m: "Transformation") -> "Transformation": 

202 """ 

203 Apply one transformation to another. 

204 

205 Args: 

206 m: a Transformation to apply. 

207 

208 Returns: 

209 A new ``Transformation`` instance 

210 

211 Example: 

212 >>> from pypdf import PdfWriter, Transformation 

213 >>> height, width = 40, 50 

214 >>> page = PdfWriter().add_blank_page(800, 600) 

215 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

216 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

217 >>> page.add_transformation(op) 

218 

219 """ 

220 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

221 return Transformation(ctm) 

222 

223 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

224 """ 

225 Translate the contents of a page. 

226 

227 Args: 

228 tx: The translation along the x-axis. 

229 ty: The translation along the y-axis. 

230 

231 Returns: 

232 A new ``Transformation`` instance 

233 

234 """ 

235 m = self.ctm 

236 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

237 

238 def scale( 

239 self, sx: Optional[float] = None, sy: Optional[float] = None 

240 ) -> "Transformation": 

241 """ 

242 Scale the contents of a page towards the origin of the coordinate system. 

243 

244 Typically, that is the lower-left corner of the page. That can be 

245 changed by translating the contents / the page boxes. 

246 

247 Args: 

248 sx: The scale factor along the x-axis. 

249 sy: The scale factor along the y-axis. 

250 

251 Returns: 

252 A new Transformation instance with the scaled matrix. 

253 

254 """ 

255 if sx is None and sy is None: 

256 raise ValueError("Either sx or sy must be specified") 

257 if sx is None: 

258 sx = sy 

259 if sy is None: 

260 sy = sx 

261 assert sx is not None 

262 assert sy is not None 

263 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

264 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

265 return Transformation(ctm) 

266 

267 def rotate(self, rotation: float) -> "Transformation": 

268 """ 

269 Rotate the contents of a page. 

270 

271 Args: 

272 rotation: The angle of rotation in degrees. 

273 

274 Returns: 

275 A new ``Transformation`` instance with the rotated matrix. 

276 

277 """ 

278 rotation = math.radians(rotation) 

279 op: TransformationMatrixType = ( 

280 (math.cos(rotation), math.sin(rotation), 0), 

281 (-math.sin(rotation), math.cos(rotation), 0), 

282 (0, 0, 1), 

283 ) 

284 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

285 return Transformation(ctm) 

286 

287 def __repr__(self) -> str: 

288 return f"Transformation(ctm={self.ctm})" 

289 

290 @overload 

291 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: 

292 ... 

293 

294 @overload 

295 def apply_on( 

296 self, pt: tuple[float, float], as_object: bool = False 

297 ) -> tuple[float, float]: 

298 ... 

299 

300 def apply_on( 

301 self, 

302 pt: Union[tuple[float, float], list[float]], 

303 as_object: bool = False, 

304 ) -> Union[tuple[float, float], list[float]]: 

305 """ 

306 Apply the transformation matrix on the given point. 

307 

308 Args: 

309 pt: A tuple or list representing the point in the form (x, y). 

310 as_object: If True, return items as FloatObject, otherwise as plain floats. 

311 

312 Returns: 

313 A tuple or list representing the transformed point in the form (x', y') 

314 

315 """ 

316 typ = FloatObject if as_object else float 

317 pt1 = ( 

318 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

319 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

320 ) 

321 return list(pt1) if isinstance(pt, list) else pt1 

322 

323 

324@dataclass 

325class ImageFile: 

326 """ 

327 Image within the PDF file. *This object is not designed to be built.* 

328 

329 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

330 """ 

331 

332 name: str = "" 

333 """ 

334 Filename as identified within the PDF file. 

335 """ 

336 

337 data: bytes = b"" 

338 """ 

339 Data as bytes. 

340 """ 

341 

342 image: Optional[Image] = None 

343 """ 

344 Data as PIL image. 

345 """ 

346 

347 indirect_reference: Optional[IndirectObject] = None 

348 """ 

349 Reference to the object storing the stream. 

350 """ 

351 

352 def replace(self, new_image: Image, **kwargs: Any) -> None: 

353 """ 

354 Replace the image with a new PIL image. 

355 

356 Args: 

357 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

358 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

359 

360 Raises: 

361 TypeError: If the image is inline or in a PdfReader. 

362 TypeError: If the image does not belong to a PdfWriter. 

363 TypeError: If `new_image` is not a PIL Image. 

364 

365 Note: 

366 This method replaces the existing image with a new image. 

367 It is not allowed for inline images or images within a PdfReader. 

368 The `kwargs` parameter allows passing additional parameters 

369 to `Image.save()`, such as quality. 

370 

371 """ 

372 if pil_not_imported: 

373 raise ImportError( 

374 "pillow is required to do image extraction. " 

375 "It can be installed via 'pip install pypdf[image]'" 

376 ) 

377 

378 from ._reader import PdfReader # noqa: PLC0415 

379 

380 # to prevent circular import 

381 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 

382 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

383 

384 if self.indirect_reference is None: 

385 raise TypeError("Cannot update an inline image.") 

386 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

387 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

388 if not isinstance(new_image, Image): 

389 raise TypeError("new_image shall be a PIL Image") 

390 b = BytesIO() 

391 new_image.save(b, "PDF", **kwargs) 

392 reader = PdfReader(b) 

393 page_image = reader.pages[0].images[0] 

394 assert page_image.indirect_reference is not None 

395 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

396 page_image.indirect_reference.get_object() 

397 ) 

398 cast( 

399 PdfObject, self.indirect_reference.get_object() 

400 ).indirect_reference = self.indirect_reference 

401 # change the object attributes 

402 extension, byte_stream, img = _xobj_to_image( 

403 cast(DictionaryObject, self.indirect_reference.get_object()), 

404 pillow_parameters=kwargs, 

405 ) 

406 assert extension is not None 

407 self.name = self.name[: self.name.rfind(".")] + extension 

408 self.data = byte_stream 

409 self.image = img 

410 

411 def __str__(self) -> str: 

412 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

413 

414 def __repr__(self) -> str: 

415 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

416 

417 

418class VirtualListImages(Sequence[ImageFile]): 

419 """ 

420 Provides access to images referenced within a page. 

421 Only one copy will be returned if the usage is used on the same page multiple times. 

422 See :func:`PageObject.images` for more details. 

423 """ 

424 

425 def __init__( 

426 self, 

427 ids_function: Callable[[], list[Union[str, list[str]]]], 

428 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], 

429 ) -> None: 

430 self.ids_function = ids_function 

431 self.get_function = get_function 

432 self.current = -1 

433 

434 def __len__(self) -> int: 

435 return len(self.ids_function()) 

436 

437 def keys(self) -> list[Union[str, list[str]]]: 

438 return self.ids_function() 

439 

440 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: 

441 return [(x, self[x]) for x in self.ids_function()] 

442 

443 @overload 

444 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: 

445 ... 

446 

447 @overload 

448 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

449 ... 

450 

451 def __getitem__( 

452 self, index: Union[int, slice, str, list[str], tuple[str]] 

453 ) -> Union[ImageFile, Sequence[ImageFile]]: 

454 lst = self.ids_function() 

455 if isinstance(index, slice): 

456 indices = range(*index.indices(len(self))) 

457 lst = [lst[x] for x in indices] 

458 cls = type(self) 

459 return cls((lambda: lst), self.get_function) 

460 if isinstance(index, (str, list, tuple)): 

461 return self.get_function(index) 

462 if not isinstance(index, int): 

463 raise TypeError("Invalid sequence indices type") 

464 len_self = len(lst) 

465 if index < 0: 

466 # support negative indexes 

467 index += len_self 

468 if not (0 <= index < len_self): 

469 raise IndexError("Sequence index out of range") 

470 return self.get_function(lst[index]) 

471 

472 def __iter__(self) -> Iterator[ImageFile]: 

473 for i in range(len(self)): 

474 yield self[i] 

475 

476 def __str__(self) -> str: 

477 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

478 return f"[{', '.join(p)}]" 

479 

480 

481class PageObject(DictionaryObject): 

482 """ 

483 PageObject represents a single page within a PDF file. 

484 

485 Typically these objects will be created by accessing the 

486 :attr:`pages<pypdf.PdfReader.pages>` property of the 

487 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

488 also possible to create an empty page with the 

489 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

490 

491 Args: 

492 pdf: PDF file the page belongs to. 

493 indirect_reference: Stores the original indirect reference to 

494 this object in its source PDF 

495 

496 """ 

497 

498 original_page: "PageObject" # very local use in writer when appending 

499 

500 def __init__( 

501 self, 

502 pdf: Optional[PdfCommonDocProtocol] = None, 

503 indirect_reference: Optional[IndirectObject] = None, 

504 ) -> None: 

505 DictionaryObject.__init__(self) 

506 self.pdf = pdf 

507 self.inline_images: Optional[dict[str, ImageFile]] = None 

508 self.indirect_reference = indirect_reference 

509 if not is_null_or_none(indirect_reference): 

510 assert indirect_reference is not None, "mypy" 

511 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

512 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {} 

513 

514 def hash_bin(self) -> int: 

515 """ 

516 Used to detect modified object. 

517 

518 Note: this function is overloaded to return the same results 

519 as a DictionaryObject. 

520 

521 Returns: 

522 Hash considering type and value. 

523 

524 """ 

525 return hash( 

526 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

527 ) 

528 

529 def hash_value_data(self) -> bytes: 

530 data = super().hash_value_data() 

531 data += f"{id(self)}".encode() 

532 return data 

533 

534 @property 

535 def user_unit(self) -> float: 

536 """ 

537 A read-only positive number giving the size of user space units. 

538 

539 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

540 space unit is 1/72 inch, and a value of 3 means that a user 

541 space unit is 3/72 inch. 

542 """ 

543 return self.get(PG.USER_UNIT, 1) 

544 

545 @staticmethod 

546 def create_blank_page( 

547 pdf: Optional[PdfCommonDocProtocol] = None, 

548 width: Union[float, Decimal, None] = None, 

549 height: Union[float, Decimal, None] = None, 

550 ) -> "PageObject": 

551 """ 

552 Return a new blank page. 

553 

554 If ``width`` or ``height`` is ``None``, try to get the page size 

555 from the last page of *pdf*. 

556 

557 Args: 

558 pdf: PDF file the page is within. 

559 width: The width of the new page expressed in default user 

560 space units. 

561 height: The height of the new page expressed in default user 

562 space units. 

563 

564 Returns: 

565 The new blank page 

566 

567 Raises: 

568 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

569 no page 

570 

571 """ 

572 page = PageObject(pdf) 

573 

574 # Creates a new page (cf PDF Reference §7.7.3.3) 

575 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

576 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

577 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

578 if width is None or height is None: 

579 if pdf is not None and len(pdf.pages) > 0: 

580 lastpage = pdf.pages[len(pdf.pages) - 1] 

581 width = lastpage.mediabox.width 

582 height = lastpage.mediabox.height 

583 else: 

584 raise PageSizeNotDefinedError 

585 page.__setitem__( 

586 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

587 ) 

588 

589 return page 

590 

591 def _get_ids_image( 

592 self, 

593 obj: Optional[DictionaryObject] = None, 

594 ancest: Optional[list[str]] = None, 

595 call_stack: Optional[list[Any]] = None, 

596 ) -> list[Union[str, list[str]]]: 

597 if call_stack is None: 

598 call_stack = [] 

599 _i = getattr(obj, "indirect_reference", None) 

600 if _i in call_stack: 

601 return [] 

602 call_stack.append(_i) 

603 if self.inline_images is None: 

604 self.inline_images = self._get_inline_images() 

605 if obj is None: 

606 obj = self 

607 if ancest is None: 

608 ancest = [] 

609 lst: list[Union[str, list[str]]] = [] 

610 if ( 

611 PG.RESOURCES not in obj or 

612 is_null_or_none(resources := obj[PG.RESOURCES]) or 

613 RES.XOBJECT not in cast(DictionaryObject, resources) 

614 ): 

615 return [] if self.inline_images is None else list(self.inline_images.keys()) 

616 

617 x_object = resources[RES.XOBJECT].get_object() # type: ignore 

618 for o in x_object: 

619 if not isinstance(x_object[o], StreamObject): 

620 continue 

621 if x_object[o][IA.SUBTYPE] == "/Image": 

622 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

623 else: # is a form with possible images inside 

624 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

625 assert self.inline_images is not None 

626 lst.extend(list(self.inline_images.keys())) 

627 return lst 

628 

629 def _get_image( 

630 self, 

631 id: Union[str, list[str], tuple[str]], 

632 obj: Optional[DictionaryObject] = None, 

633 ) -> ImageFile: 

634 if obj is None: 

635 obj = cast(DictionaryObject, self) 

636 if isinstance(id, tuple): 

637 id = list(id) 

638 if isinstance(id, list) and len(id) == 1: 

639 id = id[0] 

640 try: 

641 xobjs = cast( 

642 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

643 ) 

644 except KeyError: 

645 if not (id[0] == "~" and id[-1] == "~"): 

646 raise 

647 if isinstance(id, str): 

648 if id[0] == "~" and id[-1] == "~": 

649 if self.inline_images is None: 

650 self.inline_images = self._get_inline_images() 

651 if self.inline_images is None: # pragma: no cover 

652 raise KeyError("No inline image can be found") 

653 return self.inline_images[id] 

654 

655 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 

656 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

657 extension, byte_stream = imgd[:2] 

658 return ImageFile( 

659 name=f"{id[1:]}{extension}", 

660 data=byte_stream, 

661 image=imgd[2], 

662 indirect_reference=xobjs[id].indirect_reference, 

663 ) 

664 # in a subobject 

665 ids = id[1:] 

666 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

667 

668 @property 

669 def images(self) -> VirtualListImages: 

670 """ 

671 Read-only property emulating a list of images on a page. 

672 

673 Get a list of all images on the page. The key can be: 

674 - A string (for the top object) 

675 - A tuple (for images within XObject forms) 

676 - An integer 

677 

678 Examples: 

679 * `reader.pages[0].images[0]` # return first image 

680 * `reader.pages[0].images['/I0']` # return image '/I0' 

681 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

682 * `for img in reader.pages[0].images:` # loops through all objects 

683 

684 images.keys() and images.items() can be used. 

685 

686 The ImageFile has the following properties: 

687 

688 * `.name` : name of the object 

689 * `.data` : bytes of the object 

690 * `.image` : PIL Image Object 

691 * `.indirect_reference` : object reference 

692 

693 and the following methods: 

694 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

695 replace the image in the pdf with the new image 

696 applying the saving parameters indicated (such as quality) 

697 

698 Example usage: 

699 

700 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

701 

702 Inline images are extracted and named ~0~, ~1~, ..., with the 

703 indirect_reference set to None. 

704 

705 """ 

706 return VirtualListImages(self._get_ids_image, self._get_image) 

707 

708 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

709 """Translate values used in inline image""" 

710 try: 

711 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

712 except (TypeError, KeyError): 

713 if isinstance(v, NameObject): 

714 # It is a custom name, thus we have to look in resources. 

715 # The only applicable case is for ColorSpace. 

716 try: 

717 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

718 v = cast(DictionaryObject, res)[v] 

719 except KeyError: # for res and v 

720 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

721 return v 

722 

723 def _get_inline_images(self) -> dict[str, ImageFile]: 

724 """Load inline images. Entries will be identified as `~1~`.""" 

725 content = self.get_contents() 

726 if is_null_or_none(content): 

727 return {} 

728 imgs_data = [] 

729 assert content is not None, "mypy" 

730 for param, ope in content.operations: 

731 if ope == b"INLINE IMAGE": 

732 imgs_data.append( 

733 {"settings": param["settings"], "__streamdata__": param["data"]} 

734 ) 

735 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

736 raise PdfReadError( 

737 f"{ope!r} operator met whereas not expected, " 

738 "please share use case with pypdf dev team" 

739 ) 

740 files = {} 

741 for num, ii in enumerate(imgs_data): 

742 init = { 

743 "__streamdata__": ii["__streamdata__"], 

744 "/Length": len(ii["__streamdata__"]), 

745 } 

746 for k, v in ii["settings"].items(): 

747 if k in {"/Length", "/L"}: # no length is expected 

748 continue 

749 if isinstance(v, list): 

750 v = ArrayObject( 

751 [self._translate_value_inline_image(k, x) for x in v] 

752 ) 

753 else: 

754 v = self._translate_value_inline_image(k, v) 

755 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

756 if k not in init: 

757 init[k] = v 

758 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

759 from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 

760 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

761 files[f"~{num}~"] = ImageFile( 

762 name=f"~{num}~{extension}", 

763 data=byte_stream, 

764 image=img, 

765 indirect_reference=None, 

766 ) 

767 return files 

768 

769 @property 

770 def rotation(self) -> int: 

771 """ 

772 The visual rotation of the page. 

773 

774 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

775 valid values. This property does not affect ``/Contents``. 

776 """ 

777 rotate_obj = self.get(PG.ROTATE, 0) 

778 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

779 

780 @rotation.setter 

781 def rotation(self, r: float) -> None: 

782 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

783 

784 def transfer_rotation_to_content(self) -> None: 

785 """ 

786 Apply the rotation of the page to the content and the media/crop/... 

787 boxes. 

788 

789 It is recommended to apply this function before page merging. 

790 """ 

791 r = -self.rotation # rotation to apply is in the otherway 

792 self.rotation = 0 

793 mb = RectangleObject(self.mediabox) 

794 trsf = ( 

795 Transformation() 

796 .translate( 

797 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

798 ) 

799 .rotate(r) 

800 ) 

801 pt1 = trsf.apply_on(mb.lower_left) 

802 pt2 = trsf.apply_on(mb.upper_right) 

803 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

804 self.add_transformation(trsf, False) 

805 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

806 if b in self: 

807 rr = RectangleObject(self[b]) # type: ignore 

808 pt1 = trsf.apply_on(rr.lower_left) 

809 pt2 = trsf.apply_on(rr.upper_right) 

810 self[NameObject(b)] = RectangleObject( 

811 ( 

812 min(pt1[0], pt2[0]), 

813 min(pt1[1], pt2[1]), 

814 max(pt1[0], pt2[0]), 

815 max(pt1[1], pt2[1]), 

816 ) 

817 ) 

818 

819 def rotate(self, angle: int) -> "PageObject": 

820 """ 

821 Rotate a page clockwise by increments of 90 degrees. 

822 

823 Args: 

824 angle: Angle to rotate the page. Must be an increment of 90 deg. 

825 

826 Returns: 

827 The rotated PageObject 

828 

829 """ 

830 if angle % 90 != 0: 

831 raise ValueError("Rotation angle must be a multiple of 90") 

832 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

833 return self 

834 

835 def _merge_resources( 

836 self, 

837 res1: DictionaryObject, 

838 res2: DictionaryObject, 

839 resource: Any, 

840 new_res1: bool = True, 

841 ) -> tuple[dict[str, Any], dict[str, Any]]: 

842 try: 

843 assert isinstance(self.indirect_reference, IndirectObject) 

844 pdf = self.indirect_reference.pdf 

845 is_pdf_writer = hasattr( 

846 pdf, "_add_object" 

847 ) # expect isinstance(pdf, PdfWriter) 

848 except (AssertionError, AttributeError): 

849 pdf = None 

850 is_pdf_writer = False 

851 

852 def compute_unique_key(base_key: str) -> tuple[str, bool]: 

853 """ 

854 Find a key that either doesn't already exist or has the same value 

855 (indicated by the bool) 

856 

857 Args: 

858 base_key: An index is added to this to get the computed key 

859 

860 Returns: 

861 A tuple (computed key, bool) where the boolean indicates 

862 if there is a resource of the given computed_key with the same 

863 value. 

864 

865 """ 

866 value = page2res.raw_get(base_key) 

867 # TODO: a possible improvement for writer, the indirect_reference 

868 # cannot be found because translated 

869 

870 # try the current key first (e.g. "foo"), but otherwise iterate 

871 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

872 # many keys, thus this'll eventually end, even if it's been crafted 

873 # to be maximally annoying. 

874 computed_key = base_key 

875 idx = 0 

876 while computed_key in new_res: 

877 if new_res.raw_get(computed_key) == value: 

878 # there's already a resource of this name, with the exact 

879 # same value 

880 return computed_key, True 

881 computed_key = f"{base_key}-{idx}" 

882 idx += 1 

883 return computed_key, False 

884 

885 if new_res1: 

886 new_res = DictionaryObject() 

887 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

888 else: 

889 new_res = cast(DictionaryObject, res1[resource]) 

890 page2res = cast( 

891 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

892 ) 

893 rename_res = {} 

894 for key in page2res: 

895 unique_key, same_value = compute_unique_key(key) 

896 newname = NameObject(unique_key) 

897 if key != unique_key: 

898 # we have to use a different name for this 

899 rename_res[key] = newname 

900 

901 if not same_value: 

902 if is_pdf_writer: 

903 new_res[newname] = page2res.raw_get(key).clone(pdf) 

904 try: 

905 new_res[newname] = new_res[newname].indirect_reference 

906 except AttributeError: 

907 pass 

908 else: 

909 new_res[newname] = page2res.raw_get(key) 

910 lst = sorted(new_res.items()) 

911 new_res.clear() 

912 for el in lst: 

913 new_res[el[0]] = el[1] 

914 return new_res, rename_res 

915 

916 @staticmethod 

917 def _content_stream_rename( 

918 stream: ContentStream, 

919 rename: dict[Any, Any], 

920 pdf: Optional[PdfCommonDocProtocol], 

921 ) -> ContentStream: 

922 if not rename: 

923 return stream 

924 stream = ContentStream(stream, pdf) 

925 for operands, _operator in stream.operations: 

926 if isinstance(operands, list): 

927 for i, op in enumerate(operands): 

928 if isinstance(op, NameObject): 

929 operands[i] = rename.get(op, op) 

930 elif isinstance(operands, dict): 

931 for i, op in operands.items(): 

932 if isinstance(op, NameObject): 

933 operands[i] = rename.get(op, op) 

934 else: 

935 raise KeyError(f"Type of operands is {type(operands)}") 

936 return stream 

937 

938 @staticmethod 

939 def _add_transformation_matrix( 

940 contents: Any, 

941 pdf: Optional[PdfCommonDocProtocol], 

942 ctm: CompressedTransformationMatrix, 

943 ) -> ContentStream: 

944 """Add transformation matrix at the beginning of the given contents stream.""" 

945 contents = ContentStream(contents, pdf) 

946 contents.operations.insert( 

947 0, 

948 [ 

949 [FloatObject(x) for x in ctm], 

950 b"cm", 

951 ], 

952 ) 

953 return contents 

954 

955 def _get_contents_as_bytes(self) -> Optional[bytes]: 

956 """ 

957 Return the page contents as bytes. 

958 

959 Returns: 

960 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

961 

962 """ 

963 if PG.CONTENTS in self: 

964 obj = self[PG.CONTENTS].get_object() 

965 if isinstance(obj, list): 

966 return b"".join(x.get_object().get_data() for x in obj) 

967 return cast(EncodedStreamObject, obj).get_data() 

968 return None 

969 

970 def get_contents(self) -> Optional[ContentStream]: 

971 """ 

972 Access the page contents. 

973 

974 Returns: 

975 The ``/Contents`` object, or ``None`` if it does not exist. 

976 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

977 

978 """ 

979 if PG.CONTENTS in self: 

980 try: 

981 pdf = cast(IndirectObject, self.indirect_reference).pdf 

982 except AttributeError: 

983 pdf = None 

984 obj = self[PG.CONTENTS] 

985 if is_null_or_none(obj): 

986 return None 

987 resolved_object = obj.get_object() 

988 return ContentStream(resolved_object, pdf) 

989 return None 

990 

991 def replace_contents( 

992 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

993 ) -> None: 

994 """ 

995 Replace the page contents with the new content and nullify old objects 

996 Args: 

997 content: new content; if None delete the content field. 

998 """ 

999 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

1000 # the page is not attached : the content is directly attached. 

1001 self[NameObject(PG.CONTENTS)] = content 

1002 return 

1003 

1004 from pypdf._writer import PdfWriter # noqa: PLC0415 

1005 if not isinstance(self.indirect_reference.pdf, PdfWriter): 

1006 deprecate( 

1007 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated " 

1008 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use " 

1009 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable." 

1010 ) 

1011 

1012 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1013 for o in self[PG.CONTENTS]: # type: ignore[attr-defined] 

1014 try: 

1015 self.indirect_reference.pdf._objects[ 

1016 o.indirect_reference.idnum - 1 

1017 ] = NullObject() 

1018 except AttributeError: 

1019 pass 

1020 

1021 if isinstance(content, ArrayObject): 

1022 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content) 

1023 

1024 if is_null_or_none(content): 

1025 if PG.CONTENTS not in self: 

1026 return 

1027 assert self.indirect_reference is not None 

1028 assert self[PG.CONTENTS].indirect_reference is not None 

1029 self.indirect_reference.pdf._objects[ 

1030 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore 

1031 ] = NullObject() 

1032 del self[PG.CONTENTS] 

1033 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1034 try: 

1035 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( 

1036 content 

1037 ) 

1038 except AttributeError: 

1039 # applies at least for page not in writer 

1040 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1041 # this will be fixed with the _add_object 

1042 self[NameObject(PG.CONTENTS)] = content 

1043 else: 

1044 assert content is not None, "mypy" 

1045 content.indirect_reference = self[ 

1046 PG.CONTENTS 

1047 ].indirect_reference # TODO: in the future may require generation management 

1048 try: 

1049 self.indirect_reference.pdf._objects[ 

1050 content.indirect_reference.idnum - 1 # type: ignore 

1051 ] = content 

1052 except AttributeError: 

1053 # applies at least for page not in writer 

1054 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1055 # this will be fixed with the _add_object 

1056 self[NameObject(PG.CONTENTS)] = content 

1057 # forces recalculation of inline_images 

1058 self.inline_images = None 

1059 

1060 def merge_page( 

1061 self, page2: "PageObject", expand: bool = False, over: bool = True 

1062 ) -> None: 

1063 """ 

1064 Merge the content streams of two pages into one. 

1065 

1066 Resource references (e.g. fonts) are maintained from both pages. 

1067 The mediabox, cropbox, etc of this page are not altered. 

1068 The parameter page's content stream will 

1069 be added to the end of this page's content stream, 

1070 meaning that it will be drawn after, or "on top" of this page. 

1071 

1072 Args: 

1073 page2: The page to be merged into this one. Should be 

1074 an instance of :class:`PageObject<PageObject>`. 

1075 over: set the page2 content over page1 if True (default) else under 

1076 expand: If True, the current page dimensions will be 

1077 expanded to accommodate the dimensions of the page to be merged. 

1078 

1079 """ 

1080 self._merge_page(page2, over=over, expand=expand) 

1081 

1082 def _merge_page( 

1083 self, 

1084 page2: "PageObject", 

1085 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1086 ctm: Optional[CompressedTransformationMatrix] = None, 

1087 over: bool = True, 

1088 expand: bool = False, 

1089 ) -> None: 

1090 # First we work on merging the resource dictionaries. This allows us 

1091 # to find out what symbols in the content streams we might need to 

1092 # rename. 

1093 try: 

1094 assert isinstance(self.indirect_reference, IndirectObject) 

1095 if hasattr( 

1096 self.indirect_reference.pdf, "_add_object" 

1097 ): # to detect PdfWriter 

1098 return self._merge_page_writer( 

1099 page2, page2transformation, ctm, over, expand 

1100 ) 

1101 except (AssertionError, AttributeError): 

1102 pass 

1103 

1104 new_resources = DictionaryObject() 

1105 rename = {} 

1106 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1107 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1108 new_annots = ArrayObject() 

1109 

1110 for page in (self, page2): 

1111 if PG.ANNOTS in page: 

1112 annots = page[PG.ANNOTS] 

1113 if isinstance(annots, ArrayObject): 

1114 new_annots.extend(annots) 

1115 

1116 for res in ( 

1117 RES.EXT_G_STATE, 

1118 RES.FONT, 

1119 RES.XOBJECT, 

1120 RES.COLOR_SPACE, 

1121 RES.PATTERN, 

1122 RES.SHADING, 

1123 RES.PROPERTIES, 

1124 ): 

1125 new, newrename = self._merge_resources( 

1126 original_resources, page2resources, res 

1127 ) 

1128 if new: 

1129 new_resources[NameObject(res)] = new 

1130 rename.update(newrename) 

1131 

1132 # Combine /ProcSet sets, making sure there's a consistent order 

1133 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1134 sorted( 

1135 set( 

1136 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1137 ).union( 

1138 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1139 ) 

1140 ) 

1141 ) 

1142 

1143 new_content_array = ArrayObject() 

1144 original_content = self.get_contents() 

1145 if original_content is not None: 

1146 original_content.isolate_graphics_state() 

1147 new_content_array.append(original_content) 

1148 

1149 page2content = page2.get_contents() 

1150 if page2content is not None: 

1151 rect = getattr(page2, MERGE_CROP_BOX) 

1152 page2content.operations.insert( 

1153 0, 

1154 ( 

1155 map( 

1156 FloatObject, 

1157 [ 

1158 rect.left, 

1159 rect.bottom, 

1160 rect.width, 

1161 rect.height, 

1162 ], 

1163 ), 

1164 b"re", 

1165 ), 

1166 ) 

1167 page2content.operations.insert(1, ([], b"W")) 

1168 page2content.operations.insert(2, ([], b"n")) 

1169 if page2transformation is not None: 

1170 page2content = page2transformation(page2content) 

1171 page2content = PageObject._content_stream_rename( 

1172 page2content, rename, self.pdf 

1173 ) 

1174 page2content.isolate_graphics_state() 

1175 if over: 

1176 new_content_array.append(page2content) 

1177 else: 

1178 new_content_array.insert(0, page2content) 

1179 

1180 # if expanding the page to fit a new page, calculate the new media box size 

1181 if expand: 

1182 self._expand_mediabox(page2, ctm) 

1183 

1184 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1185 self[NameObject(PG.RESOURCES)] = new_resources 

1186 self[NameObject(PG.ANNOTS)] = new_annots 

1187 return None 

1188 

1189 def _merge_page_writer( 

1190 self, 

1191 page2: "PageObject", 

1192 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1193 ctm: Optional[CompressedTransformationMatrix] = None, 

1194 over: bool = True, 

1195 expand: bool = False, 

1196 ) -> None: 

1197 # First we work on merging the resource dictionaries. This allows us 

1198 # to find which symbols in the content streams we might need to 

1199 # rename. 

1200 assert isinstance(self.indirect_reference, IndirectObject) 

1201 pdf = self.indirect_reference.pdf 

1202 

1203 rename = {} 

1204 if PG.RESOURCES not in self: 

1205 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1206 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1207 if PG.RESOURCES not in page2: 

1208 page2resources = DictionaryObject() 

1209 else: 

1210 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1211 

1212 for res in ( 

1213 RES.EXT_G_STATE, 

1214 RES.FONT, 

1215 RES.XOBJECT, 

1216 RES.COLOR_SPACE, 

1217 RES.PATTERN, 

1218 RES.SHADING, 

1219 RES.PROPERTIES, 

1220 ): 

1221 if res in page2resources: 

1222 if res not in original_resources: 

1223 original_resources[NameObject(res)] = DictionaryObject() 

1224 _, newrename = self._merge_resources( 

1225 original_resources, page2resources, res, False 

1226 ) 

1227 rename.update(newrename) 

1228 # Combine /ProcSet sets. 

1229 if RES.PROC_SET in page2resources: 

1230 if RES.PROC_SET not in original_resources: 

1231 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1232 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1233 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1234 if x not in arr: 

1235 arr.append(x) 

1236 arr.sort() 

1237 

1238 if PG.ANNOTS in page2: 

1239 if PG.ANNOTS not in self: 

1240 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1241 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1242 if ctm is None: 

1243 trsf = Transformation() 

1244 else: 

1245 trsf = Transformation(ctm) 

1246 # Ensure we are working on a copy of the list. Otherwise, if both pages 

1247 # are the same object, we might run into an infinite loop. 

1248 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])): 

1249 a = a.get_object() 

1250 aa = a.clone( 

1251 pdf, 

1252 ignore_fields=("/P", "/StructParent", "/Parent"), 

1253 force_duplicate=True, 

1254 ) 

1255 r = cast(ArrayObject, a["/Rect"]) 

1256 pt1 = trsf.apply_on((r[0], r[1]), True) 

1257 pt2 = trsf.apply_on((r[2], r[3]), True) 

1258 aa[NameObject("/Rect")] = ArrayObject( 

1259 ( 

1260 min(pt1[0], pt2[0]), 

1261 min(pt1[1], pt2[1]), 

1262 max(pt1[0], pt2[0]), 

1263 max(pt1[1], pt2[1]), 

1264 ) 

1265 ) 

1266 if "/QuadPoints" in a: 

1267 q = cast(ArrayObject, a["/QuadPoints"]) 

1268 aa[NameObject("/QuadPoints")] = ArrayObject( 

1269 trsf.apply_on((q[0], q[1]), True) 

1270 + trsf.apply_on((q[2], q[3]), True) 

1271 + trsf.apply_on((q[4], q[5]), True) 

1272 + trsf.apply_on((q[6], q[7]), True) 

1273 ) 

1274 try: 

1275 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1276 except KeyError: 

1277 pass 

1278 try: 

1279 aa[NameObject("/P")] = self.indirect_reference 

1280 annots.append(aa.indirect_reference) 

1281 except AttributeError: 

1282 pass 

1283 

1284 new_content_array = ArrayObject() 

1285 original_content = self.get_contents() 

1286 if original_content is not None: 

1287 original_content.isolate_graphics_state() 

1288 new_content_array.append(original_content) 

1289 

1290 page2content = page2.get_contents() 

1291 if page2content is not None: 

1292 rect = getattr(page2, MERGE_CROP_BOX) 

1293 page2content.operations.insert( 

1294 0, 

1295 ( 

1296 map( 

1297 FloatObject, 

1298 [ 

1299 rect.left, 

1300 rect.bottom, 

1301 rect.width, 

1302 rect.height, 

1303 ], 

1304 ), 

1305 b"re", 

1306 ), 

1307 ) 

1308 page2content.operations.insert(1, ([], b"W")) 

1309 page2content.operations.insert(2, ([], b"n")) 

1310 if page2transformation is not None: 

1311 page2content = page2transformation(page2content) 

1312 page2content = PageObject._content_stream_rename( 

1313 page2content, rename, self.pdf 

1314 ) 

1315 page2content.isolate_graphics_state() 

1316 if over: 

1317 new_content_array.append(page2content) 

1318 else: 

1319 new_content_array.insert(0, page2content) 

1320 

1321 # if expanding the page to fit a new page, calculate the new media box size 

1322 if expand: 

1323 self._expand_mediabox(page2, ctm) 

1324 

1325 self.replace_contents(new_content_array) 

1326 

1327 def _expand_mediabox( 

1328 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1329 ) -> None: 

1330 corners1 = ( 

1331 self.mediabox.left.as_numeric(), 

1332 self.mediabox.bottom.as_numeric(), 

1333 self.mediabox.right.as_numeric(), 

1334 self.mediabox.top.as_numeric(), 

1335 ) 

1336 corners2 = ( 

1337 page2.mediabox.left.as_numeric(), 

1338 page2.mediabox.bottom.as_numeric(), 

1339 page2.mediabox.left.as_numeric(), 

1340 page2.mediabox.top.as_numeric(), 

1341 page2.mediabox.right.as_numeric(), 

1342 page2.mediabox.top.as_numeric(), 

1343 page2.mediabox.right.as_numeric(), 

1344 page2.mediabox.bottom.as_numeric(), 

1345 ) 

1346 if ctm is not None: 

1347 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1348 new_x = tuple( 

1349 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1350 for i in range(0, 8, 2) 

1351 ) 

1352 new_y = tuple( 

1353 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1354 for i in range(0, 8, 2) 

1355 ) 

1356 else: 

1357 new_x = corners2[0:8:2] 

1358 new_y = corners2[1:8:2] 

1359 lowerleft = (min(new_x), min(new_y)) 

1360 upperright = (max(new_x), max(new_y)) 

1361 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1362 upperright = ( 

1363 max(corners1[2], upperright[0]), 

1364 max(corners1[3], upperright[1]), 

1365 ) 

1366 

1367 self.mediabox.lower_left = lowerleft 

1368 self.mediabox.upper_right = upperright 

1369 

1370 def merge_transformed_page( 

1371 self, 

1372 page2: "PageObject", 

1373 ctm: Union[CompressedTransformationMatrix, Transformation], 

1374 over: bool = True, 

1375 expand: bool = False, 

1376 ) -> None: 

1377 """ 

1378 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1379 matrix is applied to the merged stream. 

1380 

1381 Args: 

1382 page2: The page to be merged into this one. 

1383 ctm: a 6-element tuple containing the operands of the 

1384 transformation matrix 

1385 over: set the page2 content over page1 if True (default) else under 

1386 expand: Whether the page should be expanded to fit the dimensions 

1387 of the page to be merged. 

1388 

1389 """ 

1390 if isinstance(ctm, Transformation): 

1391 ctm = ctm.ctm 

1392 self._merge_page( 

1393 page2, 

1394 lambda page2_content: PageObject._add_transformation_matrix( 

1395 page2_content, page2.pdf, ctm 

1396 ), 

1397 ctm, 

1398 over, 

1399 expand, 

1400 ) 

1401 

1402 def merge_scaled_page( 

1403 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1404 ) -> None: 

1405 """ 

1406 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1407 is scaled by applying a transformation matrix. 

1408 

1409 Args: 

1410 page2: The page to be merged into this one. 

1411 scale: The scaling factor 

1412 over: set the page2 content over page1 if True (default) else under 

1413 expand: Whether the page should be expanded to fit the 

1414 dimensions of the page to be merged. 

1415 

1416 """ 

1417 op = Transformation().scale(scale, scale) 

1418 self.merge_transformed_page(page2, op, over, expand) 

1419 

1420 def merge_rotated_page( 

1421 self, 

1422 page2: "PageObject", 

1423 rotation: float, 

1424 over: bool = True, 

1425 expand: bool = False, 

1426 ) -> None: 

1427 """ 

1428 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1429 is rotated by applying a transformation matrix. 

1430 

1431 Args: 

1432 page2: The page to be merged into this one. 

1433 rotation: The angle of the rotation, in degrees 

1434 over: set the page2 content over page1 if True (default) else under 

1435 expand: Whether the page should be expanded to fit the 

1436 dimensions of the page to be merged. 

1437 

1438 """ 

1439 op = Transformation().rotate(rotation) 

1440 self.merge_transformed_page(page2, op, over, expand) 

1441 

1442 def merge_translated_page( 

1443 self, 

1444 page2: "PageObject", 

1445 tx: float, 

1446 ty: float, 

1447 over: bool = True, 

1448 expand: bool = False, 

1449 ) -> None: 

1450 """ 

1451 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1452 merged is translated by applying a transformation matrix. 

1453 

1454 Args: 

1455 page2: the page to be merged into this one. 

1456 tx: The translation on X axis 

1457 ty: The translation on Y axis 

1458 over: set the page2 content over page1 if True (default) else under 

1459 expand: Whether the page should be expanded to fit the 

1460 dimensions of the page to be merged. 

1461 

1462 """ 

1463 op = Transformation().translate(tx, ty) 

1464 self.merge_transformed_page(page2, op, over, expand) 

1465 

1466 def add_transformation( 

1467 self, 

1468 ctm: Union[Transformation, CompressedTransformationMatrix], 

1469 expand: bool = False, 

1470 ) -> None: 

1471 """ 

1472 Apply a transformation matrix to the page. 

1473 

1474 Args: 

1475 ctm: A 6-element tuple containing the operands of the 

1476 transformation matrix. Alternatively, a 

1477 :py:class:`Transformation<pypdf.Transformation>` 

1478 object can be passed. 

1479 

1480 See :doc:`/user/cropping-and-transforming`. 

1481 

1482 """ 

1483 if isinstance(ctm, Transformation): 

1484 ctm = ctm.ctm 

1485 content = self.get_contents() 

1486 if content is not None: 

1487 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1488 content.isolate_graphics_state() 

1489 self.replace_contents(content) 

1490 # if expanding the page to fit a new page, calculate the new media box size 

1491 if expand: 

1492 corners = [ 

1493 self.mediabox.left.as_numeric(), 

1494 self.mediabox.bottom.as_numeric(), 

1495 self.mediabox.left.as_numeric(), 

1496 self.mediabox.top.as_numeric(), 

1497 self.mediabox.right.as_numeric(), 

1498 self.mediabox.top.as_numeric(), 

1499 self.mediabox.right.as_numeric(), 

1500 self.mediabox.bottom.as_numeric(), 

1501 ] 

1502 

1503 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1504 new_x = [ 

1505 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1506 for i in range(0, 8, 2) 

1507 ] 

1508 new_y = [ 

1509 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1510 for i in range(0, 8, 2) 

1511 ] 

1512 

1513 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1514 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1515 

1516 def scale(self, sx: float, sy: float) -> None: 

1517 """ 

1518 Scale a page by the given factors by applying a transformation matrix 

1519 to its content and updating the page size. 

1520 

1521 This updates the various page boundaries (bleedbox, trimbox, etc.) 

1522 and the contents of the page. 

1523 

1524 Args: 

1525 sx: The scaling factor on horizontal axis. 

1526 sy: The scaling factor on vertical axis. 

1527 

1528 """ 

1529 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1530 self.bleedbox = self.bleedbox.scale(sx, sy) 

1531 self.trimbox = self.trimbox.scale(sx, sy) 

1532 self.artbox = self.artbox.scale(sx, sy) 

1533 self.cropbox = self.cropbox.scale(sx, sy) 

1534 self.mediabox = self.mediabox.scale(sx, sy) 

1535 

1536 if PG.ANNOTS in self: 

1537 annotations = self[PG.ANNOTS] 

1538 if isinstance(annotations, ArrayObject): 

1539 for annotation in annotations: 

1540 annotation_obj = annotation.get_object() 

1541 if ADA.Rect in annotation_obj: 

1542 rectangle = annotation_obj[ADA.Rect] 

1543 if isinstance(rectangle, ArrayObject): 

1544 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1545 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1546 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1547 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1548 

1549 if PG.VP in self: 

1550 viewport = self[PG.VP] 

1551 if isinstance(viewport, ArrayObject): 

1552 bbox = viewport[0]["/BBox"] 

1553 else: 

1554 bbox = viewport["/BBox"] # type: ignore 

1555 scaled_bbox = RectangleObject( 

1556 ( 

1557 float(bbox[0]) * sx, 

1558 float(bbox[1]) * sy, 

1559 float(bbox[2]) * sx, 

1560 float(bbox[3]) * sy, 

1561 ) 

1562 ) 

1563 if isinstance(viewport, ArrayObject): 

1564 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1565 NameObject("/BBox") 

1566 ] = scaled_bbox 

1567 else: 

1568 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1569 

1570 def scale_by(self, factor: float) -> None: 

1571 """ 

1572 Scale a page by the given factor by applying a transformation matrix to 

1573 its content and updating the page size. 

1574 

1575 Args: 

1576 factor: The scaling factor (for both X and Y axis). 

1577 

1578 """ 

1579 self.scale(factor, factor) 

1580 

1581 def scale_to(self, width: float, height: float) -> None: 

1582 """ 

1583 Scale a page to the specified dimensions by applying a transformation 

1584 matrix to its content and updating the page size. 

1585 

1586 Args: 

1587 width: The new width. 

1588 height: The new height. 

1589 

1590 """ 

1591 sx = width / float(self.mediabox.width) 

1592 sy = height / float(self.mediabox.height) 

1593 self.scale(sx, sy) 

1594 

1595 def compress_content_streams(self, level: int = -1) -> None: 

1596 """ 

1597 Compress the size of this page by joining all content streams and 

1598 applying a FlateDecode filter. 

1599 

1600 However, it is possible that this function will perform no action if 

1601 content stream compression becomes "automatic". 

1602 """ 

1603 content = self.get_contents() 

1604 if content is not None: 

1605 content_obj = content.flate_encode(level) 

1606 try: 

1607 content.indirect_reference.pdf._objects[ # type: ignore 

1608 content.indirect_reference.idnum - 1 # type: ignore 

1609 ] = content_obj 

1610 except AttributeError: 

1611 if self.indirect_reference is not None and hasattr( 

1612 self.indirect_reference.pdf, "_add_object" 

1613 ): 

1614 self.replace_contents(content_obj) 

1615 else: 

1616 raise ValueError("Page must be part of a PdfWriter") 

1617 

1618 @property 

1619 def page_number(self) -> Optional[int]: 

1620 """ 

1621 Read-only property which returns the page number within the PDF file. 

1622 

1623 Returns: 

1624 Page number; None if the page is not attached to a PDF. 

1625 

1626 """ 

1627 if self.indirect_reference is None: 

1628 return None 

1629 try: 

1630 lst = self.indirect_reference.pdf.pages 

1631 return lst.index(self) 

1632 except ValueError: 

1633 return None 

1634 

1635 def _debug_for_extract(self) -> str: # pragma: no cover 

1636 out = "" 

1637 for ope, op in ContentStream( 

1638 self["/Contents"].get_object(), self.pdf, "bytes" 

1639 ).operations: 

1640 if op == b"TJ": 

1641 s = [x for x in ope[0] if isinstance(x, str)] 

1642 else: 

1643 s = [] 

1644 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1645 out += "\n=============================\n" 

1646 try: 

1647 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1648 out += fo + "\n" 

1649 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1650 try: 

1651 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1652 "/Encoding" 

1653 ].__repr__() 

1654 out += enc_repr + "\n" 

1655 except Exception: 

1656 pass 

1657 try: 

1658 out += ( 

1659 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1660 "/ToUnicode" 

1661 ] 

1662 .get_data() 

1663 .decode() 

1664 + "\n" 

1665 ) 

1666 except Exception: 

1667 pass 

1668 

1669 except KeyError: 

1670 out += "No Font\n" 

1671 return out 

1672 

1673 def _extract_text( 

1674 self, 

1675 obj: Any, 

1676 pdf: Any, 

1677 orientations: tuple[int, ...] = (0, 90, 180, 270), 

1678 space_width: float = 200.0, 

1679 content_key: Optional[str] = PG.CONTENTS, 

1680 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1681 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1682 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1683 ) -> str: 

1684 """ 

1685 See extract_text for most arguments. 

1686 

1687 Args: 

1688 content_key: indicate the default key where to extract data 

1689 None = the object; this allows reusing the function on an XObject 

1690 default = "/Content" 

1691 

1692 """ 

1693 extractor = TextExtraction() 

1694 cmaps: dict[ 

1695 str, 

1696 tuple[ 

1697 str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject 

1698 ], 

1699 ] = {} 

1700 

1701 try: 

1702 objr = obj 

1703 while NameObject(PG.RESOURCES) not in objr: 

1704 # /Resources can be inherited so we look to parents 

1705 objr = objr["/Parent"].get_object() 

1706 # If no parents then no /Resources will be available, 

1707 # so an exception will be raised 

1708 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1709 except Exception: 

1710 # No resources means no text is possible (no font); we consider the 

1711 # file as not damaged, no need to check for TJ or Tj 

1712 return "" 

1713 

1714 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]): 

1715 for f in cast(DictionaryObject, font): 

1716 try: 

1717 cmaps[f] = build_char_map(f, space_width, obj) 

1718 except TypeError: 

1719 pass 

1720 

1721 try: 

1722 content = ( 

1723 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1724 ) 

1725 if not isinstance(content, ContentStream): 

1726 content = ContentStream(content, pdf, "bytes") 

1727 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1728 return "" 

1729 # We check all strings are TextStringObjects. ByteStringObjects 

1730 # are strings where the byte->string encoding was unknown, so adding 

1731 # them to the text here would be gibberish. 

1732 

1733 # Initialize the extractor with the necessary parameters 

1734 extractor.initialize_extraction(orientations, visitor_text, cmaps) 

1735 

1736 for operands, operator in content.operations: 

1737 if visitor_operand_before is not None: 

1738 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1739 # Multiple operators are handled here 

1740 if operator == b"'": 

1741 extractor.process_operation(b"T*", []) 

1742 extractor.process_operation(b"Tj", operands) 

1743 elif operator == b'"': 

1744 extractor.process_operation(b"Tw", [operands[0]]) 

1745 extractor.process_operation(b"Tc", [operands[1]]) 

1746 extractor.process_operation(b"T*", []) 

1747 extractor.process_operation(b"Tj", operands[2:]) 

1748 elif operator == b"TJ": 

1749 # The space width may be smaller than the font width, so the width should be 95%. 

1750 _confirm_space_width = extractor._space_width * 0.95 

1751 if operands: 

1752 for op in operands[0]: 

1753 if isinstance(op, (str, bytes)): 

1754 extractor.process_operation(b"Tj", [op]) 

1755 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1756 abs(float(op)) >= _confirm_space_width 

1757 and extractor.text 

1758 and extractor.text[-1] != " " 

1759 ): 

1760 extractor.process_operation(b"Tj", [" "]) 

1761 elif operator == b"TD": 

1762 extractor.process_operation(b"TL", [-operands[1]]) 

1763 extractor.process_operation(b"Td", operands) 

1764 elif operator == b"Do": 

1765 extractor.output += extractor.text 

1766 if visitor_text is not None: 

1767 visitor_text( 

1768 extractor.text, 

1769 extractor.memo_cm, 

1770 extractor.memo_tm, 

1771 extractor.cmap[3], 

1772 extractor.font_size, 

1773 ) 

1774 try: 

1775 if extractor.output[-1] != "\n": 

1776 extractor.output += "\n" 

1777 if visitor_text is not None: 

1778 visitor_text( 

1779 "\n", 

1780 extractor.memo_cm, 

1781 extractor.memo_tm, 

1782 extractor.cmap[3], 

1783 extractor.font_size, 

1784 ) 

1785 except IndexError: 

1786 pass 

1787 try: 

1788 xobj = resources_dict["/XObject"] 

1789 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1790 text = self.extract_xform_text( 

1791 xobj[operands[0]], # type: ignore 

1792 orientations, 

1793 space_width, 

1794 visitor_operand_before, 

1795 visitor_operand_after, 

1796 visitor_text, 

1797 ) 

1798 extractor.output += text 

1799 if visitor_text is not None: 

1800 visitor_text( 

1801 text, 

1802 extractor.memo_cm, 

1803 extractor.memo_tm, 

1804 extractor.cmap[3], 

1805 extractor.font_size, 

1806 ) 

1807 except Exception as exception: 

1808 logger_warning( 

1809 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1810 __name__, 

1811 ) 

1812 finally: 

1813 extractor.text = "" 

1814 extractor.memo_cm = extractor.cm_matrix.copy() 

1815 extractor.memo_tm = extractor.tm_matrix.copy() 

1816 else: 

1817 extractor.process_operation(operator, operands) 

1818 if visitor_operand_after is not None: 

1819 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1820 extractor.output += extractor.text # just in case 

1821 if extractor.text != "" and visitor_text is not None: 

1822 visitor_text( 

1823 extractor.text, 

1824 extractor.memo_cm, 

1825 extractor.memo_tm, 

1826 extractor.cmap[3], 

1827 extractor.font_size, 

1828 ) 

1829 return extractor.output 

1830 

1831 def _layout_mode_fonts(self) -> dict[str, _layout_mode.Font]: 

1832 """ 

1833 Get fonts formatted for "layout" mode text extraction. 

1834 

1835 Returns: 

1836 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name 

1837 

1838 """ 

1839 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1840 objr: Any = self 

1841 fonts: dict[str, _layout_mode.Font] = {} 

1842 while objr is not None: 

1843 try: 

1844 resources_dict: Any = objr[PG.RESOURCES] 

1845 except KeyError: 

1846 resources_dict = {} 

1847 if "/Font" in resources_dict and self.pdf is not None: 

1848 for font_name in resources_dict["/Font"]: 

1849 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) 

1850 font_dict = { 

1851 k: v.get_object() 

1852 if isinstance(v, IndirectObject) 

1853 else [_v.get_object() for _v in v] 

1854 if isinstance(v, ArrayObject) 

1855 else v 

1856 for k, v in font_dict_obj.items() 

1857 } 

1858 # mypy really sucks at unpacking 

1859 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] 

1860 try: 

1861 objr = objr["/Parent"].get_object() 

1862 except KeyError: 

1863 objr = None 

1864 

1865 return fonts 

1866 

1867 def _layout_mode_text( 

1868 self, 

1869 space_vertically: bool = True, 

1870 scale_weight: float = 1.25, 

1871 strip_rotated: bool = True, 

1872 debug_path: Optional[Path] = None, 

1873 font_height_weight: float = 1, 

1874 ) -> str: 

1875 """ 

1876 Get text preserving fidelity to source PDF text layout. 

1877 

1878 Args: 

1879 space_vertically: include blank lines inferred from y distance + font 

1880 height. Defaults to True. 

1881 scale_weight: multiplier for string length when calculating weighted 

1882 average character width. Defaults to 1.25. 

1883 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1884 layout mode output. Defaults to True. 

1885 debug_path (Path | None): if supplied, must target a directory. 

1886 creates the following files with debug information for layout mode 

1887 functions if supplied: 

1888 - fonts.json: output of self._layout_mode_fonts 

1889 - tjs.json: individual text render ops with corresponding transform matrices 

1890 - bts.json: text render ops left justified and grouped by BT/ET operators 

1891 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1892 Defaults to None. 

1893 font_height_weight: multiplier for font height when calculating 

1894 blank lines. Defaults to 1. 

1895 

1896 Returns: 

1897 str: multiline string containing page text in a fixed width format that 

1898 closely adheres to the rendered layout in the source pdf. 

1899 

1900 """ 

1901 fonts = self._layout_mode_fonts() 

1902 if debug_path: # pragma: no cover 

1903 import json # noqa: PLC0415 

1904 

1905 debug_path.joinpath("fonts.json").write_text( 

1906 json.dumps( 

1907 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) 

1908 ), 

1909 "utf-8", 

1910 ) 

1911 

1912 ops = iter( 

1913 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1914 ) 

1915 bt_groups = _layout_mode.text_show_operations( 

1916 ops, fonts, strip_rotated, debug_path 

1917 ) 

1918 

1919 if not bt_groups: 

1920 return "" 

1921 

1922 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1923 

1924 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1925 

1926 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1927 

1928 def extract_text( 

1929 self, 

1930 *args: Any, 

1931 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), 

1932 space_width: float = 200.0, 

1933 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1934 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1935 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1936 extraction_mode: Literal["plain", "layout"] = "plain", 

1937 **kwargs: Any, 

1938 ) -> str: 

1939 """ 

1940 Locate all text drawing commands, in the order they are provided in the 

1941 content stream, and extract the text. 

1942 

1943 This works well for some PDF files, but poorly for others, depending on 

1944 the generator used. This will be refined in the future. 

1945 

1946 Do not rely on the order of text coming out of this function, as it 

1947 will change if this function is made more sophisticated. 

1948 

1949 Arabic and Hebrew are extracted in the correct order. 

1950 If required a custom RTL range of characters can be defined; 

1951 see function set_custom_rtl. 

1952 

1953 Additionally you can provide visitor methods to get informed on all 

1954 operations and all text objects. 

1955 For example in some PDF files this can be useful to parse tables. 

1956 

1957 Args: 

1958 orientations: list of orientations extract_text will look for 

1959 default = (0, 90, 180, 270) 

1960 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1961 270 (turned right) 

1962 Silently ignored in "layout" mode. 

1963 space_width: force default space width 

1964 if not extracted from font (default: 200) 

1965 Silently ignored in "layout" mode. 

1966 visitor_operand_before: function to be called before processing an operation. 

1967 It has four arguments: operator, operand-arguments, 

1968 current transformation matrix and text matrix. 

1969 Ignored with a warning in "layout" mode. 

1970 visitor_operand_after: function to be called after processing an operation. 

1971 It has four arguments: operator, operand-arguments, 

1972 current transformation matrix and text matrix. 

1973 Ignored with a warning in "layout" mode. 

1974 visitor_text: function to be called when extracting some text at some position. 

1975 It has five arguments: text, current transformation matrix, 

1976 text matrix, font-dictionary and font-size. 

1977 The font-dictionary may be None in case of unknown fonts. 

1978 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1979 Ignored with a warning in "layout" mode. 

1980 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1981 "layout" for experimental layout mode functionality. 

1982 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1983 in "layout" mode. 

1984 

1985 kwargs: 

1986 layout_mode_space_vertically (bool): include blank lines inferred from 

1987 y distance + font height. Defaults to True. 

1988 layout_mode_scale_weight (float): multiplier for string length when calculating 

1989 weighted average character width. Defaults to 1.25. 

1990 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1991 Set to False to include rotated text anyway. If rotated text is discovered, 

1992 layout will be degraded and a warning will result. Defaults to True. 

1993 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1994 creates the following files with debug information for layout mode 

1995 functions if supplied: 

1996 

1997 - fonts.json: output of self._layout_mode_fonts 

1998 - tjs.json: individual text render ops with corresponding transform matrices 

1999 - bts.json: text render ops left justified and grouped by BT/ET operators 

2000 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

2001 layout_mode_font_height_weight (float): multiplier for font height when calculating 

2002 blank lines. Defaults to 1. 

2003 

2004 Returns: 

2005 The extracted text 

2006 

2007 """ 

2008 if extraction_mode not in ["plain", "layout"]: 

2009 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

2010 if extraction_mode == "layout": 

2011 for visitor in ( 

2012 "visitor_operand_before", 

2013 "visitor_operand_after", 

2014 "visitor_text", 

2015 ): 

2016 if locals()[visitor]: 

2017 logger_warning( 

2018 f"Argument {visitor} is ignored in layout mode", 

2019 __name__, 

2020 ) 

2021 return self._layout_mode_text( 

2022 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2023 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2024 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2025 debug_path=kwargs.get("layout_mode_debug_path"), 

2026 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2027 ) 

2028 if len(args) >= 1: 

2029 if isinstance(args[0], str): 

2030 if len(args) >= 3: 

2031 if isinstance(args[2], (tuple, int)): 

2032 orientations = args[2] 

2033 else: 

2034 raise TypeError(f"Invalid positional parameter {args[2]}") 

2035 if len(args) >= 4: 

2036 if isinstance(args[3], (float, int)): 

2037 space_width = args[3] 

2038 else: 

2039 raise TypeError(f"Invalid positional parameter {args[3]}") 

2040 elif isinstance(args[0], (tuple, int)): 

2041 orientations = args[0] 

2042 if len(args) >= 2: 

2043 if isinstance(args[1], (float, int)): 

2044 space_width = args[1] 

2045 else: 

2046 raise TypeError(f"Invalid positional parameter {args[1]}") 

2047 else: 

2048 raise TypeError(f"Invalid positional parameter {args[0]}") 

2049 

2050 if isinstance(orientations, int): 

2051 orientations = (orientations,) 

2052 

2053 return self._extract_text( 

2054 self, 

2055 self.pdf, 

2056 orientations, 

2057 space_width, 

2058 PG.CONTENTS, 

2059 visitor_operand_before, 

2060 visitor_operand_after, 

2061 visitor_text, 

2062 ) 

2063 

2064 def extract_xform_text( 

2065 self, 

2066 xform: EncodedStreamObject, 

2067 orientations: tuple[int, ...] = (0, 90, 270, 360), 

2068 space_width: float = 200.0, 

2069 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2070 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2071 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2072 ) -> str: 

2073 """ 

2074 Extract text from an XObject. 

2075 

2076 Args: 

2077 xform: 

2078 orientations: 

2079 space_width: force default space width (if not extracted from font (default 200) 

2080 visitor_operand_before: 

2081 visitor_operand_after: 

2082 visitor_text: 

2083 

2084 Returns: 

2085 The extracted text 

2086 

2087 """ 

2088 return self._extract_text( 

2089 xform, 

2090 self.pdf, 

2091 orientations, 

2092 space_width, 

2093 None, 

2094 visitor_operand_before, 

2095 visitor_operand_after, 

2096 visitor_text, 

2097 ) 

2098 

2099 def _get_fonts(self) -> tuple[set[str], set[str]]: 

2100 """ 

2101 Get the names of embedded fonts and unembedded fonts. 

2102 

2103 Returns: 

2104 A tuple (set of embedded fonts, set of unembedded fonts) 

2105 

2106 """ 

2107 obj = self.get_object() 

2108 assert isinstance(obj, DictionaryObject) 

2109 fonts: set[str] = set() 

2110 embedded: set[str] = set() 

2111 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2112 unembedded = fonts - embedded 

2113 return embedded, unembedded 

2114 

2115 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2116 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2117 default user space units, defining the boundaries of the physical medium on 

2118 which the page is intended to be displayed or printed.""" 

2119 

2120 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2121 """ 

2122 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2123 default user space units, defining the visible region of default user 

2124 space. 

2125 

2126 When the page is displayed or printed, its contents are to be clipped 

2127 (cropped) to this rectangle and then imposed on the output medium in some 

2128 implementation-defined manner. Default value: same as 

2129 :attr:`mediabox<mediabox>`. 

2130 """ 

2131 

2132 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2133 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2134 default user space units, defining the region to which the contents of the 

2135 page should be clipped when output in a production environment.""" 

2136 

2137 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2138 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2139 default user space units, defining the intended dimensions of the finished 

2140 page after trimming.""" 

2141 

2142 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2143 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2144 default user space units, defining the extent of the page's meaningful 

2145 content as intended by the page's creator.""" 

2146 

2147 @property 

2148 def annotations(self) -> Optional[ArrayObject]: 

2149 if "/Annots" not in self: 

2150 return None 

2151 return cast(ArrayObject, self["/Annots"]) 

2152 

2153 @annotations.setter 

2154 def annotations(self, value: Optional[ArrayObject]) -> None: 

2155 """ 

2156 Set the annotations array of the page. 

2157 

2158 Typically you do not want to set this value, but append to it. 

2159 If you append to it, remember to add the object first to the writer 

2160 and only add the indirect object. 

2161 """ 

2162 if value is None: 

2163 if "/Annots" not in self: 

2164 return 

2165 del self[NameObject("/Annots")] 

2166 else: 

2167 self[NameObject("/Annots")] = value 

2168 

2169 

2170class _VirtualList(Sequence[PageObject]): 

2171 def __init__( 

2172 self, 

2173 length_function: Callable[[], int], 

2174 get_function: Callable[[int], PageObject], 

2175 ) -> None: 

2176 self.length_function = length_function 

2177 self.get_function = get_function 

2178 self.current = -1 

2179 

2180 def __len__(self) -> int: 

2181 return self.length_function() 

2182 

2183 @overload 

2184 def __getitem__(self, index: int) -> PageObject: 

2185 ... 

2186 

2187 @overload 

2188 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2189 ... 

2190 

2191 def __getitem__( 

2192 self, index: Union[int, slice] 

2193 ) -> Union[PageObject, Sequence[PageObject]]: 

2194 if isinstance(index, slice): 

2195 indices = range(*index.indices(len(self))) 

2196 cls = type(self) 

2197 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2198 if not isinstance(index, int): 

2199 raise TypeError("Sequence indices must be integers") 

2200 len_self = len(self) 

2201 if index < 0: 

2202 # support negative indexes 

2203 index += len_self 

2204 if not (0 <= index < len_self): 

2205 raise IndexError("Sequence index out of range") 

2206 return self.get_function(index) 

2207 

2208 def __delitem__(self, index: Union[int, slice]) -> None: 

2209 if isinstance(index, slice): 

2210 r = list(range(*index.indices(len(self)))) 

2211 # pages have to be deleted from last to first 

2212 r.sort() 

2213 r.reverse() 

2214 for p in r: 

2215 del self[p] # recursive call 

2216 return 

2217 if not isinstance(index, int): 

2218 raise TypeError("Index must be integers") 

2219 len_self = len(self) 

2220 if index < 0: 

2221 # support negative indexes 

2222 index += len_self 

2223 if not (0 <= index < len_self): 

2224 raise IndexError("Index out of range") 

2225 ind = self[index].indirect_reference 

2226 assert ind is not None 

2227 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2228 "/Parent", None 

2229 ) 

2230 first = True 

2231 while parent is not None: 

2232 parent = cast(DictionaryObject, parent.get_object()) 

2233 try: 

2234 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2235 del cast(ArrayObject, parent["/Kids"])[i] 

2236 first = False 

2237 try: 

2238 assert ind is not None 

2239 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2240 except Exception: # pragma: no cover 

2241 pass 

2242 if "/Count" in parent: 

2243 parent[NameObject("/Count")] = NumberObject( 

2244 cast(int, parent["/Count"]) - 1 

2245 ) 

2246 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2247 # No more objects in this part of this subtree 

2248 ind = parent.indirect_reference 

2249 parent = parent.get("/Parent", None) 

2250 except ValueError: # from index 

2251 if first: 

2252 raise PdfReadError(f"Page not found in page tree: {ind}") 

2253 break 

2254 

2255 def __iter__(self) -> Iterator[PageObject]: 

2256 for i in range(len(self)): 

2257 yield self[i] 

2258 

2259 def __str__(self) -> str: 

2260 p = [f"PageObject({i})" for i in range(self.length_function())] 

2261 return f"[{', '.join(p)}]" 

2262 

2263 

2264def _get_fonts_walk( 

2265 obj: DictionaryObject, 

2266 fnt: set[str], 

2267 emb: set[str], 

2268) -> tuple[set[str], set[str]]: 

2269 """ 

2270 Get the set of all fonts and all embedded fonts. 

2271 

2272 Args: 

2273 obj: Page resources dictionary 

2274 fnt: font 

2275 emb: embedded fonts 

2276 

2277 Returns: 

2278 A tuple (fnt, emb) 

2279 

2280 If there is a key called 'BaseFont', that is a font that is used in the document. 

2281 If there is a key called 'FontName' and another key in the same dictionary object 

2282 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2283 embedded. 

2284 

2285 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2286 

2287 """ 

2288 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2289 

2290 def process_font(f: DictionaryObject) -> None: 

2291 nonlocal fnt, emb 

2292 f = cast(DictionaryObject, f.get_object()) # to be sure 

2293 if "/BaseFont" in f: 

2294 fnt.add(cast(str, f["/BaseFont"])) 

2295 

2296 if ( 

2297 ("/CharProcs" in f) 

2298 or ( 

2299 "/FontDescriptor" in f 

2300 and any( 

2301 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2302 ) 

2303 ) 

2304 or ( 

2305 "/DescendantFonts" in f 

2306 and "/FontDescriptor" 

2307 in cast( 

2308 DictionaryObject, 

2309 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2310 ) 

2311 and any( 

2312 x 

2313 in cast( 

2314 DictionaryObject, 

2315 cast( 

2316 DictionaryObject, 

2317 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2318 )["/FontDescriptor"], 

2319 ) 

2320 for x in fontkeys 

2321 ) 

2322 ) 

2323 ): 

2324 # the list comprehension ensures there is FontFile 

2325 try: 

2326 emb.add(cast(str, f["/BaseFont"])) 

2327 except KeyError: 

2328 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2329 

2330 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2331 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2332 process_font(f) 

2333 if "/Resources" in obj: 

2334 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2335 for f in cast( 

2336 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2337 ).values(): 

2338 process_font(f) 

2339 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2340 for x in cast( 

2341 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2342 ).values(): 

2343 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2344 if "/Annots" in obj: 

2345 for a in cast(ArrayObject, obj["/Annots"]): 

2346 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2347 if "/AP" in obj: 

2348 if ( 

2349 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2350 "/Type" 

2351 ) 

2352 == "/XObject" 

2353 ): 

2354 _get_fonts_walk( 

2355 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2356 fnt, 

2357 emb, 

2358 ) 

2359 else: 

2360 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2361 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2362 return fnt, emb # return the sets for each page