Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

913 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from collections.abc import Iterable, Iterator, Sequence 

32from dataclasses import dataclass 

33from decimal import Decimal 

34from io import BytesIO 

35from pathlib import Path 

36from typing import ( 

37 Any, 

38 Callable, 

39 Literal, 

40 Optional, 

41 Union, 

42 cast, 

43 overload, 

44) 

45 

46from ._cmap import ( 

47 build_char_map, 

48) 

49from ._protocols import PdfCommonDocProtocol 

50from ._text_extraction import ( 

51 _layout_mode, 

52) 

53from ._text_extraction._text_extractor import TextExtraction 

54from ._utils import ( 

55 CompressedTransformationMatrix, 

56 TransformationMatrixType, 

57 _human_readable_bytes, 

58 logger_warning, 

59 matrix_multiply, 

60) 

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

62from .constants import AnnotationDictionaryAttributes as ADA 

63from .constants import ImageAttributes as IA 

64from .constants import PageAttributes as PG 

65from .constants import Resources as RES 

66from .errors import PageSizeNotDefinedError, PdfReadError 

67from .filters import _xobj_to_image 

68from .generic import ( 

69 ArrayObject, 

70 ContentStream, 

71 DictionaryObject, 

72 EncodedStreamObject, 

73 FloatObject, 

74 IndirectObject, 

75 NameObject, 

76 NullObject, 

77 NumberObject, 

78 PdfObject, 

79 RectangleObject, 

80 StreamObject, 

81 is_null_or_none, 

82) 

83 

84try: 

85 from PIL.Image import Image 

86 

87 pil_not_imported = False 

88except ImportError: 

89 Image = object # type: ignore 

90 pil_not_imported = True # error will be raised only when using images 

91 

92MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" 

93 

94 

95def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

96 retval: Union[None, RectangleObject, IndirectObject] = self.get(name) 

97 if isinstance(retval, RectangleObject): 

98 return retval 

99 if is_null_or_none(retval): 

100 for d in defaults: 

101 retval = self.get(d) 

102 if retval is not None: 

103 break 

104 if isinstance(retval, IndirectObject): 

105 retval = self.pdf.get_object(retval) 

106 retval = RectangleObject(retval) # type: ignore 

107 _set_rectangle(self, name, retval) 

108 return retval 

109 

110 

111def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

112 self[NameObject(name)] = value 

113 

114 

115def _delete_rectangle(self: Any, name: str) -> None: 

116 del self[name] 

117 

118 

119def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

120 return property( 

121 lambda self: _get_rectangle(self, name, fallback), 

122 lambda self, value: _set_rectangle(self, name, value), 

123 lambda self: _delete_rectangle(self, name), 

124 ) 

125 

126 

127class Transformation: 

128 """ 

129 Represent a 2D transformation. 

130 

131 The transformation between two coordinate systems is represented by a 3-by-3 

132 transformation matrix with the following form:: 

133 

134 a b 0 

135 c d 0 

136 e f 1 

137 

138 Because a transformation matrix has only six elements that can be changed, 

139 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

140 

141 Coordinate transformations are expressed as matrix multiplications:: 

142 

143 a b 0 

144 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

145 e f 1 

146 

147 

148 Example: 

149 >>> from pypdf import PdfWriter, Transformation 

150 >>> page = PdfWriter().add_blank_page(800, 600) 

151 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

152 >>> page.add_transformation(op) 

153 

154 """ 

155 

156 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

157 self.ctm = ctm 

158 

159 @property 

160 def matrix(self) -> TransformationMatrixType: 

161 """ 

162 Return the transformation matrix as a tuple of tuples in the form: 

163 

164 ((a, b, 0), (c, d, 0), (e, f, 1)) 

165 """ 

166 return ( 

167 (self.ctm[0], self.ctm[1], 0), 

168 (self.ctm[2], self.ctm[3], 0), 

169 (self.ctm[4], self.ctm[5], 1), 

170 ) 

171 

172 @staticmethod 

173 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

174 """ 

175 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

176 

177 Args: 

178 matrix: The transformation matrix as a tuple of tuples. 

179 

180 Returns: 

181 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

182 

183 """ 

184 return ( 

185 matrix[0][0], 

186 matrix[0][1], 

187 matrix[1][0], 

188 matrix[1][1], 

189 matrix[2][0], 

190 matrix[2][1], 

191 ) 

192 

193 def _to_cm(self) -> str: 

194 # Returns the cm operation string for the given transformation matrix 

195 return ( 

196 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " 

197 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" 

198 ) 

199 

200 def transform(self, m: "Transformation") -> "Transformation": 

201 """ 

202 Apply one transformation to another. 

203 

204 Args: 

205 m: a Transformation to apply. 

206 

207 Returns: 

208 A new ``Transformation`` instance 

209 

210 Example: 

211 >>> from pypdf import PdfWriter, Transformation 

212 >>> height, width = 40, 50 

213 >>> page = PdfWriter().add_blank_page(800, 600) 

214 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

215 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

216 >>> page.add_transformation(op) 

217 

218 """ 

219 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

220 return Transformation(ctm) 

221 

222 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

223 """ 

224 Translate the contents of a page. 

225 

226 Args: 

227 tx: The translation along the x-axis. 

228 ty: The translation along the y-axis. 

229 

230 Returns: 

231 A new ``Transformation`` instance 

232 

233 """ 

234 m = self.ctm 

235 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

236 

237 def scale( 

238 self, sx: Optional[float] = None, sy: Optional[float] = None 

239 ) -> "Transformation": 

240 """ 

241 Scale the contents of a page towards the origin of the coordinate system. 

242 

243 Typically, that is the lower-left corner of the page. That can be 

244 changed by translating the contents / the page boxes. 

245 

246 Args: 

247 sx: The scale factor along the x-axis. 

248 sy: The scale factor along the y-axis. 

249 

250 Returns: 

251 A new Transformation instance with the scaled matrix. 

252 

253 """ 

254 if sx is None and sy is None: 

255 raise ValueError("Either sx or sy must be specified") 

256 if sx is None: 

257 sx = sy 

258 if sy is None: 

259 sy = sx 

260 assert sx is not None 

261 assert sy is not None 

262 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

263 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

264 return Transformation(ctm) 

265 

266 def rotate(self, rotation: float) -> "Transformation": 

267 """ 

268 Rotate the contents of a page. 

269 

270 Args: 

271 rotation: The angle of rotation in degrees. 

272 

273 Returns: 

274 A new ``Transformation`` instance with the rotated matrix. 

275 

276 """ 

277 rotation = math.radians(rotation) 

278 op: TransformationMatrixType = ( 

279 (math.cos(rotation), math.sin(rotation), 0), 

280 (-math.sin(rotation), math.cos(rotation), 0), 

281 (0, 0, 1), 

282 ) 

283 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

284 return Transformation(ctm) 

285 

286 def __repr__(self) -> str: 

287 return f"Transformation(ctm={self.ctm})" 

288 

289 @overload 

290 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: 

291 ... 

292 

293 @overload 

294 def apply_on( 

295 self, pt: tuple[float, float], as_object: bool = False 

296 ) -> tuple[float, float]: 

297 ... 

298 

299 def apply_on( 

300 self, 

301 pt: Union[tuple[float, float], list[float]], 

302 as_object: bool = False, 

303 ) -> Union[tuple[float, float], list[float]]: 

304 """ 

305 Apply the transformation matrix on the given point. 

306 

307 Args: 

308 pt: A tuple or list representing the point in the form (x, y). 

309 as_object: If True, return items as FloatObject, otherwise as plain floats. 

310 

311 Returns: 

312 A tuple or list representing the transformed point in the form (x', y') 

313 

314 """ 

315 typ = FloatObject if as_object else float 

316 pt1 = ( 

317 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

318 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

319 ) 

320 return list(pt1) if isinstance(pt, list) else pt1 

321 

322 

323@dataclass 

324class ImageFile: 

325 """ 

326 Image within the PDF file. *This object is not designed to be built.* 

327 

328 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

329 """ 

330 

331 name: str = "" 

332 """ 

333 Filename as identified within the PDF file. 

334 """ 

335 

336 data: bytes = b"" 

337 """ 

338 Data as bytes. 

339 """ 

340 

341 image: Optional[Image] = None 

342 """ 

343 Data as PIL image. 

344 """ 

345 

346 indirect_reference: Optional[IndirectObject] = None 

347 """ 

348 Reference to the object storing the stream. 

349 """ 

350 

351 def replace(self, new_image: Image, **kwargs: Any) -> None: 

352 """ 

353 Replace the image with a new PIL image. 

354 

355 Args: 

356 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

357 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

358 

359 Raises: 

360 TypeError: If the image is inline or in a PdfReader. 

361 TypeError: If the image does not belong to a PdfWriter. 

362 TypeError: If `new_image` is not a PIL Image. 

363 

364 Note: 

365 This method replaces the existing image with a new image. 

366 It is not allowed for inline images or images within a PdfReader. 

367 The `kwargs` parameter allows passing additional parameters 

368 to `Image.save()`, such as quality. 

369 

370 """ 

371 if pil_not_imported: 

372 raise ImportError( 

373 "pillow is required to do image extraction. " 

374 "It can be installed via 'pip install pypdf[image]'" 

375 ) 

376 

377 from ._reader import PdfReader # noqa: PLC0415 

378 

379 # to prevent circular import 

380 from .filters import _xobj_to_image # noqa: PLC0415 

381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

382 

383 if self.indirect_reference is None: 

384 raise TypeError("Cannot update an inline image.") 

385 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

386 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

387 if not isinstance(new_image, Image): 

388 raise TypeError("new_image shall be a PIL Image") 

389 b = BytesIO() 

390 new_image.save(b, "PDF", **kwargs) 

391 reader = PdfReader(b) 

392 assert reader.pages[0].images[0].indirect_reference is not None 

393 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

394 reader.pages[0].images[0].indirect_reference.get_object() 

395 ) 

396 cast( 

397 PdfObject, self.indirect_reference.get_object() 

398 ).indirect_reference = self.indirect_reference 

399 # change the object attributes 

400 extension, byte_stream, img = _xobj_to_image( 

401 cast(DictionaryObject, self.indirect_reference.get_object()) 

402 ) 

403 assert extension is not None 

404 self.name = self.name[: self.name.rfind(".")] + extension 

405 self.data = byte_stream 

406 self.image = img 

407 

408 def __str__(self) -> str: 

409 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

410 

411 def __repr__(self) -> str: 

412 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

413 

414 

415class VirtualListImages(Sequence[ImageFile]): 

416 """ 

417 Provides access to images referenced within a page. 

418 Only one copy will be returned if the usage is used on the same page multiple times. 

419 See :func:`PageObject.images` for more details. 

420 """ 

421 

422 def __init__( 

423 self, 

424 ids_function: Callable[[], list[Union[str, list[str]]]], 

425 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], 

426 ) -> None: 

427 self.ids_function = ids_function 

428 self.get_function = get_function 

429 self.current = -1 

430 

431 def __len__(self) -> int: 

432 return len(self.ids_function()) 

433 

434 def keys(self) -> list[Union[str, list[str]]]: 

435 return self.ids_function() 

436 

437 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: 

438 return [(x, self[x]) for x in self.ids_function()] 

439 

440 @overload 

441 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: 

442 ... 

443 

444 @overload 

445 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

446 ... 

447 

448 def __getitem__( 

449 self, index: Union[int, slice, str, list[str], tuple[str]] 

450 ) -> Union[ImageFile, Sequence[ImageFile]]: 

451 lst = self.ids_function() 

452 if isinstance(index, slice): 

453 indices = range(*index.indices(len(self))) 

454 lst = [lst[x] for x in indices] 

455 cls = type(self) 

456 return cls((lambda: lst), self.get_function) 

457 if isinstance(index, (str, list, tuple)): 

458 return self.get_function(index) 

459 if not isinstance(index, int): 

460 raise TypeError("Invalid sequence indices type") 

461 len_self = len(lst) 

462 if index < 0: 

463 # support negative indexes 

464 index += len_self 

465 if not (0 <= index < len_self): 

466 raise IndexError("Sequence index out of range") 

467 return self.get_function(lst[index]) 

468 

469 def __iter__(self) -> Iterator[ImageFile]: 

470 for i in range(len(self)): 

471 yield self[i] 

472 

473 def __str__(self) -> str: 

474 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

475 return f"[{', '.join(p)}]" 

476 

477 

478class PageObject(DictionaryObject): 

479 """ 

480 PageObject represents a single page within a PDF file. 

481 

482 Typically these objects will be created by accessing the 

483 :attr:`pages<pypdf.PdfReader.pages>` property of the 

484 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

485 also possible to create an empty page with the 

486 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

487 

488 Args: 

489 pdf: PDF file the page belongs to. 

490 indirect_reference: Stores the original indirect reference to 

491 this object in its source PDF 

492 

493 """ 

494 

495 original_page: "PageObject" # very local use in writer when appending 

496 

497 def __init__( 

498 self, 

499 pdf: Optional[PdfCommonDocProtocol] = None, 

500 indirect_reference: Optional[IndirectObject] = None, 

501 ) -> None: 

502 DictionaryObject.__init__(self) 

503 self.pdf = pdf 

504 self.inline_images: Optional[dict[str, ImageFile]] = None 

505 self.indirect_reference = indirect_reference 

506 if not is_null_or_none(indirect_reference): 

507 assert indirect_reference is not None, "mypy" 

508 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

509 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {} 

510 

511 def hash_bin(self) -> int: 

512 """ 

513 Used to detect modified object. 

514 

515 Note: this function is overloaded to return the same results 

516 as a DictionaryObject. 

517 

518 Returns: 

519 Hash considering type and value. 

520 

521 """ 

522 return hash( 

523 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

524 ) 

525 

526 def hash_value_data(self) -> bytes: 

527 data = super().hash_value_data() 

528 data += f"{id(self)}".encode() 

529 return data 

530 

531 @property 

532 def user_unit(self) -> float: 

533 """ 

534 A read-only positive number giving the size of user space units. 

535 

536 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

537 space unit is 1/72 inch, and a value of 3 means that a user 

538 space unit is 3/72 inch. 

539 """ 

540 return self.get(PG.USER_UNIT, 1) 

541 

542 @staticmethod 

543 def create_blank_page( 

544 pdf: Optional[PdfCommonDocProtocol] = None, 

545 width: Union[float, Decimal, None] = None, 

546 height: Union[float, Decimal, None] = None, 

547 ) -> "PageObject": 

548 """ 

549 Return a new blank page. 

550 

551 If ``width`` or ``height`` is ``None``, try to get the page size 

552 from the last page of *pdf*. 

553 

554 Args: 

555 pdf: PDF file the page is within. 

556 width: The width of the new page expressed in default user 

557 space units. 

558 height: The height of the new page expressed in default user 

559 space units. 

560 

561 Returns: 

562 The new blank page 

563 

564 Raises: 

565 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

566 no page 

567 

568 """ 

569 page = PageObject(pdf) 

570 

571 # Creates a new page (cf PDF Reference §7.7.3.3) 

572 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

573 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

574 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

575 if width is None or height is None: 

576 if pdf is not None and len(pdf.pages) > 0: 

577 lastpage = pdf.pages[len(pdf.pages) - 1] 

578 width = lastpage.mediabox.width 

579 height = lastpage.mediabox.height 

580 else: 

581 raise PageSizeNotDefinedError 

582 page.__setitem__( 

583 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

584 ) 

585 

586 return page 

587 

588 def _get_ids_image( 

589 self, 

590 obj: Optional[DictionaryObject] = None, 

591 ancest: Optional[list[str]] = None, 

592 call_stack: Optional[list[Any]] = None, 

593 ) -> list[Union[str, list[str]]]: 

594 if call_stack is None: 

595 call_stack = [] 

596 _i = getattr(obj, "indirect_reference", None) 

597 if _i in call_stack: 

598 return [] 

599 call_stack.append(_i) 

600 if self.inline_images is None: 

601 self.inline_images = self._get_inline_images() 

602 if obj is None: 

603 obj = self 

604 if ancest is None: 

605 ancest = [] 

606 lst: list[Union[str, list[str]]] = [] 

607 if ( 

608 PG.RESOURCES not in obj or 

609 is_null_or_none(resources := obj[PG.RESOURCES]) or 

610 RES.XOBJECT not in cast(DictionaryObject, resources) 

611 ): 

612 return [] if self.inline_images is None else list(self.inline_images.keys()) 

613 

614 x_object = resources[RES.XOBJECT].get_object() # type: ignore 

615 for o in x_object: 

616 if not isinstance(x_object[o], StreamObject): 

617 continue 

618 if x_object[o][IA.SUBTYPE] == "/Image": 

619 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

620 else: # is a form with possible images inside 

621 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

622 assert self.inline_images is not None 

623 lst.extend(list(self.inline_images.keys())) 

624 return lst 

625 

626 def _get_image( 

627 self, 

628 id: Union[str, list[str], tuple[str]], 

629 obj: Optional[DictionaryObject] = None, 

630 ) -> ImageFile: 

631 if obj is None: 

632 obj = cast(DictionaryObject, self) 

633 if isinstance(id, tuple): 

634 id = list(id) 

635 if isinstance(id, list) and len(id) == 1: 

636 id = id[0] 

637 try: 

638 xobjs = cast( 

639 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

640 ) 

641 except KeyError: 

642 if not (id[0] == "~" and id[-1] == "~"): 

643 raise 

644 if isinstance(id, str): 

645 if id[0] == "~" and id[-1] == "~": 

646 if self.inline_images is None: 

647 self.inline_images = self._get_inline_images() 

648 if self.inline_images is None: # pragma: no cover 

649 raise KeyError("No inline image can be found") 

650 return self.inline_images[id] 

651 

652 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

653 extension, byte_stream = imgd[:2] 

654 return ImageFile( 

655 name=f"{id[1:]}{extension}", 

656 data=byte_stream, 

657 image=imgd[2], 

658 indirect_reference=xobjs[id].indirect_reference, 

659 ) 

660 # in a subobject 

661 ids = id[1:] 

662 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

663 

664 @property 

665 def images(self) -> VirtualListImages: 

666 """ 

667 Read-only property emulating a list of images on a page. 

668 

669 Get a list of all images on the page. The key can be: 

670 - A string (for the top object) 

671 - A tuple (for images within XObject forms) 

672 - An integer 

673 

674 Examples: 

675 * `reader.pages[0].images[0]` # return first image 

676 * `reader.pages[0].images['/I0']` # return image '/I0' 

677 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

678 * `for img in reader.pages[0].images:` # loops through all objects 

679 

680 images.keys() and images.items() can be used. 

681 

682 The ImageFile has the following properties: 

683 

684 * `.name` : name of the object 

685 * `.data` : bytes of the object 

686 * `.image` : PIL Image Object 

687 * `.indirect_reference` : object reference 

688 

689 and the following methods: 

690 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

691 replace the image in the pdf with the new image 

692 applying the saving parameters indicated (such as quality) 

693 

694 Example usage: 

695 

696 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

697 

698 Inline images are extracted and named ~0~, ~1~, ..., with the 

699 indirect_reference set to None. 

700 

701 """ 

702 return VirtualListImages(self._get_ids_image, self._get_image) 

703 

704 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

705 """Translate values used in inline image""" 

706 try: 

707 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

708 except (TypeError, KeyError): 

709 if isinstance(v, NameObject): 

710 # It is a custom name, thus we have to look in resources. 

711 # The only applicable case is for ColorSpace. 

712 try: 

713 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

714 v = cast(DictionaryObject, res)[v] 

715 except KeyError: # for res and v 

716 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

717 return v 

718 

719 def _get_inline_images(self) -> dict[str, ImageFile]: 

720 """Load inline images. Entries will be identified as `~1~`.""" 

721 content = self.get_contents() 

722 if is_null_or_none(content): 

723 return {} 

724 imgs_data = [] 

725 assert content is not None, "mypy" 

726 for param, ope in content.operations: 

727 if ope == b"INLINE IMAGE": 

728 imgs_data.append( 

729 {"settings": param["settings"], "__streamdata__": param["data"]} 

730 ) 

731 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

732 raise PdfReadError( 

733 f"{ope!r} operator met whereas not expected, " 

734 "please share use case with pypdf dev team" 

735 ) 

736 files = {} 

737 for num, ii in enumerate(imgs_data): 

738 init = { 

739 "__streamdata__": ii["__streamdata__"], 

740 "/Length": len(ii["__streamdata__"]), 

741 } 

742 for k, v in ii["settings"].items(): 

743 if k in {"/Length", "/L"}: # no length is expected 

744 continue 

745 if isinstance(v, list): 

746 v = ArrayObject( 

747 [self._translate_value_inline_image(k, x) for x in v] 

748 ) 

749 else: 

750 v = self._translate_value_inline_image(k, v) 

751 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

752 if k not in init: 

753 init[k] = v 

754 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

755 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

756 files[f"~{num}~"] = ImageFile( 

757 name=f"~{num}~{extension}", 

758 data=byte_stream, 

759 image=img, 

760 indirect_reference=None, 

761 ) 

762 return files 

763 

764 @property 

765 def rotation(self) -> int: 

766 """ 

767 The visual rotation of the page. 

768 

769 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

770 valid values. This property does not affect ``/Contents``. 

771 """ 

772 rotate_obj = self.get(PG.ROTATE, 0) 

773 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

774 

775 @rotation.setter 

776 def rotation(self, r: float) -> None: 

777 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

778 

779 def transfer_rotation_to_content(self) -> None: 

780 """ 

781 Apply the rotation of the page to the content and the media/crop/... 

782 boxes. 

783 

784 It is recommended to apply this function before page merging. 

785 """ 

786 r = -self.rotation # rotation to apply is in the otherway 

787 self.rotation = 0 

788 mb = RectangleObject(self.mediabox) 

789 trsf = ( 

790 Transformation() 

791 .translate( 

792 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

793 ) 

794 .rotate(r) 

795 ) 

796 pt1 = trsf.apply_on(mb.lower_left) 

797 pt2 = trsf.apply_on(mb.upper_right) 

798 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

799 self.add_transformation(trsf, False) 

800 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

801 if b in self: 

802 rr = RectangleObject(self[b]) # type: ignore 

803 pt1 = trsf.apply_on(rr.lower_left) 

804 pt2 = trsf.apply_on(rr.upper_right) 

805 self[NameObject(b)] = RectangleObject( 

806 ( 

807 min(pt1[0], pt2[0]), 

808 min(pt1[1], pt2[1]), 

809 max(pt1[0], pt2[0]), 

810 max(pt1[1], pt2[1]), 

811 ) 

812 ) 

813 

814 def rotate(self, angle: int) -> "PageObject": 

815 """ 

816 Rotate a page clockwise by increments of 90 degrees. 

817 

818 Args: 

819 angle: Angle to rotate the page. Must be an increment of 90 deg. 

820 

821 Returns: 

822 The rotated PageObject 

823 

824 """ 

825 if angle % 90 != 0: 

826 raise ValueError("Rotation angle must be a multiple of 90") 

827 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

828 return self 

829 

830 def _merge_resources( 

831 self, 

832 res1: DictionaryObject, 

833 res2: DictionaryObject, 

834 resource: Any, 

835 new_res1: bool = True, 

836 ) -> tuple[dict[str, Any], dict[str, Any]]: 

837 try: 

838 assert isinstance(self.indirect_reference, IndirectObject) 

839 pdf = self.indirect_reference.pdf 

840 is_pdf_writer = hasattr( 

841 pdf, "_add_object" 

842 ) # expect isinstance(pdf, PdfWriter) 

843 except (AssertionError, AttributeError): 

844 pdf = None 

845 is_pdf_writer = False 

846 

847 def compute_unique_key(base_key: str) -> tuple[str, bool]: 

848 """ 

849 Find a key that either doesn't already exist or has the same value 

850 (indicated by the bool) 

851 

852 Args: 

853 base_key: An index is added to this to get the computed key 

854 

855 Returns: 

856 A tuple (computed key, bool) where the boolean indicates 

857 if there is a resource of the given computed_key with the same 

858 value. 

859 

860 """ 

861 value = page2res.raw_get(base_key) 

862 # TODO: a possible improvement for writer, the indirect_reference 

863 # cannot be found because translated 

864 

865 # try the current key first (e.g. "foo"), but otherwise iterate 

866 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

867 # many keys, thus this'll eventually end, even if it's been crafted 

868 # to be maximally annoying. 

869 computed_key = base_key 

870 idx = 0 

871 while computed_key in new_res: 

872 if new_res.raw_get(computed_key) == value: 

873 # there's already a resource of this name, with the exact 

874 # same value 

875 return computed_key, True 

876 computed_key = f"{base_key}-{idx}" 

877 idx += 1 

878 return computed_key, False 

879 

880 if new_res1: 

881 new_res = DictionaryObject() 

882 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

883 else: 

884 new_res = cast(DictionaryObject, res1[resource]) 

885 page2res = cast( 

886 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

887 ) 

888 rename_res = {} 

889 for key in page2res: 

890 unique_key, same_value = compute_unique_key(key) 

891 newname = NameObject(unique_key) 

892 if key != unique_key: 

893 # we have to use a different name for this 

894 rename_res[key] = newname 

895 

896 if not same_value: 

897 if is_pdf_writer: 

898 new_res[newname] = page2res.raw_get(key).clone(pdf) 

899 try: 

900 new_res[newname] = new_res[newname].indirect_reference 

901 except AttributeError: 

902 pass 

903 else: 

904 new_res[newname] = page2res.raw_get(key) 

905 lst = sorted(new_res.items()) 

906 new_res.clear() 

907 for el in lst: 

908 new_res[el[0]] = el[1] 

909 return new_res, rename_res 

910 

911 @staticmethod 

912 def _content_stream_rename( 

913 stream: ContentStream, 

914 rename: dict[Any, Any], 

915 pdf: Optional[PdfCommonDocProtocol], 

916 ) -> ContentStream: 

917 if not rename: 

918 return stream 

919 stream = ContentStream(stream, pdf) 

920 for operands, _operator in stream.operations: 

921 if isinstance(operands, list): 

922 for i, op in enumerate(operands): 

923 if isinstance(op, NameObject): 

924 operands[i] = rename.get(op, op) 

925 elif isinstance(operands, dict): 

926 for i, op in operands.items(): 

927 if isinstance(op, NameObject): 

928 operands[i] = rename.get(op, op) 

929 else: 

930 raise KeyError(f"Type of operands is {type(operands)}") 

931 return stream 

932 

933 @staticmethod 

934 def _add_transformation_matrix( 

935 contents: Any, 

936 pdf: Optional[PdfCommonDocProtocol], 

937 ctm: CompressedTransformationMatrix, 

938 ) -> ContentStream: 

939 """Add transformation matrix at the beginning of the given contents stream.""" 

940 contents = ContentStream(contents, pdf) 

941 contents.operations.insert( 

942 0, 

943 [ 

944 [FloatObject(x) for x in ctm], 

945 b"cm", 

946 ], 

947 ) 

948 return contents 

949 

950 def _get_contents_as_bytes(self) -> Optional[bytes]: 

951 """ 

952 Return the page contents as bytes. 

953 

954 Returns: 

955 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

956 

957 """ 

958 if PG.CONTENTS in self: 

959 obj = self[PG.CONTENTS].get_object() 

960 if isinstance(obj, list): 

961 return b"".join(x.get_object().get_data() for x in obj) 

962 return cast(EncodedStreamObject, obj).get_data() 

963 return None 

964 

965 def get_contents(self) -> Optional[ContentStream]: 

966 """ 

967 Access the page contents. 

968 

969 Returns: 

970 The ``/Contents`` object, or ``None`` if it does not exist. 

971 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

972 

973 """ 

974 if PG.CONTENTS in self: 

975 try: 

976 pdf = cast(IndirectObject, self.indirect_reference).pdf 

977 except AttributeError: 

978 pdf = None 

979 obj = self[PG.CONTENTS] 

980 if is_null_or_none(obj): 

981 return None 

982 resolved_object = obj.get_object() 

983 return ContentStream(resolved_object, pdf) 

984 return None 

985 

986 def replace_contents( 

987 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

988 ) -> None: 

989 """ 

990 Replace the page contents with the new content and nullify old objects 

991 Args: 

992 content: new content; if None delete the content field. 

993 """ 

994 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

995 # the page is not attached : the content is directly attached. 

996 self[NameObject(PG.CONTENTS)] = content 

997 return 

998 

999 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1000 for o in self[PG.CONTENTS]: # type: ignore[attr-defined] 

1001 try: 

1002 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore 

1003 except AttributeError: 

1004 pass 

1005 

1006 if isinstance(content, ArrayObject): 

1007 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content) 

1008 

1009 if is_null_or_none(content): 

1010 if PG.CONTENTS not in self: 

1011 return 

1012 assert self.indirect_reference is not None 

1013 assert self[PG.CONTENTS].indirect_reference is not None 

1014 self.indirect_reference.pdf._objects[ 

1015 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore 

1016 ] = NullObject() 

1017 del self[PG.CONTENTS] 

1018 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1019 try: 

1020 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( 

1021 content 

1022 ) 

1023 except AttributeError: 

1024 # applies at least for page not in writer 

1025 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1026 # this will be fixed with the _add_object 

1027 self[NameObject(PG.CONTENTS)] = content 

1028 else: 

1029 assert content is not None, "mypy" 

1030 content.indirect_reference = self[ 

1031 PG.CONTENTS 

1032 ].indirect_reference # TODO: in the future may require generation management 

1033 try: 

1034 self.indirect_reference.pdf._objects[ 

1035 content.indirect_reference.idnum - 1 # type: ignore 

1036 ] = content 

1037 except AttributeError: 

1038 # applies at least for page not in writer 

1039 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1040 # this will be fixed with the _add_object 

1041 self[NameObject(PG.CONTENTS)] = content 

1042 # forces recalculation of inline_images 

1043 self.inline_images = None 

1044 

1045 def merge_page( 

1046 self, page2: "PageObject", expand: bool = False, over: bool = True 

1047 ) -> None: 

1048 """ 

1049 Merge the content streams of two pages into one. 

1050 

1051 Resource references (e.g. fonts) are maintained from both pages. 

1052 The mediabox, cropbox, etc of this page are not altered. 

1053 The parameter page's content stream will 

1054 be added to the end of this page's content stream, 

1055 meaning that it will be drawn after, or "on top" of this page. 

1056 

1057 Args: 

1058 page2: The page to be merged into this one. Should be 

1059 an instance of :class:`PageObject<PageObject>`. 

1060 over: set the page2 content over page1 if True (default) else under 

1061 expand: If True, the current page dimensions will be 

1062 expanded to accommodate the dimensions of the page to be merged. 

1063 

1064 """ 

1065 self._merge_page(page2, over=over, expand=expand) 

1066 

1067 def _merge_page( 

1068 self, 

1069 page2: "PageObject", 

1070 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1071 ctm: Optional[CompressedTransformationMatrix] = None, 

1072 over: bool = True, 

1073 expand: bool = False, 

1074 ) -> None: 

1075 # First we work on merging the resource dictionaries. This allows us 

1076 # to find out what symbols in the content streams we might need to 

1077 # rename. 

1078 try: 

1079 assert isinstance(self.indirect_reference, IndirectObject) 

1080 if hasattr( 

1081 self.indirect_reference.pdf, "_add_object" 

1082 ): # to detect PdfWriter 

1083 return self._merge_page_writer( 

1084 page2, page2transformation, ctm, over, expand 

1085 ) 

1086 return None 

1087 except (AssertionError, AttributeError): 

1088 pass 

1089 

1090 new_resources = DictionaryObject() 

1091 rename = {} 

1092 try: 

1093 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1094 except KeyError: 

1095 original_resources = DictionaryObject() 

1096 try: 

1097 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1098 except KeyError: 

1099 page2resources = DictionaryObject() 

1100 new_annots = ArrayObject() 

1101 

1102 for page in (self, page2): 

1103 if PG.ANNOTS in page: 

1104 annots = page[PG.ANNOTS] 

1105 if isinstance(annots, ArrayObject): 

1106 new_annots.extend(annots) 

1107 

1108 for res in ( 

1109 RES.EXT_G_STATE, 

1110 RES.FONT, 

1111 RES.XOBJECT, 

1112 RES.COLOR_SPACE, 

1113 RES.PATTERN, 

1114 RES.SHADING, 

1115 RES.PROPERTIES, 

1116 ): 

1117 new, newrename = self._merge_resources( 

1118 original_resources, page2resources, res 

1119 ) 

1120 if new: 

1121 new_resources[NameObject(res)] = new 

1122 rename.update(newrename) 

1123 

1124 # Combine /ProcSet sets, making sure there's a consistent order 

1125 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1126 sorted( 

1127 set( 

1128 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1129 ).union( 

1130 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1131 ) 

1132 ) 

1133 ) 

1134 

1135 new_content_array = ArrayObject() 

1136 original_content = self.get_contents() 

1137 if original_content is not None: 

1138 original_content.isolate_graphics_state() 

1139 new_content_array.append(original_content) 

1140 

1141 page2content = page2.get_contents() 

1142 if page2content is not None: 

1143 rect = getattr(page2, MERGE_CROP_BOX) 

1144 page2content.operations.insert( 

1145 0, 

1146 ( 

1147 map( 

1148 FloatObject, 

1149 [ 

1150 rect.left, 

1151 rect.bottom, 

1152 rect.width, 

1153 rect.height, 

1154 ], 

1155 ), 

1156 b"re", 

1157 ), 

1158 ) 

1159 page2content.operations.insert(1, ([], b"W")) 

1160 page2content.operations.insert(2, ([], b"n")) 

1161 if page2transformation is not None: 

1162 page2content = page2transformation(page2content) 

1163 page2content = PageObject._content_stream_rename( 

1164 page2content, rename, self.pdf 

1165 ) 

1166 page2content.isolate_graphics_state() 

1167 if over: 

1168 new_content_array.append(page2content) 

1169 else: 

1170 new_content_array.insert(0, page2content) 

1171 

1172 # if expanding the page to fit a new page, calculate the new media box size 

1173 if expand: 

1174 self._expand_mediabox(page2, ctm) 

1175 

1176 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1177 self[NameObject(PG.RESOURCES)] = new_resources 

1178 self[NameObject(PG.ANNOTS)] = new_annots 

1179 return None 

1180 

1181 def _merge_page_writer( 

1182 self, 

1183 page2: "PageObject", 

1184 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1185 ctm: Optional[CompressedTransformationMatrix] = None, 

1186 over: bool = True, 

1187 expand: bool = False, 

1188 ) -> None: 

1189 # First we work on merging the resource dictionaries. This allows us 

1190 # to find which symbols in the content streams we might need to 

1191 # rename. 

1192 assert isinstance(self.indirect_reference, IndirectObject) 

1193 pdf = self.indirect_reference.pdf 

1194 

1195 rename = {} 

1196 if PG.RESOURCES not in self: 

1197 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1198 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1199 if PG.RESOURCES not in page2: 

1200 page2resources = DictionaryObject() 

1201 else: 

1202 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1203 

1204 for res in ( 

1205 RES.EXT_G_STATE, 

1206 RES.FONT, 

1207 RES.XOBJECT, 

1208 RES.COLOR_SPACE, 

1209 RES.PATTERN, 

1210 RES.SHADING, 

1211 RES.PROPERTIES, 

1212 ): 

1213 if res in page2resources: 

1214 if res not in original_resources: 

1215 original_resources[NameObject(res)] = DictionaryObject() 

1216 _, newrename = self._merge_resources( 

1217 original_resources, page2resources, res, False 

1218 ) 

1219 rename.update(newrename) 

1220 # Combine /ProcSet sets. 

1221 if RES.PROC_SET in page2resources: 

1222 if RES.PROC_SET not in original_resources: 

1223 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1224 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1225 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1226 if x not in arr: 

1227 arr.append(x) 

1228 arr.sort() 

1229 

1230 if PG.ANNOTS in page2: 

1231 if PG.ANNOTS not in self: 

1232 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1233 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1234 if ctm is None: 

1235 trsf = Transformation() 

1236 else: 

1237 trsf = Transformation(ctm) 

1238 for a in cast(ArrayObject, page2[PG.ANNOTS]): 

1239 a = a.get_object() 

1240 aa = a.clone( 

1241 pdf, 

1242 ignore_fields=("/P", "/StructParent", "/Parent"), 

1243 force_duplicate=True, 

1244 ) 

1245 r = cast(ArrayObject, a["/Rect"]) 

1246 pt1 = trsf.apply_on((r[0], r[1]), True) 

1247 pt2 = trsf.apply_on((r[2], r[3]), True) 

1248 aa[NameObject("/Rect")] = ArrayObject( 

1249 ( 

1250 min(pt1[0], pt2[0]), 

1251 min(pt1[1], pt2[1]), 

1252 max(pt1[0], pt2[0]), 

1253 max(pt1[1], pt2[1]), 

1254 ) 

1255 ) 

1256 if "/QuadPoints" in a: 

1257 q = cast(ArrayObject, a["/QuadPoints"]) 

1258 aa[NameObject("/QuadPoints")] = ArrayObject( 

1259 trsf.apply_on((q[0], q[1]), True) 

1260 + trsf.apply_on((q[2], q[3]), True) 

1261 + trsf.apply_on((q[4], q[5]), True) 

1262 + trsf.apply_on((q[6], q[7]), True) 

1263 ) 

1264 try: 

1265 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1266 except KeyError: 

1267 pass 

1268 try: 

1269 aa[NameObject("/P")] = self.indirect_reference 

1270 annots.append(aa.indirect_reference) 

1271 except AttributeError: 

1272 pass 

1273 

1274 new_content_array = ArrayObject() 

1275 original_content = self.get_contents() 

1276 if original_content is not None: 

1277 original_content.isolate_graphics_state() 

1278 new_content_array.append(original_content) 

1279 

1280 page2content = page2.get_contents() 

1281 if page2content is not None: 

1282 rect = getattr(page2, MERGE_CROP_BOX) 

1283 page2content.operations.insert( 

1284 0, 

1285 ( 

1286 map( 

1287 FloatObject, 

1288 [ 

1289 rect.left, 

1290 rect.bottom, 

1291 rect.width, 

1292 rect.height, 

1293 ], 

1294 ), 

1295 b"re", 

1296 ), 

1297 ) 

1298 page2content.operations.insert(1, ([], b"W")) 

1299 page2content.operations.insert(2, ([], b"n")) 

1300 if page2transformation is not None: 

1301 page2content = page2transformation(page2content) 

1302 page2content = PageObject._content_stream_rename( 

1303 page2content, rename, self.pdf 

1304 ) 

1305 page2content.isolate_graphics_state() 

1306 if over: 

1307 new_content_array.append(page2content) 

1308 else: 

1309 new_content_array.insert(0, page2content) 

1310 

1311 # if expanding the page to fit a new page, calculate the new media box size 

1312 if expand: 

1313 self._expand_mediabox(page2, ctm) 

1314 

1315 self.replace_contents(new_content_array) 

1316 

1317 def _expand_mediabox( 

1318 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1319 ) -> None: 

1320 corners1 = ( 

1321 self.mediabox.left.as_numeric(), 

1322 self.mediabox.bottom.as_numeric(), 

1323 self.mediabox.right.as_numeric(), 

1324 self.mediabox.top.as_numeric(), 

1325 ) 

1326 corners2 = ( 

1327 page2.mediabox.left.as_numeric(), 

1328 page2.mediabox.bottom.as_numeric(), 

1329 page2.mediabox.left.as_numeric(), 

1330 page2.mediabox.top.as_numeric(), 

1331 page2.mediabox.right.as_numeric(), 

1332 page2.mediabox.top.as_numeric(), 

1333 page2.mediabox.right.as_numeric(), 

1334 page2.mediabox.bottom.as_numeric(), 

1335 ) 

1336 if ctm is not None: 

1337 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1338 new_x = tuple( 

1339 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1340 for i in range(0, 8, 2) 

1341 ) 

1342 new_y = tuple( 

1343 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1344 for i in range(0, 8, 2) 

1345 ) 

1346 else: 

1347 new_x = corners2[0:8:2] 

1348 new_y = corners2[1:8:2] 

1349 lowerleft = (min(new_x), min(new_y)) 

1350 upperright = (max(new_x), max(new_y)) 

1351 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1352 upperright = ( 

1353 max(corners1[2], upperright[0]), 

1354 max(corners1[3], upperright[1]), 

1355 ) 

1356 

1357 self.mediabox.lower_left = lowerleft 

1358 self.mediabox.upper_right = upperright 

1359 

1360 def merge_transformed_page( 

1361 self, 

1362 page2: "PageObject", 

1363 ctm: Union[CompressedTransformationMatrix, Transformation], 

1364 over: bool = True, 

1365 expand: bool = False, 

1366 ) -> None: 

1367 """ 

1368 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1369 matrix is applied to the merged stream. 

1370 

1371 Args: 

1372 page2: The page to be merged into this one. 

1373 ctm: a 6-element tuple containing the operands of the 

1374 transformation matrix 

1375 over: set the page2 content over page1 if True (default) else under 

1376 expand: Whether the page should be expanded to fit the dimensions 

1377 of the page to be merged. 

1378 

1379 """ 

1380 if isinstance(ctm, Transformation): 

1381 ctm = ctm.ctm 

1382 self._merge_page( 

1383 page2, 

1384 lambda page2Content: PageObject._add_transformation_matrix( 

1385 page2Content, page2.pdf, ctm 

1386 ), 

1387 ctm, 

1388 over, 

1389 expand, 

1390 ) 

1391 

1392 def merge_scaled_page( 

1393 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1394 ) -> None: 

1395 """ 

1396 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1397 is scaled by applying a transformation matrix. 

1398 

1399 Args: 

1400 page2: The page to be merged into this one. 

1401 scale: The scaling factor 

1402 over: set the page2 content over page1 if True (default) else under 

1403 expand: Whether the page should be expanded to fit the 

1404 dimensions of the page to be merged. 

1405 

1406 """ 

1407 op = Transformation().scale(scale, scale) 

1408 self.merge_transformed_page(page2, op, over, expand) 

1409 

1410 def merge_rotated_page( 

1411 self, 

1412 page2: "PageObject", 

1413 rotation: float, 

1414 over: bool = True, 

1415 expand: bool = False, 

1416 ) -> None: 

1417 """ 

1418 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1419 is rotated by applying a transformation matrix. 

1420 

1421 Args: 

1422 page2: The page to be merged into this one. 

1423 rotation: The angle of the rotation, in degrees 

1424 over: set the page2 content over page1 if True (default) else under 

1425 expand: Whether the page should be expanded to fit the 

1426 dimensions of the page to be merged. 

1427 

1428 """ 

1429 op = Transformation().rotate(rotation) 

1430 self.merge_transformed_page(page2, op, over, expand) 

1431 

1432 def merge_translated_page( 

1433 self, 

1434 page2: "PageObject", 

1435 tx: float, 

1436 ty: float, 

1437 over: bool = True, 

1438 expand: bool = False, 

1439 ) -> None: 

1440 """ 

1441 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1442 merged is translated by applying a transformation matrix. 

1443 

1444 Args: 

1445 page2: the page to be merged into this one. 

1446 tx: The translation on X axis 

1447 ty: The translation on Y axis 

1448 over: set the page2 content over page1 if True (default) else under 

1449 expand: Whether the page should be expanded to fit the 

1450 dimensions of the page to be merged. 

1451 

1452 """ 

1453 op = Transformation().translate(tx, ty) 

1454 self.merge_transformed_page(page2, op, over, expand) 

1455 

1456 def add_transformation( 

1457 self, 

1458 ctm: Union[Transformation, CompressedTransformationMatrix], 

1459 expand: bool = False, 

1460 ) -> None: 

1461 """ 

1462 Apply a transformation matrix to the page. 

1463 

1464 Args: 

1465 ctm: A 6-element tuple containing the operands of the 

1466 transformation matrix. Alternatively, a 

1467 :py:class:`Transformation<pypdf.Transformation>` 

1468 object can be passed. 

1469 

1470 See :doc:`/user/cropping-and-transforming`. 

1471 

1472 """ 

1473 if isinstance(ctm, Transformation): 

1474 ctm = ctm.ctm 

1475 content = self.get_contents() 

1476 if content is not None: 

1477 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1478 content.isolate_graphics_state() 

1479 self.replace_contents(content) 

1480 # if expanding the page to fit a new page, calculate the new media box size 

1481 if expand: 

1482 corners = [ 

1483 self.mediabox.left.as_numeric(), 

1484 self.mediabox.bottom.as_numeric(), 

1485 self.mediabox.left.as_numeric(), 

1486 self.mediabox.top.as_numeric(), 

1487 self.mediabox.right.as_numeric(), 

1488 self.mediabox.top.as_numeric(), 

1489 self.mediabox.right.as_numeric(), 

1490 self.mediabox.bottom.as_numeric(), 

1491 ] 

1492 

1493 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1494 new_x = [ 

1495 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1496 for i in range(0, 8, 2) 

1497 ] 

1498 new_y = [ 

1499 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1500 for i in range(0, 8, 2) 

1501 ] 

1502 

1503 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1504 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1505 

1506 def scale(self, sx: float, sy: float) -> None: 

1507 """ 

1508 Scale a page by the given factors by applying a transformation matrix 

1509 to its content and updating the page size. 

1510 

1511 This updates the various page boundaries (bleedbox, trimbox, etc.) 

1512 and the contents of the page. 

1513 

1514 Args: 

1515 sx: The scaling factor on horizontal axis. 

1516 sy: The scaling factor on vertical axis. 

1517 

1518 """ 

1519 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1520 self.bleedbox = self.bleedbox.scale(sx, sy) 

1521 self.trimbox = self.trimbox.scale(sx, sy) 

1522 self.artbox = self.artbox.scale(sx, sy) 

1523 self.cropbox = self.cropbox.scale(sx, sy) 

1524 self.mediabox = self.mediabox.scale(sx, sy) 

1525 

1526 if PG.ANNOTS in self: 

1527 annotations = self[PG.ANNOTS] 

1528 if isinstance(annotations, ArrayObject): 

1529 for annotation in annotations: 

1530 annotation_obj = annotation.get_object() 

1531 if ADA.Rect in annotation_obj: 

1532 rectangle = annotation_obj[ADA.Rect] 

1533 if isinstance(rectangle, ArrayObject): 

1534 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1535 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1536 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1537 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1538 

1539 if PG.VP in self: 

1540 viewport = self[PG.VP] 

1541 if isinstance(viewport, ArrayObject): 

1542 bbox = viewport[0]["/BBox"] 

1543 else: 

1544 bbox = viewport["/BBox"] # type: ignore 

1545 scaled_bbox = RectangleObject( 

1546 ( 

1547 float(bbox[0]) * sx, 

1548 float(bbox[1]) * sy, 

1549 float(bbox[2]) * sx, 

1550 float(bbox[3]) * sy, 

1551 ) 

1552 ) 

1553 if isinstance(viewport, ArrayObject): 

1554 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1555 NameObject("/BBox") 

1556 ] = scaled_bbox 

1557 else: 

1558 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1559 

1560 def scale_by(self, factor: float) -> None: 

1561 """ 

1562 Scale a page by the given factor by applying a transformation matrix to 

1563 its content and updating the page size. 

1564 

1565 Args: 

1566 factor: The scaling factor (for both X and Y axis). 

1567 

1568 """ 

1569 self.scale(factor, factor) 

1570 

1571 def scale_to(self, width: float, height: float) -> None: 

1572 """ 

1573 Scale a page to the specified dimensions by applying a transformation 

1574 matrix to its content and updating the page size. 

1575 

1576 Args: 

1577 width: The new width. 

1578 height: The new height. 

1579 

1580 """ 

1581 sx = width / float(self.mediabox.width) 

1582 sy = height / float(self.mediabox.height) 

1583 self.scale(sx, sy) 

1584 

1585 def compress_content_streams(self, level: int = -1) -> None: 

1586 """ 

1587 Compress the size of this page by joining all content streams and 

1588 applying a FlateDecode filter. 

1589 

1590 However, it is possible that this function will perform no action if 

1591 content stream compression becomes "automatic". 

1592 """ 

1593 content = self.get_contents() 

1594 if content is not None: 

1595 content_obj = content.flate_encode(level) 

1596 try: 

1597 content.indirect_reference.pdf._objects[ # type: ignore 

1598 content.indirect_reference.idnum - 1 # type: ignore 

1599 ] = content_obj 

1600 except AttributeError: 

1601 if self.indirect_reference is not None and hasattr( 

1602 self.indirect_reference.pdf, "_add_object" 

1603 ): 

1604 self.replace_contents(content_obj) 

1605 else: 

1606 raise ValueError("Page must be part of a PdfWriter") 

1607 

1608 @property 

1609 def page_number(self) -> Optional[int]: 

1610 """ 

1611 Read-only property which returns the page number within the PDF file. 

1612 

1613 Returns: 

1614 Page number; None if the page is not attached to a PDF. 

1615 

1616 """ 

1617 if self.indirect_reference is None: 

1618 return None 

1619 try: 

1620 lst = self.indirect_reference.pdf.pages 

1621 return lst.index(self) 

1622 except ValueError: 

1623 return None 

1624 

1625 def _debug_for_extract(self) -> str: # pragma: no cover 

1626 out = "" 

1627 for ope, op in ContentStream( 

1628 self["/Contents"].get_object(), self.pdf, "bytes" 

1629 ).operations: 

1630 if op == b"TJ": 

1631 s = [x for x in ope[0] if isinstance(x, str)] 

1632 else: 

1633 s = [] 

1634 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1635 out += "\n=============================\n" 

1636 try: 

1637 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1638 out += fo + "\n" 

1639 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1640 try: 

1641 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1642 "/Encoding" 

1643 ].__repr__() 

1644 out += enc_repr + "\n" 

1645 except Exception: 

1646 pass 

1647 try: 

1648 out += ( 

1649 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1650 "/ToUnicode" 

1651 ] 

1652 .get_data() 

1653 .decode() 

1654 + "\n" 

1655 ) 

1656 except Exception: 

1657 pass 

1658 

1659 except KeyError: 

1660 out += "No Font\n" 

1661 return out 

1662 

1663 def _extract_text( 

1664 self, 

1665 obj: Any, 

1666 pdf: Any, 

1667 orientations: tuple[int, ...] = (0, 90, 180, 270), 

1668 space_width: float = 200.0, 

1669 content_key: Optional[str] = PG.CONTENTS, 

1670 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1671 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1672 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1673 ) -> str: 

1674 """ 

1675 See extract_text for most arguments. 

1676 

1677 Args: 

1678 content_key: indicate the default key where to extract data 

1679 None = the object; this allows reusing the function on an XObject 

1680 default = "/Content" 

1681 

1682 """ 

1683 extractor = TextExtraction() 

1684 cmaps: dict[ 

1685 str, 

1686 tuple[ 

1687 str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject 

1688 ], 

1689 ] = {} 

1690 

1691 try: 

1692 objr = obj 

1693 while NameObject(PG.RESOURCES) not in objr: 

1694 # /Resources can be inherited so we look to parents 

1695 objr = objr["/Parent"].get_object() 

1696 # If no parents then no /Resources will be available, 

1697 # so an exception will be raised 

1698 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1699 except Exception: 

1700 # No resources means no text is possible (no font); we consider the 

1701 # file as not damaged, no need to check for TJ or Tj 

1702 return "" 

1703 

1704 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]): 

1705 for f in cast(DictionaryObject, font): 

1706 try: 

1707 cmaps[f] = build_char_map(f, space_width, obj) 

1708 except TypeError: 

1709 pass 

1710 

1711 try: 

1712 content = ( 

1713 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1714 ) 

1715 if not isinstance(content, ContentStream): 

1716 content = ContentStream(content, pdf, "bytes") 

1717 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1718 return "" 

1719 # We check all strings are TextStringObjects. ByteStringObjects 

1720 # are strings where the byte->string encoding was unknown, so adding 

1721 # them to the text here would be gibberish. 

1722 

1723 # Initialize the extractor with the necessary parameters 

1724 extractor.initialize_extraction(orientations, visitor_text, cmaps) 

1725 

1726 for operands, operator in content.operations: 

1727 if visitor_operand_before is not None: 

1728 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1729 # Multiple operators are handled here 

1730 if operator == b"'": 

1731 extractor.process_operation(b"T*", []) 

1732 extractor.process_operation(b"Tj", operands) 

1733 elif operator == b'"': 

1734 extractor.process_operation(b"Tw", [operands[0]]) 

1735 extractor.process_operation(b"Tc", [operands[1]]) 

1736 extractor.process_operation(b"T*", []) 

1737 extractor.process_operation(b"Tj", operands[2:]) 

1738 elif operator == b"TJ": 

1739 # The space width may be smaller than the font width, so the width should be 95%. 

1740 _confirm_space_width = extractor._space_width * 0.95 

1741 if operands: 

1742 for op in operands[0]: 

1743 if isinstance(op, (str, bytes)): 

1744 extractor.process_operation(b"Tj", [op]) 

1745 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1746 abs(float(op)) >= _confirm_space_width 

1747 and extractor.text 

1748 and extractor.text[-1] != " " 

1749 ): 

1750 extractor.process_operation(b"Tj", [" "]) 

1751 elif operator == b"TD": 

1752 extractor.process_operation(b"TL", [-operands[1]]) 

1753 extractor.process_operation(b"Td", operands) 

1754 elif operator == b"Do": 

1755 extractor.output += extractor.text 

1756 if visitor_text is not None: 

1757 visitor_text( 

1758 extractor.text, 

1759 extractor.memo_cm, 

1760 extractor.memo_tm, 

1761 extractor.cmap[3], 

1762 extractor.font_size, 

1763 ) 

1764 try: 

1765 if extractor.output[-1] != "\n": 

1766 extractor.output += "\n" 

1767 if visitor_text is not None: 

1768 visitor_text( 

1769 "\n", 

1770 extractor.memo_cm, 

1771 extractor.memo_tm, 

1772 extractor.cmap[3], 

1773 extractor.font_size, 

1774 ) 

1775 except IndexError: 

1776 pass 

1777 try: 

1778 xobj = resources_dict["/XObject"] 

1779 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1780 text = self.extract_xform_text( 

1781 xobj[operands[0]], # type: ignore 

1782 orientations, 

1783 space_width, 

1784 visitor_operand_before, 

1785 visitor_operand_after, 

1786 visitor_text, 

1787 ) 

1788 extractor.output += text 

1789 if visitor_text is not None: 

1790 visitor_text( 

1791 text, 

1792 extractor.memo_cm, 

1793 extractor.memo_tm, 

1794 extractor.cmap[3], 

1795 extractor.font_size, 

1796 ) 

1797 except Exception as exception: 

1798 logger_warning( 

1799 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1800 __name__, 

1801 ) 

1802 finally: 

1803 extractor.text = "" 

1804 extractor.memo_cm = extractor.cm_matrix.copy() 

1805 extractor.memo_tm = extractor.tm_matrix.copy() 

1806 else: 

1807 extractor.process_operation(operator, operands) 

1808 if visitor_operand_after is not None: 

1809 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1810 extractor.output += extractor.text # just in case 

1811 if extractor.text != "" and visitor_text is not None: 

1812 visitor_text( 

1813 extractor.text, 

1814 extractor.memo_cm, 

1815 extractor.memo_tm, 

1816 extractor.cmap[3], 

1817 extractor.font_size, 

1818 ) 

1819 return extractor.output 

1820 

1821 def _layout_mode_fonts(self) -> dict[str, _layout_mode.Font]: 

1822 """ 

1823 Get fonts formatted for "layout" mode text extraction. 

1824 

1825 Returns: 

1826 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name 

1827 

1828 """ 

1829 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1830 objr: Any = self 

1831 fonts: dict[str, _layout_mode.Font] = {} 

1832 while objr is not None: 

1833 try: 

1834 resources_dict: Any = objr[PG.RESOURCES] 

1835 except KeyError: 

1836 resources_dict = {} 

1837 if "/Font" in resources_dict and self.pdf is not None: 

1838 for font_name in resources_dict["/Font"]: 

1839 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) 

1840 font_dict = { 

1841 k: v.get_object() 

1842 if isinstance(v, IndirectObject) 

1843 else [_v.get_object() for _v in v] 

1844 if isinstance(v, ArrayObject) 

1845 else v 

1846 for k, v in font_dict_obj.items() 

1847 } 

1848 # mypy really sucks at unpacking 

1849 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] 

1850 try: 

1851 objr = objr["/Parent"].get_object() 

1852 except KeyError: 

1853 objr = None 

1854 

1855 return fonts 

1856 

1857 def _layout_mode_text( 

1858 self, 

1859 space_vertically: bool = True, 

1860 scale_weight: float = 1.25, 

1861 strip_rotated: bool = True, 

1862 debug_path: Optional[Path] = None, 

1863 font_height_weight: float = 1, 

1864 ) -> str: 

1865 """ 

1866 Get text preserving fidelity to source PDF text layout. 

1867 

1868 Args: 

1869 space_vertically: include blank lines inferred from y distance + font 

1870 height. Defaults to True. 

1871 scale_weight: multiplier for string length when calculating weighted 

1872 average character width. Defaults to 1.25. 

1873 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1874 layout mode output. Defaults to True. 

1875 debug_path (Path | None): if supplied, must target a directory. 

1876 creates the following files with debug information for layout mode 

1877 functions if supplied: 

1878 - fonts.json: output of self._layout_mode_fonts 

1879 - tjs.json: individual text render ops with corresponding transform matrices 

1880 - bts.json: text render ops left justified and grouped by BT/ET operators 

1881 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1882 Defaults to None. 

1883 font_height_weight: multiplier for font height when calculating 

1884 blank lines. Defaults to 1. 

1885 

1886 Returns: 

1887 str: multiline string containing page text in a fixed width format that 

1888 closely adheres to the rendered layout in the source pdf. 

1889 

1890 """ 

1891 fonts = self._layout_mode_fonts() 

1892 if debug_path: # pragma: no cover 

1893 import json # noqa: PLC0415 

1894 

1895 debug_path.joinpath("fonts.json").write_text( 

1896 json.dumps( 

1897 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) 

1898 ), 

1899 "utf-8", 

1900 ) 

1901 

1902 ops = iter( 

1903 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1904 ) 

1905 bt_groups = _layout_mode.text_show_operations( 

1906 ops, fonts, strip_rotated, debug_path 

1907 ) 

1908 

1909 if not bt_groups: 

1910 return "" 

1911 

1912 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1913 

1914 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1915 

1916 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1917 

1918 def extract_text( 

1919 self, 

1920 *args: Any, 

1921 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), 

1922 space_width: float = 200.0, 

1923 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1924 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1925 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1926 extraction_mode: Literal["plain", "layout"] = "plain", 

1927 **kwargs: Any, 

1928 ) -> str: 

1929 """ 

1930 Locate all text drawing commands, in the order they are provided in the 

1931 content stream, and extract the text. 

1932 

1933 This works well for some PDF files, but poorly for others, depending on 

1934 the generator used. This will be refined in the future. 

1935 

1936 Do not rely on the order of text coming out of this function, as it 

1937 will change if this function is made more sophisticated. 

1938 

1939 Arabic and Hebrew are extracted in the correct order. 

1940 If required a custom RTL range of characters can be defined; 

1941 see function set_custom_rtl. 

1942 

1943 Additionally you can provide visitor methods to get informed on all 

1944 operations and all text objects. 

1945 For example in some PDF files this can be useful to parse tables. 

1946 

1947 Args: 

1948 orientations: list of orientations extract_text will look for 

1949 default = (0, 90, 180, 270) 

1950 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1951 270 (turned right) 

1952 Silently ignored in "layout" mode. 

1953 space_width: force default space width 

1954 if not extracted from font (default: 200) 

1955 Silently ignored in "layout" mode. 

1956 visitor_operand_before: function to be called before processing an operation. 

1957 It has four arguments: operator, operand-arguments, 

1958 current transformation matrix and text matrix. 

1959 Ignored with a warning in "layout" mode. 

1960 visitor_operand_after: function to be called after processing an operation. 

1961 It has four arguments: operator, operand-arguments, 

1962 current transformation matrix and text matrix. 

1963 Ignored with a warning in "layout" mode. 

1964 visitor_text: function to be called when extracting some text at some position. 

1965 It has five arguments: text, current transformation matrix, 

1966 text matrix, font-dictionary and font-size. 

1967 The font-dictionary may be None in case of unknown fonts. 

1968 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1969 Ignored with a warning in "layout" mode. 

1970 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1971 "layout" for experimental layout mode functionality. 

1972 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1973 in "layout" mode. 

1974 

1975 kwargs: 

1976 layout_mode_space_vertically (bool): include blank lines inferred from 

1977 y distance + font height. Defaults to True. 

1978 layout_mode_scale_weight (float): multiplier for string length when calculating 

1979 weighted average character width. Defaults to 1.25. 

1980 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1981 Set to False to include rotated text anyway. If rotated text is discovered, 

1982 layout will be degraded and a warning will result. Defaults to True. 

1983 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1984 creates the following files with debug information for layout mode 

1985 functions if supplied: 

1986 

1987 - fonts.json: output of self._layout_mode_fonts 

1988 - tjs.json: individual text render ops with corresponding transform matrices 

1989 - bts.json: text render ops left justified and grouped by BT/ET operators 

1990 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1991 layout_mode_font_height_weight (float): multiplier for font height when calculating 

1992 blank lines. Defaults to 1. 

1993 

1994 Returns: 

1995 The extracted text 

1996 

1997 """ 

1998 if extraction_mode not in ["plain", "layout"]: 

1999 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

2000 if extraction_mode == "layout": 

2001 for visitor in ( 

2002 "visitor_operand_before", 

2003 "visitor_operand_after", 

2004 "visitor_text", 

2005 ): 

2006 if locals()[visitor]: 

2007 logger_warning( 

2008 f"Argument {visitor} is ignored in layout mode", 

2009 __name__, 

2010 ) 

2011 return self._layout_mode_text( 

2012 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2013 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2014 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2015 debug_path=kwargs.get("layout_mode_debug_path"), 

2016 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2017 ) 

2018 if len(args) >= 1: 

2019 if isinstance(args[0], str): 

2020 if len(args) >= 3: 

2021 if isinstance(args[2], (tuple, int)): 

2022 orientations = args[2] 

2023 else: 

2024 raise TypeError(f"Invalid positional parameter {args[2]}") 

2025 if len(args) >= 4: 

2026 if isinstance(args[3], (float, int)): 

2027 space_width = args[3] 

2028 else: 

2029 raise TypeError(f"Invalid positional parameter {args[3]}") 

2030 elif isinstance(args[0], (tuple, int)): 

2031 orientations = args[0] 

2032 if len(args) >= 2: 

2033 if isinstance(args[1], (float, int)): 

2034 space_width = args[1] 

2035 else: 

2036 raise TypeError(f"Invalid positional parameter {args[1]}") 

2037 else: 

2038 raise TypeError(f"Invalid positional parameter {args[0]}") 

2039 

2040 if isinstance(orientations, int): 

2041 orientations = (orientations,) 

2042 

2043 return self._extract_text( 

2044 self, 

2045 self.pdf, 

2046 orientations, 

2047 space_width, 

2048 PG.CONTENTS, 

2049 visitor_operand_before, 

2050 visitor_operand_after, 

2051 visitor_text, 

2052 ) 

2053 

2054 def extract_xform_text( 

2055 self, 

2056 xform: EncodedStreamObject, 

2057 orientations: tuple[int, ...] = (0, 90, 270, 360), 

2058 space_width: float = 200.0, 

2059 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2060 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2061 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2062 ) -> str: 

2063 """ 

2064 Extract text from an XObject. 

2065 

2066 Args: 

2067 xform: 

2068 orientations: 

2069 space_width: force default space width (if not extracted from font (default 200) 

2070 visitor_operand_before: 

2071 visitor_operand_after: 

2072 visitor_text: 

2073 

2074 Returns: 

2075 The extracted text 

2076 

2077 """ 

2078 return self._extract_text( 

2079 xform, 

2080 self.pdf, 

2081 orientations, 

2082 space_width, 

2083 None, 

2084 visitor_operand_before, 

2085 visitor_operand_after, 

2086 visitor_text, 

2087 ) 

2088 

2089 def _get_fonts(self) -> tuple[set[str], set[str]]: 

2090 """ 

2091 Get the names of embedded fonts and unembedded fonts. 

2092 

2093 Returns: 

2094 A tuple (set of embedded fonts, set of unembedded fonts) 

2095 

2096 """ 

2097 obj = self.get_object() 

2098 assert isinstance(obj, DictionaryObject) 

2099 fonts: set[str] = set() 

2100 embedded: set[str] = set() 

2101 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2102 unembedded = fonts - embedded 

2103 return embedded, unembedded 

2104 

2105 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2106 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2107 default user space units, defining the boundaries of the physical medium on 

2108 which the page is intended to be displayed or printed.""" 

2109 

2110 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2111 """ 

2112 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2113 default user space units, defining the visible region of default user 

2114 space. 

2115 

2116 When the page is displayed or printed, its contents are to be clipped 

2117 (cropped) to this rectangle and then imposed on the output medium in some 

2118 implementation-defined manner. Default value: same as 

2119 :attr:`mediabox<mediabox>`. 

2120 """ 

2121 

2122 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2123 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2124 default user space units, defining the region to which the contents of the 

2125 page should be clipped when output in a production environment.""" 

2126 

2127 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2128 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2129 default user space units, defining the intended dimensions of the finished 

2130 page after trimming.""" 

2131 

2132 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2133 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2134 default user space units, defining the extent of the page's meaningful 

2135 content as intended by the page's creator.""" 

2136 

2137 @property 

2138 def annotations(self) -> Optional[ArrayObject]: 

2139 if "/Annots" not in self: 

2140 return None 

2141 return cast(ArrayObject, self["/Annots"]) 

2142 

2143 @annotations.setter 

2144 def annotations(self, value: Optional[ArrayObject]) -> None: 

2145 """ 

2146 Set the annotations array of the page. 

2147 

2148 Typically you do not want to set this value, but append to it. 

2149 If you append to it, remember to add the object first to the writer 

2150 and only add the indirect object. 

2151 """ 

2152 if value is None: 

2153 del self[NameObject("/Annots")] 

2154 else: 

2155 self[NameObject("/Annots")] = value 

2156 

2157 

2158class _VirtualList(Sequence[PageObject]): 

2159 def __init__( 

2160 self, 

2161 length_function: Callable[[], int], 

2162 get_function: Callable[[int], PageObject], 

2163 ) -> None: 

2164 self.length_function = length_function 

2165 self.get_function = get_function 

2166 self.current = -1 

2167 

2168 def __len__(self) -> int: 

2169 return self.length_function() 

2170 

2171 @overload 

2172 def __getitem__(self, index: int) -> PageObject: 

2173 ... 

2174 

2175 @overload 

2176 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2177 ... 

2178 

2179 def __getitem__( 

2180 self, index: Union[int, slice] 

2181 ) -> Union[PageObject, Sequence[PageObject]]: 

2182 if isinstance(index, slice): 

2183 indices = range(*index.indices(len(self))) 

2184 cls = type(self) 

2185 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2186 if not isinstance(index, int): 

2187 raise TypeError("Sequence indices must be integers") 

2188 len_self = len(self) 

2189 if index < 0: 

2190 # support negative indexes 

2191 index += len_self 

2192 if not (0 <= index < len_self): 

2193 raise IndexError("Sequence index out of range") 

2194 return self.get_function(index) 

2195 

2196 def __delitem__(self, index: Union[int, slice]) -> None: 

2197 if isinstance(index, slice): 

2198 r = list(range(*index.indices(len(self)))) 

2199 # pages have to be deleted from last to first 

2200 r.sort() 

2201 r.reverse() 

2202 for p in r: 

2203 del self[p] # recursive call 

2204 return 

2205 if not isinstance(index, int): 

2206 raise TypeError("Index must be integers") 

2207 len_self = len(self) 

2208 if index < 0: 

2209 # support negative indexes 

2210 index += len_self 

2211 if not (0 <= index < len_self): 

2212 raise IndexError("Index out of range") 

2213 ind = self[index].indirect_reference 

2214 assert ind is not None 

2215 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2216 "/Parent", None 

2217 ) 

2218 first = True 

2219 while parent is not None: 

2220 parent = cast(DictionaryObject, parent.get_object()) 

2221 try: 

2222 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2223 del cast(ArrayObject, parent["/Kids"])[i] 

2224 first = False 

2225 try: 

2226 assert ind is not None 

2227 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2228 except Exception: # pragma: no cover 

2229 pass 

2230 if "/Count" in parent: 

2231 parent[NameObject("/Count")] = NumberObject( 

2232 cast(int, parent["/Count"]) - 1 

2233 ) 

2234 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2235 # No more objects in this part of this subtree 

2236 ind = parent.indirect_reference 

2237 parent = parent.get("/Parent", None) 

2238 except ValueError: # from index 

2239 if first: 

2240 raise PdfReadError(f"Page not found in page tree: {ind}") 

2241 break 

2242 

2243 def __iter__(self) -> Iterator[PageObject]: 

2244 for i in range(len(self)): 

2245 yield self[i] 

2246 

2247 def __str__(self) -> str: 

2248 p = [f"PageObject({i})" for i in range(self.length_function())] 

2249 return f"[{', '.join(p)}]" 

2250 

2251 

2252def _get_fonts_walk( 

2253 obj: DictionaryObject, 

2254 fnt: set[str], 

2255 emb: set[str], 

2256) -> tuple[set[str], set[str]]: 

2257 """ 

2258 Get the set of all fonts and all embedded fonts. 

2259 

2260 Args: 

2261 obj: Page resources dictionary 

2262 fnt: font 

2263 emb: embedded fonts 

2264 

2265 Returns: 

2266 A tuple (fnt, emb) 

2267 

2268 If there is a key called 'BaseFont', that is a font that is used in the document. 

2269 If there is a key called 'FontName' and another key in the same dictionary object 

2270 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2271 embedded. 

2272 

2273 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2274 

2275 """ 

2276 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2277 

2278 def process_font(f: DictionaryObject) -> None: 

2279 nonlocal fnt, emb 

2280 f = cast(DictionaryObject, f.get_object()) # to be sure 

2281 if "/BaseFont" in f: 

2282 fnt.add(cast(str, f["/BaseFont"])) 

2283 

2284 if ( 

2285 ("/CharProcs" in f) 

2286 or ( 

2287 "/FontDescriptor" in f 

2288 and any( 

2289 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2290 ) 

2291 ) 

2292 or ( 

2293 "/DescendantFonts" in f 

2294 and "/FontDescriptor" 

2295 in cast( 

2296 DictionaryObject, 

2297 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2298 ) 

2299 and any( 

2300 x 

2301 in cast( 

2302 DictionaryObject, 

2303 cast( 

2304 DictionaryObject, 

2305 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2306 )["/FontDescriptor"], 

2307 ) 

2308 for x in fontkeys 

2309 ) 

2310 ) 

2311 ): 

2312 # the list comprehension ensures there is FontFile 

2313 try: 

2314 emb.add(cast(str, f["/BaseFont"])) 

2315 except KeyError: 

2316 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2317 

2318 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2319 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2320 process_font(f) 

2321 if "/Resources" in obj: 

2322 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2323 for f in cast( 

2324 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2325 ).values(): 

2326 process_font(f) 

2327 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2328 for x in cast( 

2329 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2330 ).values(): 

2331 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2332 if "/Annots" in obj: 

2333 for a in cast(ArrayObject, obj["/Annots"]): 

2334 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2335 if "/AP" in obj: 

2336 if ( 

2337 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2338 "/Type" 

2339 ) 

2340 == "/XObject" 

2341 ): 

2342 _get_fonts_walk( 

2343 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2344 fnt, 

2345 emb, 

2346 ) 

2347 else: 

2348 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2349 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2350 return fnt, emb # return the sets for each page