Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

914 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from collections.abc import Iterable, Iterator, Sequence 

32from dataclasses import dataclass 

33from decimal import Decimal 

34from io import BytesIO 

35from pathlib import Path 

36from typing import ( 

37 Any, 

38 Callable, 

39 Literal, 

40 Optional, 

41 Union, 

42 cast, 

43 overload, 

44) 

45 

46from ._cmap import ( 

47 build_char_map, 

48) 

49from ._protocols import PdfCommonDocProtocol 

50from ._text_extraction import ( 

51 _layout_mode, 

52) 

53from ._text_extraction._text_extractor import TextExtraction 

54from ._utils import ( 

55 CompressedTransformationMatrix, 

56 TransformationMatrixType, 

57 _human_readable_bytes, 

58 logger_warning, 

59 matrix_multiply, 

60) 

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

62from .constants import AnnotationDictionaryAttributes as ADA 

63from .constants import ImageAttributes as IA 

64from .constants import PageAttributes as PG 

65from .constants import Resources as RES 

66from .errors import PageSizeNotDefinedError, PdfReadError 

67from .filters import _xobj_to_image 

68from .generic import ( 

69 ArrayObject, 

70 ContentStream, 

71 DictionaryObject, 

72 EncodedStreamObject, 

73 FloatObject, 

74 IndirectObject, 

75 NameObject, 

76 NullObject, 

77 NumberObject, 

78 PdfObject, 

79 RectangleObject, 

80 StreamObject, 

81 is_null_or_none, 

82) 

83 

84try: 

85 from PIL.Image import Image 

86 

87 pil_not_imported = False 

88except ImportError: 

89 Image = object # type: ignore 

90 pil_not_imported = True # error will be raised only when using images 

91 

92MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" 

93 

94 

95def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

96 retval: Union[None, RectangleObject, IndirectObject] = self.get(name) 

97 if isinstance(retval, RectangleObject): 

98 return retval 

99 if is_null_or_none(retval): 

100 for d in defaults: 

101 retval = self.get(d) 

102 if retval is not None: 

103 break 

104 if isinstance(retval, IndirectObject): 

105 retval = self.pdf.get_object(retval) 

106 retval = RectangleObject(retval) # type: ignore 

107 _set_rectangle(self, name, retval) 

108 return retval 

109 

110 

111def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

112 self[NameObject(name)] = value 

113 

114 

115def _delete_rectangle(self: Any, name: str) -> None: 

116 del self[name] 

117 

118 

119def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

120 return property( 

121 lambda self: _get_rectangle(self, name, fallback), 

122 lambda self, value: _set_rectangle(self, name, value), 

123 lambda self: _delete_rectangle(self, name), 

124 ) 

125 

126 

127class Transformation: 

128 """ 

129 Represent a 2D transformation. 

130 

131 The transformation between two coordinate systems is represented by a 3-by-3 

132 transformation matrix with the following form:: 

133 

134 a b 0 

135 c d 0 

136 e f 1 

137 

138 Because a transformation matrix has only six elements that can be changed, 

139 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

140 

141 Coordinate transformations are expressed as matrix multiplications:: 

142 

143 a b 0 

144 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

145 e f 1 

146 

147 

148 Example: 

149 >>> from pypdf import PdfWriter, Transformation 

150 >>> page = PdfWriter().add_blank_page(800, 600) 

151 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

152 >>> page.add_transformation(op) 

153 

154 """ 

155 

156 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

157 self.ctm = ctm 

158 

159 @property 

160 def matrix(self) -> TransformationMatrixType: 

161 """ 

162 Return the transformation matrix as a tuple of tuples in the form: 

163 

164 ((a, b, 0), (c, d, 0), (e, f, 1)) 

165 """ 

166 return ( 

167 (self.ctm[0], self.ctm[1], 0), 

168 (self.ctm[2], self.ctm[3], 0), 

169 (self.ctm[4], self.ctm[5], 1), 

170 ) 

171 

172 @staticmethod 

173 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

174 """ 

175 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

176 

177 Args: 

178 matrix: The transformation matrix as a tuple of tuples. 

179 

180 Returns: 

181 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

182 

183 """ 

184 return ( 

185 matrix[0][0], 

186 matrix[0][1], 

187 matrix[1][0], 

188 matrix[1][1], 

189 matrix[2][0], 

190 matrix[2][1], 

191 ) 

192 

193 def _to_cm(self) -> str: 

194 # Returns the cm operation string for the given transformation matrix 

195 return ( 

196 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " 

197 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" 

198 ) 

199 

200 def transform(self, m: "Transformation") -> "Transformation": 

201 """ 

202 Apply one transformation to another. 

203 

204 Args: 

205 m: a Transformation to apply. 

206 

207 Returns: 

208 A new ``Transformation`` instance 

209 

210 Example: 

211 >>> from pypdf import PdfWriter, Transformation 

212 >>> height, width = 40, 50 

213 >>> page = PdfWriter().add_blank_page(800, 600) 

214 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

215 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

216 >>> page.add_transformation(op) 

217 

218 """ 

219 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

220 return Transformation(ctm) 

221 

222 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

223 """ 

224 Translate the contents of a page. 

225 

226 Args: 

227 tx: The translation along the x-axis. 

228 ty: The translation along the y-axis. 

229 

230 Returns: 

231 A new ``Transformation`` instance 

232 

233 """ 

234 m = self.ctm 

235 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

236 

237 def scale( 

238 self, sx: Optional[float] = None, sy: Optional[float] = None 

239 ) -> "Transformation": 

240 """ 

241 Scale the contents of a page towards the origin of the coordinate system. 

242 

243 Typically, that is the lower-left corner of the page. That can be 

244 changed by translating the contents / the page boxes. 

245 

246 Args: 

247 sx: The scale factor along the x-axis. 

248 sy: The scale factor along the y-axis. 

249 

250 Returns: 

251 A new Transformation instance with the scaled matrix. 

252 

253 """ 

254 if sx is None and sy is None: 

255 raise ValueError("Either sx or sy must be specified") 

256 if sx is None: 

257 sx = sy 

258 if sy is None: 

259 sy = sx 

260 assert sx is not None 

261 assert sy is not None 

262 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

263 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

264 return Transformation(ctm) 

265 

266 def rotate(self, rotation: float) -> "Transformation": 

267 """ 

268 Rotate the contents of a page. 

269 

270 Args: 

271 rotation: The angle of rotation in degrees. 

272 

273 Returns: 

274 A new ``Transformation`` instance with the rotated matrix. 

275 

276 """ 

277 rotation = math.radians(rotation) 

278 op: TransformationMatrixType = ( 

279 (math.cos(rotation), math.sin(rotation), 0), 

280 (-math.sin(rotation), math.cos(rotation), 0), 

281 (0, 0, 1), 

282 ) 

283 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

284 return Transformation(ctm) 

285 

286 def __repr__(self) -> str: 

287 return f"Transformation(ctm={self.ctm})" 

288 

289 @overload 

290 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: 

291 ... 

292 

293 @overload 

294 def apply_on( 

295 self, pt: tuple[float, float], as_object: bool = False 

296 ) -> tuple[float, float]: 

297 ... 

298 

299 def apply_on( 

300 self, 

301 pt: Union[tuple[float, float], list[float]], 

302 as_object: bool = False, 

303 ) -> Union[tuple[float, float], list[float]]: 

304 """ 

305 Apply the transformation matrix on the given point. 

306 

307 Args: 

308 pt: A tuple or list representing the point in the form (x, y). 

309 as_object: If True, return items as FloatObject, otherwise as plain floats. 

310 

311 Returns: 

312 A tuple or list representing the transformed point in the form (x', y') 

313 

314 """ 

315 typ = FloatObject if as_object else float 

316 pt1 = ( 

317 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

318 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

319 ) 

320 return list(pt1) if isinstance(pt, list) else pt1 

321 

322 

323@dataclass 

324class ImageFile: 

325 """ 

326 Image within the PDF file. *This object is not designed to be built.* 

327 

328 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

329 """ 

330 

331 name: str = "" 

332 """ 

333 Filename as identified within the PDF file. 

334 """ 

335 

336 data: bytes = b"" 

337 """ 

338 Data as bytes. 

339 """ 

340 

341 image: Optional[Image] = None 

342 """ 

343 Data as PIL image. 

344 """ 

345 

346 indirect_reference: Optional[IndirectObject] = None 

347 """ 

348 Reference to the object storing the stream. 

349 """ 

350 

351 def replace(self, new_image: Image, **kwargs: Any) -> None: 

352 """ 

353 Replace the image with a new PIL image. 

354 

355 Args: 

356 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

357 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

358 

359 Raises: 

360 TypeError: If the image is inline or in a PdfReader. 

361 TypeError: If the image does not belong to a PdfWriter. 

362 TypeError: If `new_image` is not a PIL Image. 

363 

364 Note: 

365 This method replaces the existing image with a new image. 

366 It is not allowed for inline images or images within a PdfReader. 

367 The `kwargs` parameter allows passing additional parameters 

368 to `Image.save()`, such as quality. 

369 

370 """ 

371 if pil_not_imported: 

372 raise ImportError( 

373 "pillow is required to do image extraction. " 

374 "It can be installed via 'pip install pypdf[image]'" 

375 ) 

376 

377 from ._reader import PdfReader # noqa: PLC0415 

378 

379 # to prevent circular import 

380 from .filters import _xobj_to_image # noqa: PLC0415 

381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

382 

383 if self.indirect_reference is None: 

384 raise TypeError("Cannot update an inline image.") 

385 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

386 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

387 if not isinstance(new_image, Image): 

388 raise TypeError("new_image shall be a PIL Image") 

389 b = BytesIO() 

390 new_image.save(b, "PDF", **kwargs) 

391 reader = PdfReader(b) 

392 page_image = reader.pages[0].images[0] 

393 assert page_image.indirect_reference is not None 

394 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

395 page_image.indirect_reference.get_object() 

396 ) 

397 cast( 

398 PdfObject, self.indirect_reference.get_object() 

399 ).indirect_reference = self.indirect_reference 

400 # change the object attributes 

401 extension, byte_stream, img = _xobj_to_image( 

402 cast(DictionaryObject, self.indirect_reference.get_object()), 

403 pillow_parameters=kwargs, 

404 ) 

405 assert extension is not None 

406 self.name = self.name[: self.name.rfind(".")] + extension 

407 self.data = byte_stream 

408 self.image = img 

409 

410 def __str__(self) -> str: 

411 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

412 

413 def __repr__(self) -> str: 

414 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

415 

416 

417class VirtualListImages(Sequence[ImageFile]): 

418 """ 

419 Provides access to images referenced within a page. 

420 Only one copy will be returned if the usage is used on the same page multiple times. 

421 See :func:`PageObject.images` for more details. 

422 """ 

423 

424 def __init__( 

425 self, 

426 ids_function: Callable[[], list[Union[str, list[str]]]], 

427 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], 

428 ) -> None: 

429 self.ids_function = ids_function 

430 self.get_function = get_function 

431 self.current = -1 

432 

433 def __len__(self) -> int: 

434 return len(self.ids_function()) 

435 

436 def keys(self) -> list[Union[str, list[str]]]: 

437 return self.ids_function() 

438 

439 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: 

440 return [(x, self[x]) for x in self.ids_function()] 

441 

442 @overload 

443 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: 

444 ... 

445 

446 @overload 

447 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

448 ... 

449 

450 def __getitem__( 

451 self, index: Union[int, slice, str, list[str], tuple[str]] 

452 ) -> Union[ImageFile, Sequence[ImageFile]]: 

453 lst = self.ids_function() 

454 if isinstance(index, slice): 

455 indices = range(*index.indices(len(self))) 

456 lst = [lst[x] for x in indices] 

457 cls = type(self) 

458 return cls((lambda: lst), self.get_function) 

459 if isinstance(index, (str, list, tuple)): 

460 return self.get_function(index) 

461 if not isinstance(index, int): 

462 raise TypeError("Invalid sequence indices type") 

463 len_self = len(lst) 

464 if index < 0: 

465 # support negative indexes 

466 index += len_self 

467 if not (0 <= index < len_self): 

468 raise IndexError("Sequence index out of range") 

469 return self.get_function(lst[index]) 

470 

471 def __iter__(self) -> Iterator[ImageFile]: 

472 for i in range(len(self)): 

473 yield self[i] 

474 

475 def __str__(self) -> str: 

476 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

477 return f"[{', '.join(p)}]" 

478 

479 

480class PageObject(DictionaryObject): 

481 """ 

482 PageObject represents a single page within a PDF file. 

483 

484 Typically these objects will be created by accessing the 

485 :attr:`pages<pypdf.PdfReader.pages>` property of the 

486 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

487 also possible to create an empty page with the 

488 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

489 

490 Args: 

491 pdf: PDF file the page belongs to. 

492 indirect_reference: Stores the original indirect reference to 

493 this object in its source PDF 

494 

495 """ 

496 

497 original_page: "PageObject" # very local use in writer when appending 

498 

499 def __init__( 

500 self, 

501 pdf: Optional[PdfCommonDocProtocol] = None, 

502 indirect_reference: Optional[IndirectObject] = None, 

503 ) -> None: 

504 DictionaryObject.__init__(self) 

505 self.pdf = pdf 

506 self.inline_images: Optional[dict[str, ImageFile]] = None 

507 self.indirect_reference = indirect_reference 

508 if not is_null_or_none(indirect_reference): 

509 assert indirect_reference is not None, "mypy" 

510 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

511 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {} 

512 

513 def hash_bin(self) -> int: 

514 """ 

515 Used to detect modified object. 

516 

517 Note: this function is overloaded to return the same results 

518 as a DictionaryObject. 

519 

520 Returns: 

521 Hash considering type and value. 

522 

523 """ 

524 return hash( 

525 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

526 ) 

527 

528 def hash_value_data(self) -> bytes: 

529 data = super().hash_value_data() 

530 data += f"{id(self)}".encode() 

531 return data 

532 

533 @property 

534 def user_unit(self) -> float: 

535 """ 

536 A read-only positive number giving the size of user space units. 

537 

538 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

539 space unit is 1/72 inch, and a value of 3 means that a user 

540 space unit is 3/72 inch. 

541 """ 

542 return self.get(PG.USER_UNIT, 1) 

543 

544 @staticmethod 

545 def create_blank_page( 

546 pdf: Optional[PdfCommonDocProtocol] = None, 

547 width: Union[float, Decimal, None] = None, 

548 height: Union[float, Decimal, None] = None, 

549 ) -> "PageObject": 

550 """ 

551 Return a new blank page. 

552 

553 If ``width`` or ``height`` is ``None``, try to get the page size 

554 from the last page of *pdf*. 

555 

556 Args: 

557 pdf: PDF file the page is within. 

558 width: The width of the new page expressed in default user 

559 space units. 

560 height: The height of the new page expressed in default user 

561 space units. 

562 

563 Returns: 

564 The new blank page 

565 

566 Raises: 

567 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

568 no page 

569 

570 """ 

571 page = PageObject(pdf) 

572 

573 # Creates a new page (cf PDF Reference §7.7.3.3) 

574 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

575 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

576 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

577 if width is None or height is None: 

578 if pdf is not None and len(pdf.pages) > 0: 

579 lastpage = pdf.pages[len(pdf.pages) - 1] 

580 width = lastpage.mediabox.width 

581 height = lastpage.mediabox.height 

582 else: 

583 raise PageSizeNotDefinedError 

584 page.__setitem__( 

585 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

586 ) 

587 

588 return page 

589 

590 def _get_ids_image( 

591 self, 

592 obj: Optional[DictionaryObject] = None, 

593 ancest: Optional[list[str]] = None, 

594 call_stack: Optional[list[Any]] = None, 

595 ) -> list[Union[str, list[str]]]: 

596 if call_stack is None: 

597 call_stack = [] 

598 _i = getattr(obj, "indirect_reference", None) 

599 if _i in call_stack: 

600 return [] 

601 call_stack.append(_i) 

602 if self.inline_images is None: 

603 self.inline_images = self._get_inline_images() 

604 if obj is None: 

605 obj = self 

606 if ancest is None: 

607 ancest = [] 

608 lst: list[Union[str, list[str]]] = [] 

609 if ( 

610 PG.RESOURCES not in obj or 

611 is_null_or_none(resources := obj[PG.RESOURCES]) or 

612 RES.XOBJECT not in cast(DictionaryObject, resources) 

613 ): 

614 return [] if self.inline_images is None else list(self.inline_images.keys()) 

615 

616 x_object = resources[RES.XOBJECT].get_object() # type: ignore 

617 for o in x_object: 

618 if not isinstance(x_object[o], StreamObject): 

619 continue 

620 if x_object[o][IA.SUBTYPE] == "/Image": 

621 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

622 else: # is a form with possible images inside 

623 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

624 assert self.inline_images is not None 

625 lst.extend(list(self.inline_images.keys())) 

626 return lst 

627 

628 def _get_image( 

629 self, 

630 id: Union[str, list[str], tuple[str]], 

631 obj: Optional[DictionaryObject] = None, 

632 ) -> ImageFile: 

633 if obj is None: 

634 obj = cast(DictionaryObject, self) 

635 if isinstance(id, tuple): 

636 id = list(id) 

637 if isinstance(id, list) and len(id) == 1: 

638 id = id[0] 

639 try: 

640 xobjs = cast( 

641 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

642 ) 

643 except KeyError: 

644 if not (id[0] == "~" and id[-1] == "~"): 

645 raise 

646 if isinstance(id, str): 

647 if id[0] == "~" and id[-1] == "~": 

648 if self.inline_images is None: 

649 self.inline_images = self._get_inline_images() 

650 if self.inline_images is None: # pragma: no cover 

651 raise KeyError("No inline image can be found") 

652 return self.inline_images[id] 

653 

654 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

655 extension, byte_stream = imgd[:2] 

656 return ImageFile( 

657 name=f"{id[1:]}{extension}", 

658 data=byte_stream, 

659 image=imgd[2], 

660 indirect_reference=xobjs[id].indirect_reference, 

661 ) 

662 # in a subobject 

663 ids = id[1:] 

664 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

665 

666 @property 

667 def images(self) -> VirtualListImages: 

668 """ 

669 Read-only property emulating a list of images on a page. 

670 

671 Get a list of all images on the page. The key can be: 

672 - A string (for the top object) 

673 - A tuple (for images within XObject forms) 

674 - An integer 

675 

676 Examples: 

677 * `reader.pages[0].images[0]` # return first image 

678 * `reader.pages[0].images['/I0']` # return image '/I0' 

679 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

680 * `for img in reader.pages[0].images:` # loops through all objects 

681 

682 images.keys() and images.items() can be used. 

683 

684 The ImageFile has the following properties: 

685 

686 * `.name` : name of the object 

687 * `.data` : bytes of the object 

688 * `.image` : PIL Image Object 

689 * `.indirect_reference` : object reference 

690 

691 and the following methods: 

692 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

693 replace the image in the pdf with the new image 

694 applying the saving parameters indicated (such as quality) 

695 

696 Example usage: 

697 

698 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

699 

700 Inline images are extracted and named ~0~, ~1~, ..., with the 

701 indirect_reference set to None. 

702 

703 """ 

704 return VirtualListImages(self._get_ids_image, self._get_image) 

705 

706 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

707 """Translate values used in inline image""" 

708 try: 

709 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

710 except (TypeError, KeyError): 

711 if isinstance(v, NameObject): 

712 # It is a custom name, thus we have to look in resources. 

713 # The only applicable case is for ColorSpace. 

714 try: 

715 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

716 v = cast(DictionaryObject, res)[v] 

717 except KeyError: # for res and v 

718 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

719 return v 

720 

721 def _get_inline_images(self) -> dict[str, ImageFile]: 

722 """Load inline images. Entries will be identified as `~1~`.""" 

723 content = self.get_contents() 

724 if is_null_or_none(content): 

725 return {} 

726 imgs_data = [] 

727 assert content is not None, "mypy" 

728 for param, ope in content.operations: 

729 if ope == b"INLINE IMAGE": 

730 imgs_data.append( 

731 {"settings": param["settings"], "__streamdata__": param["data"]} 

732 ) 

733 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

734 raise PdfReadError( 

735 f"{ope!r} operator met whereas not expected, " 

736 "please share use case with pypdf dev team" 

737 ) 

738 files = {} 

739 for num, ii in enumerate(imgs_data): 

740 init = { 

741 "__streamdata__": ii["__streamdata__"], 

742 "/Length": len(ii["__streamdata__"]), 

743 } 

744 for k, v in ii["settings"].items(): 

745 if k in {"/Length", "/L"}: # no length is expected 

746 continue 

747 if isinstance(v, list): 

748 v = ArrayObject( 

749 [self._translate_value_inline_image(k, x) for x in v] 

750 ) 

751 else: 

752 v = self._translate_value_inline_image(k, v) 

753 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

754 if k not in init: 

755 init[k] = v 

756 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

757 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

758 files[f"~{num}~"] = ImageFile( 

759 name=f"~{num}~{extension}", 

760 data=byte_stream, 

761 image=img, 

762 indirect_reference=None, 

763 ) 

764 return files 

765 

766 @property 

767 def rotation(self) -> int: 

768 """ 

769 The visual rotation of the page. 

770 

771 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

772 valid values. This property does not affect ``/Contents``. 

773 """ 

774 rotate_obj = self.get(PG.ROTATE, 0) 

775 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

776 

777 @rotation.setter 

778 def rotation(self, r: float) -> None: 

779 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

780 

781 def transfer_rotation_to_content(self) -> None: 

782 """ 

783 Apply the rotation of the page to the content and the media/crop/... 

784 boxes. 

785 

786 It is recommended to apply this function before page merging. 

787 """ 

788 r = -self.rotation # rotation to apply is in the otherway 

789 self.rotation = 0 

790 mb = RectangleObject(self.mediabox) 

791 trsf = ( 

792 Transformation() 

793 .translate( 

794 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

795 ) 

796 .rotate(r) 

797 ) 

798 pt1 = trsf.apply_on(mb.lower_left) 

799 pt2 = trsf.apply_on(mb.upper_right) 

800 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

801 self.add_transformation(trsf, False) 

802 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

803 if b in self: 

804 rr = RectangleObject(self[b]) # type: ignore 

805 pt1 = trsf.apply_on(rr.lower_left) 

806 pt2 = trsf.apply_on(rr.upper_right) 

807 self[NameObject(b)] = RectangleObject( 

808 ( 

809 min(pt1[0], pt2[0]), 

810 min(pt1[1], pt2[1]), 

811 max(pt1[0], pt2[0]), 

812 max(pt1[1], pt2[1]), 

813 ) 

814 ) 

815 

816 def rotate(self, angle: int) -> "PageObject": 

817 """ 

818 Rotate a page clockwise by increments of 90 degrees. 

819 

820 Args: 

821 angle: Angle to rotate the page. Must be an increment of 90 deg. 

822 

823 Returns: 

824 The rotated PageObject 

825 

826 """ 

827 if angle % 90 != 0: 

828 raise ValueError("Rotation angle must be a multiple of 90") 

829 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

830 return self 

831 

832 def _merge_resources( 

833 self, 

834 res1: DictionaryObject, 

835 res2: DictionaryObject, 

836 resource: Any, 

837 new_res1: bool = True, 

838 ) -> tuple[dict[str, Any], dict[str, Any]]: 

839 try: 

840 assert isinstance(self.indirect_reference, IndirectObject) 

841 pdf = self.indirect_reference.pdf 

842 is_pdf_writer = hasattr( 

843 pdf, "_add_object" 

844 ) # expect isinstance(pdf, PdfWriter) 

845 except (AssertionError, AttributeError): 

846 pdf = None 

847 is_pdf_writer = False 

848 

849 def compute_unique_key(base_key: str) -> tuple[str, bool]: 

850 """ 

851 Find a key that either doesn't already exist or has the same value 

852 (indicated by the bool) 

853 

854 Args: 

855 base_key: An index is added to this to get the computed key 

856 

857 Returns: 

858 A tuple (computed key, bool) where the boolean indicates 

859 if there is a resource of the given computed_key with the same 

860 value. 

861 

862 """ 

863 value = page2res.raw_get(base_key) 

864 # TODO: a possible improvement for writer, the indirect_reference 

865 # cannot be found because translated 

866 

867 # try the current key first (e.g. "foo"), but otherwise iterate 

868 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

869 # many keys, thus this'll eventually end, even if it's been crafted 

870 # to be maximally annoying. 

871 computed_key = base_key 

872 idx = 0 

873 while computed_key in new_res: 

874 if new_res.raw_get(computed_key) == value: 

875 # there's already a resource of this name, with the exact 

876 # same value 

877 return computed_key, True 

878 computed_key = f"{base_key}-{idx}" 

879 idx += 1 

880 return computed_key, False 

881 

882 if new_res1: 

883 new_res = DictionaryObject() 

884 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

885 else: 

886 new_res = cast(DictionaryObject, res1[resource]) 

887 page2res = cast( 

888 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

889 ) 

890 rename_res = {} 

891 for key in page2res: 

892 unique_key, same_value = compute_unique_key(key) 

893 newname = NameObject(unique_key) 

894 if key != unique_key: 

895 # we have to use a different name for this 

896 rename_res[key] = newname 

897 

898 if not same_value: 

899 if is_pdf_writer: 

900 new_res[newname] = page2res.raw_get(key).clone(pdf) 

901 try: 

902 new_res[newname] = new_res[newname].indirect_reference 

903 except AttributeError: 

904 pass 

905 else: 

906 new_res[newname] = page2res.raw_get(key) 

907 lst = sorted(new_res.items()) 

908 new_res.clear() 

909 for el in lst: 

910 new_res[el[0]] = el[1] 

911 return new_res, rename_res 

912 

913 @staticmethod 

914 def _content_stream_rename( 

915 stream: ContentStream, 

916 rename: dict[Any, Any], 

917 pdf: Optional[PdfCommonDocProtocol], 

918 ) -> ContentStream: 

919 if not rename: 

920 return stream 

921 stream = ContentStream(stream, pdf) 

922 for operands, _operator in stream.operations: 

923 if isinstance(operands, list): 

924 for i, op in enumerate(operands): 

925 if isinstance(op, NameObject): 

926 operands[i] = rename.get(op, op) 

927 elif isinstance(operands, dict): 

928 for i, op in operands.items(): 

929 if isinstance(op, NameObject): 

930 operands[i] = rename.get(op, op) 

931 else: 

932 raise KeyError(f"Type of operands is {type(operands)}") 

933 return stream 

934 

935 @staticmethod 

936 def _add_transformation_matrix( 

937 contents: Any, 

938 pdf: Optional[PdfCommonDocProtocol], 

939 ctm: CompressedTransformationMatrix, 

940 ) -> ContentStream: 

941 """Add transformation matrix at the beginning of the given contents stream.""" 

942 contents = ContentStream(contents, pdf) 

943 contents.operations.insert( 

944 0, 

945 [ 

946 [FloatObject(x) for x in ctm], 

947 b"cm", 

948 ], 

949 ) 

950 return contents 

951 

952 def _get_contents_as_bytes(self) -> Optional[bytes]: 

953 """ 

954 Return the page contents as bytes. 

955 

956 Returns: 

957 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

958 

959 """ 

960 if PG.CONTENTS in self: 

961 obj = self[PG.CONTENTS].get_object() 

962 if isinstance(obj, list): 

963 return b"".join(x.get_object().get_data() for x in obj) 

964 return cast(EncodedStreamObject, obj).get_data() 

965 return None 

966 

967 def get_contents(self) -> Optional[ContentStream]: 

968 """ 

969 Access the page contents. 

970 

971 Returns: 

972 The ``/Contents`` object, or ``None`` if it does not exist. 

973 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

974 

975 """ 

976 if PG.CONTENTS in self: 

977 try: 

978 pdf = cast(IndirectObject, self.indirect_reference).pdf 

979 except AttributeError: 

980 pdf = None 

981 obj = self[PG.CONTENTS] 

982 if is_null_or_none(obj): 

983 return None 

984 resolved_object = obj.get_object() 

985 return ContentStream(resolved_object, pdf) 

986 return None 

987 

988 def replace_contents( 

989 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

990 ) -> None: 

991 """ 

992 Replace the page contents with the new content and nullify old objects 

993 Args: 

994 content: new content; if None delete the content field. 

995 """ 

996 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

997 # the page is not attached : the content is directly attached. 

998 self[NameObject(PG.CONTENTS)] = content 

999 return 

1000 

1001 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1002 for o in self[PG.CONTENTS]: # type: ignore[attr-defined] 

1003 try: 

1004 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore 

1005 except AttributeError: 

1006 pass 

1007 

1008 if isinstance(content, ArrayObject): 

1009 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content) 

1010 

1011 if is_null_or_none(content): 

1012 if PG.CONTENTS not in self: 

1013 return 

1014 assert self.indirect_reference is not None 

1015 assert self[PG.CONTENTS].indirect_reference is not None 

1016 self.indirect_reference.pdf._objects[ 

1017 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore 

1018 ] = NullObject() 

1019 del self[PG.CONTENTS] 

1020 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1021 try: 

1022 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( 

1023 content 

1024 ) 

1025 except AttributeError: 

1026 # applies at least for page not in writer 

1027 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1028 # this will be fixed with the _add_object 

1029 self[NameObject(PG.CONTENTS)] = content 

1030 else: 

1031 assert content is not None, "mypy" 

1032 content.indirect_reference = self[ 

1033 PG.CONTENTS 

1034 ].indirect_reference # TODO: in the future may require generation management 

1035 try: 

1036 self.indirect_reference.pdf._objects[ 

1037 content.indirect_reference.idnum - 1 # type: ignore 

1038 ] = content 

1039 except AttributeError: 

1040 # applies at least for page not in writer 

1041 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1042 # this will be fixed with the _add_object 

1043 self[NameObject(PG.CONTENTS)] = content 

1044 # forces recalculation of inline_images 

1045 self.inline_images = None 

1046 

1047 def merge_page( 

1048 self, page2: "PageObject", expand: bool = False, over: bool = True 

1049 ) -> None: 

1050 """ 

1051 Merge the content streams of two pages into one. 

1052 

1053 Resource references (e.g. fonts) are maintained from both pages. 

1054 The mediabox, cropbox, etc of this page are not altered. 

1055 The parameter page's content stream will 

1056 be added to the end of this page's content stream, 

1057 meaning that it will be drawn after, or "on top" of this page. 

1058 

1059 Args: 

1060 page2: The page to be merged into this one. Should be 

1061 an instance of :class:`PageObject<PageObject>`. 

1062 over: set the page2 content over page1 if True (default) else under 

1063 expand: If True, the current page dimensions will be 

1064 expanded to accommodate the dimensions of the page to be merged. 

1065 

1066 """ 

1067 self._merge_page(page2, over=over, expand=expand) 

1068 

1069 def _merge_page( 

1070 self, 

1071 page2: "PageObject", 

1072 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1073 ctm: Optional[CompressedTransformationMatrix] = None, 

1074 over: bool = True, 

1075 expand: bool = False, 

1076 ) -> None: 

1077 # First we work on merging the resource dictionaries. This allows us 

1078 # to find out what symbols in the content streams we might need to 

1079 # rename. 

1080 try: 

1081 assert isinstance(self.indirect_reference, IndirectObject) 

1082 if hasattr( 

1083 self.indirect_reference.pdf, "_add_object" 

1084 ): # to detect PdfWriter 

1085 return self._merge_page_writer( 

1086 page2, page2transformation, ctm, over, expand 

1087 ) 

1088 return None 

1089 except (AssertionError, AttributeError): 

1090 pass 

1091 

1092 new_resources = DictionaryObject() 

1093 rename = {} 

1094 try: 

1095 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1096 except KeyError: 

1097 original_resources = DictionaryObject() 

1098 try: 

1099 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1100 except KeyError: 

1101 page2resources = DictionaryObject() 

1102 new_annots = ArrayObject() 

1103 

1104 for page in (self, page2): 

1105 if PG.ANNOTS in page: 

1106 annots = page[PG.ANNOTS] 

1107 if isinstance(annots, ArrayObject): 

1108 new_annots.extend(annots) 

1109 

1110 for res in ( 

1111 RES.EXT_G_STATE, 

1112 RES.FONT, 

1113 RES.XOBJECT, 

1114 RES.COLOR_SPACE, 

1115 RES.PATTERN, 

1116 RES.SHADING, 

1117 RES.PROPERTIES, 

1118 ): 

1119 new, newrename = self._merge_resources( 

1120 original_resources, page2resources, res 

1121 ) 

1122 if new: 

1123 new_resources[NameObject(res)] = new 

1124 rename.update(newrename) 

1125 

1126 # Combine /ProcSet sets, making sure there's a consistent order 

1127 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1128 sorted( 

1129 set( 

1130 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1131 ).union( 

1132 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1133 ) 

1134 ) 

1135 ) 

1136 

1137 new_content_array = ArrayObject() 

1138 original_content = self.get_contents() 

1139 if original_content is not None: 

1140 original_content.isolate_graphics_state() 

1141 new_content_array.append(original_content) 

1142 

1143 page2content = page2.get_contents() 

1144 if page2content is not None: 

1145 rect = getattr(page2, MERGE_CROP_BOX) 

1146 page2content.operations.insert( 

1147 0, 

1148 ( 

1149 map( 

1150 FloatObject, 

1151 [ 

1152 rect.left, 

1153 rect.bottom, 

1154 rect.width, 

1155 rect.height, 

1156 ], 

1157 ), 

1158 b"re", 

1159 ), 

1160 ) 

1161 page2content.operations.insert(1, ([], b"W")) 

1162 page2content.operations.insert(2, ([], b"n")) 

1163 if page2transformation is not None: 

1164 page2content = page2transformation(page2content) 

1165 page2content = PageObject._content_stream_rename( 

1166 page2content, rename, self.pdf 

1167 ) 

1168 page2content.isolate_graphics_state() 

1169 if over: 

1170 new_content_array.append(page2content) 

1171 else: 

1172 new_content_array.insert(0, page2content) 

1173 

1174 # if expanding the page to fit a new page, calculate the new media box size 

1175 if expand: 

1176 self._expand_mediabox(page2, ctm) 

1177 

1178 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1179 self[NameObject(PG.RESOURCES)] = new_resources 

1180 self[NameObject(PG.ANNOTS)] = new_annots 

1181 return None 

1182 

1183 def _merge_page_writer( 

1184 self, 

1185 page2: "PageObject", 

1186 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1187 ctm: Optional[CompressedTransformationMatrix] = None, 

1188 over: bool = True, 

1189 expand: bool = False, 

1190 ) -> None: 

1191 # First we work on merging the resource dictionaries. This allows us 

1192 # to find which symbols in the content streams we might need to 

1193 # rename. 

1194 assert isinstance(self.indirect_reference, IndirectObject) 

1195 pdf = self.indirect_reference.pdf 

1196 

1197 rename = {} 

1198 if PG.RESOURCES not in self: 

1199 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1200 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1201 if PG.RESOURCES not in page2: 

1202 page2resources = DictionaryObject() 

1203 else: 

1204 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1205 

1206 for res in ( 

1207 RES.EXT_G_STATE, 

1208 RES.FONT, 

1209 RES.XOBJECT, 

1210 RES.COLOR_SPACE, 

1211 RES.PATTERN, 

1212 RES.SHADING, 

1213 RES.PROPERTIES, 

1214 ): 

1215 if res in page2resources: 

1216 if res not in original_resources: 

1217 original_resources[NameObject(res)] = DictionaryObject() 

1218 _, newrename = self._merge_resources( 

1219 original_resources, page2resources, res, False 

1220 ) 

1221 rename.update(newrename) 

1222 # Combine /ProcSet sets. 

1223 if RES.PROC_SET in page2resources: 

1224 if RES.PROC_SET not in original_resources: 

1225 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1226 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1227 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1228 if x not in arr: 

1229 arr.append(x) 

1230 arr.sort() 

1231 

1232 if PG.ANNOTS in page2: 

1233 if PG.ANNOTS not in self: 

1234 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1235 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1236 if ctm is None: 

1237 trsf = Transformation() 

1238 else: 

1239 trsf = Transformation(ctm) 

1240 for a in cast(ArrayObject, page2[PG.ANNOTS]): 

1241 a = a.get_object() 

1242 aa = a.clone( 

1243 pdf, 

1244 ignore_fields=("/P", "/StructParent", "/Parent"), 

1245 force_duplicate=True, 

1246 ) 

1247 r = cast(ArrayObject, a["/Rect"]) 

1248 pt1 = trsf.apply_on((r[0], r[1]), True) 

1249 pt2 = trsf.apply_on((r[2], r[3]), True) 

1250 aa[NameObject("/Rect")] = ArrayObject( 

1251 ( 

1252 min(pt1[0], pt2[0]), 

1253 min(pt1[1], pt2[1]), 

1254 max(pt1[0], pt2[0]), 

1255 max(pt1[1], pt2[1]), 

1256 ) 

1257 ) 

1258 if "/QuadPoints" in a: 

1259 q = cast(ArrayObject, a["/QuadPoints"]) 

1260 aa[NameObject("/QuadPoints")] = ArrayObject( 

1261 trsf.apply_on((q[0], q[1]), True) 

1262 + trsf.apply_on((q[2], q[3]), True) 

1263 + trsf.apply_on((q[4], q[5]), True) 

1264 + trsf.apply_on((q[6], q[7]), True) 

1265 ) 

1266 try: 

1267 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1268 except KeyError: 

1269 pass 

1270 try: 

1271 aa[NameObject("/P")] = self.indirect_reference 

1272 annots.append(aa.indirect_reference) 

1273 except AttributeError: 

1274 pass 

1275 

1276 new_content_array = ArrayObject() 

1277 original_content = self.get_contents() 

1278 if original_content is not None: 

1279 original_content.isolate_graphics_state() 

1280 new_content_array.append(original_content) 

1281 

1282 page2content = page2.get_contents() 

1283 if page2content is not None: 

1284 rect = getattr(page2, MERGE_CROP_BOX) 

1285 page2content.operations.insert( 

1286 0, 

1287 ( 

1288 map( 

1289 FloatObject, 

1290 [ 

1291 rect.left, 

1292 rect.bottom, 

1293 rect.width, 

1294 rect.height, 

1295 ], 

1296 ), 

1297 b"re", 

1298 ), 

1299 ) 

1300 page2content.operations.insert(1, ([], b"W")) 

1301 page2content.operations.insert(2, ([], b"n")) 

1302 if page2transformation is not None: 

1303 page2content = page2transformation(page2content) 

1304 page2content = PageObject._content_stream_rename( 

1305 page2content, rename, self.pdf 

1306 ) 

1307 page2content.isolate_graphics_state() 

1308 if over: 

1309 new_content_array.append(page2content) 

1310 else: 

1311 new_content_array.insert(0, page2content) 

1312 

1313 # if expanding the page to fit a new page, calculate the new media box size 

1314 if expand: 

1315 self._expand_mediabox(page2, ctm) 

1316 

1317 self.replace_contents(new_content_array) 

1318 

1319 def _expand_mediabox( 

1320 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1321 ) -> None: 

1322 corners1 = ( 

1323 self.mediabox.left.as_numeric(), 

1324 self.mediabox.bottom.as_numeric(), 

1325 self.mediabox.right.as_numeric(), 

1326 self.mediabox.top.as_numeric(), 

1327 ) 

1328 corners2 = ( 

1329 page2.mediabox.left.as_numeric(), 

1330 page2.mediabox.bottom.as_numeric(), 

1331 page2.mediabox.left.as_numeric(), 

1332 page2.mediabox.top.as_numeric(), 

1333 page2.mediabox.right.as_numeric(), 

1334 page2.mediabox.top.as_numeric(), 

1335 page2.mediabox.right.as_numeric(), 

1336 page2.mediabox.bottom.as_numeric(), 

1337 ) 

1338 if ctm is not None: 

1339 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1340 new_x = tuple( 

1341 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1342 for i in range(0, 8, 2) 

1343 ) 

1344 new_y = tuple( 

1345 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1346 for i in range(0, 8, 2) 

1347 ) 

1348 else: 

1349 new_x = corners2[0:8:2] 

1350 new_y = corners2[1:8:2] 

1351 lowerleft = (min(new_x), min(new_y)) 

1352 upperright = (max(new_x), max(new_y)) 

1353 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1354 upperright = ( 

1355 max(corners1[2], upperright[0]), 

1356 max(corners1[3], upperright[1]), 

1357 ) 

1358 

1359 self.mediabox.lower_left = lowerleft 

1360 self.mediabox.upper_right = upperright 

1361 

1362 def merge_transformed_page( 

1363 self, 

1364 page2: "PageObject", 

1365 ctm: Union[CompressedTransformationMatrix, Transformation], 

1366 over: bool = True, 

1367 expand: bool = False, 

1368 ) -> None: 

1369 """ 

1370 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1371 matrix is applied to the merged stream. 

1372 

1373 Args: 

1374 page2: The page to be merged into this one. 

1375 ctm: a 6-element tuple containing the operands of the 

1376 transformation matrix 

1377 over: set the page2 content over page1 if True (default) else under 

1378 expand: Whether the page should be expanded to fit the dimensions 

1379 of the page to be merged. 

1380 

1381 """ 

1382 if isinstance(ctm, Transformation): 

1383 ctm = ctm.ctm 

1384 self._merge_page( 

1385 page2, 

1386 lambda page2Content: PageObject._add_transformation_matrix( 

1387 page2Content, page2.pdf, ctm 

1388 ), 

1389 ctm, 

1390 over, 

1391 expand, 

1392 ) 

1393 

1394 def merge_scaled_page( 

1395 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1396 ) -> None: 

1397 """ 

1398 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1399 is scaled by applying a transformation matrix. 

1400 

1401 Args: 

1402 page2: The page to be merged into this one. 

1403 scale: The scaling factor 

1404 over: set the page2 content over page1 if True (default) else under 

1405 expand: Whether the page should be expanded to fit the 

1406 dimensions of the page to be merged. 

1407 

1408 """ 

1409 op = Transformation().scale(scale, scale) 

1410 self.merge_transformed_page(page2, op, over, expand) 

1411 

1412 def merge_rotated_page( 

1413 self, 

1414 page2: "PageObject", 

1415 rotation: float, 

1416 over: bool = True, 

1417 expand: bool = False, 

1418 ) -> None: 

1419 """ 

1420 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1421 is rotated by applying a transformation matrix. 

1422 

1423 Args: 

1424 page2: The page to be merged into this one. 

1425 rotation: The angle of the rotation, in degrees 

1426 over: set the page2 content over page1 if True (default) else under 

1427 expand: Whether the page should be expanded to fit the 

1428 dimensions of the page to be merged. 

1429 

1430 """ 

1431 op = Transformation().rotate(rotation) 

1432 self.merge_transformed_page(page2, op, over, expand) 

1433 

1434 def merge_translated_page( 

1435 self, 

1436 page2: "PageObject", 

1437 tx: float, 

1438 ty: float, 

1439 over: bool = True, 

1440 expand: bool = False, 

1441 ) -> None: 

1442 """ 

1443 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1444 merged is translated by applying a transformation matrix. 

1445 

1446 Args: 

1447 page2: the page to be merged into this one. 

1448 tx: The translation on X axis 

1449 ty: The translation on Y axis 

1450 over: set the page2 content over page1 if True (default) else under 

1451 expand: Whether the page should be expanded to fit the 

1452 dimensions of the page to be merged. 

1453 

1454 """ 

1455 op = Transformation().translate(tx, ty) 

1456 self.merge_transformed_page(page2, op, over, expand) 

1457 

1458 def add_transformation( 

1459 self, 

1460 ctm: Union[Transformation, CompressedTransformationMatrix], 

1461 expand: bool = False, 

1462 ) -> None: 

1463 """ 

1464 Apply a transformation matrix to the page. 

1465 

1466 Args: 

1467 ctm: A 6-element tuple containing the operands of the 

1468 transformation matrix. Alternatively, a 

1469 :py:class:`Transformation<pypdf.Transformation>` 

1470 object can be passed. 

1471 

1472 See :doc:`/user/cropping-and-transforming`. 

1473 

1474 """ 

1475 if isinstance(ctm, Transformation): 

1476 ctm = ctm.ctm 

1477 content = self.get_contents() 

1478 if content is not None: 

1479 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1480 content.isolate_graphics_state() 

1481 self.replace_contents(content) 

1482 # if expanding the page to fit a new page, calculate the new media box size 

1483 if expand: 

1484 corners = [ 

1485 self.mediabox.left.as_numeric(), 

1486 self.mediabox.bottom.as_numeric(), 

1487 self.mediabox.left.as_numeric(), 

1488 self.mediabox.top.as_numeric(), 

1489 self.mediabox.right.as_numeric(), 

1490 self.mediabox.top.as_numeric(), 

1491 self.mediabox.right.as_numeric(), 

1492 self.mediabox.bottom.as_numeric(), 

1493 ] 

1494 

1495 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1496 new_x = [ 

1497 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1498 for i in range(0, 8, 2) 

1499 ] 

1500 new_y = [ 

1501 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1502 for i in range(0, 8, 2) 

1503 ] 

1504 

1505 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1506 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1507 

1508 def scale(self, sx: float, sy: float) -> None: 

1509 """ 

1510 Scale a page by the given factors by applying a transformation matrix 

1511 to its content and updating the page size. 

1512 

1513 This updates the various page boundaries (bleedbox, trimbox, etc.) 

1514 and the contents of the page. 

1515 

1516 Args: 

1517 sx: The scaling factor on horizontal axis. 

1518 sy: The scaling factor on vertical axis. 

1519 

1520 """ 

1521 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1522 self.bleedbox = self.bleedbox.scale(sx, sy) 

1523 self.trimbox = self.trimbox.scale(sx, sy) 

1524 self.artbox = self.artbox.scale(sx, sy) 

1525 self.cropbox = self.cropbox.scale(sx, sy) 

1526 self.mediabox = self.mediabox.scale(sx, sy) 

1527 

1528 if PG.ANNOTS in self: 

1529 annotations = self[PG.ANNOTS] 

1530 if isinstance(annotations, ArrayObject): 

1531 for annotation in annotations: 

1532 annotation_obj = annotation.get_object() 

1533 if ADA.Rect in annotation_obj: 

1534 rectangle = annotation_obj[ADA.Rect] 

1535 if isinstance(rectangle, ArrayObject): 

1536 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1537 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1538 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1539 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1540 

1541 if PG.VP in self: 

1542 viewport = self[PG.VP] 

1543 if isinstance(viewport, ArrayObject): 

1544 bbox = viewport[0]["/BBox"] 

1545 else: 

1546 bbox = viewport["/BBox"] # type: ignore 

1547 scaled_bbox = RectangleObject( 

1548 ( 

1549 float(bbox[0]) * sx, 

1550 float(bbox[1]) * sy, 

1551 float(bbox[2]) * sx, 

1552 float(bbox[3]) * sy, 

1553 ) 

1554 ) 

1555 if isinstance(viewport, ArrayObject): 

1556 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1557 NameObject("/BBox") 

1558 ] = scaled_bbox 

1559 else: 

1560 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1561 

1562 def scale_by(self, factor: float) -> None: 

1563 """ 

1564 Scale a page by the given factor by applying a transformation matrix to 

1565 its content and updating the page size. 

1566 

1567 Args: 

1568 factor: The scaling factor (for both X and Y axis). 

1569 

1570 """ 

1571 self.scale(factor, factor) 

1572 

1573 def scale_to(self, width: float, height: float) -> None: 

1574 """ 

1575 Scale a page to the specified dimensions by applying a transformation 

1576 matrix to its content and updating the page size. 

1577 

1578 Args: 

1579 width: The new width. 

1580 height: The new height. 

1581 

1582 """ 

1583 sx = width / float(self.mediabox.width) 

1584 sy = height / float(self.mediabox.height) 

1585 self.scale(sx, sy) 

1586 

1587 def compress_content_streams(self, level: int = -1) -> None: 

1588 """ 

1589 Compress the size of this page by joining all content streams and 

1590 applying a FlateDecode filter. 

1591 

1592 However, it is possible that this function will perform no action if 

1593 content stream compression becomes "automatic". 

1594 """ 

1595 content = self.get_contents() 

1596 if content is not None: 

1597 content_obj = content.flate_encode(level) 

1598 try: 

1599 content.indirect_reference.pdf._objects[ # type: ignore 

1600 content.indirect_reference.idnum - 1 # type: ignore 

1601 ] = content_obj 

1602 except AttributeError: 

1603 if self.indirect_reference is not None and hasattr( 

1604 self.indirect_reference.pdf, "_add_object" 

1605 ): 

1606 self.replace_contents(content_obj) 

1607 else: 

1608 raise ValueError("Page must be part of a PdfWriter") 

1609 

1610 @property 

1611 def page_number(self) -> Optional[int]: 

1612 """ 

1613 Read-only property which returns the page number within the PDF file. 

1614 

1615 Returns: 

1616 Page number; None if the page is not attached to a PDF. 

1617 

1618 """ 

1619 if self.indirect_reference is None: 

1620 return None 

1621 try: 

1622 lst = self.indirect_reference.pdf.pages 

1623 return lst.index(self) 

1624 except ValueError: 

1625 return None 

1626 

1627 def _debug_for_extract(self) -> str: # pragma: no cover 

1628 out = "" 

1629 for ope, op in ContentStream( 

1630 self["/Contents"].get_object(), self.pdf, "bytes" 

1631 ).operations: 

1632 if op == b"TJ": 

1633 s = [x for x in ope[0] if isinstance(x, str)] 

1634 else: 

1635 s = [] 

1636 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1637 out += "\n=============================\n" 

1638 try: 

1639 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1640 out += fo + "\n" 

1641 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1642 try: 

1643 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1644 "/Encoding" 

1645 ].__repr__() 

1646 out += enc_repr + "\n" 

1647 except Exception: 

1648 pass 

1649 try: 

1650 out += ( 

1651 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1652 "/ToUnicode" 

1653 ] 

1654 .get_data() 

1655 .decode() 

1656 + "\n" 

1657 ) 

1658 except Exception: 

1659 pass 

1660 

1661 except KeyError: 

1662 out += "No Font\n" 

1663 return out 

1664 

1665 def _extract_text( 

1666 self, 

1667 obj: Any, 

1668 pdf: Any, 

1669 orientations: tuple[int, ...] = (0, 90, 180, 270), 

1670 space_width: float = 200.0, 

1671 content_key: Optional[str] = PG.CONTENTS, 

1672 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1673 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1674 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1675 ) -> str: 

1676 """ 

1677 See extract_text for most arguments. 

1678 

1679 Args: 

1680 content_key: indicate the default key where to extract data 

1681 None = the object; this allows reusing the function on an XObject 

1682 default = "/Content" 

1683 

1684 """ 

1685 extractor = TextExtraction() 

1686 cmaps: dict[ 

1687 str, 

1688 tuple[ 

1689 str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject 

1690 ], 

1691 ] = {} 

1692 

1693 try: 

1694 objr = obj 

1695 while NameObject(PG.RESOURCES) not in objr: 

1696 # /Resources can be inherited so we look to parents 

1697 objr = objr["/Parent"].get_object() 

1698 # If no parents then no /Resources will be available, 

1699 # so an exception will be raised 

1700 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1701 except Exception: 

1702 # No resources means no text is possible (no font); we consider the 

1703 # file as not damaged, no need to check for TJ or Tj 

1704 return "" 

1705 

1706 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]): 

1707 for f in cast(DictionaryObject, font): 

1708 try: 

1709 cmaps[f] = build_char_map(f, space_width, obj) 

1710 except TypeError: 

1711 pass 

1712 

1713 try: 

1714 content = ( 

1715 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1716 ) 

1717 if not isinstance(content, ContentStream): 

1718 content = ContentStream(content, pdf, "bytes") 

1719 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1720 return "" 

1721 # We check all strings are TextStringObjects. ByteStringObjects 

1722 # are strings where the byte->string encoding was unknown, so adding 

1723 # them to the text here would be gibberish. 

1724 

1725 # Initialize the extractor with the necessary parameters 

1726 extractor.initialize_extraction(orientations, visitor_text, cmaps) 

1727 

1728 for operands, operator in content.operations: 

1729 if visitor_operand_before is not None: 

1730 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1731 # Multiple operators are handled here 

1732 if operator == b"'": 

1733 extractor.process_operation(b"T*", []) 

1734 extractor.process_operation(b"Tj", operands) 

1735 elif operator == b'"': 

1736 extractor.process_operation(b"Tw", [operands[0]]) 

1737 extractor.process_operation(b"Tc", [operands[1]]) 

1738 extractor.process_operation(b"T*", []) 

1739 extractor.process_operation(b"Tj", operands[2:]) 

1740 elif operator == b"TJ": 

1741 # The space width may be smaller than the font width, so the width should be 95%. 

1742 _confirm_space_width = extractor._space_width * 0.95 

1743 if operands: 

1744 for op in operands[0]: 

1745 if isinstance(op, (str, bytes)): 

1746 extractor.process_operation(b"Tj", [op]) 

1747 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1748 abs(float(op)) >= _confirm_space_width 

1749 and extractor.text 

1750 and extractor.text[-1] != " " 

1751 ): 

1752 extractor.process_operation(b"Tj", [" "]) 

1753 elif operator == b"TD": 

1754 extractor.process_operation(b"TL", [-operands[1]]) 

1755 extractor.process_operation(b"Td", operands) 

1756 elif operator == b"Do": 

1757 extractor.output += extractor.text 

1758 if visitor_text is not None: 

1759 visitor_text( 

1760 extractor.text, 

1761 extractor.memo_cm, 

1762 extractor.memo_tm, 

1763 extractor.cmap[3], 

1764 extractor.font_size, 

1765 ) 

1766 try: 

1767 if extractor.output[-1] != "\n": 

1768 extractor.output += "\n" 

1769 if visitor_text is not None: 

1770 visitor_text( 

1771 "\n", 

1772 extractor.memo_cm, 

1773 extractor.memo_tm, 

1774 extractor.cmap[3], 

1775 extractor.font_size, 

1776 ) 

1777 except IndexError: 

1778 pass 

1779 try: 

1780 xobj = resources_dict["/XObject"] 

1781 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1782 text = self.extract_xform_text( 

1783 xobj[operands[0]], # type: ignore 

1784 orientations, 

1785 space_width, 

1786 visitor_operand_before, 

1787 visitor_operand_after, 

1788 visitor_text, 

1789 ) 

1790 extractor.output += text 

1791 if visitor_text is not None: 

1792 visitor_text( 

1793 text, 

1794 extractor.memo_cm, 

1795 extractor.memo_tm, 

1796 extractor.cmap[3], 

1797 extractor.font_size, 

1798 ) 

1799 except Exception as exception: 

1800 logger_warning( 

1801 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1802 __name__, 

1803 ) 

1804 finally: 

1805 extractor.text = "" 

1806 extractor.memo_cm = extractor.cm_matrix.copy() 

1807 extractor.memo_tm = extractor.tm_matrix.copy() 

1808 else: 

1809 extractor.process_operation(operator, operands) 

1810 if visitor_operand_after is not None: 

1811 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1812 extractor.output += extractor.text # just in case 

1813 if extractor.text != "" and visitor_text is not None: 

1814 visitor_text( 

1815 extractor.text, 

1816 extractor.memo_cm, 

1817 extractor.memo_tm, 

1818 extractor.cmap[3], 

1819 extractor.font_size, 

1820 ) 

1821 return extractor.output 

1822 

1823 def _layout_mode_fonts(self) -> dict[str, _layout_mode.Font]: 

1824 """ 

1825 Get fonts formatted for "layout" mode text extraction. 

1826 

1827 Returns: 

1828 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name 

1829 

1830 """ 

1831 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1832 objr: Any = self 

1833 fonts: dict[str, _layout_mode.Font] = {} 

1834 while objr is not None: 

1835 try: 

1836 resources_dict: Any = objr[PG.RESOURCES] 

1837 except KeyError: 

1838 resources_dict = {} 

1839 if "/Font" in resources_dict and self.pdf is not None: 

1840 for font_name in resources_dict["/Font"]: 

1841 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) 

1842 font_dict = { 

1843 k: v.get_object() 

1844 if isinstance(v, IndirectObject) 

1845 else [_v.get_object() for _v in v] 

1846 if isinstance(v, ArrayObject) 

1847 else v 

1848 for k, v in font_dict_obj.items() 

1849 } 

1850 # mypy really sucks at unpacking 

1851 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] 

1852 try: 

1853 objr = objr["/Parent"].get_object() 

1854 except KeyError: 

1855 objr = None 

1856 

1857 return fonts 

1858 

1859 def _layout_mode_text( 

1860 self, 

1861 space_vertically: bool = True, 

1862 scale_weight: float = 1.25, 

1863 strip_rotated: bool = True, 

1864 debug_path: Optional[Path] = None, 

1865 font_height_weight: float = 1, 

1866 ) -> str: 

1867 """ 

1868 Get text preserving fidelity to source PDF text layout. 

1869 

1870 Args: 

1871 space_vertically: include blank lines inferred from y distance + font 

1872 height. Defaults to True. 

1873 scale_weight: multiplier for string length when calculating weighted 

1874 average character width. Defaults to 1.25. 

1875 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1876 layout mode output. Defaults to True. 

1877 debug_path (Path | None): if supplied, must target a directory. 

1878 creates the following files with debug information for layout mode 

1879 functions if supplied: 

1880 - fonts.json: output of self._layout_mode_fonts 

1881 - tjs.json: individual text render ops with corresponding transform matrices 

1882 - bts.json: text render ops left justified and grouped by BT/ET operators 

1883 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1884 Defaults to None. 

1885 font_height_weight: multiplier for font height when calculating 

1886 blank lines. Defaults to 1. 

1887 

1888 Returns: 

1889 str: multiline string containing page text in a fixed width format that 

1890 closely adheres to the rendered layout in the source pdf. 

1891 

1892 """ 

1893 fonts = self._layout_mode_fonts() 

1894 if debug_path: # pragma: no cover 

1895 import json # noqa: PLC0415 

1896 

1897 debug_path.joinpath("fonts.json").write_text( 

1898 json.dumps( 

1899 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) 

1900 ), 

1901 "utf-8", 

1902 ) 

1903 

1904 ops = iter( 

1905 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1906 ) 

1907 bt_groups = _layout_mode.text_show_operations( 

1908 ops, fonts, strip_rotated, debug_path 

1909 ) 

1910 

1911 if not bt_groups: 

1912 return "" 

1913 

1914 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1915 

1916 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1917 

1918 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1919 

1920 def extract_text( 

1921 self, 

1922 *args: Any, 

1923 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), 

1924 space_width: float = 200.0, 

1925 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1926 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1927 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1928 extraction_mode: Literal["plain", "layout"] = "plain", 

1929 **kwargs: Any, 

1930 ) -> str: 

1931 """ 

1932 Locate all text drawing commands, in the order they are provided in the 

1933 content stream, and extract the text. 

1934 

1935 This works well for some PDF files, but poorly for others, depending on 

1936 the generator used. This will be refined in the future. 

1937 

1938 Do not rely on the order of text coming out of this function, as it 

1939 will change if this function is made more sophisticated. 

1940 

1941 Arabic and Hebrew are extracted in the correct order. 

1942 If required a custom RTL range of characters can be defined; 

1943 see function set_custom_rtl. 

1944 

1945 Additionally you can provide visitor methods to get informed on all 

1946 operations and all text objects. 

1947 For example in some PDF files this can be useful to parse tables. 

1948 

1949 Args: 

1950 orientations: list of orientations extract_text will look for 

1951 default = (0, 90, 180, 270) 

1952 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1953 270 (turned right) 

1954 Silently ignored in "layout" mode. 

1955 space_width: force default space width 

1956 if not extracted from font (default: 200) 

1957 Silently ignored in "layout" mode. 

1958 visitor_operand_before: function to be called before processing an operation. 

1959 It has four arguments: operator, operand-arguments, 

1960 current transformation matrix and text matrix. 

1961 Ignored with a warning in "layout" mode. 

1962 visitor_operand_after: function to be called after processing an operation. 

1963 It has four arguments: operator, operand-arguments, 

1964 current transformation matrix and text matrix. 

1965 Ignored with a warning in "layout" mode. 

1966 visitor_text: function to be called when extracting some text at some position. 

1967 It has five arguments: text, current transformation matrix, 

1968 text matrix, font-dictionary and font-size. 

1969 The font-dictionary may be None in case of unknown fonts. 

1970 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1971 Ignored with a warning in "layout" mode. 

1972 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1973 "layout" for experimental layout mode functionality. 

1974 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1975 in "layout" mode. 

1976 

1977 kwargs: 

1978 layout_mode_space_vertically (bool): include blank lines inferred from 

1979 y distance + font height. Defaults to True. 

1980 layout_mode_scale_weight (float): multiplier for string length when calculating 

1981 weighted average character width. Defaults to 1.25. 

1982 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1983 Set to False to include rotated text anyway. If rotated text is discovered, 

1984 layout will be degraded and a warning will result. Defaults to True. 

1985 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1986 creates the following files with debug information for layout mode 

1987 functions if supplied: 

1988 

1989 - fonts.json: output of self._layout_mode_fonts 

1990 - tjs.json: individual text render ops with corresponding transform matrices 

1991 - bts.json: text render ops left justified and grouped by BT/ET operators 

1992 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1993 layout_mode_font_height_weight (float): multiplier for font height when calculating 

1994 blank lines. Defaults to 1. 

1995 

1996 Returns: 

1997 The extracted text 

1998 

1999 """ 

2000 if extraction_mode not in ["plain", "layout"]: 

2001 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

2002 if extraction_mode == "layout": 

2003 for visitor in ( 

2004 "visitor_operand_before", 

2005 "visitor_operand_after", 

2006 "visitor_text", 

2007 ): 

2008 if locals()[visitor]: 

2009 logger_warning( 

2010 f"Argument {visitor} is ignored in layout mode", 

2011 __name__, 

2012 ) 

2013 return self._layout_mode_text( 

2014 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2015 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2016 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2017 debug_path=kwargs.get("layout_mode_debug_path"), 

2018 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2019 ) 

2020 if len(args) >= 1: 

2021 if isinstance(args[0], str): 

2022 if len(args) >= 3: 

2023 if isinstance(args[2], (tuple, int)): 

2024 orientations = args[2] 

2025 else: 

2026 raise TypeError(f"Invalid positional parameter {args[2]}") 

2027 if len(args) >= 4: 

2028 if isinstance(args[3], (float, int)): 

2029 space_width = args[3] 

2030 else: 

2031 raise TypeError(f"Invalid positional parameter {args[3]}") 

2032 elif isinstance(args[0], (tuple, int)): 

2033 orientations = args[0] 

2034 if len(args) >= 2: 

2035 if isinstance(args[1], (float, int)): 

2036 space_width = args[1] 

2037 else: 

2038 raise TypeError(f"Invalid positional parameter {args[1]}") 

2039 else: 

2040 raise TypeError(f"Invalid positional parameter {args[0]}") 

2041 

2042 if isinstance(orientations, int): 

2043 orientations = (orientations,) 

2044 

2045 return self._extract_text( 

2046 self, 

2047 self.pdf, 

2048 orientations, 

2049 space_width, 

2050 PG.CONTENTS, 

2051 visitor_operand_before, 

2052 visitor_operand_after, 

2053 visitor_text, 

2054 ) 

2055 

2056 def extract_xform_text( 

2057 self, 

2058 xform: EncodedStreamObject, 

2059 orientations: tuple[int, ...] = (0, 90, 270, 360), 

2060 space_width: float = 200.0, 

2061 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2062 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2063 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2064 ) -> str: 

2065 """ 

2066 Extract text from an XObject. 

2067 

2068 Args: 

2069 xform: 

2070 orientations: 

2071 space_width: force default space width (if not extracted from font (default 200) 

2072 visitor_operand_before: 

2073 visitor_operand_after: 

2074 visitor_text: 

2075 

2076 Returns: 

2077 The extracted text 

2078 

2079 """ 

2080 return self._extract_text( 

2081 xform, 

2082 self.pdf, 

2083 orientations, 

2084 space_width, 

2085 None, 

2086 visitor_operand_before, 

2087 visitor_operand_after, 

2088 visitor_text, 

2089 ) 

2090 

2091 def _get_fonts(self) -> tuple[set[str], set[str]]: 

2092 """ 

2093 Get the names of embedded fonts and unembedded fonts. 

2094 

2095 Returns: 

2096 A tuple (set of embedded fonts, set of unembedded fonts) 

2097 

2098 """ 

2099 obj = self.get_object() 

2100 assert isinstance(obj, DictionaryObject) 

2101 fonts: set[str] = set() 

2102 embedded: set[str] = set() 

2103 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2104 unembedded = fonts - embedded 

2105 return embedded, unembedded 

2106 

2107 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2108 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2109 default user space units, defining the boundaries of the physical medium on 

2110 which the page is intended to be displayed or printed.""" 

2111 

2112 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2113 """ 

2114 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2115 default user space units, defining the visible region of default user 

2116 space. 

2117 

2118 When the page is displayed or printed, its contents are to be clipped 

2119 (cropped) to this rectangle and then imposed on the output medium in some 

2120 implementation-defined manner. Default value: same as 

2121 :attr:`mediabox<mediabox>`. 

2122 """ 

2123 

2124 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2125 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2126 default user space units, defining the region to which the contents of the 

2127 page should be clipped when output in a production environment.""" 

2128 

2129 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2130 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2131 default user space units, defining the intended dimensions of the finished 

2132 page after trimming.""" 

2133 

2134 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2135 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2136 default user space units, defining the extent of the page's meaningful 

2137 content as intended by the page's creator.""" 

2138 

2139 @property 

2140 def annotations(self) -> Optional[ArrayObject]: 

2141 if "/Annots" not in self: 

2142 return None 

2143 return cast(ArrayObject, self["/Annots"]) 

2144 

2145 @annotations.setter 

2146 def annotations(self, value: Optional[ArrayObject]) -> None: 

2147 """ 

2148 Set the annotations array of the page. 

2149 

2150 Typically you do not want to set this value, but append to it. 

2151 If you append to it, remember to add the object first to the writer 

2152 and only add the indirect object. 

2153 """ 

2154 if value is None: 

2155 del self[NameObject("/Annots")] 

2156 else: 

2157 self[NameObject("/Annots")] = value 

2158 

2159 

2160class _VirtualList(Sequence[PageObject]): 

2161 def __init__( 

2162 self, 

2163 length_function: Callable[[], int], 

2164 get_function: Callable[[int], PageObject], 

2165 ) -> None: 

2166 self.length_function = length_function 

2167 self.get_function = get_function 

2168 self.current = -1 

2169 

2170 def __len__(self) -> int: 

2171 return self.length_function() 

2172 

2173 @overload 

2174 def __getitem__(self, index: int) -> PageObject: 

2175 ... 

2176 

2177 @overload 

2178 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2179 ... 

2180 

2181 def __getitem__( 

2182 self, index: Union[int, slice] 

2183 ) -> Union[PageObject, Sequence[PageObject]]: 

2184 if isinstance(index, slice): 

2185 indices = range(*index.indices(len(self))) 

2186 cls = type(self) 

2187 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2188 if not isinstance(index, int): 

2189 raise TypeError("Sequence indices must be integers") 

2190 len_self = len(self) 

2191 if index < 0: 

2192 # support negative indexes 

2193 index += len_self 

2194 if not (0 <= index < len_self): 

2195 raise IndexError("Sequence index out of range") 

2196 return self.get_function(index) 

2197 

2198 def __delitem__(self, index: Union[int, slice]) -> None: 

2199 if isinstance(index, slice): 

2200 r = list(range(*index.indices(len(self)))) 

2201 # pages have to be deleted from last to first 

2202 r.sort() 

2203 r.reverse() 

2204 for p in r: 

2205 del self[p] # recursive call 

2206 return 

2207 if not isinstance(index, int): 

2208 raise TypeError("Index must be integers") 

2209 len_self = len(self) 

2210 if index < 0: 

2211 # support negative indexes 

2212 index += len_self 

2213 if not (0 <= index < len_self): 

2214 raise IndexError("Index out of range") 

2215 ind = self[index].indirect_reference 

2216 assert ind is not None 

2217 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2218 "/Parent", None 

2219 ) 

2220 first = True 

2221 while parent is not None: 

2222 parent = cast(DictionaryObject, parent.get_object()) 

2223 try: 

2224 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2225 del cast(ArrayObject, parent["/Kids"])[i] 

2226 first = False 

2227 try: 

2228 assert ind is not None 

2229 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2230 except Exception: # pragma: no cover 

2231 pass 

2232 if "/Count" in parent: 

2233 parent[NameObject("/Count")] = NumberObject( 

2234 cast(int, parent["/Count"]) - 1 

2235 ) 

2236 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2237 # No more objects in this part of this subtree 

2238 ind = parent.indirect_reference 

2239 parent = parent.get("/Parent", None) 

2240 except ValueError: # from index 

2241 if first: 

2242 raise PdfReadError(f"Page not found in page tree: {ind}") 

2243 break 

2244 

2245 def __iter__(self) -> Iterator[PageObject]: 

2246 for i in range(len(self)): 

2247 yield self[i] 

2248 

2249 def __str__(self) -> str: 

2250 p = [f"PageObject({i})" for i in range(self.length_function())] 

2251 return f"[{', '.join(p)}]" 

2252 

2253 

2254def _get_fonts_walk( 

2255 obj: DictionaryObject, 

2256 fnt: set[str], 

2257 emb: set[str], 

2258) -> tuple[set[str], set[str]]: 

2259 """ 

2260 Get the set of all fonts and all embedded fonts. 

2261 

2262 Args: 

2263 obj: Page resources dictionary 

2264 fnt: font 

2265 emb: embedded fonts 

2266 

2267 Returns: 

2268 A tuple (fnt, emb) 

2269 

2270 If there is a key called 'BaseFont', that is a font that is used in the document. 

2271 If there is a key called 'FontName' and another key in the same dictionary object 

2272 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2273 embedded. 

2274 

2275 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2276 

2277 """ 

2278 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2279 

2280 def process_font(f: DictionaryObject) -> None: 

2281 nonlocal fnt, emb 

2282 f = cast(DictionaryObject, f.get_object()) # to be sure 

2283 if "/BaseFont" in f: 

2284 fnt.add(cast(str, f["/BaseFont"])) 

2285 

2286 if ( 

2287 ("/CharProcs" in f) 

2288 or ( 

2289 "/FontDescriptor" in f 

2290 and any( 

2291 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2292 ) 

2293 ) 

2294 or ( 

2295 "/DescendantFonts" in f 

2296 and "/FontDescriptor" 

2297 in cast( 

2298 DictionaryObject, 

2299 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2300 ) 

2301 and any( 

2302 x 

2303 in cast( 

2304 DictionaryObject, 

2305 cast( 

2306 DictionaryObject, 

2307 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2308 )["/FontDescriptor"], 

2309 ) 

2310 for x in fontkeys 

2311 ) 

2312 ) 

2313 ): 

2314 # the list comprehension ensures there is FontFile 

2315 try: 

2316 emb.add(cast(str, f["/BaseFont"])) 

2317 except KeyError: 

2318 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2319 

2320 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2321 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2322 process_font(f) 

2323 if "/Resources" in obj: 

2324 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2325 for f in cast( 

2326 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2327 ).values(): 

2328 process_font(f) 

2329 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2330 for x in cast( 

2331 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2332 ).values(): 

2333 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2334 if "/Annots" in obj: 

2335 for a in cast(ArrayObject, obj["/Annots"]): 

2336 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2337 if "/AP" in obj: 

2338 if ( 

2339 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2340 "/Type" 

2341 ) 

2342 == "/XObject" 

2343 ): 

2344 _get_fonts_walk( 

2345 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2346 fnt, 

2347 emb, 

2348 ) 

2349 else: 

2350 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2351 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2352 return fnt, emb # return the sets for each page