Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

910 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from dataclasses import dataclass 

32from decimal import Decimal 

33from io import BytesIO 

34from pathlib import Path 

35from typing import ( 

36 Any, 

37 Callable, 

38 Dict, 

39 Iterable, 

40 Iterator, 

41 List, 

42 Literal, 

43 Optional, 

44 Sequence, 

45 Set, 

46 Tuple, 

47 Union, 

48 cast, 

49 overload, 

50) 

51 

52from ._cmap import ( 

53 build_char_map, 

54) 

55from ._protocols import PdfCommonDocProtocol 

56from ._text_extraction import ( 

57 _layout_mode, 

58) 

59from ._text_extraction._text_extractor import TextExtraction 

60from ._utils import ( 

61 CompressedTransformationMatrix, 

62 TransformationMatrixType, 

63 _human_readable_bytes, 

64 logger_warning, 

65 matrix_multiply, 

66) 

67from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

68from .constants import AnnotationDictionaryAttributes as ADA 

69from .constants import ImageAttributes as IA 

70from .constants import PageAttributes as PG 

71from .constants import Resources as RES 

72from .errors import PageSizeNotDefinedError, PdfReadError 

73from .filters import _xobj_to_image 

74from .generic import ( 

75 ArrayObject, 

76 ContentStream, 

77 DictionaryObject, 

78 EncodedStreamObject, 

79 FloatObject, 

80 IndirectObject, 

81 NameObject, 

82 NullObject, 

83 NumberObject, 

84 PdfObject, 

85 RectangleObject, 

86 StreamObject, 

87 is_null_or_none, 

88) 

89 

90try: 

91 from PIL.Image import Image 

92 

93 pil_not_imported = False 

94except ImportError: 

95 Image = object # type: ignore 

96 pil_not_imported = True # error will be raised only when using images 

97 

98MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' 

99 

100 

101def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

102 retval: Union[None, RectangleObject, IndirectObject] = self.get(name) 

103 if isinstance(retval, RectangleObject): 

104 return retval 

105 if is_null_or_none(retval): 

106 for d in defaults: 

107 retval = self.get(d) 

108 if retval is not None: 

109 break 

110 if isinstance(retval, IndirectObject): 

111 retval = self.pdf.get_object(retval) 

112 retval = RectangleObject(retval) # type: ignore 

113 _set_rectangle(self, name, retval) 

114 return retval 

115 

116 

117def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

118 name = NameObject(name) 

119 self[name] = value 

120 

121 

122def _delete_rectangle(self: Any, name: str) -> None: 

123 del self[name] 

124 

125 

126def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

127 return property( 

128 lambda self: _get_rectangle(self, name, fallback), 

129 lambda self, value: _set_rectangle(self, name, value), 

130 lambda self: _delete_rectangle(self, name), 

131 ) 

132 

133 

134class Transformation: 

135 """ 

136 Represent a 2D transformation. 

137 

138 The transformation between two coordinate systems is represented by a 3-by-3 

139 transformation matrix with the following form:: 

140 

141 a b 0 

142 c d 0 

143 e f 1 

144 

145 Because a transformation matrix has only six elements that can be changed, 

146 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

147 

148 Coordinate transformations are expressed as matrix multiplications:: 

149 

150 a b 0 

151 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

152 e f 1 

153 

154 

155 Example: 

156 >>> from pypdf import Transformation 

157 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

158 >>> page.add_transformation(op) 

159 

160 """ 

161 

162 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

163 self.ctm = ctm 

164 

165 @property 

166 def matrix(self) -> TransformationMatrixType: 

167 """ 

168 Return the transformation matrix as a tuple of tuples in the form: 

169 

170 ((a, b, 0), (c, d, 0), (e, f, 1)) 

171 """ 

172 return ( 

173 (self.ctm[0], self.ctm[1], 0), 

174 (self.ctm[2], self.ctm[3], 0), 

175 (self.ctm[4], self.ctm[5], 1), 

176 ) 

177 

178 @staticmethod 

179 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

180 """ 

181 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

182 

183 Args: 

184 matrix: The transformation matrix as a tuple of tuples. 

185 

186 Returns: 

187 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

188 

189 """ 

190 return ( 

191 matrix[0][0], 

192 matrix[0][1], 

193 matrix[1][0], 

194 matrix[1][1], 

195 matrix[2][0], 

196 matrix[2][1], 

197 ) 

198 

199 def transform(self, m: "Transformation") -> "Transformation": 

200 """ 

201 Apply one transformation to another. 

202 

203 Args: 

204 m: a Transformation to apply. 

205 

206 Returns: 

207 A new ``Transformation`` instance 

208 

209 Example: 

210 >>> from pypdf import Transformation 

211 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

212 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

213 >>> page.add_transformation(op) 

214 

215 """ 

216 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

217 return Transformation(ctm) 

218 

219 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

220 """ 

221 Translate the contents of a page. 

222 

223 Args: 

224 tx: The translation along the x-axis. 

225 ty: The translation along the y-axis. 

226 

227 Returns: 

228 A new ``Transformation`` instance 

229 

230 """ 

231 m = self.ctm 

232 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

233 

234 def scale( 

235 self, sx: Optional[float] = None, sy: Optional[float] = None 

236 ) -> "Transformation": 

237 """ 

238 Scale the contents of a page towards the origin of the coordinate system. 

239 

240 Typically, that is the lower-left corner of the page. That can be 

241 changed by translating the contents / the page boxes. 

242 

243 Args: 

244 sx: The scale factor along the x-axis. 

245 sy: The scale factor along the y-axis. 

246 

247 Returns: 

248 A new Transformation instance with the scaled matrix. 

249 

250 """ 

251 if sx is None and sy is None: 

252 raise ValueError("Either sx or sy must be specified") 

253 if sx is None: 

254 sx = sy 

255 if sy is None: 

256 sy = sx 

257 assert sx is not None 

258 assert sy is not None 

259 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

260 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

261 return Transformation(ctm) 

262 

263 def rotate(self, rotation: float) -> "Transformation": 

264 """ 

265 Rotate the contents of a page. 

266 

267 Args: 

268 rotation: The angle of rotation in degrees. 

269 

270 Returns: 

271 A new ``Transformation`` instance with the rotated matrix. 

272 

273 """ 

274 rotation = math.radians(rotation) 

275 op: TransformationMatrixType = ( 

276 (math.cos(rotation), math.sin(rotation), 0), 

277 (-math.sin(rotation), math.cos(rotation), 0), 

278 (0, 0, 1), 

279 ) 

280 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

281 return Transformation(ctm) 

282 

283 def __repr__(self) -> str: 

284 return f"Transformation(ctm={self.ctm})" 

285 

286 @overload 

287 def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]: 

288 ... 

289 

290 @overload 

291 def apply_on( 

292 self, pt: Tuple[float, float], as_object: bool = False 

293 ) -> Tuple[float, float]: 

294 ... 

295 

296 def apply_on( 

297 self, 

298 pt: Union[Tuple[float, float], List[float]], 

299 as_object: bool = False, 

300 ) -> Union[Tuple[float, float], List[float]]: 

301 """ 

302 Apply the transformation matrix on the given point. 

303 

304 Args: 

305 pt: A tuple or list representing the point in the form (x, y). 

306 as_object: If True, return items as FloatObject, otherwise as plain floats. 

307 

308 Returns: 

309 A tuple or list representing the transformed point in the form (x', y') 

310 

311 """ 

312 typ = FloatObject if as_object else float 

313 pt1 = ( 

314 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

315 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

316 ) 

317 return list(pt1) if isinstance(pt, list) else pt1 

318 

319 

320@dataclass 

321class ImageFile: 

322 """ 

323 Image within the PDF file. *This object is not designed to be built.* 

324 

325 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

326 """ 

327 

328 name: str = "" 

329 """ 

330 Filename as identified within the PDF file. 

331 """ 

332 

333 data: bytes = b"" 

334 """ 

335 Data as bytes. 

336 """ 

337 

338 image: Optional[Image] = None 

339 """ 

340 Data as PIL image. 

341 """ 

342 

343 indirect_reference: Optional[IndirectObject] = None 

344 """ 

345 Reference to the object storing the stream. 

346 """ 

347 

348 def replace(self, new_image: Image, **kwargs: Any) -> None: 

349 """ 

350 Replace the image with a new PIL image. 

351 

352 Args: 

353 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

354 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

355 

356 Raises: 

357 TypeError: If the image is inline or in a PdfReader. 

358 TypeError: If the image does not belong to a PdfWriter. 

359 TypeError: If `new_image` is not a PIL Image. 

360 

361 Note: 

362 This method replaces the existing image with a new image. 

363 It is not allowed for inline images or images within a PdfReader. 

364 The `kwargs` parameter allows passing additional parameters 

365 to `Image.save()`, such as quality. 

366 

367 """ 

368 if pil_not_imported: 

369 raise ImportError( 

370 "pillow is required to do image extraction. " 

371 "It can be installed via 'pip install pypdf[image]'" 

372 ) 

373 

374 from ._reader import PdfReader # noqa: PLC0415 

375 

376 # to prevent circular import 

377 from .filters import _xobj_to_image # noqa: PLC0415 

378 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

379 

380 if self.indirect_reference is None: 

381 raise TypeError("Cannot update an inline image.") 

382 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

383 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

384 if not isinstance(new_image, Image): 

385 raise TypeError("new_image shall be a PIL Image") 

386 b = BytesIO() 

387 new_image.save(b, "PDF", **kwargs) 

388 reader = PdfReader(b) 

389 assert reader.pages[0].images[0].indirect_reference is not None 

390 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

391 reader.pages[0].images[0].indirect_reference.get_object() 

392 ) 

393 cast( 

394 PdfObject, self.indirect_reference.get_object() 

395 ).indirect_reference = self.indirect_reference 

396 # change the object attributes 

397 extension, byte_stream, img = _xobj_to_image( 

398 cast(DictionaryObject, self.indirect_reference.get_object()) 

399 ) 

400 assert extension is not None 

401 self.name = self.name[: self.name.rfind(".")] + extension 

402 self.data = byte_stream 

403 self.image = img 

404 

405 def __str__(self) -> str: 

406 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

407 

408 def __repr__(self) -> str: 

409 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

410 

411 

412class VirtualListImages(Sequence[ImageFile]): 

413 """ 

414 Provides access to images referenced within a page. 

415 Only one copy will be returned if the usage is used on the same page multiple times. 

416 See :func:`PageObject.images` for more details. 

417 """ 

418 

419 def __init__( 

420 self, 

421 ids_function: Callable[[], List[Union[str, List[str]]]], 

422 get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], 

423 ) -> None: 

424 self.ids_function = ids_function 

425 self.get_function = get_function 

426 self.current = -1 

427 

428 def __len__(self) -> int: 

429 return len(self.ids_function()) 

430 

431 def keys(self) -> List[Union[str, List[str]]]: 

432 return self.ids_function() 

433 

434 def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: 

435 return [(x, self[x]) for x in self.ids_function()] 

436 

437 @overload 

438 def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: 

439 ... 

440 

441 @overload 

442 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

443 ... 

444 

445 def __getitem__( 

446 self, index: Union[int, slice, str, List[str], Tuple[str]] 

447 ) -> Union[ImageFile, Sequence[ImageFile]]: 

448 lst = self.ids_function() 

449 if isinstance(index, slice): 

450 indices = range(*index.indices(len(self))) 

451 lst = [lst[x] for x in indices] 

452 cls = type(self) 

453 return cls((lambda: lst), self.get_function) 

454 if isinstance(index, (str, list, tuple)): 

455 return self.get_function(index) 

456 if not isinstance(index, int): 

457 raise TypeError("Invalid sequence indices type") 

458 len_self = len(lst) 

459 if index < 0: 

460 # support negative indexes 

461 index += len_self 

462 if not (0 <= index < len_self): 

463 raise IndexError("Sequence index out of range") 

464 return self.get_function(lst[index]) 

465 

466 def __iter__(self) -> Iterator[ImageFile]: 

467 for i in range(len(self)): 

468 yield self[i] 

469 

470 def __str__(self) -> str: 

471 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

472 return f"[{', '.join(p)}]" 

473 

474 

475class PageObject(DictionaryObject): 

476 """ 

477 PageObject represents a single page within a PDF file. 

478 

479 Typically these objects will be created by accessing the 

480 :attr:`pages<pypdf.PdfReader.pages>` property of the 

481 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

482 also possible to create an empty page with the 

483 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

484 

485 Args: 

486 pdf: PDF file the page belongs to. 

487 indirect_reference: Stores the original indirect reference to 

488 this object in its source PDF 

489 

490 """ 

491 

492 original_page: "PageObject" # very local use in writer when appending 

493 

494 def __init__( 

495 self, 

496 pdf: Optional[PdfCommonDocProtocol] = None, 

497 indirect_reference: Optional[IndirectObject] = None, 

498 ) -> None: 

499 DictionaryObject.__init__(self) 

500 self.pdf = pdf 

501 self.inline_images: Optional[Dict[str, ImageFile]] = None 

502 self.indirect_reference = indirect_reference 

503 if not is_null_or_none(indirect_reference): 

504 assert indirect_reference is not None, "mypy" 

505 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

506 self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {} 

507 

508 def hash_bin(self) -> int: 

509 """ 

510 Used to detect modified object. 

511 

512 Note: this function is overloaded to return the same results 

513 as a DictionaryObject. 

514 

515 Returns: 

516 Hash considering type and value. 

517 

518 """ 

519 return hash( 

520 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

521 ) 

522 

523 def hash_value_data(self) -> bytes: 

524 data = super().hash_value_data() 

525 data += b"%d" % id(self) 

526 return data 

527 

528 @property 

529 def user_unit(self) -> float: 

530 """ 

531 A read-only positive number giving the size of user space units. 

532 

533 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

534 space unit is 1/72 inch, and a value of 3 means that a user 

535 space unit is 3/72 inch. 

536 """ 

537 return self.get(PG.USER_UNIT, 1) 

538 

539 @staticmethod 

540 def create_blank_page( 

541 pdf: Optional[PdfCommonDocProtocol] = None, 

542 width: Union[float, Decimal, None] = None, 

543 height: Union[float, Decimal, None] = None, 

544 ) -> "PageObject": 

545 """ 

546 Return a new blank page. 

547 

548 If ``width`` or ``height`` is ``None``, try to get the page size 

549 from the last page of *pdf*. 

550 

551 Args: 

552 pdf: PDF file the page is within. 

553 width: The width of the new page expressed in default user 

554 space units. 

555 height: The height of the new page expressed in default user 

556 space units. 

557 

558 Returns: 

559 The new blank page 

560 

561 Raises: 

562 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

563 no page 

564 

565 """ 

566 page = PageObject(pdf) 

567 

568 # Creates a new page (cf PDF Reference §7.7.3.3) 

569 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

570 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

571 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

572 if width is None or height is None: 

573 if pdf is not None and len(pdf.pages) > 0: 

574 lastpage = pdf.pages[len(pdf.pages) - 1] 

575 width = lastpage.mediabox.width 

576 height = lastpage.mediabox.height 

577 else: 

578 raise PageSizeNotDefinedError 

579 page.__setitem__( 

580 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

581 ) 

582 

583 return page 

584 

585 def _get_ids_image( 

586 self, 

587 obj: Optional[DictionaryObject] = None, 

588 ancest: Optional[List[str]] = None, 

589 call_stack: Optional[List[Any]] = None, 

590 ) -> List[Union[str, List[str]]]: 

591 if call_stack is None: 

592 call_stack = [] 

593 _i = getattr(obj, "indirect_reference", None) 

594 if _i in call_stack: 

595 return [] 

596 call_stack.append(_i) 

597 if self.inline_images is None: 

598 self.inline_images = self._get_inline_images() 

599 if obj is None: 

600 obj = self 

601 if ancest is None: 

602 ancest = [] 

603 lst: List[Union[str, List[str]]] = [] 

604 if ( 

605 PG.RESOURCES not in obj or 

606 is_null_or_none(resources := obj[PG.RESOURCES]) or 

607 RES.XOBJECT not in cast(DictionaryObject, resources) 

608 ): 

609 return [] if self.inline_images is None else list(self.inline_images.keys()) 

610 

611 x_object = resources[RES.XOBJECT].get_object() # type: ignore 

612 for o in x_object: 

613 if not isinstance(x_object[o], StreamObject): 

614 continue 

615 if x_object[o][IA.SUBTYPE] == "/Image": 

616 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

617 else: # is a form with possible images inside 

618 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

619 assert self.inline_images is not None 

620 lst.extend(list(self.inline_images.keys())) 

621 return lst 

622 

623 def _get_image( 

624 self, 

625 id: Union[str, List[str], Tuple[str]], 

626 obj: Optional[DictionaryObject] = None, 

627 ) -> ImageFile: 

628 if obj is None: 

629 obj = cast(DictionaryObject, self) 

630 if isinstance(id, tuple): 

631 id = list(id) 

632 if isinstance(id, List) and len(id) == 1: 

633 id = id[0] 

634 try: 

635 xobjs = cast( 

636 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

637 ) 

638 except KeyError: 

639 if not (id[0] == "~" and id[-1] == "~"): 

640 raise 

641 if isinstance(id, str): 

642 if id[0] == "~" and id[-1] == "~": 

643 if self.inline_images is None: 

644 self.inline_images = self._get_inline_images() 

645 if self.inline_images is None: # pragma: no cover 

646 raise KeyError("No inline image can be found") 

647 return self.inline_images[id] 

648 

649 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

650 extension, byte_stream = imgd[:2] 

651 return ImageFile( 

652 name=f"{id[1:]}{extension}", 

653 data=byte_stream, 

654 image=imgd[2], 

655 indirect_reference=xobjs[id].indirect_reference, 

656 ) 

657 # in a subobject 

658 ids = id[1:] 

659 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

660 

661 @property 

662 def images(self) -> VirtualListImages: 

663 """ 

664 Read-only property emulating a list of images on a page. 

665 

666 Get a list of all images on the page. The key can be: 

667 - A string (for the top object) 

668 - A tuple (for images within XObject forms) 

669 - An integer 

670 

671 Examples: 

672 * `reader.pages[0].images[0]` # return first image 

673 * `reader.pages[0].images['/I0']` # return image '/I0' 

674 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

675 * `for img in reader.pages[0].images:` # loops through all objects 

676 

677 images.keys() and images.items() can be used. 

678 

679 The ImageFile has the following properties: 

680 

681 * `.name` : name of the object 

682 * `.data` : bytes of the object 

683 * `.image` : PIL Image Object 

684 * `.indirect_reference` : object reference 

685 

686 and the following methods: 

687 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

688 replace the image in the pdf with the new image 

689 applying the saving parameters indicated (such as quality) 

690 

691 Example usage: 

692 

693 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

694 

695 Inline images are extracted and named ~0~, ~1~, ..., with the 

696 indirect_reference set to None. 

697 

698 """ 

699 return VirtualListImages(self._get_ids_image, self._get_image) 

700 

701 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

702 """Translate values used in inline image""" 

703 try: 

704 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

705 except (TypeError, KeyError): 

706 if isinstance(v, NameObject): 

707 # It is a custom name, thus we have to look in resources. 

708 # The only applicable case is for ColorSpace. 

709 try: 

710 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

711 v = cast(DictionaryObject, res)[v] 

712 except KeyError: # for res and v 

713 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

714 return v 

715 

716 def _get_inline_images(self) -> Dict[str, ImageFile]: 

717 """Load inline images. Entries will be identified as `~1~`.""" 

718 content = self.get_contents() 

719 if is_null_or_none(content): 

720 return {} 

721 imgs_data = [] 

722 assert content is not None, "mypy" 

723 for param, ope in content.operations: 

724 if ope == b"INLINE IMAGE": 

725 imgs_data.append( 

726 {"settings": param["settings"], "__streamdata__": param["data"]} 

727 ) 

728 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

729 raise PdfReadError( 

730 f"{ope!r} operator met whereas not expected, " 

731 "please share use case with pypdf dev team" 

732 ) 

733 files = {} 

734 for num, ii in enumerate(imgs_data): 

735 init = { 

736 "__streamdata__": ii["__streamdata__"], 

737 "/Length": len(ii["__streamdata__"]), 

738 } 

739 for k, v in ii["settings"].items(): 

740 if k in {"/Length", "/L"}: # no length is expected 

741 continue 

742 if isinstance(v, list): 

743 v = ArrayObject( 

744 [self._translate_value_inline_image(k, x) for x in v] 

745 ) 

746 else: 

747 v = self._translate_value_inline_image(k, v) 

748 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

749 if k not in init: 

750 init[k] = v 

751 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

752 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

753 files[f"~{num}~"] = ImageFile( 

754 name=f"~{num}~{extension}", 

755 data=byte_stream, 

756 image=img, 

757 indirect_reference=None, 

758 ) 

759 return files 

760 

761 @property 

762 def rotation(self) -> int: 

763 """ 

764 The visual rotation of the page. 

765 

766 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

767 valid values. This property does not affect ``/Contents``. 

768 """ 

769 rotate_obj = self.get(PG.ROTATE, 0) 

770 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

771 

772 @rotation.setter 

773 def rotation(self, r: float) -> None: 

774 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

775 

776 def transfer_rotation_to_content(self) -> None: 

777 """ 

778 Apply the rotation of the page to the content and the media/crop/... 

779 boxes. 

780 

781 It is recommended to apply this function before page merging. 

782 """ 

783 r = -self.rotation # rotation to apply is in the otherway 

784 self.rotation = 0 

785 mb = RectangleObject(self.mediabox) 

786 trsf = ( 

787 Transformation() 

788 .translate( 

789 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

790 ) 

791 .rotate(r) 

792 ) 

793 pt1 = trsf.apply_on(mb.lower_left) 

794 pt2 = trsf.apply_on(mb.upper_right) 

795 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

796 self.add_transformation(trsf, False) 

797 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

798 if b in self: 

799 rr = RectangleObject(self[b]) # type: ignore 

800 pt1 = trsf.apply_on(rr.lower_left) 

801 pt2 = trsf.apply_on(rr.upper_right) 

802 self[NameObject(b)] = RectangleObject( 

803 ( 

804 min(pt1[0], pt2[0]), 

805 min(pt1[1], pt2[1]), 

806 max(pt1[0], pt2[0]), 

807 max(pt1[1], pt2[1]), 

808 ) 

809 ) 

810 

811 def rotate(self, angle: int) -> "PageObject": 

812 """ 

813 Rotate a page clockwise by increments of 90 degrees. 

814 

815 Args: 

816 angle: Angle to rotate the page. Must be an increment of 90 deg. 

817 

818 Returns: 

819 The rotated PageObject 

820 

821 """ 

822 if angle % 90 != 0: 

823 raise ValueError("Rotation angle must be a multiple of 90") 

824 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

825 return self 

826 

827 def _merge_resources( 

828 self, 

829 res1: DictionaryObject, 

830 res2: DictionaryObject, 

831 resource: Any, 

832 new_res1: bool = True, 

833 ) -> Tuple[Dict[str, Any], Dict[str, Any]]: 

834 try: 

835 assert isinstance(self.indirect_reference, IndirectObject) 

836 pdf = self.indirect_reference.pdf 

837 is_pdf_writer = hasattr( 

838 pdf, "_add_object" 

839 ) # expect isinstance(pdf, PdfWriter) 

840 except (AssertionError, AttributeError): 

841 pdf = None 

842 is_pdf_writer = False 

843 

844 def compute_unique_key(base_key: str) -> Tuple[str, bool]: 

845 """ 

846 Find a key that either doesn't already exist or has the same value 

847 (indicated by the bool) 

848 

849 Args: 

850 base_key: An index is added to this to get the computed key 

851 

852 Returns: 

853 A tuple (computed key, bool) where the boolean indicates 

854 if there is a resource of the given computed_key with the same 

855 value. 

856 

857 """ 

858 value = page2res.raw_get(base_key) 

859 # TODO: a possible improvement for writer, the indirect_reference 

860 # cannot be found because translated 

861 

862 # try the current key first (e.g. "foo"), but otherwise iterate 

863 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

864 # many keys, thus this'll eventually end, even if it's been crafted 

865 # to be maximally annoying. 

866 computed_key = base_key 

867 idx = 0 

868 while computed_key in new_res: 

869 if new_res.raw_get(computed_key) == value: 

870 # there's already a resource of this name, with the exact 

871 # same value 

872 return computed_key, True 

873 computed_key = f"{base_key}-{idx}" 

874 idx += 1 

875 return computed_key, False 

876 

877 if new_res1: 

878 new_res = DictionaryObject() 

879 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

880 else: 

881 new_res = cast(DictionaryObject, res1[resource]) 

882 page2res = cast( 

883 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

884 ) 

885 rename_res = {} 

886 for key in page2res: 

887 unique_key, same_value = compute_unique_key(key) 

888 newname = NameObject(unique_key) 

889 if key != unique_key: 

890 # we have to use a different name for this 

891 rename_res[key] = newname 

892 

893 if not same_value: 

894 if is_pdf_writer: 

895 new_res[newname] = page2res.raw_get(key).clone(pdf) 

896 try: 

897 new_res[newname] = new_res[newname].indirect_reference 

898 except AttributeError: 

899 pass 

900 else: 

901 new_res[newname] = page2res.raw_get(key) 

902 lst = sorted(new_res.items()) 

903 new_res.clear() 

904 for el in lst: 

905 new_res[el[0]] = el[1] 

906 return new_res, rename_res 

907 

908 @staticmethod 

909 def _content_stream_rename( 

910 stream: ContentStream, 

911 rename: Dict[Any, Any], 

912 pdf: Optional[PdfCommonDocProtocol], 

913 ) -> ContentStream: 

914 if not rename: 

915 return stream 

916 stream = ContentStream(stream, pdf) 

917 for operands, _operator in stream.operations: 

918 if isinstance(operands, list): 

919 for i, op in enumerate(operands): 

920 if isinstance(op, NameObject): 

921 operands[i] = rename.get(op, op) 

922 elif isinstance(operands, dict): 

923 for i, op in operands.items(): 

924 if isinstance(op, NameObject): 

925 operands[i] = rename.get(op, op) 

926 else: 

927 raise KeyError(f"Type of operands is {type(operands)}") 

928 return stream 

929 

930 @staticmethod 

931 def _add_transformation_matrix( 

932 contents: Any, 

933 pdf: Optional[PdfCommonDocProtocol], 

934 ctm: CompressedTransformationMatrix, 

935 ) -> ContentStream: 

936 """Add transformation matrix at the beginning of the given contents stream.""" 

937 contents = ContentStream(contents, pdf) 

938 contents.operations.insert( 

939 0, 

940 [ 

941 [FloatObject(x) for x in ctm], 

942 b"cm", 

943 ], 

944 ) 

945 return contents 

946 

947 def _get_contents_as_bytes(self) -> Optional[bytes]: 

948 """ 

949 Return the page contents as bytes. 

950 

951 Returns: 

952 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

953 

954 """ 

955 if PG.CONTENTS in self: 

956 obj = self[PG.CONTENTS].get_object() 

957 if isinstance(obj, list): 

958 return b"".join(x.get_object().get_data() for x in obj) 

959 return cast(EncodedStreamObject, obj).get_data() 

960 return None 

961 

962 def get_contents(self) -> Optional[ContentStream]: 

963 """ 

964 Access the page contents. 

965 

966 Returns: 

967 The ``/Contents`` object, or ``None`` if it does not exist. 

968 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

969 

970 """ 

971 if PG.CONTENTS in self: 

972 try: 

973 pdf = cast(IndirectObject, self.indirect_reference).pdf 

974 except AttributeError: 

975 pdf = None 

976 obj = self[PG.CONTENTS] 

977 if is_null_or_none(obj): 

978 return None 

979 resolved_object = obj.get_object() 

980 return ContentStream(resolved_object, pdf) 

981 return None 

982 

983 def replace_contents( 

984 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

985 ) -> None: 

986 """ 

987 Replace the page contents with the new content and nullify old objects 

988 Args: 

989 content: new content; if None delete the content field. 

990 """ 

991 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

992 # the page is not attached : the content is directly attached. 

993 self[NameObject(PG.CONTENTS)] = content 

994 return 

995 

996 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

997 for o in self[PG.CONTENTS]: # type: ignore[attr-defined] 

998 try: 

999 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore 

1000 except AttributeError: 

1001 pass 

1002 

1003 if isinstance(content, ArrayObject): 

1004 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content) 

1005 

1006 if is_null_or_none(content): 

1007 if PG.CONTENTS not in self: 

1008 return 

1009 assert self.indirect_reference is not None 

1010 assert self[PG.CONTENTS].indirect_reference is not None 

1011 self.indirect_reference.pdf._objects[ 

1012 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore 

1013 ] = NullObject() 

1014 del self[PG.CONTENTS] 

1015 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1016 try: 

1017 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( 

1018 content 

1019 ) 

1020 except AttributeError: 

1021 # applies at least for page not in writer 

1022 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1023 # this will be fixed with the _add_object 

1024 self[NameObject(PG.CONTENTS)] = content 

1025 else: 

1026 assert content is not None, "mypy" 

1027 content.indirect_reference = self[ 

1028 PG.CONTENTS 

1029 ].indirect_reference # TODO: in the future may require generation management 

1030 try: 

1031 self.indirect_reference.pdf._objects[ 

1032 content.indirect_reference.idnum - 1 # type: ignore 

1033 ] = content 

1034 except AttributeError: 

1035 # applies at least for page not in writer 

1036 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1037 # this will be fixed with the _add_object 

1038 self[NameObject(PG.CONTENTS)] = content 

1039 # forces recalculation of inline_images 

1040 self.inline_images = None 

1041 

1042 def merge_page( 

1043 self, page2: "PageObject", expand: bool = False, over: bool = True 

1044 ) -> None: 

1045 """ 

1046 Merge the content streams of two pages into one. 

1047 

1048 Resource references (e.g. fonts) are maintained from both pages. 

1049 The mediabox, cropbox, etc of this page are not altered. 

1050 The parameter page's content stream will 

1051 be added to the end of this page's content stream, 

1052 meaning that it will be drawn after, or "on top" of this page. 

1053 

1054 Args: 

1055 page2: The page to be merged into this one. Should be 

1056 an instance of :class:`PageObject<PageObject>`. 

1057 over: set the page2 content over page1 if True (default) else under 

1058 expand: If True, the current page dimensions will be 

1059 expanded to accommodate the dimensions of the page to be merged. 

1060 

1061 """ 

1062 self._merge_page(page2, over=over, expand=expand) 

1063 

1064 def _merge_page( 

1065 self, 

1066 page2: "PageObject", 

1067 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1068 ctm: Optional[CompressedTransformationMatrix] = None, 

1069 over: bool = True, 

1070 expand: bool = False, 

1071 ) -> None: 

1072 # First we work on merging the resource dictionaries. This allows us 

1073 # to find out what symbols in the content streams we might need to 

1074 # rename. 

1075 try: 

1076 assert isinstance(self.indirect_reference, IndirectObject) 

1077 if hasattr( 

1078 self.indirect_reference.pdf, "_add_object" 

1079 ): # to detect PdfWriter 

1080 return self._merge_page_writer( 

1081 page2, page2transformation, ctm, over, expand 

1082 ) 

1083 except (AssertionError, AttributeError): 

1084 pass 

1085 

1086 new_resources = DictionaryObject() 

1087 rename = {} 

1088 try: 

1089 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1090 except KeyError: 

1091 original_resources = DictionaryObject() 

1092 try: 

1093 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1094 except KeyError: 

1095 page2resources = DictionaryObject() 

1096 new_annots = ArrayObject() 

1097 

1098 for page in (self, page2): 

1099 if PG.ANNOTS in page: 

1100 annots = page[PG.ANNOTS] 

1101 if isinstance(annots, ArrayObject): 

1102 new_annots.extend(annots) 

1103 

1104 for res in ( 

1105 RES.EXT_G_STATE, 

1106 RES.FONT, 

1107 RES.XOBJECT, 

1108 RES.COLOR_SPACE, 

1109 RES.PATTERN, 

1110 RES.SHADING, 

1111 RES.PROPERTIES, 

1112 ): 

1113 new, newrename = self._merge_resources( 

1114 original_resources, page2resources, res 

1115 ) 

1116 if new: 

1117 new_resources[NameObject(res)] = new 

1118 rename.update(newrename) 

1119 

1120 # Combine /ProcSet sets, making sure there's a consistent order 

1121 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1122 sorted( 

1123 set( 

1124 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1125 ).union( 

1126 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1127 ) 

1128 ) 

1129 ) 

1130 

1131 new_content_array = ArrayObject() 

1132 original_content = self.get_contents() 

1133 if original_content is not None: 

1134 original_content.isolate_graphics_state() 

1135 new_content_array.append(original_content) 

1136 

1137 page2content = page2.get_contents() 

1138 if page2content is not None: 

1139 rect = getattr(page2, MERGE_CROP_BOX) 

1140 page2content.operations.insert( 

1141 0, 

1142 ( 

1143 map( 

1144 FloatObject, 

1145 [ 

1146 rect.left, 

1147 rect.bottom, 

1148 rect.width, 

1149 rect.height, 

1150 ], 

1151 ), 

1152 b"re", 

1153 ), 

1154 ) 

1155 page2content.operations.insert(1, ([], b"W")) 

1156 page2content.operations.insert(2, ([], b"n")) 

1157 if page2transformation is not None: 

1158 page2content = page2transformation(page2content) 

1159 page2content = PageObject._content_stream_rename( 

1160 page2content, rename, self.pdf 

1161 ) 

1162 page2content.isolate_graphics_state() 

1163 if over: 

1164 new_content_array.append(page2content) 

1165 else: 

1166 new_content_array.insert(0, page2content) 

1167 

1168 # if expanding the page to fit a new page, calculate the new media box size 

1169 if expand: 

1170 self._expand_mediabox(page2, ctm) 

1171 

1172 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1173 self[NameObject(PG.RESOURCES)] = new_resources 

1174 self[NameObject(PG.ANNOTS)] = new_annots 

1175 

1176 def _merge_page_writer( 

1177 self, 

1178 page2: "PageObject", 

1179 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1180 ctm: Optional[CompressedTransformationMatrix] = None, 

1181 over: bool = True, 

1182 expand: bool = False, 

1183 ) -> None: 

1184 # First we work on merging the resource dictionaries. This allows us 

1185 # to find which symbols in the content streams we might need to 

1186 # rename. 

1187 assert isinstance(self.indirect_reference, IndirectObject) 

1188 pdf = self.indirect_reference.pdf 

1189 

1190 rename = {} 

1191 if PG.RESOURCES not in self: 

1192 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1193 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1194 if PG.RESOURCES not in page2: 

1195 page2resources = DictionaryObject() 

1196 else: 

1197 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1198 

1199 for res in ( 

1200 RES.EXT_G_STATE, 

1201 RES.FONT, 

1202 RES.XOBJECT, 

1203 RES.COLOR_SPACE, 

1204 RES.PATTERN, 

1205 RES.SHADING, 

1206 RES.PROPERTIES, 

1207 ): 

1208 if res in page2resources: 

1209 if res not in original_resources: 

1210 original_resources[NameObject(res)] = DictionaryObject() 

1211 _, newrename = self._merge_resources( 

1212 original_resources, page2resources, res, False 

1213 ) 

1214 rename.update(newrename) 

1215 # Combine /ProcSet sets. 

1216 if RES.PROC_SET in page2resources: 

1217 if RES.PROC_SET not in original_resources: 

1218 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1219 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1220 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1221 if x not in arr: 

1222 arr.append(x) 

1223 arr.sort() 

1224 

1225 if PG.ANNOTS in page2: 

1226 if PG.ANNOTS not in self: 

1227 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1228 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1229 if ctm is None: 

1230 trsf = Transformation() 

1231 else: 

1232 trsf = Transformation(ctm) 

1233 for a in cast(ArrayObject, page2[PG.ANNOTS]): 

1234 a = a.get_object() 

1235 aa = a.clone( 

1236 pdf, 

1237 ignore_fields=("/P", "/StructParent", "/Parent"), 

1238 force_duplicate=True, 

1239 ) 

1240 r = cast(ArrayObject, a["/Rect"]) 

1241 pt1 = trsf.apply_on((r[0], r[1]), True) 

1242 pt2 = trsf.apply_on((r[2], r[3]), True) 

1243 aa[NameObject("/Rect")] = ArrayObject( 

1244 ( 

1245 min(pt1[0], pt2[0]), 

1246 min(pt1[1], pt2[1]), 

1247 max(pt1[0], pt2[0]), 

1248 max(pt1[1], pt2[1]), 

1249 ) 

1250 ) 

1251 if "/QuadPoints" in a: 

1252 q = cast(ArrayObject, a["/QuadPoints"]) 

1253 aa[NameObject("/QuadPoints")] = ArrayObject( 

1254 trsf.apply_on((q[0], q[1]), True) 

1255 + trsf.apply_on((q[2], q[3]), True) 

1256 + trsf.apply_on((q[4], q[5]), True) 

1257 + trsf.apply_on((q[6], q[7]), True) 

1258 ) 

1259 try: 

1260 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1261 except KeyError: 

1262 pass 

1263 try: 

1264 aa[NameObject("/P")] = self.indirect_reference 

1265 annots.append(aa.indirect_reference) 

1266 except AttributeError: 

1267 pass 

1268 

1269 new_content_array = ArrayObject() 

1270 original_content = self.get_contents() 

1271 if original_content is not None: 

1272 original_content.isolate_graphics_state() 

1273 new_content_array.append(original_content) 

1274 

1275 page2content = page2.get_contents() 

1276 if page2content is not None: 

1277 rect = getattr(page2, MERGE_CROP_BOX) 

1278 page2content.operations.insert( 

1279 0, 

1280 ( 

1281 map( 

1282 FloatObject, 

1283 [ 

1284 rect.left, 

1285 rect.bottom, 

1286 rect.width, 

1287 rect.height, 

1288 ], 

1289 ), 

1290 b"re", 

1291 ), 

1292 ) 

1293 page2content.operations.insert(1, ([], b"W")) 

1294 page2content.operations.insert(2, ([], b"n")) 

1295 if page2transformation is not None: 

1296 page2content = page2transformation(page2content) 

1297 page2content = PageObject._content_stream_rename( 

1298 page2content, rename, self.pdf 

1299 ) 

1300 page2content.isolate_graphics_state() 

1301 if over: 

1302 new_content_array.append(page2content) 

1303 else: 

1304 new_content_array.insert(0, page2content) 

1305 

1306 # if expanding the page to fit a new page, calculate the new media box size 

1307 if expand: 

1308 self._expand_mediabox(page2, ctm) 

1309 

1310 self.replace_contents(new_content_array) 

1311 

1312 def _expand_mediabox( 

1313 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1314 ) -> None: 

1315 corners1 = ( 

1316 self.mediabox.left.as_numeric(), 

1317 self.mediabox.bottom.as_numeric(), 

1318 self.mediabox.right.as_numeric(), 

1319 self.mediabox.top.as_numeric(), 

1320 ) 

1321 corners2 = ( 

1322 page2.mediabox.left.as_numeric(), 

1323 page2.mediabox.bottom.as_numeric(), 

1324 page2.mediabox.left.as_numeric(), 

1325 page2.mediabox.top.as_numeric(), 

1326 page2.mediabox.right.as_numeric(), 

1327 page2.mediabox.top.as_numeric(), 

1328 page2.mediabox.right.as_numeric(), 

1329 page2.mediabox.bottom.as_numeric(), 

1330 ) 

1331 if ctm is not None: 

1332 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1333 new_x = tuple( 

1334 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1335 for i in range(0, 8, 2) 

1336 ) 

1337 new_y = tuple( 

1338 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1339 for i in range(0, 8, 2) 

1340 ) 

1341 else: 

1342 new_x = corners2[0:8:2] 

1343 new_y = corners2[1:8:2] 

1344 lowerleft = (min(new_x), min(new_y)) 

1345 upperright = (max(new_x), max(new_y)) 

1346 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1347 upperright = ( 

1348 max(corners1[2], upperright[0]), 

1349 max(corners1[3], upperright[1]), 

1350 ) 

1351 

1352 self.mediabox.lower_left = lowerleft 

1353 self.mediabox.upper_right = upperright 

1354 

1355 def merge_transformed_page( 

1356 self, 

1357 page2: "PageObject", 

1358 ctm: Union[CompressedTransformationMatrix, Transformation], 

1359 over: bool = True, 

1360 expand: bool = False, 

1361 ) -> None: 

1362 """ 

1363 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1364 matrix is applied to the merged stream. 

1365 

1366 Args: 

1367 page2: The page to be merged into this one. 

1368 ctm: a 6-element tuple containing the operands of the 

1369 transformation matrix 

1370 over: set the page2 content over page1 if True (default) else under 

1371 expand: Whether the page should be expanded to fit the dimensions 

1372 of the page to be merged. 

1373 

1374 """ 

1375 if isinstance(ctm, Transformation): 

1376 ctm = ctm.ctm 

1377 self._merge_page( 

1378 page2, 

1379 lambda page2Content: PageObject._add_transformation_matrix( 

1380 page2Content, page2.pdf, ctm 

1381 ), 

1382 ctm, 

1383 over, 

1384 expand, 

1385 ) 

1386 

1387 def merge_scaled_page( 

1388 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1389 ) -> None: 

1390 """ 

1391 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1392 is scaled by applying a transformation matrix. 

1393 

1394 Args: 

1395 page2: The page to be merged into this one. 

1396 scale: The scaling factor 

1397 over: set the page2 content over page1 if True (default) else under 

1398 expand: Whether the page should be expanded to fit the 

1399 dimensions of the page to be merged. 

1400 

1401 """ 

1402 op = Transformation().scale(scale, scale) 

1403 self.merge_transformed_page(page2, op, over, expand) 

1404 

1405 def merge_rotated_page( 

1406 self, 

1407 page2: "PageObject", 

1408 rotation: float, 

1409 over: bool = True, 

1410 expand: bool = False, 

1411 ) -> None: 

1412 """ 

1413 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1414 is rotated by applying a transformation matrix. 

1415 

1416 Args: 

1417 page2: The page to be merged into this one. 

1418 rotation: The angle of the rotation, in degrees 

1419 over: set the page2 content over page1 if True (default) else under 

1420 expand: Whether the page should be expanded to fit the 

1421 dimensions of the page to be merged. 

1422 

1423 """ 

1424 op = Transformation().rotate(rotation) 

1425 self.merge_transformed_page(page2, op, over, expand) 

1426 

1427 def merge_translated_page( 

1428 self, 

1429 page2: "PageObject", 

1430 tx: float, 

1431 ty: float, 

1432 over: bool = True, 

1433 expand: bool = False, 

1434 ) -> None: 

1435 """ 

1436 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1437 merged is translated by applying a transformation matrix. 

1438 

1439 Args: 

1440 page2: the page to be merged into this one. 

1441 tx: The translation on X axis 

1442 ty: The translation on Y axis 

1443 over: set the page2 content over page1 if True (default) else under 

1444 expand: Whether the page should be expanded to fit the 

1445 dimensions of the page to be merged. 

1446 

1447 """ 

1448 op = Transformation().translate(tx, ty) 

1449 self.merge_transformed_page(page2, op, over, expand) 

1450 

1451 def add_transformation( 

1452 self, 

1453 ctm: Union[Transformation, CompressedTransformationMatrix], 

1454 expand: bool = False, 

1455 ) -> None: 

1456 """ 

1457 Apply a transformation matrix to the page. 

1458 

1459 Args: 

1460 ctm: A 6-element tuple containing the operands of the 

1461 transformation matrix. Alternatively, a 

1462 :py:class:`Transformation<pypdf.Transformation>` 

1463 object can be passed. 

1464 

1465 See :doc:`/user/cropping-and-transforming`. 

1466 

1467 """ 

1468 if isinstance(ctm, Transformation): 

1469 ctm = ctm.ctm 

1470 content = self.get_contents() 

1471 if content is not None: 

1472 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1473 content.isolate_graphics_state() 

1474 self.replace_contents(content) 

1475 # if expanding the page to fit a new page, calculate the new media box size 

1476 if expand: 

1477 corners = [ 

1478 self.mediabox.left.as_numeric(), 

1479 self.mediabox.bottom.as_numeric(), 

1480 self.mediabox.left.as_numeric(), 

1481 self.mediabox.top.as_numeric(), 

1482 self.mediabox.right.as_numeric(), 

1483 self.mediabox.top.as_numeric(), 

1484 self.mediabox.right.as_numeric(), 

1485 self.mediabox.bottom.as_numeric(), 

1486 ] 

1487 

1488 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1489 new_x = [ 

1490 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1491 for i in range(0, 8, 2) 

1492 ] 

1493 new_y = [ 

1494 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1495 for i in range(0, 8, 2) 

1496 ] 

1497 

1498 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1499 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1500 

1501 def scale(self, sx: float, sy: float) -> None: 

1502 """ 

1503 Scale a page by the given factors by applying a transformation matrix 

1504 to its content and updating the page size. 

1505 

1506 This updates the various page boundaries (mediabox, cropbox, etc.) 

1507 and the contents of the page. 

1508 

1509 Args: 

1510 sx: The scaling factor on horizontal axis. 

1511 sy: The scaling factor on vertical axis. 

1512 

1513 """ 

1514 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1515 self.mediabox = self.mediabox.scale(sx, sy) 

1516 self.cropbox = self.cropbox.scale(sx, sy) 

1517 self.bleedbox = self.bleedbox.scale(sx, sy) 

1518 self.trimbox = self.trimbox.scale(sx, sy) 

1519 self.artbox = self.artbox.scale(sx, sy) 

1520 

1521 if PG.ANNOTS in self: 

1522 annotations = self[PG.ANNOTS] 

1523 if isinstance(annotations, ArrayObject): 

1524 for annotation in annotations: 

1525 annotation_obj = annotation.get_object() 

1526 if ADA.Rect in annotation_obj: 

1527 rectangle = annotation_obj[ADA.Rect] 

1528 if isinstance(rectangle, ArrayObject): 

1529 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1530 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1531 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1532 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1533 

1534 if PG.VP in self: 

1535 viewport = self[PG.VP] 

1536 if isinstance(viewport, ArrayObject): 

1537 bbox = viewport[0]["/BBox"] 

1538 else: 

1539 bbox = viewport["/BBox"] # type: ignore 

1540 scaled_bbox = RectangleObject( 

1541 ( 

1542 float(bbox[0]) * sx, 

1543 float(bbox[1]) * sy, 

1544 float(bbox[2]) * sx, 

1545 float(bbox[3]) * sy, 

1546 ) 

1547 ) 

1548 if isinstance(viewport, ArrayObject): 

1549 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1550 NameObject("/BBox") 

1551 ] = scaled_bbox 

1552 else: 

1553 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1554 

1555 def scale_by(self, factor: float) -> None: 

1556 """ 

1557 Scale a page by the given factor by applying a transformation matrix to 

1558 its content and updating the page size. 

1559 

1560 Args: 

1561 factor: The scaling factor (for both X and Y axis). 

1562 

1563 """ 

1564 self.scale(factor, factor) 

1565 

1566 def scale_to(self, width: float, height: float) -> None: 

1567 """ 

1568 Scale a page to the specified dimensions by applying a transformation 

1569 matrix to its content and updating the page size. 

1570 

1571 Args: 

1572 width: The new width. 

1573 height: The new height. 

1574 

1575 """ 

1576 sx = width / float(self.mediabox.width) 

1577 sy = height / float(self.mediabox.height) 

1578 self.scale(sx, sy) 

1579 

1580 def compress_content_streams(self, level: int = -1) -> None: 

1581 """ 

1582 Compress the size of this page by joining all content streams and 

1583 applying a FlateDecode filter. 

1584 

1585 However, it is possible that this function will perform no action if 

1586 content stream compression becomes "automatic". 

1587 """ 

1588 content = self.get_contents() 

1589 if content is not None: 

1590 content_obj = content.flate_encode(level) 

1591 try: 

1592 content.indirect_reference.pdf._objects[ # type: ignore 

1593 content.indirect_reference.idnum - 1 # type: ignore 

1594 ] = content_obj 

1595 except AttributeError: 

1596 if self.indirect_reference is not None and hasattr( 

1597 self.indirect_reference.pdf, "_add_object" 

1598 ): 

1599 self.replace_contents(content_obj) 

1600 else: 

1601 raise ValueError("Page must be part of a PdfWriter") 

1602 

1603 @property 

1604 def page_number(self) -> Optional[int]: 

1605 """ 

1606 Read-only property which returns the page number within the PDF file. 

1607 

1608 Returns: 

1609 Page number; None if the page is not attached to a PDF. 

1610 

1611 """ 

1612 if self.indirect_reference is None: 

1613 return None 

1614 try: 

1615 lst = self.indirect_reference.pdf.pages 

1616 return lst.index(self) 

1617 except ValueError: 

1618 return None 

1619 

1620 def _debug_for_extract(self) -> str: # pragma: no cover 

1621 out = "" 

1622 for ope, op in ContentStream( 

1623 self["/Contents"].get_object(), self.pdf, "bytes" 

1624 ).operations: 

1625 if op == b"TJ": 

1626 s = [x for x in ope[0] if isinstance(x, str)] 

1627 else: 

1628 s = [] 

1629 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1630 out += "\n=============================\n" 

1631 try: 

1632 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1633 out += fo + "\n" 

1634 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1635 try: 

1636 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1637 "/Encoding" 

1638 ].__repr__() 

1639 out += enc_repr + "\n" 

1640 except Exception: 

1641 pass 

1642 try: 

1643 out += ( 

1644 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1645 "/ToUnicode" 

1646 ] 

1647 .get_data() 

1648 .decode() 

1649 + "\n" 

1650 ) 

1651 except Exception: 

1652 pass 

1653 

1654 except KeyError: 

1655 out += "No Font\n" 

1656 return out 

1657 

1658 def _extract_text( 

1659 self, 

1660 obj: Any, 

1661 pdf: Any, 

1662 orientations: Tuple[int, ...] = (0, 90, 180, 270), 

1663 space_width: float = 200.0, 

1664 content_key: Optional[str] = PG.CONTENTS, 

1665 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1666 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1667 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1668 ) -> str: 

1669 """ 

1670 See extract_text for most arguments. 

1671 

1672 Args: 

1673 content_key: indicate the default key where to extract data 

1674 None = the object; this allows reusing the function on an XObject 

1675 default = "/Content" 

1676 

1677 """ 

1678 extractor = TextExtraction() 

1679 cmaps: Dict[ 

1680 str, 

1681 Tuple[ 

1682 str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject 

1683 ], 

1684 ] = {} 

1685 

1686 try: 

1687 objr = obj 

1688 while NameObject(PG.RESOURCES) not in objr: 

1689 # /Resources can be inherited so we look to parents 

1690 objr = objr["/Parent"].get_object() 

1691 # If no parents then no /Resources will be available, 

1692 # so an exception will be raised 

1693 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1694 except Exception: 

1695 # No resources means no text is possible (no font); we consider the 

1696 # file as not damaged, no need to check for TJ or Tj 

1697 return "" 

1698 

1699 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]): 

1700 for f in cast(DictionaryObject, font): 

1701 try: 

1702 cmaps[f] = build_char_map(f, space_width, obj) 

1703 except TypeError: 

1704 pass 

1705 

1706 try: 

1707 content = ( 

1708 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1709 ) 

1710 if not isinstance(content, ContentStream): 

1711 content = ContentStream(content, pdf, "bytes") 

1712 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1713 return "" 

1714 # We check all strings are TextStringObjects. ByteStringObjects 

1715 # are strings where the byte->string encoding was unknown, so adding 

1716 # them to the text here would be gibberish. 

1717 

1718 # Initialize the extractor with the necessary parameters 

1719 extractor.initialize_extraction(orientations, visitor_text, cmaps) 

1720 

1721 for operands, operator in content.operations: 

1722 if visitor_operand_before is not None: 

1723 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1724 # Multiple operators are handled here 

1725 if operator == b"'": 

1726 extractor.process_operation(b"T*", []) 

1727 extractor.process_operation(b"Tj", operands) 

1728 elif operator == b'"': 

1729 extractor.process_operation(b"Tw", [operands[0]]) 

1730 extractor.process_operation(b"Tc", [operands[1]]) 

1731 extractor.process_operation(b"T*", []) 

1732 extractor.process_operation(b"Tj", operands[2:]) 

1733 elif operator == b"TJ": 

1734 # The space width may be smaller than the font width, so the width should be 95%. 

1735 _confirm_space_width = extractor._space_width * 0.95 

1736 if operands: 

1737 for op in operands[0]: 

1738 if isinstance(op, (str, bytes)): 

1739 extractor.process_operation(b"Tj", [op]) 

1740 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1741 abs(float(op)) >= _confirm_space_width 

1742 and extractor.text 

1743 and extractor.text[-1] != " " 

1744 ): 

1745 extractor.process_operation(b"Tj", [" "]) 

1746 elif operator == b"TD": 

1747 extractor.process_operation(b"TL", [-operands[1]]) 

1748 extractor.process_operation(b"Td", operands) 

1749 elif operator == b"Do": 

1750 extractor.output += extractor.text 

1751 if visitor_text is not None: 

1752 visitor_text( 

1753 extractor.text, 

1754 extractor.memo_cm, 

1755 extractor.memo_tm, 

1756 extractor.cmap[3], 

1757 extractor.font_size, 

1758 ) 

1759 try: 

1760 if extractor.output[-1] != "\n": 

1761 extractor.output += "\n" 

1762 if visitor_text is not None: 

1763 visitor_text( 

1764 "\n", 

1765 extractor.memo_cm, 

1766 extractor.memo_tm, 

1767 extractor.cmap[3], 

1768 extractor.font_size, 

1769 ) 

1770 except IndexError: 

1771 pass 

1772 try: 

1773 xobj = resources_dict["/XObject"] 

1774 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1775 text = self.extract_xform_text( 

1776 xobj[operands[0]], # type: ignore 

1777 orientations, 

1778 space_width, 

1779 visitor_operand_before, 

1780 visitor_operand_after, 

1781 visitor_text, 

1782 ) 

1783 extractor.output += text 

1784 if visitor_text is not None: 

1785 visitor_text( 

1786 text, 

1787 extractor.memo_cm, 

1788 extractor.memo_tm, 

1789 extractor.cmap[3], 

1790 extractor.font_size, 

1791 ) 

1792 except Exception as exception: 

1793 logger_warning( 

1794 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1795 __name__, 

1796 ) 

1797 finally: 

1798 extractor.text = "" 

1799 extractor.memo_cm = extractor.cm_matrix.copy() 

1800 extractor.memo_tm = extractor.tm_matrix.copy() 

1801 else: 

1802 extractor.process_operation(operator, operands) 

1803 if visitor_operand_after is not None: 

1804 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1805 extractor.output += extractor.text # just in case 

1806 if extractor.text != "" and visitor_text is not None: 

1807 visitor_text( 

1808 extractor.text, 

1809 extractor.memo_cm, 

1810 extractor.memo_tm, 

1811 extractor.cmap[3], 

1812 extractor.font_size, 

1813 ) 

1814 return extractor.output 

1815 

1816 def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]: 

1817 """ 

1818 Get fonts formatted for "layout" mode text extraction. 

1819 

1820 Returns: 

1821 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name 

1822 

1823 """ 

1824 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1825 objr: Any = self 

1826 fonts: Dict[str, _layout_mode.Font] = {} 

1827 while objr is not None: 

1828 try: 

1829 resources_dict: Any = objr[PG.RESOURCES] 

1830 except KeyError: 

1831 resources_dict = {} 

1832 if "/Font" in resources_dict and self.pdf is not None: 

1833 for font_name in resources_dict["/Font"]: 

1834 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) 

1835 font_dict = { 

1836 k: v.get_object() 

1837 if isinstance(v, IndirectObject) 

1838 else [_v.get_object() for _v in v] 

1839 if isinstance(v, ArrayObject) 

1840 else v 

1841 for k, v in font_dict_obj.items() 

1842 } 

1843 # mypy really sucks at unpacking 

1844 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] 

1845 try: 

1846 objr = objr["/Parent"].get_object() 

1847 except KeyError: 

1848 objr = None 

1849 

1850 return fonts 

1851 

1852 def _layout_mode_text( 

1853 self, 

1854 space_vertically: bool = True, 

1855 scale_weight: float = 1.25, 

1856 strip_rotated: bool = True, 

1857 debug_path: Optional[Path] = None, 

1858 font_height_weight: float = 1, 

1859 ) -> str: 

1860 """ 

1861 Get text preserving fidelity to source PDF text layout. 

1862 

1863 Args: 

1864 space_vertically: include blank lines inferred from y distance + font 

1865 height. Defaults to True. 

1866 scale_weight: multiplier for string length when calculating weighted 

1867 average character width. Defaults to 1.25. 

1868 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1869 layout mode output. Defaults to True. 

1870 debug_path (Path | None): if supplied, must target a directory. 

1871 creates the following files with debug information for layout mode 

1872 functions if supplied: 

1873 - fonts.json: output of self._layout_mode_fonts 

1874 - tjs.json: individual text render ops with corresponding transform matrices 

1875 - bts.json: text render ops left justified and grouped by BT/ET operators 

1876 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1877 Defaults to None. 

1878 font_height_weight: multiplier for font height when calculating 

1879 blank lines. Defaults to 1. 

1880 

1881 Returns: 

1882 str: multiline string containing page text in a fixed width format that 

1883 closely adheres to the rendered layout in the source pdf. 

1884 

1885 """ 

1886 fonts = self._layout_mode_fonts() 

1887 if debug_path: # pragma: no cover 

1888 import json # noqa: PLC0415 

1889 

1890 debug_path.joinpath("fonts.json").write_text( 

1891 json.dumps( 

1892 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) 

1893 ), 

1894 "utf-8", 

1895 ) 

1896 

1897 ops = iter( 

1898 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1899 ) 

1900 bt_groups = _layout_mode.text_show_operations( 

1901 ops, fonts, strip_rotated, debug_path 

1902 ) 

1903 

1904 if not bt_groups: 

1905 return "" 

1906 

1907 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1908 

1909 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1910 

1911 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1912 

1913 def extract_text( 

1914 self, 

1915 *args: Any, 

1916 orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), 

1917 space_width: float = 200.0, 

1918 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1919 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1920 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1921 extraction_mode: Literal["plain", "layout"] = "plain", 

1922 **kwargs: Any, 

1923 ) -> str: 

1924 """ 

1925 Locate all text drawing commands, in the order they are provided in the 

1926 content stream, and extract the text. 

1927 

1928 This works well for some PDF files, but poorly for others, depending on 

1929 the generator used. This will be refined in the future. 

1930 

1931 Do not rely on the order of text coming out of this function, as it 

1932 will change if this function is made more sophisticated. 

1933 

1934 Arabic and Hebrew are extracted in the correct order. 

1935 If required a custom RTL range of characters can be defined; 

1936 see function set_custom_rtl. 

1937 

1938 Additionally you can provide visitor methods to get informed on all 

1939 operations and all text objects. 

1940 For example in some PDF files this can be useful to parse tables. 

1941 

1942 Args: 

1943 orientations: list of orientations extract_text will look for 

1944 default = (0, 90, 180, 270) 

1945 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1946 270 (turned right) 

1947 Silently ignored in "layout" mode. 

1948 space_width: force default space width 

1949 if not extracted from font (default: 200) 

1950 Silently ignored in "layout" mode. 

1951 visitor_operand_before: function to be called before processing an operation. 

1952 It has four arguments: operator, operand-arguments, 

1953 current transformation matrix and text matrix. 

1954 Ignored with a warning in "layout" mode. 

1955 visitor_operand_after: function to be called after processing an operation. 

1956 It has four arguments: operator, operand-arguments, 

1957 current transformation matrix and text matrix. 

1958 Ignored with a warning in "layout" mode. 

1959 visitor_text: function to be called when extracting some text at some position. 

1960 It has five arguments: text, current transformation matrix, 

1961 text matrix, font-dictionary and font-size. 

1962 The font-dictionary may be None in case of unknown fonts. 

1963 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1964 Ignored with a warning in "layout" mode. 

1965 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1966 "layout" for experimental layout mode functionality. 

1967 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1968 in "layout" mode. 

1969 

1970 kwargs: 

1971 layout_mode_space_vertically (bool): include blank lines inferred from 

1972 y distance + font height. Defaults to True. 

1973 layout_mode_scale_weight (float): multiplier for string length when calculating 

1974 weighted average character width. Defaults to 1.25. 

1975 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1976 Set to False to include rotated text anyway. If rotated text is discovered, 

1977 layout will be degraded and a warning will result. Defaults to True. 

1978 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1979 creates the following files with debug information for layout mode 

1980 functions if supplied: 

1981 

1982 - fonts.json: output of self._layout_mode_fonts 

1983 - tjs.json: individual text render ops with corresponding transform matrices 

1984 - bts.json: text render ops left justified and grouped by BT/ET operators 

1985 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1986 layout_mode_font_height_weight (float): multiplier for font height when calculating 

1987 blank lines. Defaults to 1. 

1988 

1989 Returns: 

1990 The extracted text 

1991 

1992 """ 

1993 if extraction_mode not in ["plain", "layout"]: 

1994 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

1995 if extraction_mode == "layout": 

1996 for visitor in ( 

1997 "visitor_operand_before", 

1998 "visitor_operand_after", 

1999 "visitor_text", 

2000 ): 

2001 if locals()[visitor]: 

2002 logger_warning( 

2003 f"Argument {visitor} is ignored in layout mode", 

2004 __name__, 

2005 ) 

2006 return self._layout_mode_text( 

2007 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2008 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2009 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2010 debug_path=kwargs.get("layout_mode_debug_path"), 

2011 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2012 ) 

2013 if len(args) >= 1: 

2014 if isinstance(args[0], str): 

2015 if len(args) >= 3: 

2016 if isinstance(args[2], (tuple, int)): 

2017 orientations = args[2] 

2018 else: 

2019 raise TypeError(f"Invalid positional parameter {args[2]}") 

2020 if len(args) >= 4: 

2021 if isinstance(args[3], (float, int)): 

2022 space_width = args[3] 

2023 else: 

2024 raise TypeError(f"Invalid positional parameter {args[3]}") 

2025 elif isinstance(args[0], (tuple, int)): 

2026 orientations = args[0] 

2027 if len(args) >= 2: 

2028 if isinstance(args[1], (float, int)): 

2029 space_width = args[1] 

2030 else: 

2031 raise TypeError(f"Invalid positional parameter {args[1]}") 

2032 else: 

2033 raise TypeError(f"Invalid positional parameter {args[0]}") 

2034 

2035 if isinstance(orientations, int): 

2036 orientations = (orientations,) 

2037 

2038 return self._extract_text( 

2039 self, 

2040 self.pdf, 

2041 orientations, 

2042 space_width, 

2043 PG.CONTENTS, 

2044 visitor_operand_before, 

2045 visitor_operand_after, 

2046 visitor_text, 

2047 ) 

2048 

2049 def extract_xform_text( 

2050 self, 

2051 xform: EncodedStreamObject, 

2052 orientations: Tuple[int, ...] = (0, 90, 270, 360), 

2053 space_width: float = 200.0, 

2054 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2055 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2056 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2057 ) -> str: 

2058 """ 

2059 Extract text from an XObject. 

2060 

2061 Args: 

2062 xform: 

2063 orientations: 

2064 space_width: force default space width (if not extracted from font (default 200) 

2065 visitor_operand_before: 

2066 visitor_operand_after: 

2067 visitor_text: 

2068 

2069 Returns: 

2070 The extracted text 

2071 

2072 """ 

2073 return self._extract_text( 

2074 xform, 

2075 self.pdf, 

2076 orientations, 

2077 space_width, 

2078 None, 

2079 visitor_operand_before, 

2080 visitor_operand_after, 

2081 visitor_text, 

2082 ) 

2083 

2084 def _get_fonts(self) -> Tuple[Set[str], Set[str]]: 

2085 """ 

2086 Get the names of embedded fonts and unembedded fonts. 

2087 

2088 Returns: 

2089 A tuple (set of embedded fonts, set of unembedded fonts) 

2090 

2091 """ 

2092 obj = self.get_object() 

2093 assert isinstance(obj, DictionaryObject) 

2094 fonts: Set[str] = set() 

2095 embedded: Set[str] = set() 

2096 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2097 unembedded = fonts - embedded 

2098 return embedded, unembedded 

2099 

2100 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2101 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2102 default user space units, defining the boundaries of the physical medium on 

2103 which the page is intended to be displayed or printed.""" 

2104 

2105 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2106 """ 

2107 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2108 default user space units, defining the visible region of default user 

2109 space. 

2110 

2111 When the page is displayed or printed, its contents are to be clipped 

2112 (cropped) to this rectangle and then imposed on the output medium in some 

2113 implementation-defined manner. Default value: same as 

2114 :attr:`mediabox<mediabox>`. 

2115 """ 

2116 

2117 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2118 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2119 default user space units, defining the region to which the contents of the 

2120 page should be clipped when output in a production environment.""" 

2121 

2122 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2123 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2124 default user space units, defining the intended dimensions of the finished 

2125 page after trimming.""" 

2126 

2127 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2128 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2129 default user space units, defining the extent of the page's meaningful 

2130 content as intended by the page's creator.""" 

2131 

2132 @property 

2133 def annotations(self) -> Optional[ArrayObject]: 

2134 if "/Annots" not in self: 

2135 return None 

2136 return cast(ArrayObject, self["/Annots"]) 

2137 

2138 @annotations.setter 

2139 def annotations(self, value: Optional[ArrayObject]) -> None: 

2140 """ 

2141 Set the annotations array of the page. 

2142 

2143 Typically you do not want to set this value, but append to it. 

2144 If you append to it, remember to add the object first to the writer 

2145 and only add the indirect object. 

2146 """ 

2147 if value is None: 

2148 del self[NameObject("/Annots")] 

2149 else: 

2150 self[NameObject("/Annots")] = value 

2151 

2152 

2153class _VirtualList(Sequence[PageObject]): 

2154 def __init__( 

2155 self, 

2156 length_function: Callable[[], int], 

2157 get_function: Callable[[int], PageObject], 

2158 ) -> None: 

2159 self.length_function = length_function 

2160 self.get_function = get_function 

2161 self.current = -1 

2162 

2163 def __len__(self) -> int: 

2164 return self.length_function() 

2165 

2166 @overload 

2167 def __getitem__(self, index: int) -> PageObject: 

2168 ... 

2169 

2170 @overload 

2171 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2172 ... 

2173 

2174 def __getitem__( 

2175 self, index: Union[int, slice] 

2176 ) -> Union[PageObject, Sequence[PageObject]]: 

2177 if isinstance(index, slice): 

2178 indices = range(*index.indices(len(self))) 

2179 cls = type(self) 

2180 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2181 if not isinstance(index, int): 

2182 raise TypeError("Sequence indices must be integers") 

2183 len_self = len(self) 

2184 if index < 0: 

2185 # support negative indexes 

2186 index += len_self 

2187 if not (0 <= index < len_self): 

2188 raise IndexError("Sequence index out of range") 

2189 return self.get_function(index) 

2190 

2191 def __delitem__(self, index: Union[int, slice]) -> None: 

2192 if isinstance(index, slice): 

2193 r = list(range(*index.indices(len(self)))) 

2194 # pages have to be deleted from last to first 

2195 r.sort() 

2196 r.reverse() 

2197 for p in r: 

2198 del self[p] # recursive call 

2199 return 

2200 if not isinstance(index, int): 

2201 raise TypeError("Index must be integers") 

2202 len_self = len(self) 

2203 if index < 0: 

2204 # support negative indexes 

2205 index += len_self 

2206 if not (0 <= index < len_self): 

2207 raise IndexError("Index out of range") 

2208 ind = self[index].indirect_reference 

2209 assert ind is not None 

2210 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2211 "/Parent", None 

2212 ) 

2213 first = True 

2214 while parent is not None: 

2215 parent = cast(DictionaryObject, parent.get_object()) 

2216 try: 

2217 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2218 del cast(ArrayObject, parent["/Kids"])[i] 

2219 first = False 

2220 try: 

2221 assert ind is not None 

2222 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2223 except Exception: # pragma: no cover 

2224 pass 

2225 if "/Count" in parent: 

2226 parent[NameObject("/Count")] = NumberObject( 

2227 cast(int, parent["/Count"]) - 1 

2228 ) 

2229 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2230 # No more objects in this part of this subtree 

2231 ind = parent.indirect_reference 

2232 parent = parent.get("/Parent", None) 

2233 except ValueError: # from index 

2234 if first: 

2235 raise PdfReadError(f"Page not found in page tree: {ind}") 

2236 break 

2237 

2238 def __iter__(self) -> Iterator[PageObject]: 

2239 for i in range(len(self)): 

2240 yield self[i] 

2241 

2242 def __str__(self) -> str: 

2243 p = [f"PageObject({i})" for i in range(self.length_function())] 

2244 return f"[{', '.join(p)}]" 

2245 

2246 

2247def _get_fonts_walk( 

2248 obj: DictionaryObject, 

2249 fnt: Set[str], 

2250 emb: Set[str], 

2251) -> Tuple[Set[str], Set[str]]: 

2252 """ 

2253 Get the set of all fonts and all embedded fonts. 

2254 

2255 Args: 

2256 obj: Page resources dictionary 

2257 fnt: font 

2258 emb: embedded fonts 

2259 

2260 Returns: 

2261 A tuple (fnt, emb) 

2262 

2263 If there is a key called 'BaseFont', that is a font that is used in the document. 

2264 If there is a key called 'FontName' and another key in the same dictionary object 

2265 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2266 embedded. 

2267 

2268 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2269 

2270 """ 

2271 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2272 

2273 def process_font(f: DictionaryObject) -> None: 

2274 nonlocal fnt, emb 

2275 f = cast(DictionaryObject, f.get_object()) # to be sure 

2276 if "/BaseFont" in f: 

2277 fnt.add(cast(str, f["/BaseFont"])) 

2278 

2279 if ( 

2280 ("/CharProcs" in f) 

2281 or ( 

2282 "/FontDescriptor" in f 

2283 and any( 

2284 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2285 ) 

2286 ) 

2287 or ( 

2288 "/DescendantFonts" in f 

2289 and "/FontDescriptor" 

2290 in cast( 

2291 DictionaryObject, 

2292 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2293 ) 

2294 and any( 

2295 x 

2296 in cast( 

2297 DictionaryObject, 

2298 cast( 

2299 DictionaryObject, 

2300 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2301 )["/FontDescriptor"], 

2302 ) 

2303 for x in fontkeys 

2304 ) 

2305 ) 

2306 ): 

2307 # the list comprehension ensures there is FontFile 

2308 try: 

2309 emb.add(cast(str, f["/BaseFont"])) 

2310 except KeyError: 

2311 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2312 

2313 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2314 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2315 process_font(f) 

2316 if "/Resources" in obj: 

2317 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2318 for f in cast( 

2319 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2320 ).values(): 

2321 process_font(f) 

2322 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2323 for x in cast( 

2324 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2325 ).values(): 

2326 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2327 if "/Annots" in obj: 

2328 for a in cast(ArrayObject, obj["/Annots"]): 

2329 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2330 if "/AP" in obj: 

2331 if ( 

2332 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2333 "/Type" 

2334 ) 

2335 == "/XObject" 

2336 ): 

2337 _get_fonts_walk( 

2338 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2339 fnt, 

2340 emb, 

2341 ) 

2342 else: 

2343 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2344 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2345 return fnt, emb # return the sets for each page