Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

922 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from collections.abc import Iterable, Iterator, Sequence 

32from copy import deepcopy 

33from dataclasses import asdict, dataclass 

34from decimal import Decimal 

35from io import BytesIO 

36from pathlib import Path 

37from typing import ( 

38 Any, 

39 Callable, 

40 Literal, 

41 Optional, 

42 Union, 

43 cast, 

44 overload, 

45) 

46 

47from ._font import Font 

48from ._protocols import PdfCommonDocProtocol 

49from ._text_extraction import ( 

50 _layout_mode, 

51) 

52from ._text_extraction._text_extractor import TextExtraction 

53from ._utils import ( 

54 CompressedTransformationMatrix, 

55 TransformationMatrixType, 

56 _human_readable_bytes, 

57 deprecate, 

58 logger_warning, 

59 matrix_multiply, 

60) 

61from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

62from .constants import AnnotationDictionaryAttributes as ADA 

63from .constants import ImageAttributes as IA 

64from .constants import PageAttributes as PG 

65from .constants import Resources as RES 

66from .errors import PageSizeNotDefinedError, PdfReadError 

67from .generic import ( 

68 ArrayObject, 

69 ContentStream, 

70 DictionaryObject, 

71 EncodedStreamObject, 

72 FloatObject, 

73 IndirectObject, 

74 NameObject, 

75 NullObject, 

76 NumberObject, 

77 PdfObject, 

78 RectangleObject, 

79 StreamObject, 

80 is_null_or_none, 

81) 

82 

83try: 

84 from PIL.Image import Image 

85 

86 pil_not_imported = False 

87except ImportError: 

88 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10 

89 pil_not_imported = True # error will be raised only when using images 

90 

91MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" 

92 

93 

94def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

95 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name) 

96 if isinstance(retval, RectangleObject): 

97 return retval 

98 if is_null_or_none(retval): 

99 for d in defaults: 

100 retval = self.get(d) 

101 if retval is not None: 

102 break 

103 if isinstance(retval, IndirectObject): 

104 retval = self.pdf.get_object(retval) 

105 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4: 

106 logger_warning(f"Expected four values, got {length}: {retval}", __name__) 

107 retval = RectangleObject(tuple(retval[:4])) 

108 else: 

109 retval = RectangleObject(retval) # type: ignore 

110 _set_rectangle(self, name, retval) 

111 return retval 

112 

113 

114def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

115 self[NameObject(name)] = value 

116 

117 

118def _delete_rectangle(self: Any, name: str) -> None: 

119 del self[name] 

120 

121 

122def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

123 return property( 

124 lambda self: _get_rectangle(self, name, fallback), 

125 lambda self, value: _set_rectangle(self, name, value), 

126 lambda self: _delete_rectangle(self, name), 

127 ) 

128 

129 

130class Transformation: 

131 """ 

132 Represent a 2D transformation. 

133 

134 The transformation between two coordinate systems is represented by a 3-by-3 

135 transformation matrix with the following form:: 

136 

137 a b 0 

138 c d 0 

139 e f 1 

140 

141 Because a transformation matrix has only six elements that can be changed, 

142 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

143 

144 Coordinate transformations are expressed as matrix multiplications:: 

145 

146 a b 0 

147 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

148 e f 1 

149 

150 

151 Example: 

152 >>> from pypdf import PdfWriter, Transformation 

153 >>> page = PdfWriter().add_blank_page(800, 600) 

154 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

155 >>> page.add_transformation(op) 

156 

157 """ 

158 

159 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

160 self.ctm = ctm 

161 

162 @property 

163 def matrix(self) -> TransformationMatrixType: 

164 """ 

165 Return the transformation matrix as a tuple of tuples in the form: 

166 

167 ((a, b, 0), (c, d, 0), (e, f, 1)) 

168 """ 

169 return ( 

170 (self.ctm[0], self.ctm[1], 0), 

171 (self.ctm[2], self.ctm[3], 0), 

172 (self.ctm[4], self.ctm[5], 1), 

173 ) 

174 

175 @staticmethod 

176 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

177 """ 

178 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

179 

180 Args: 

181 matrix: The transformation matrix as a tuple of tuples. 

182 

183 Returns: 

184 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

185 

186 """ 

187 return ( 

188 matrix[0][0], 

189 matrix[0][1], 

190 matrix[1][0], 

191 matrix[1][1], 

192 matrix[2][0], 

193 matrix[2][1], 

194 ) 

195 

196 def _to_cm(self) -> str: 

197 # Returns the cm operation string for the given transformation matrix 

198 return ( 

199 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " 

200 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" 

201 ) 

202 

203 def transform(self, m: "Transformation") -> "Transformation": 

204 """ 

205 Apply one transformation to another. 

206 

207 Args: 

208 m: a Transformation to apply. 

209 

210 Returns: 

211 A new ``Transformation`` instance 

212 

213 Example: 

214 >>> from pypdf import PdfWriter, Transformation 

215 >>> height, width = 40, 50 

216 >>> page = PdfWriter().add_blank_page(800, 600) 

217 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

218 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

219 >>> page.add_transformation(op) 

220 

221 """ 

222 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

223 return Transformation(ctm) 

224 

225 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

226 """ 

227 Translate the contents of a page. 

228 

229 Args: 

230 tx: The translation along the x-axis. 

231 ty: The translation along the y-axis. 

232 

233 Returns: 

234 A new ``Transformation`` instance 

235 

236 """ 

237 m = self.ctm 

238 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

239 

240 def scale( 

241 self, sx: Optional[float] = None, sy: Optional[float] = None 

242 ) -> "Transformation": 

243 """ 

244 Scale the contents of a page towards the origin of the coordinate system. 

245 

246 Typically, that is the lower-left corner of the page. That can be 

247 changed by translating the contents / the page boxes. 

248 

249 Args: 

250 sx: The scale factor along the x-axis. 

251 sy: The scale factor along the y-axis. 

252 

253 Returns: 

254 A new Transformation instance with the scaled matrix. 

255 

256 """ 

257 if sx is None and sy is None: 

258 raise ValueError("Either sx or sy must be specified") 

259 if sx is None: 

260 sx = sy 

261 if sy is None: 

262 sy = sx 

263 assert sx is not None 

264 assert sy is not None 

265 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

266 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

267 return Transformation(ctm) 

268 

269 def rotate(self, rotation: float) -> "Transformation": 

270 """ 

271 Rotate the contents of a page. 

272 

273 Args: 

274 rotation: The angle of rotation in degrees. 

275 

276 Returns: 

277 A new ``Transformation`` instance with the rotated matrix. 

278 

279 """ 

280 rotation = math.radians(rotation) 

281 op: TransformationMatrixType = ( 

282 (math.cos(rotation), math.sin(rotation), 0), 

283 (-math.sin(rotation), math.cos(rotation), 0), 

284 (0, 0, 1), 

285 ) 

286 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

287 return Transformation(ctm) 

288 

289 def __repr__(self) -> str: 

290 return f"Transformation(ctm={self.ctm})" 

291 

292 @overload 

293 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: 

294 ... 

295 

296 @overload 

297 def apply_on( 

298 self, pt: tuple[float, float], as_object: bool = False 

299 ) -> tuple[float, float]: 

300 ... 

301 

302 def apply_on( 

303 self, 

304 pt: Union[tuple[float, float], list[float]], 

305 as_object: bool = False, 

306 ) -> Union[tuple[float, float], list[float]]: 

307 """ 

308 Apply the transformation matrix on the given point. 

309 

310 Args: 

311 pt: A tuple or list representing the point in the form (x, y). 

312 as_object: If True, return items as FloatObject, otherwise as plain floats. 

313 

314 Returns: 

315 A tuple or list representing the transformed point in the form (x', y') 

316 

317 """ 

318 typ = FloatObject if as_object else float 

319 pt1 = ( 

320 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

321 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

322 ) 

323 return list(pt1) if isinstance(pt, list) else pt1 

324 

325 

326@dataclass 

327class ImageFile: 

328 """ 

329 Image within the PDF file. *This object is not designed to be built.* 

330 

331 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

332 """ 

333 

334 name: str = "" 

335 """ 

336 Filename as identified within the PDF file. 

337 """ 

338 

339 data: bytes = b"" 

340 """ 

341 Data as bytes. 

342 """ 

343 

344 image: Optional[Image] = None 

345 """ 

346 Data as PIL image. 

347 """ 

348 

349 indirect_reference: Optional[IndirectObject] = None 

350 """ 

351 Reference to the object storing the stream. 

352 """ 

353 

354 def replace(self, new_image: Image, **kwargs: Any) -> None: 

355 """ 

356 Replace the image with a new PIL image. 

357 

358 Args: 

359 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

360 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

361 

362 Raises: 

363 TypeError: If the image is inline or in a PdfReader. 

364 TypeError: If the image does not belong to a PdfWriter. 

365 TypeError: If `new_image` is not a PIL Image. 

366 

367 Note: 

368 This method replaces the existing image with a new image. 

369 It is not allowed for inline images or images within a PdfReader. 

370 The `kwargs` parameter allows passing additional parameters 

371 to `Image.save()`, such as quality. 

372 

373 """ 

374 if pil_not_imported: 

375 raise ImportError( 

376 "pillow is required to do image extraction. " 

377 "It can be installed via 'pip install pypdf[image]'" 

378 ) 

379 

380 from ._reader import PdfReader # noqa: PLC0415 

381 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

382 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 

383 

384 if self.indirect_reference is None: 

385 raise TypeError("Cannot update an inline image.") 

386 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

387 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

388 if not isinstance(new_image, Image): 

389 raise TypeError("new_image shall be a PIL Image") 

390 b = BytesIO() 

391 new_image.save(b, "PDF", **kwargs) 

392 reader = PdfReader(b) 

393 page_image = reader.pages[0].images[0] 

394 assert page_image.indirect_reference is not None 

395 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

396 page_image.indirect_reference.get_object() 

397 ) 

398 cast( 

399 PdfObject, self.indirect_reference.get_object() 

400 ).indirect_reference = self.indirect_reference 

401 # change the object attributes 

402 extension, byte_stream, img = _xobj_to_image( 

403 cast(DictionaryObject, self.indirect_reference.get_object()), 

404 pillow_parameters=kwargs, 

405 ) 

406 assert extension is not None 

407 self.name = self.name[: self.name.rfind(".")] + extension 

408 self.data = byte_stream 

409 self.image = img 

410 

411 def __str__(self) -> str: 

412 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

413 

414 def __repr__(self) -> str: 

415 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

416 

417 

418class VirtualListImages(Sequence[ImageFile]): 

419 """ 

420 Provides access to images referenced within a page. 

421 Only one copy will be returned if the usage is used on the same page multiple times. 

422 See :func:`PageObject.images` for more details. 

423 """ 

424 

425 def __init__( 

426 self, 

427 ids_function: Callable[[], list[Union[str, list[str]]]], 

428 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], 

429 ) -> None: 

430 self.ids_function = ids_function 

431 self.get_function = get_function 

432 self.current = -1 

433 

434 def __len__(self) -> int: 

435 return len(self.ids_function()) 

436 

437 def keys(self) -> list[Union[str, list[str]]]: 

438 return self.ids_function() 

439 

440 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: 

441 return [(x, self[x]) for x in self.ids_function()] 

442 

443 @overload 

444 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: 

445 ... 

446 

447 @overload 

448 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

449 ... 

450 

451 def __getitem__( 

452 self, index: Union[int, slice, str, list[str], tuple[str]] 

453 ) -> Union[ImageFile, Sequence[ImageFile]]: 

454 lst = self.ids_function() 

455 if isinstance(index, slice): 

456 indices = range(*index.indices(len(self))) 

457 lst = [lst[x] for x in indices] 

458 cls = type(self) 

459 return cls((lambda: lst), self.get_function) 

460 if isinstance(index, (str, list, tuple)): 

461 return self.get_function(index) 

462 if not isinstance(index, int): 

463 raise TypeError("Invalid sequence indices type") 

464 len_self = len(lst) 

465 if index < 0: 

466 # support negative indexes 

467 index += len_self 

468 if not (0 <= index < len_self): 

469 raise IndexError("Sequence index out of range") 

470 return self.get_function(lst[index]) 

471 

472 def __iter__(self) -> Iterator[ImageFile]: 

473 for i in range(len(self)): 

474 yield self[i] 

475 

476 def __str__(self) -> str: 

477 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

478 return f"[{', '.join(p)}]" 

479 

480 

481class PageObject(DictionaryObject): 

482 """ 

483 PageObject represents a single page within a PDF file. 

484 

485 Typically these objects will be created by accessing the 

486 :attr:`pages<pypdf.PdfReader.pages>` property of the 

487 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

488 also possible to create an empty page with the 

489 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

490 

491 Args: 

492 pdf: PDF file the page belongs to. 

493 indirect_reference: Stores the original indirect reference to 

494 this object in its source PDF 

495 

496 """ 

497 

498 original_page: "PageObject" # very local use in writer when appending 

499 

500 def __init__( 

501 self, 

502 pdf: Optional[PdfCommonDocProtocol] = None, 

503 indirect_reference: Optional[IndirectObject] = None, 

504 ) -> None: 

505 DictionaryObject.__init__(self) 

506 self.pdf = pdf 

507 self.inline_images: Optional[dict[str, ImageFile]] = None 

508 self.indirect_reference = indirect_reference 

509 if not is_null_or_none(indirect_reference): 

510 assert indirect_reference is not None, "mypy" 

511 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

512 self._font_width_maps: dict[str, tuple[dict[str, float], str, float]] = {} 

513 

514 def hash_bin(self) -> int: 

515 """ 

516 Used to detect modified object. 

517 

518 Note: this function is overloaded to return the same results 

519 as a DictionaryObject. 

520 

521 Returns: 

522 Hash considering type and value. 

523 

524 """ 

525 return hash( 

526 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

527 ) 

528 

529 def hash_value_data(self) -> bytes: 

530 data = super().hash_value_data() 

531 data += f"{id(self)}".encode() 

532 return data 

533 

534 @property 

535 def user_unit(self) -> float: 

536 """ 

537 A read-only positive number giving the size of user space units. 

538 

539 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

540 space unit is 1/72 inch, and a value of 3 means that a user 

541 space unit is 3/72 inch. 

542 """ 

543 return self.get(PG.USER_UNIT, 1) 

544 

545 @staticmethod 

546 def create_blank_page( 

547 pdf: Optional[PdfCommonDocProtocol] = None, 

548 width: Union[float, Decimal, None] = None, 

549 height: Union[float, Decimal, None] = None, 

550 ) -> "PageObject": 

551 """ 

552 Return a new blank page. 

553 

554 If ``width`` or ``height`` is ``None``, try to get the page size 

555 from the last page of *pdf*. 

556 

557 Args: 

558 pdf: PDF file the page is within. 

559 width: The width of the new page expressed in default user 

560 space units. 

561 height: The height of the new page expressed in default user 

562 space units. 

563 

564 Returns: 

565 The new blank page 

566 

567 Raises: 

568 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

569 no page 

570 

571 """ 

572 page = PageObject(pdf) 

573 

574 # Creates a new page (cf PDF Reference §7.7.3.3) 

575 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

576 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

577 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

578 if width is None or height is None: 

579 if pdf is not None and len(pdf.pages) > 0: 

580 lastpage = pdf.pages[len(pdf.pages) - 1] 

581 width = lastpage.mediabox.width 

582 height = lastpage.mediabox.height 

583 else: 

584 raise PageSizeNotDefinedError 

585 page.__setitem__( 

586 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

587 ) 

588 

589 return page 

590 

591 def _get_ids_image( 

592 self, 

593 obj: Optional[DictionaryObject] = None, 

594 ancest: Optional[list[str]] = None, 

595 call_stack: Optional[list[Any]] = None, 

596 ) -> list[Union[str, list[str]]]: 

597 if call_stack is None: 

598 call_stack = [] 

599 _i = getattr(obj, "indirect_reference", None) 

600 if _i in call_stack: 

601 return [] 

602 call_stack.append(_i) 

603 if self.inline_images is None: 

604 self.inline_images = self._get_inline_images() 

605 if obj is None: 

606 obj = self 

607 if ancest is None: 

608 ancest = [] 

609 lst: list[Union[str, list[str]]] = [] 

610 if ( 

611 PG.RESOURCES not in obj or 

612 is_null_or_none(resources := obj[PG.RESOURCES]) or 

613 RES.XOBJECT not in cast(DictionaryObject, resources) 

614 ): 

615 return [] if self.inline_images is None else list(self.inline_images.keys()) 

616 

617 x_object = resources[RES.XOBJECT].get_object() # type: ignore 

618 for o in x_object: 

619 if not isinstance(x_object[o], StreamObject): 

620 continue 

621 if x_object[o][IA.SUBTYPE] == "/Image": 

622 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

623 else: # is a form with possible images inside 

624 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

625 assert self.inline_images is not None 

626 lst.extend(list(self.inline_images.keys())) 

627 return lst 

628 

629 def _get_image( 

630 self, 

631 id: Union[str, list[str], tuple[str]], 

632 obj: Optional[DictionaryObject] = None, 

633 ) -> ImageFile: 

634 if obj is None: 

635 obj = cast(DictionaryObject, self) 

636 if isinstance(id, tuple): 

637 id = list(id) 

638 if isinstance(id, list) and len(id) == 1: 

639 id = id[0] 

640 try: 

641 xobjs = cast( 

642 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

643 ) 

644 except KeyError: 

645 if not (id[0] == "~" and id[-1] == "~"): 

646 raise 

647 if isinstance(id, str): 

648 if id[0] == "~" and id[-1] == "~": 

649 if self.inline_images is None: 

650 self.inline_images = self._get_inline_images() 

651 if self.inline_images is None: # pragma: no cover 

652 raise KeyError("No inline image can be found") 

653 return self.inline_images[id] 

654 

655 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 

656 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

657 extension, byte_stream = imgd[:2] 

658 return ImageFile( 

659 name=f"{id[1:]}{extension}", 

660 data=byte_stream, 

661 image=imgd[2], 

662 indirect_reference=xobjs[id].indirect_reference, 

663 ) 

664 # in a subobject 

665 ids = id[1:] 

666 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

667 

668 @property 

669 def images(self) -> VirtualListImages: 

670 """ 

671 Read-only property emulating a list of images on a page. 

672 

673 Get a list of all images on the page. The key can be: 

674 - A string (for the top object) 

675 - A tuple (for images within XObject forms) 

676 - An integer 

677 

678 Examples: 

679 * `reader.pages[0].images[0]` # return first image 

680 * `reader.pages[0].images['/I0']` # return image '/I0' 

681 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

682 * `for img in reader.pages[0].images:` # loops through all objects 

683 

684 images.keys() and images.items() can be used. 

685 

686 The ImageFile has the following properties: 

687 

688 * `.name` : name of the object 

689 * `.data` : bytes of the object 

690 * `.image` : PIL Image Object 

691 * `.indirect_reference` : object reference 

692 

693 and the following methods: 

694 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

695 replace the image in the pdf with the new image 

696 applying the saving parameters indicated (such as quality) 

697 

698 Example usage: 

699 

700 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

701 

702 Inline images are extracted and named ~0~, ~1~, ..., with the 

703 indirect_reference set to None. 

704 

705 """ 

706 return VirtualListImages(self._get_ids_image, self._get_image) 

707 

708 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

709 """Translate values used in inline image""" 

710 try: 

711 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

712 except (TypeError, KeyError): 

713 if isinstance(v, NameObject): 

714 # It is a custom name, thus we have to look in resources. 

715 # The only applicable case is for ColorSpace. 

716 try: 

717 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

718 v = cast(DictionaryObject, res)[v] 

719 except KeyError: # for res and v 

720 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

721 return v 

722 

723 def _get_inline_images(self) -> dict[str, ImageFile]: 

724 """Load inline images. Entries will be identified as `~1~`.""" 

725 content = self.get_contents() 

726 if is_null_or_none(content): 

727 return {} 

728 imgs_data = [] 

729 assert content is not None, "mypy" 

730 for param, ope in content.operations: 

731 if ope == b"INLINE IMAGE": 

732 imgs_data.append( 

733 {"settings": param["settings"], "__streamdata__": param["data"]} 

734 ) 

735 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

736 raise PdfReadError( 

737 f"{ope!r} operator met whereas not expected, " 

738 "please share use case with pypdf dev team" 

739 ) 

740 files = {} 

741 for num, ii in enumerate(imgs_data): 

742 init = { 

743 "__streamdata__": ii["__streamdata__"], 

744 "/Length": len(ii["__streamdata__"]), 

745 } 

746 for k, v in ii["settings"].items(): 

747 if k in {"/Length", "/L"}: # no length is expected 

748 continue 

749 if isinstance(v, list): 

750 v = ArrayObject( 

751 [self._translate_value_inline_image(k, x) for x in v] 

752 ) 

753 else: 

754 v = self._translate_value_inline_image(k, v) 

755 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

756 if k not in init: 

757 init[k] = v 

758 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

759 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 

760 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

761 files[f"~{num}~"] = ImageFile( 

762 name=f"~{num}~{extension}", 

763 data=byte_stream, 

764 image=img, 

765 indirect_reference=None, 

766 ) 

767 return files 

768 

769 @property 

770 def rotation(self) -> int: 

771 """ 

772 The visual rotation of the page. 

773 

774 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

775 valid values. This property does not affect ``/Contents``. 

776 """ 

777 rotate_obj = self.get(PG.ROTATE, 0) 

778 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

779 

780 @rotation.setter 

781 def rotation(self, r: float) -> None: 

782 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

783 

784 def transfer_rotation_to_content(self) -> None: 

785 """ 

786 Apply the rotation of the page to the content and the media/crop/... 

787 boxes. 

788 

789 It is recommended to apply this function before page merging. 

790 """ 

791 r = -self.rotation # rotation to apply is in the otherway 

792 self.rotation = 0 

793 mb = RectangleObject(self.mediabox) 

794 trsf = ( 

795 Transformation() 

796 .translate( 

797 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

798 ) 

799 .rotate(r) 

800 ) 

801 pt1 = trsf.apply_on(mb.lower_left) 

802 pt2 = trsf.apply_on(mb.upper_right) 

803 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

804 self.add_transformation(trsf, False) 

805 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

806 if b in self: 

807 rr = RectangleObject(self[b]) # type: ignore 

808 pt1 = trsf.apply_on(rr.lower_left) 

809 pt2 = trsf.apply_on(rr.upper_right) 

810 self[NameObject(b)] = RectangleObject( 

811 ( 

812 min(pt1[0], pt2[0]), 

813 min(pt1[1], pt2[1]), 

814 max(pt1[0], pt2[0]), 

815 max(pt1[1], pt2[1]), 

816 ) 

817 ) 

818 

819 def rotate(self, angle: int) -> "PageObject": 

820 """ 

821 Rotate a page clockwise by increments of 90 degrees. 

822 

823 Args: 

824 angle: Angle to rotate the page. Must be an increment of 90 deg. 

825 

826 Returns: 

827 The rotated PageObject 

828 

829 """ 

830 if angle % 90 != 0: 

831 raise ValueError("Rotation angle must be a multiple of 90") 

832 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

833 return self 

834 

835 def _merge_resources( 

836 self, 

837 res1: DictionaryObject, 

838 res2: DictionaryObject, 

839 resource: Any, 

840 new_res1: bool = True, 

841 ) -> tuple[dict[str, Any], dict[str, Any]]: 

842 try: 

843 assert isinstance(self.indirect_reference, IndirectObject) 

844 pdf = self.indirect_reference.pdf 

845 is_pdf_writer = hasattr( 

846 pdf, "_add_object" 

847 ) # expect isinstance(pdf, PdfWriter) 

848 except (AssertionError, AttributeError): 

849 pdf = None 

850 is_pdf_writer = False 

851 

852 def compute_unique_key(base_key: str) -> tuple[str, bool]: 

853 """ 

854 Find a key that either doesn't already exist or has the same value 

855 (indicated by the bool) 

856 

857 Args: 

858 base_key: An index is added to this to get the computed key 

859 

860 Returns: 

861 A tuple (computed key, bool) where the boolean indicates 

862 if there is a resource of the given computed_key with the same 

863 value. 

864 

865 """ 

866 value = page2res.raw_get(base_key) 

867 # TODO: a possible improvement for writer, the indirect_reference 

868 # cannot be found because translated 

869 

870 # try the current key first (e.g. "foo"), but otherwise iterate 

871 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

872 # many keys, thus this'll eventually end, even if it's been crafted 

873 # to be maximally annoying. 

874 computed_key = base_key 

875 idx = 0 

876 while computed_key in new_res: 

877 if new_res.raw_get(computed_key) == value: 

878 # there's already a resource of this name, with the exact 

879 # same value 

880 return computed_key, True 

881 computed_key = f"{base_key}-{idx}" 

882 idx += 1 

883 return computed_key, False 

884 

885 if new_res1: 

886 new_res = DictionaryObject() 

887 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

888 else: 

889 new_res = cast(DictionaryObject, res1[resource]) 

890 page2res = cast( 

891 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

892 ) 

893 rename_res = {} 

894 for key in page2res: 

895 unique_key, same_value = compute_unique_key(key) 

896 newname = NameObject(unique_key) 

897 if key != unique_key: 

898 # we have to use a different name for this 

899 rename_res[key] = newname 

900 

901 if not same_value: 

902 if is_pdf_writer: 

903 new_res[newname] = page2res.raw_get(key).clone(pdf) 

904 try: 

905 new_res[newname] = new_res[newname].indirect_reference 

906 except AttributeError: 

907 pass 

908 else: 

909 new_res[newname] = page2res.raw_get(key) 

910 lst = sorted(new_res.items()) 

911 new_res.clear() 

912 for el in lst: 

913 new_res[el[0]] = el[1] 

914 return new_res, rename_res 

915 

916 @staticmethod 

917 def _content_stream_rename( 

918 stream: ContentStream, 

919 rename: dict[Any, Any], 

920 pdf: Optional[PdfCommonDocProtocol], 

921 ) -> ContentStream: 

922 if not rename: 

923 return stream 

924 stream = ContentStream(stream, pdf) 

925 for operands, _operator in stream.operations: 

926 if isinstance(operands, list): 

927 for i, op in enumerate(operands): 

928 if isinstance(op, NameObject): 

929 operands[i] = rename.get(op, op) 

930 elif isinstance(operands, dict): 

931 for i, op in operands.items(): 

932 if isinstance(op, NameObject): 

933 operands[i] = rename.get(op, op) 

934 else: 

935 raise KeyError(f"Type of operands is {type(operands)}") 

936 return stream 

937 

938 @staticmethod 

939 def _add_transformation_matrix( 

940 contents: Any, 

941 pdf: Optional[PdfCommonDocProtocol], 

942 ctm: CompressedTransformationMatrix, 

943 ) -> ContentStream: 

944 """Add transformation matrix at the beginning of the given contents stream.""" 

945 contents = ContentStream(contents, pdf) 

946 contents.operations.insert( 

947 0, 

948 [ 

949 [FloatObject(x) for x in ctm], 

950 b"cm", 

951 ], 

952 ) 

953 return contents 

954 

955 def _get_contents_as_bytes(self) -> Optional[bytes]: 

956 """ 

957 Return the page contents as bytes. 

958 

959 Returns: 

960 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

961 

962 """ 

963 if PG.CONTENTS in self: 

964 obj = self[PG.CONTENTS].get_object() 

965 if isinstance(obj, list): 

966 return b"".join(x.get_object().get_data() for x in obj) 

967 return cast(EncodedStreamObject, obj).get_data() 

968 return None 

969 

970 def get_contents(self) -> Optional[ContentStream]: 

971 """ 

972 Access the page contents. 

973 

974 Returns: 

975 The ``/Contents`` object, or ``None`` if it does not exist. 

976 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

977 

978 """ 

979 if PG.CONTENTS in self: 

980 try: 

981 pdf = cast(IndirectObject, self.indirect_reference).pdf 

982 except AttributeError: 

983 pdf = None 

984 obj = self[PG.CONTENTS] 

985 if is_null_or_none(obj): 

986 return None 

987 resolved_object = obj.get_object() 

988 return ContentStream(resolved_object, pdf) 

989 return None 

990 

991 def replace_contents( 

992 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

993 ) -> None: 

994 """ 

995 Replace the page contents with the new content and nullify old objects 

996 Args: 

997 content: new content; if None delete the content field. 

998 """ 

999 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

1000 # the page is not attached : the content is directly attached. 

1001 self[NameObject(PG.CONTENTS)] = content 

1002 return 

1003 

1004 from pypdf._writer import PdfWriter # noqa: PLC0415 

1005 if not isinstance(self.indirect_reference.pdf, PdfWriter): 

1006 deprecate( 

1007 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated " 

1008 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use " 

1009 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable." 

1010 ) 

1011 

1012 writer = self.indirect_reference.pdf 

1013 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1014 content_array = cast(ArrayObject, self[PG.CONTENTS]) 

1015 for reference in content_array: 

1016 try: 

1017 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject()) 

1018 except ValueError: 

1019 # Occurs when called on PdfReader. 

1020 pass 

1021 

1022 if isinstance(content, ArrayObject): 

1023 content = ArrayObject(writer._add_object(obj) for obj in content) 

1024 

1025 if is_null_or_none(content): 

1026 if PG.CONTENTS not in self: 

1027 return 

1028 assert self[PG.CONTENTS].indirect_reference is not None 

1029 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject()) 

1030 del self[PG.CONTENTS] 

1031 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1032 try: 

1033 self[NameObject(PG.CONTENTS)] = writer._add_object(content) 

1034 except AttributeError: 

1035 # applies at least for page not in writer 

1036 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1037 # this will be fixed with the _add_object 

1038 self[NameObject(PG.CONTENTS)] = content 

1039 else: 

1040 assert content is not None, "mypy" 

1041 content.indirect_reference = self[ 

1042 PG.CONTENTS 

1043 ].indirect_reference # TODO: in the future may require generation management 

1044 try: 

1045 writer._replace_object(indirect_reference=content.indirect_reference, obj=content) 

1046 except AttributeError: 

1047 # applies at least for page not in writer 

1048 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1049 # this will be fixed with the _add_object 

1050 self[NameObject(PG.CONTENTS)] = content 

1051 # forces recalculation of inline_images 

1052 self.inline_images = None 

1053 

1054 def merge_page( 

1055 self, page2: "PageObject", expand: bool = False, over: bool = True 

1056 ) -> None: 

1057 """ 

1058 Merge the content streams of two pages into one. 

1059 

1060 Resource references (e.g. fonts) are maintained from both pages. 

1061 The mediabox, cropbox, etc of this page are not altered. 

1062 The parameter page's content stream will 

1063 be added to the end of this page's content stream, 

1064 meaning that it will be drawn after, or "on top" of this page. 

1065 

1066 Args: 

1067 page2: The page to be merged into this one. Should be 

1068 an instance of :class:`PageObject<PageObject>`. 

1069 over: set the page2 content over page1 if True (default) else under 

1070 expand: If True, the current page dimensions will be 

1071 expanded to accommodate the dimensions of the page to be merged. 

1072 

1073 """ 

1074 self._merge_page(page2, over=over, expand=expand) 

1075 

1076 def _merge_page( 

1077 self, 

1078 page2: "PageObject", 

1079 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1080 ctm: Optional[CompressedTransformationMatrix] = None, 

1081 over: bool = True, 

1082 expand: bool = False, 

1083 ) -> None: 

1084 # First we work on merging the resource dictionaries. This allows us 

1085 # to find out what symbols in the content streams we might need to 

1086 # rename. 

1087 try: 

1088 assert isinstance(self.indirect_reference, IndirectObject) 

1089 if hasattr( 

1090 self.indirect_reference.pdf, "_add_object" 

1091 ): # to detect PdfWriter 

1092 return self._merge_page_writer( 

1093 page2, page2transformation, ctm, over, expand 

1094 ) 

1095 except (AssertionError, AttributeError): 

1096 pass 

1097 

1098 new_resources = DictionaryObject() 

1099 rename = {} 

1100 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1101 page2resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1102 new_annots = ArrayObject() 

1103 

1104 for page in (self, page2): 

1105 if PG.ANNOTS in page: 

1106 annots = page[PG.ANNOTS] 

1107 if isinstance(annots, ArrayObject): 

1108 new_annots.extend(annots) 

1109 

1110 for res in ( 

1111 RES.EXT_G_STATE, 

1112 RES.FONT, 

1113 RES.XOBJECT, 

1114 RES.COLOR_SPACE, 

1115 RES.PATTERN, 

1116 RES.SHADING, 

1117 RES.PROPERTIES, 

1118 ): 

1119 new, newrename = self._merge_resources( 

1120 original_resources, page2resources, res 

1121 ) 

1122 if new: 

1123 new_resources[NameObject(res)] = new 

1124 rename.update(newrename) 

1125 

1126 # Combine /ProcSet sets, making sure there's a consistent order 

1127 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1128 sorted( 

1129 set( 

1130 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1131 ).union( 

1132 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1133 ) 

1134 ) 

1135 ) 

1136 

1137 new_content_array = ArrayObject() 

1138 original_content = self.get_contents() 

1139 if original_content is not None: 

1140 original_content.isolate_graphics_state() 

1141 new_content_array.append(original_content) 

1142 

1143 page2content = page2.get_contents() 

1144 if page2content is not None: 

1145 rect = getattr(page2, MERGE_CROP_BOX) 

1146 page2content.operations.insert( 

1147 0, 

1148 ( 

1149 map( 

1150 FloatObject, 

1151 [ 

1152 rect.left, 

1153 rect.bottom, 

1154 rect.width, 

1155 rect.height, 

1156 ], 

1157 ), 

1158 b"re", 

1159 ), 

1160 ) 

1161 page2content.operations.insert(1, ([], b"W")) 

1162 page2content.operations.insert(2, ([], b"n")) 

1163 if page2transformation is not None: 

1164 page2content = page2transformation(page2content) 

1165 page2content = PageObject._content_stream_rename( 

1166 page2content, rename, self.pdf 

1167 ) 

1168 page2content.isolate_graphics_state() 

1169 if over: 

1170 new_content_array.append(page2content) 

1171 else: 

1172 new_content_array.insert(0, page2content) 

1173 

1174 # if expanding the page to fit a new page, calculate the new media box size 

1175 if expand: 

1176 self._expand_mediabox(page2, ctm) 

1177 

1178 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1179 self[NameObject(PG.RESOURCES)] = new_resources 

1180 self[NameObject(PG.ANNOTS)] = new_annots 

1181 return None 

1182 

1183 def _merge_page_writer( 

1184 self, 

1185 page2: "PageObject", 

1186 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1187 ctm: Optional[CompressedTransformationMatrix] = None, 

1188 over: bool = True, 

1189 expand: bool = False, 

1190 ) -> None: 

1191 # First we work on merging the resource dictionaries. This allows us 

1192 # to find which symbols in the content streams we might need to 

1193 # rename. 

1194 assert isinstance(self.indirect_reference, IndirectObject) 

1195 pdf = self.indirect_reference.pdf 

1196 

1197 rename = {} 

1198 if PG.RESOURCES not in self: 

1199 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1200 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1201 if PG.RESOURCES not in page2: 

1202 page2resources = DictionaryObject() 

1203 else: 

1204 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1205 

1206 for res in ( 

1207 RES.EXT_G_STATE, 

1208 RES.FONT, 

1209 RES.XOBJECT, 

1210 RES.COLOR_SPACE, 

1211 RES.PATTERN, 

1212 RES.SHADING, 

1213 RES.PROPERTIES, 

1214 ): 

1215 if res in page2resources: 

1216 if res not in original_resources: 

1217 original_resources[NameObject(res)] = DictionaryObject() 

1218 _, newrename = self._merge_resources( 

1219 original_resources, page2resources, res, False 

1220 ) 

1221 rename.update(newrename) 

1222 # Combine /ProcSet sets. 

1223 if RES.PROC_SET in page2resources: 

1224 if RES.PROC_SET not in original_resources: 

1225 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1226 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1227 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1228 if x not in arr: 

1229 arr.append(x) 

1230 arr.sort() 

1231 

1232 if PG.ANNOTS in page2: 

1233 if PG.ANNOTS not in self: 

1234 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1235 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1236 if ctm is None: 

1237 trsf = Transformation() 

1238 else: 

1239 trsf = Transformation(ctm) 

1240 # Ensure we are working on a copy of the list. Otherwise, if both pages 

1241 # are the same object, we might run into an infinite loop. 

1242 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])): 

1243 a = a.get_object() 

1244 aa = a.clone( 

1245 pdf, 

1246 ignore_fields=("/P", "/StructParent", "/Parent"), 

1247 force_duplicate=True, 

1248 ) 

1249 r = cast(ArrayObject, a["/Rect"]) 

1250 pt1 = trsf.apply_on((r[0], r[1]), True) 

1251 pt2 = trsf.apply_on((r[2], r[3]), True) 

1252 aa[NameObject("/Rect")] = ArrayObject( 

1253 ( 

1254 min(pt1[0], pt2[0]), 

1255 min(pt1[1], pt2[1]), 

1256 max(pt1[0], pt2[0]), 

1257 max(pt1[1], pt2[1]), 

1258 ) 

1259 ) 

1260 if "/QuadPoints" in a: 

1261 q = cast(ArrayObject, a["/QuadPoints"]) 

1262 aa[NameObject("/QuadPoints")] = ArrayObject( 

1263 trsf.apply_on((q[0], q[1]), True) 

1264 + trsf.apply_on((q[2], q[3]), True) 

1265 + trsf.apply_on((q[4], q[5]), True) 

1266 + trsf.apply_on((q[6], q[7]), True) 

1267 ) 

1268 try: 

1269 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1270 except KeyError: 

1271 pass 

1272 try: 

1273 aa[NameObject("/P")] = self.indirect_reference 

1274 annots.append(aa.indirect_reference) 

1275 except AttributeError: 

1276 pass 

1277 

1278 new_content_array = ArrayObject() 

1279 original_content = self.get_contents() 

1280 if original_content is not None: 

1281 original_content.isolate_graphics_state() 

1282 new_content_array.append(original_content) 

1283 

1284 page2content = page2.get_contents() 

1285 if page2content is not None: 

1286 rect = getattr(page2, MERGE_CROP_BOX) 

1287 page2content.operations.insert( 

1288 0, 

1289 ( 

1290 map( 

1291 FloatObject, 

1292 [ 

1293 rect.left, 

1294 rect.bottom, 

1295 rect.width, 

1296 rect.height, 

1297 ], 

1298 ), 

1299 b"re", 

1300 ), 

1301 ) 

1302 page2content.operations.insert(1, ([], b"W")) 

1303 page2content.operations.insert(2, ([], b"n")) 

1304 if page2transformation is not None: 

1305 page2content = page2transformation(page2content) 

1306 page2content = PageObject._content_stream_rename( 

1307 page2content, rename, self.pdf 

1308 ) 

1309 page2content.isolate_graphics_state() 

1310 if over: 

1311 new_content_array.append(page2content) 

1312 else: 

1313 new_content_array.insert(0, page2content) 

1314 

1315 # if expanding the page to fit a new page, calculate the new media box size 

1316 if expand: 

1317 self._expand_mediabox(page2, ctm) 

1318 

1319 self.replace_contents(new_content_array) 

1320 

1321 def _expand_mediabox( 

1322 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1323 ) -> None: 

1324 corners1 = ( 

1325 self.mediabox.left.as_numeric(), 

1326 self.mediabox.bottom.as_numeric(), 

1327 self.mediabox.right.as_numeric(), 

1328 self.mediabox.top.as_numeric(), 

1329 ) 

1330 corners2 = ( 

1331 page2.mediabox.left.as_numeric(), 

1332 page2.mediabox.bottom.as_numeric(), 

1333 page2.mediabox.left.as_numeric(), 

1334 page2.mediabox.top.as_numeric(), 

1335 page2.mediabox.right.as_numeric(), 

1336 page2.mediabox.top.as_numeric(), 

1337 page2.mediabox.right.as_numeric(), 

1338 page2.mediabox.bottom.as_numeric(), 

1339 ) 

1340 if ctm is not None: 

1341 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1342 new_x = tuple( 

1343 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1344 for i in range(0, 8, 2) 

1345 ) 

1346 new_y = tuple( 

1347 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1348 for i in range(0, 8, 2) 

1349 ) 

1350 else: 

1351 new_x = corners2[0:8:2] 

1352 new_y = corners2[1:8:2] 

1353 lowerleft = (min(new_x), min(new_y)) 

1354 upperright = (max(new_x), max(new_y)) 

1355 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1356 upperright = ( 

1357 max(corners1[2], upperright[0]), 

1358 max(corners1[3], upperright[1]), 

1359 ) 

1360 

1361 self.mediabox.lower_left = lowerleft 

1362 self.mediabox.upper_right = upperright 

1363 

1364 def merge_transformed_page( 

1365 self, 

1366 page2: "PageObject", 

1367 ctm: Union[CompressedTransformationMatrix, Transformation], 

1368 over: bool = True, 

1369 expand: bool = False, 

1370 ) -> None: 

1371 """ 

1372 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1373 matrix is applied to the merged stream. 

1374 

1375 Args: 

1376 page2: The page to be merged into this one. 

1377 ctm: a 6-element tuple containing the operands of the 

1378 transformation matrix 

1379 over: set the page2 content over page1 if True (default) else under 

1380 expand: Whether the page should be expanded to fit the dimensions 

1381 of the page to be merged. 

1382 

1383 """ 

1384 if isinstance(ctm, Transformation): 

1385 ctm = ctm.ctm 

1386 self._merge_page( 

1387 page2, 

1388 lambda page2_content: PageObject._add_transformation_matrix( 

1389 page2_content, page2.pdf, ctm 

1390 ), 

1391 ctm, 

1392 over, 

1393 expand, 

1394 ) 

1395 

1396 def merge_scaled_page( 

1397 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1398 ) -> None: 

1399 """ 

1400 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1401 is scaled by applying a transformation matrix. 

1402 

1403 Args: 

1404 page2: The page to be merged into this one. 

1405 scale: The scaling factor 

1406 over: set the page2 content over page1 if True (default) else under 

1407 expand: Whether the page should be expanded to fit the 

1408 dimensions of the page to be merged. 

1409 

1410 """ 

1411 op = Transformation().scale(scale, scale) 

1412 self.merge_transformed_page(page2, op, over, expand) 

1413 

1414 def merge_rotated_page( 

1415 self, 

1416 page2: "PageObject", 

1417 rotation: float, 

1418 over: bool = True, 

1419 expand: bool = False, 

1420 ) -> None: 

1421 """ 

1422 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1423 is rotated by applying a transformation matrix. 

1424 

1425 Args: 

1426 page2: The page to be merged into this one. 

1427 rotation: The angle of the rotation, in degrees 

1428 over: set the page2 content over page1 if True (default) else under 

1429 expand: Whether the page should be expanded to fit the 

1430 dimensions of the page to be merged. 

1431 

1432 """ 

1433 op = Transformation().rotate(rotation) 

1434 self.merge_transformed_page(page2, op, over, expand) 

1435 

1436 def merge_translated_page( 

1437 self, 

1438 page2: "PageObject", 

1439 tx: float, 

1440 ty: float, 

1441 over: bool = True, 

1442 expand: bool = False, 

1443 ) -> None: 

1444 """ 

1445 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1446 merged is translated by applying a transformation matrix. 

1447 

1448 Args: 

1449 page2: the page to be merged into this one. 

1450 tx: The translation on X axis 

1451 ty: The translation on Y axis 

1452 over: set the page2 content over page1 if True (default) else under 

1453 expand: Whether the page should be expanded to fit the 

1454 dimensions of the page to be merged. 

1455 

1456 """ 

1457 op = Transformation().translate(tx, ty) 

1458 self.merge_transformed_page(page2, op, over, expand) 

1459 

1460 def add_transformation( 

1461 self, 

1462 ctm: Union[Transformation, CompressedTransformationMatrix], 

1463 expand: bool = False, 

1464 ) -> None: 

1465 """ 

1466 Apply a transformation matrix to the page. 

1467 

1468 Args: 

1469 ctm: A 6-element tuple containing the operands of the 

1470 transformation matrix. Alternatively, a 

1471 :py:class:`Transformation<pypdf.Transformation>` 

1472 object can be passed. 

1473 

1474 See :doc:`/user/cropping-and-transforming`. 

1475 

1476 """ 

1477 if isinstance(ctm, Transformation): 

1478 ctm = ctm.ctm 

1479 content = self.get_contents() 

1480 if content is not None: 

1481 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1482 content.isolate_graphics_state() 

1483 self.replace_contents(content) 

1484 # if expanding the page to fit a new page, calculate the new media box size 

1485 if expand: 

1486 corners = [ 

1487 self.mediabox.left.as_numeric(), 

1488 self.mediabox.bottom.as_numeric(), 

1489 self.mediabox.left.as_numeric(), 

1490 self.mediabox.top.as_numeric(), 

1491 self.mediabox.right.as_numeric(), 

1492 self.mediabox.top.as_numeric(), 

1493 self.mediabox.right.as_numeric(), 

1494 self.mediabox.bottom.as_numeric(), 

1495 ] 

1496 

1497 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1498 new_x = [ 

1499 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1500 for i in range(0, 8, 2) 

1501 ] 

1502 new_y = [ 

1503 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1504 for i in range(0, 8, 2) 

1505 ] 

1506 

1507 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1508 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1509 

1510 def scale(self, sx: float, sy: float) -> None: 

1511 """ 

1512 Scale a page by the given factors by applying a transformation matrix 

1513 to its content and updating the page size. 

1514 

1515 This updates the various page boundaries (bleedbox, trimbox, etc.) 

1516 and the contents of the page. 

1517 

1518 Args: 

1519 sx: The scaling factor on horizontal axis. 

1520 sy: The scaling factor on vertical axis. 

1521 

1522 """ 

1523 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1524 self.bleedbox = self.bleedbox.scale(sx, sy) 

1525 self.trimbox = self.trimbox.scale(sx, sy) 

1526 self.artbox = self.artbox.scale(sx, sy) 

1527 self.cropbox = self.cropbox.scale(sx, sy) 

1528 self.mediabox = self.mediabox.scale(sx, sy) 

1529 

1530 if PG.ANNOTS in self: 

1531 annotations = self[PG.ANNOTS] 

1532 if isinstance(annotations, ArrayObject): 

1533 for annotation in annotations: 

1534 annotation_obj = annotation.get_object() 

1535 if ADA.Rect in annotation_obj: 

1536 rectangle = annotation_obj[ADA.Rect] 

1537 if isinstance(rectangle, ArrayObject): 

1538 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1539 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1540 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1541 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1542 

1543 if PG.VP in self: 

1544 viewport = self[PG.VP] 

1545 if isinstance(viewport, ArrayObject): 

1546 bbox = viewport[0]["/BBox"] 

1547 else: 

1548 bbox = viewport["/BBox"] # type: ignore 

1549 scaled_bbox = RectangleObject( 

1550 ( 

1551 float(bbox[0]) * sx, 

1552 float(bbox[1]) * sy, 

1553 float(bbox[2]) * sx, 

1554 float(bbox[3]) * sy, 

1555 ) 

1556 ) 

1557 if isinstance(viewport, ArrayObject): 

1558 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1559 NameObject("/BBox") 

1560 ] = scaled_bbox 

1561 else: 

1562 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1563 

1564 def scale_by(self, factor: float) -> None: 

1565 """ 

1566 Scale a page by the given factor by applying a transformation matrix to 

1567 its content and updating the page size. 

1568 

1569 Args: 

1570 factor: The scaling factor (for both X and Y axis). 

1571 

1572 """ 

1573 self.scale(factor, factor) 

1574 

1575 def scale_to(self, width: float, height: float) -> None: 

1576 """ 

1577 Scale a page to the specified dimensions by applying a transformation 

1578 matrix to its content and updating the page size. 

1579 

1580 Args: 

1581 width: The new width. 

1582 height: The new height. 

1583 

1584 """ 

1585 sx = width / float(self.mediabox.width) 

1586 sy = height / float(self.mediabox.height) 

1587 self.scale(sx, sy) 

1588 

1589 def compress_content_streams(self, level: int = -1) -> None: 

1590 """ 

1591 Compress the size of this page by joining all content streams and 

1592 applying a FlateDecode filter. 

1593 

1594 However, it is possible that this function will perform no action if 

1595 content stream compression becomes "automatic". 

1596 """ 

1597 content = self.get_contents() 

1598 if content is not None: 

1599 content_obj = content.flate_encode(level) 

1600 try: 

1601 content.indirect_reference.pdf._objects[ # type: ignore 

1602 content.indirect_reference.idnum - 1 # type: ignore 

1603 ] = content_obj 

1604 except AttributeError: 

1605 if self.indirect_reference is not None and hasattr( 

1606 self.indirect_reference.pdf, "_add_object" 

1607 ): 

1608 self.replace_contents(content_obj) 

1609 else: 

1610 raise ValueError("Page must be part of a PdfWriter") 

1611 

1612 @property 

1613 def page_number(self) -> Optional[int]: 

1614 """ 

1615 Read-only property which returns the page number within the PDF file. 

1616 

1617 Returns: 

1618 Page number; None if the page is not attached to a PDF. 

1619 

1620 """ 

1621 if self.indirect_reference is None: 

1622 return None 

1623 try: 

1624 lst = self.indirect_reference.pdf.pages 

1625 return lst.index(self) 

1626 except ValueError: 

1627 return None 

1628 

1629 def _debug_for_extract(self) -> str: # pragma: no cover 

1630 out = "" 

1631 for ope, op in ContentStream( 

1632 self["/Contents"].get_object(), self.pdf, "bytes" 

1633 ).operations: 

1634 if op == b"TJ": 

1635 s = [x for x in ope[0] if isinstance(x, str)] 

1636 else: 

1637 s = [] 

1638 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1639 out += "\n=============================\n" 

1640 try: 

1641 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1642 out += fo + "\n" 

1643 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1644 try: 

1645 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1646 "/Encoding" 

1647 ].__repr__() 

1648 out += enc_repr + "\n" 

1649 except Exception: 

1650 pass 

1651 try: 

1652 out += ( 

1653 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1654 "/ToUnicode" 

1655 ] 

1656 .get_data() 

1657 .decode() 

1658 + "\n" 

1659 ) 

1660 except Exception: 

1661 pass 

1662 

1663 except KeyError: 

1664 out += "No Font\n" 

1665 return out 

1666 

1667 def _extract_text( 

1668 self, 

1669 obj: Any, 

1670 pdf: Any, 

1671 orientations: tuple[int, ...] = (0, 90, 180, 270), 

1672 space_width: float = 200.0, 

1673 content_key: Optional[str] = PG.CONTENTS, 

1674 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1675 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1676 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1677 ) -> str: 

1678 """ 

1679 See extract_text for most arguments. 

1680 

1681 Args: 

1682 content_key: indicate the default key where to extract data 

1683 None = the object; this allows reusing the function on an XObject 

1684 default = "/Content" 

1685 

1686 """ 

1687 extractor = TextExtraction() 

1688 font_resources: dict[str, DictionaryObject] = {} 

1689 fonts: dict[str, Font] = {} 

1690 

1691 try: 

1692 objr = obj 

1693 while NameObject(PG.RESOURCES) not in objr: 

1694 # /Resources can be inherited so we look to parents 

1695 objr = objr["/Parent"].get_object() 

1696 # If no parents then no /Resources will be available, 

1697 # so an exception will be raised 

1698 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1699 except Exception: 

1700 # No resources means no text is possible (no font); we consider the 

1701 # file as not damaged, no need to check for TJ or Tj 

1702 return "" 

1703 

1704 if ( 

1705 not is_null_or_none(resources_dict) 

1706 and "/Font" in resources_dict 

1707 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"])) 

1708 ): 

1709 for font_resource in font_resources_dict: 

1710 try: 

1711 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object()) 

1712 font_resources[font_resource] = font_resource_object 

1713 fonts[font_resource] = Font.from_font_resource(font_resource_object) 

1714 # Override space width, if applicable 

1715 if fonts[font_resource].character_widths.get(" ", 0) == 0: 

1716 fonts[font_resource].space_width = space_width 

1717 except (AttributeError, TypeError): 

1718 pass 

1719 

1720 try: 

1721 content = ( 

1722 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1723 ) 

1724 if not isinstance(content, ContentStream): 

1725 content = ContentStream(content, pdf, "bytes") 

1726 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1727 return "" 

1728 # We check all strings are TextStringObjects. ByteStringObjects 

1729 # are strings where the byte->string encoding was unknown, so adding 

1730 # them to the text here would be gibberish. 

1731 

1732 # Initialize the extractor with the necessary parameters 

1733 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts) 

1734 

1735 for operands, operator in content.operations: 

1736 if visitor_operand_before is not None: 

1737 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1738 # Multiple operators are handled here 

1739 if operator == b"'": 

1740 extractor.process_operation(b"T*", []) 

1741 extractor.process_operation(b"Tj", operands) 

1742 elif operator == b'"': 

1743 extractor.process_operation(b"Tw", [operands[0]]) 

1744 extractor.process_operation(b"Tc", [operands[1]]) 

1745 extractor.process_operation(b"T*", []) 

1746 extractor.process_operation(b"Tj", operands[2:]) 

1747 elif operator == b"TJ": 

1748 # The space width may be smaller than the font width, so the width should be 95%. 

1749 _confirm_space_width = extractor._space_width * 0.95 

1750 if operands: 

1751 for op in operands[0]: 

1752 if isinstance(op, (str, bytes)): 

1753 extractor.process_operation(b"Tj", [op]) 

1754 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1755 abs(float(op)) >= _confirm_space_width 

1756 and extractor.text 

1757 and extractor.text[-1] != " " 

1758 ): 

1759 extractor.process_operation(b"Tj", [" "]) 

1760 elif operator == b"TD": 

1761 extractor.process_operation(b"TL", [-operands[1]]) 

1762 extractor.process_operation(b"Td", operands) 

1763 elif operator == b"Do": 

1764 extractor.output += extractor.text 

1765 if visitor_text is not None: 

1766 visitor_text( 

1767 extractor.text, 

1768 extractor.memo_cm, 

1769 extractor.memo_tm, 

1770 extractor.font_resource, 

1771 extractor.font_size, 

1772 ) 

1773 try: 

1774 if extractor.output[-1] != "\n": 

1775 extractor.output += "\n" 

1776 if visitor_text is not None: 

1777 visitor_text( 

1778 "\n", 

1779 extractor.memo_cm, 

1780 extractor.memo_tm, 

1781 extractor.font_resource, 

1782 extractor.font_size, 

1783 ) 

1784 except IndexError: 

1785 pass 

1786 try: 

1787 xobj = resources_dict["/XObject"] 

1788 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1789 text = self.extract_xform_text( 

1790 xobj[operands[0]], # type: ignore 

1791 orientations, 

1792 space_width, 

1793 visitor_operand_before, 

1794 visitor_operand_after, 

1795 visitor_text, 

1796 ) 

1797 extractor.output += text 

1798 if visitor_text is not None: 

1799 visitor_text( 

1800 text, 

1801 extractor.memo_cm, 

1802 extractor.memo_tm, 

1803 extractor.font_resource, 

1804 extractor.font_size, 

1805 ) 

1806 except Exception as exception: 

1807 logger_warning( 

1808 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1809 __name__, 

1810 ) 

1811 finally: 

1812 extractor.text = "" 

1813 extractor.memo_cm = extractor.cm_matrix.copy() 

1814 extractor.memo_tm = extractor.tm_matrix.copy() 

1815 else: 

1816 extractor.process_operation(operator, operands) 

1817 if visitor_operand_after is not None: 

1818 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1819 extractor.output += extractor.text # just in case 

1820 if extractor.text != "" and visitor_text is not None: 

1821 visitor_text( 

1822 extractor.text, 

1823 extractor.memo_cm, 

1824 extractor.memo_tm, 

1825 extractor.font_resource, 

1826 extractor.font_size, 

1827 ) 

1828 return extractor.output 

1829 

1830 def _layout_mode_fonts(self) -> dict[str, Font]: 

1831 """ 

1832 Get fonts formatted for "layout" mode text extraction. 

1833 

1834 Returns: 

1835 Dict[str, Font]: dictionary of Font instances keyed by font name 

1836 

1837 """ 

1838 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1839 objr: Any = self 

1840 fonts: dict[str, Font] = {} 

1841 while objr is not None: 

1842 try: 

1843 resources_dict: Any = objr[PG.RESOURCES] 

1844 except KeyError: 

1845 resources_dict = {} 

1846 if "/Font" in resources_dict and self.pdf is not None: 

1847 for font_name in resources_dict["/Font"]: 

1848 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name]) 

1849 try: 

1850 objr = objr["/Parent"].get_object() 

1851 except KeyError: 

1852 objr = None 

1853 

1854 return fonts 

1855 

1856 def _layout_mode_text( 

1857 self, 

1858 space_vertically: bool = True, 

1859 scale_weight: float = 1.25, 

1860 strip_rotated: bool = True, 

1861 debug_path: Optional[Path] = None, 

1862 font_height_weight: float = 1, 

1863 ) -> str: 

1864 """ 

1865 Get text preserving fidelity to source PDF text layout. 

1866 

1867 Args: 

1868 space_vertically: include blank lines inferred from y distance + font 

1869 height. Defaults to True. 

1870 scale_weight: multiplier for string length when calculating weighted 

1871 average character width. Defaults to 1.25. 

1872 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1873 layout mode output. Defaults to True. 

1874 debug_path (Path | None): if supplied, must target a directory. 

1875 creates the following files with debug information for layout mode 

1876 functions if supplied: 

1877 - fonts.json: output of self._layout_mode_fonts 

1878 - tjs.json: individual text render ops with corresponding transform matrices 

1879 - bts.json: text render ops left justified and grouped by BT/ET operators 

1880 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1881 Defaults to None. 

1882 font_height_weight: multiplier for font height when calculating 

1883 blank lines. Defaults to 1. 

1884 

1885 Returns: 

1886 str: multiline string containing page text in a fixed width format that 

1887 closely adheres to the rendered layout in the source pdf. 

1888 

1889 """ 

1890 fonts = self._layout_mode_fonts() 

1891 if debug_path: # pragma: no cover 

1892 import json # noqa: PLC0415 

1893 

1894 debug_path.joinpath("fonts.json").write_text( 

1895 json.dumps(fonts, indent=2, default=asdict), 

1896 "utf-8" 

1897 ) 

1898 

1899 ops = iter( 

1900 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1901 ) 

1902 bt_groups = _layout_mode.text_show_operations( 

1903 ops, fonts, strip_rotated, debug_path 

1904 ) 

1905 

1906 if not bt_groups: 

1907 return "" 

1908 

1909 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1910 

1911 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1912 

1913 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1914 

1915 def extract_text( 

1916 self, 

1917 *args: Any, 

1918 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), 

1919 space_width: float = 200.0, 

1920 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1921 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1922 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1923 extraction_mode: Literal["plain", "layout"] = "plain", 

1924 **kwargs: Any, 

1925 ) -> str: 

1926 """ 

1927 Locate all text drawing commands, in the order they are provided in the 

1928 content stream, and extract the text. 

1929 

1930 This works well for some PDF files, but poorly for others, depending on 

1931 the generator used. This will be refined in the future. 

1932 

1933 Do not rely on the order of text coming out of this function, as it 

1934 will change if this function is made more sophisticated. 

1935 

1936 Arabic and Hebrew are extracted in the correct order. 

1937 If required a custom RTL range of characters can be defined; 

1938 see function set_custom_rtl. 

1939 

1940 Additionally you can provide visitor methods to get informed on all 

1941 operations and all text objects. 

1942 For example in some PDF files this can be useful to parse tables. 

1943 

1944 Args: 

1945 orientations: list of orientations extract_text will look for 

1946 default = (0, 90, 180, 270) 

1947 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1948 270 (turned right) 

1949 Silently ignored in "layout" mode. 

1950 space_width: force default space width 

1951 if not extracted from font (default: 200) 

1952 Silently ignored in "layout" mode. 

1953 visitor_operand_before: function to be called before processing an operation. 

1954 It has four arguments: operator, operand-arguments, 

1955 current transformation matrix and text matrix. 

1956 Ignored with a warning in "layout" mode. 

1957 visitor_operand_after: function to be called after processing an operation. 

1958 It has four arguments: operator, operand-arguments, 

1959 current transformation matrix and text matrix. 

1960 Ignored with a warning in "layout" mode. 

1961 visitor_text: function to be called when extracting some text at some position. 

1962 It has five arguments: text, current transformation matrix, 

1963 text matrix, font-dictionary and font-size. 

1964 The font-dictionary may be None in case of unknown fonts. 

1965 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1966 Ignored with a warning in "layout" mode. 

1967 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1968 "layout" for experimental layout mode functionality. 

1969 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1970 in "layout" mode. 

1971 

1972 kwargs: 

1973 layout_mode_space_vertically (bool): include blank lines inferred from 

1974 y distance + font height. Defaults to True. 

1975 layout_mode_scale_weight (float): multiplier for string length when calculating 

1976 weighted average character width. Defaults to 1.25. 

1977 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1978 Set to False to include rotated text anyway. If rotated text is discovered, 

1979 layout will be degraded and a warning will result. Defaults to True. 

1980 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1981 creates the following files with debug information for layout mode 

1982 functions if supplied: 

1983 

1984 - fonts.json: output of self._layout_mode_fonts 

1985 - tjs.json: individual text render ops with corresponding transform matrices 

1986 - bts.json: text render ops left justified and grouped by BT/ET operators 

1987 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1988 layout_mode_font_height_weight (float): multiplier for font height when calculating 

1989 blank lines. Defaults to 1. 

1990 

1991 Returns: 

1992 The extracted text 

1993 

1994 """ 

1995 if extraction_mode not in ["plain", "layout"]: 

1996 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

1997 if extraction_mode == "layout": 

1998 for visitor in ( 

1999 "visitor_operand_before", 

2000 "visitor_operand_after", 

2001 "visitor_text", 

2002 ): 

2003 if locals()[visitor]: 

2004 logger_warning( 

2005 f"Argument {visitor} is ignored in layout mode", 

2006 __name__, 

2007 ) 

2008 return self._layout_mode_text( 

2009 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2010 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2011 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2012 debug_path=kwargs.get("layout_mode_debug_path"), 

2013 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2014 ) 

2015 if len(args) >= 1: 

2016 if isinstance(args[0], str): 

2017 if len(args) >= 3: 

2018 if isinstance(args[2], (tuple, int)): 

2019 orientations = args[2] 

2020 else: 

2021 raise TypeError(f"Invalid positional parameter {args[2]}") 

2022 if len(args) >= 4: 

2023 if isinstance(args[3], (float, int)): 

2024 space_width = args[3] 

2025 else: 

2026 raise TypeError(f"Invalid positional parameter {args[3]}") 

2027 elif isinstance(args[0], (tuple, int)): 

2028 orientations = args[0] 

2029 if len(args) >= 2: 

2030 if isinstance(args[1], (float, int)): 

2031 space_width = args[1] 

2032 else: 

2033 raise TypeError(f"Invalid positional parameter {args[1]}") 

2034 else: 

2035 raise TypeError(f"Invalid positional parameter {args[0]}") 

2036 

2037 if isinstance(orientations, int): 

2038 orientations = (orientations,) 

2039 

2040 return self._extract_text( 

2041 self, 

2042 self.pdf, 

2043 orientations, 

2044 space_width, 

2045 PG.CONTENTS, 

2046 visitor_operand_before, 

2047 visitor_operand_after, 

2048 visitor_text, 

2049 ) 

2050 

2051 def extract_xform_text( 

2052 self, 

2053 xform: EncodedStreamObject, 

2054 orientations: tuple[int, ...] = (0, 90, 270, 360), 

2055 space_width: float = 200.0, 

2056 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2057 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2058 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2059 ) -> str: 

2060 """ 

2061 Extract text from an XObject. 

2062 

2063 Args: 

2064 xform: 

2065 orientations: 

2066 space_width: force default space width (if not extracted from font (default 200) 

2067 visitor_operand_before: 

2068 visitor_operand_after: 

2069 visitor_text: 

2070 

2071 Returns: 

2072 The extracted text 

2073 

2074 """ 

2075 return self._extract_text( 

2076 xform, 

2077 self.pdf, 

2078 orientations, 

2079 space_width, 

2080 None, 

2081 visitor_operand_before, 

2082 visitor_operand_after, 

2083 visitor_text, 

2084 ) 

2085 

2086 def _get_fonts(self) -> tuple[set[str], set[str]]: 

2087 """ 

2088 Get the names of embedded fonts and unembedded fonts. 

2089 

2090 Returns: 

2091 A tuple (set of embedded fonts, set of unembedded fonts) 

2092 

2093 """ 

2094 obj = self.get_object() 

2095 assert isinstance(obj, DictionaryObject) 

2096 fonts: set[str] = set() 

2097 embedded: set[str] = set() 

2098 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2099 unembedded = fonts - embedded 

2100 return embedded, unembedded 

2101 

2102 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2103 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2104 default user space units, defining the boundaries of the physical medium on 

2105 which the page is intended to be displayed or printed.""" 

2106 

2107 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2108 """ 

2109 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2110 default user space units, defining the visible region of default user 

2111 space. 

2112 

2113 When the page is displayed or printed, its contents are to be clipped 

2114 (cropped) to this rectangle and then imposed on the output medium in some 

2115 implementation-defined manner. Default value: same as 

2116 :attr:`mediabox<mediabox>`. 

2117 """ 

2118 

2119 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2120 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2121 default user space units, defining the region to which the contents of the 

2122 page should be clipped when output in a production environment.""" 

2123 

2124 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2125 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2126 default user space units, defining the intended dimensions of the finished 

2127 page after trimming.""" 

2128 

2129 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2130 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2131 default user space units, defining the extent of the page's meaningful 

2132 content as intended by the page's creator.""" 

2133 

2134 @property 

2135 def annotations(self) -> Optional[ArrayObject]: 

2136 if "/Annots" not in self: 

2137 return None 

2138 return cast(ArrayObject, self["/Annots"]) 

2139 

2140 @annotations.setter 

2141 def annotations(self, value: Optional[ArrayObject]) -> None: 

2142 """ 

2143 Set the annotations array of the page. 

2144 

2145 Typically you do not want to set this value, but append to it. 

2146 If you append to it, remember to add the object first to the writer 

2147 and only add the indirect object. 

2148 """ 

2149 if value is None: 

2150 if "/Annots" not in self: 

2151 return 

2152 del self[NameObject("/Annots")] 

2153 else: 

2154 self[NameObject("/Annots")] = value 

2155 

2156 

2157class _VirtualList(Sequence[PageObject]): 

2158 def __init__( 

2159 self, 

2160 length_function: Callable[[], int], 

2161 get_function: Callable[[int], PageObject], 

2162 ) -> None: 

2163 self.length_function = length_function 

2164 self.get_function = get_function 

2165 self.current = -1 

2166 

2167 def __len__(self) -> int: 

2168 return self.length_function() 

2169 

2170 @overload 

2171 def __getitem__(self, index: int) -> PageObject: 

2172 ... 

2173 

2174 @overload 

2175 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2176 ... 

2177 

2178 def __getitem__( 

2179 self, index: Union[int, slice] 

2180 ) -> Union[PageObject, Sequence[PageObject]]: 

2181 if isinstance(index, slice): 

2182 indices = range(*index.indices(len(self))) 

2183 cls = type(self) 

2184 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2185 if not isinstance(index, int): 

2186 raise TypeError("Sequence indices must be integers") 

2187 len_self = len(self) 

2188 if index < 0: 

2189 # support negative indexes 

2190 index += len_self 

2191 if not (0 <= index < len_self): 

2192 raise IndexError("Sequence index out of range") 

2193 return self.get_function(index) 

2194 

2195 def __delitem__(self, index: Union[int, slice]) -> None: 

2196 if isinstance(index, slice): 

2197 r = list(range(*index.indices(len(self)))) 

2198 # pages have to be deleted from last to first 

2199 r.sort() 

2200 r.reverse() 

2201 for p in r: 

2202 del self[p] # recursive call 

2203 return 

2204 if not isinstance(index, int): 

2205 raise TypeError("Index must be integers") 

2206 len_self = len(self) 

2207 if index < 0: 

2208 # support negative indexes 

2209 index += len_self 

2210 if not (0 <= index < len_self): 

2211 raise IndexError("Index out of range") 

2212 ind = self[index].indirect_reference 

2213 assert ind is not None 

2214 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2215 "/Parent", None 

2216 ) 

2217 first = True 

2218 while parent is not None: 

2219 parent = cast(DictionaryObject, parent.get_object()) 

2220 try: 

2221 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2222 del cast(ArrayObject, parent["/Kids"])[i] 

2223 first = False 

2224 try: 

2225 assert ind is not None 

2226 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2227 except Exception: # pragma: no cover 

2228 pass 

2229 if "/Count" in parent: 

2230 parent[NameObject("/Count")] = NumberObject( 

2231 cast(int, parent["/Count"]) - 1 

2232 ) 

2233 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2234 # No more objects in this part of this subtree 

2235 ind = parent.indirect_reference 

2236 parent = parent.get("/Parent", None) 

2237 except ValueError: # from index 

2238 if first: 

2239 raise PdfReadError(f"Page not found in page tree: {ind}") 

2240 break 

2241 

2242 def __iter__(self) -> Iterator[PageObject]: 

2243 for i in range(len(self)): 

2244 yield self[i] 

2245 

2246 def __str__(self) -> str: 

2247 p = [f"PageObject({i})" for i in range(self.length_function())] 

2248 return f"[{', '.join(p)}]" 

2249 

2250 

2251def _get_fonts_walk( 

2252 obj: DictionaryObject, 

2253 fnt: set[str], 

2254 emb: set[str], 

2255) -> tuple[set[str], set[str]]: 

2256 """ 

2257 Get the set of all fonts and all embedded fonts. 

2258 

2259 Args: 

2260 obj: Page resources dictionary 

2261 fnt: font 

2262 emb: embedded fonts 

2263 

2264 Returns: 

2265 A tuple (fnt, emb) 

2266 

2267 If there is a key called 'BaseFont', that is a font that is used in the document. 

2268 If there is a key called 'FontName' and another key in the same dictionary object 

2269 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2270 embedded. 

2271 

2272 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2273 

2274 """ 

2275 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2276 

2277 def process_font(f: DictionaryObject) -> None: 

2278 nonlocal fnt, emb 

2279 f = cast(DictionaryObject, f.get_object()) # to be sure 

2280 if "/BaseFont" in f: 

2281 fnt.add(cast(str, f["/BaseFont"])) 

2282 

2283 if ( 

2284 ("/CharProcs" in f) 

2285 or ( 

2286 "/FontDescriptor" in f 

2287 and any( 

2288 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2289 ) 

2290 ) 

2291 or ( 

2292 "/DescendantFonts" in f 

2293 and "/FontDescriptor" 

2294 in cast( 

2295 DictionaryObject, 

2296 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2297 ) 

2298 and any( 

2299 x 

2300 in cast( 

2301 DictionaryObject, 

2302 cast( 

2303 DictionaryObject, 

2304 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2305 )["/FontDescriptor"], 

2306 ) 

2307 for x in fontkeys 

2308 ) 

2309 ) 

2310 ): 

2311 # the list comprehension ensures there is FontFile 

2312 try: 

2313 emb.add(cast(str, f["/BaseFont"])) 

2314 except KeyError: 

2315 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2316 

2317 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2318 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2319 process_font(f) 

2320 if "/Resources" in obj: 

2321 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2322 for f in cast( 

2323 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2324 ).values(): 

2325 process_font(f) 

2326 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2327 for x in cast( 

2328 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2329 ).values(): 

2330 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2331 if "/Annots" in obj: 

2332 for a in cast(ArrayObject, obj["/Annots"]): 

2333 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2334 if "/AP" in obj: 

2335 if ( 

2336 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2337 "/Type" 

2338 ) 

2339 == "/XObject" 

2340 ): 

2341 _get_fonts_walk( 

2342 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2343 fnt, 

2344 emb, 

2345 ) 

2346 else: 

2347 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2348 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2349 return fnt, emb # return the sets for each page