Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

910 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from dataclasses import dataclass 

32from decimal import Decimal 

33from io import BytesIO 

34from pathlib import Path 

35from typing import ( 

36 Any, 

37 Callable, 

38 Dict, 

39 Iterable, 

40 Iterator, 

41 List, 

42 Literal, 

43 Optional, 

44 Sequence, 

45 Set, 

46 Tuple, 

47 Union, 

48 cast, 

49 overload, 

50) 

51 

52from ._cmap import ( 

53 build_char_map, 

54) 

55from ._protocols import PdfCommonDocProtocol 

56from ._text_extraction import ( 

57 _layout_mode, 

58) 

59from ._text_extraction._text_extractor import TextExtraction 

60from ._utils import ( 

61 CompressedTransformationMatrix, 

62 TransformationMatrixType, 

63 _human_readable_bytes, 

64 logger_warning, 

65 matrix_multiply, 

66) 

67from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

68from .constants import AnnotationDictionaryAttributes as ADA 

69from .constants import ImageAttributes as IA 

70from .constants import PageAttributes as PG 

71from .constants import Resources as RES 

72from .errors import PageSizeNotDefinedError, PdfReadError 

73from .filters import _xobj_to_image 

74from .generic import ( 

75 ArrayObject, 

76 ContentStream, 

77 DictionaryObject, 

78 EncodedStreamObject, 

79 FloatObject, 

80 IndirectObject, 

81 NameObject, 

82 NullObject, 

83 NumberObject, 

84 PdfObject, 

85 RectangleObject, 

86 StreamObject, 

87 is_null_or_none, 

88) 

89 

90try: 

91 from PIL.Image import Image 

92 

93 pil_not_imported = False 

94except ImportError: 

95 Image = object # type: ignore 

96 pil_not_imported = True # error will be raised only when using images 

97 

98MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' 

99 

100 

101def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

102 retval: Union[None, RectangleObject, IndirectObject] = self.get(name) 

103 if isinstance(retval, RectangleObject): 

104 return retval 

105 if is_null_or_none(retval): 

106 for d in defaults: 

107 retval = self.get(d) 

108 if retval is not None: 

109 break 

110 if isinstance(retval, IndirectObject): 

111 retval = self.pdf.get_object(retval) 

112 retval = RectangleObject(retval) # type: ignore 

113 _set_rectangle(self, name, retval) 

114 return retval 

115 

116 

117def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

118 name = NameObject(name) 

119 self[name] = value 

120 

121 

122def _delete_rectangle(self: Any, name: str) -> None: 

123 del self[name] 

124 

125 

126def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

127 return property( 

128 lambda self: _get_rectangle(self, name, fallback), 

129 lambda self, value: _set_rectangle(self, name, value), 

130 lambda self: _delete_rectangle(self, name), 

131 ) 

132 

133 

134class Transformation: 

135 """ 

136 Represent a 2D transformation. 

137 

138 The transformation between two coordinate systems is represented by a 3-by-3 

139 transformation matrix with the following form:: 

140 

141 a b 0 

142 c d 0 

143 e f 1 

144 

145 Because a transformation matrix has only six elements that can be changed, 

146 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

147 

148 Coordinate transformations are expressed as matrix multiplications:: 

149 

150 a b 0 

151 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

152 e f 1 

153 

154 

155 Example: 

156 >>> from pypdf import Transformation 

157 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

158 >>> page.add_transformation(op) 

159 

160 """ 

161 

162 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

163 self.ctm = ctm 

164 

165 @property 

166 def matrix(self) -> TransformationMatrixType: 

167 """ 

168 Return the transformation matrix as a tuple of tuples in the form: 

169 

170 ((a, b, 0), (c, d, 0), (e, f, 1)) 

171 """ 

172 return ( 

173 (self.ctm[0], self.ctm[1], 0), 

174 (self.ctm[2], self.ctm[3], 0), 

175 (self.ctm[4], self.ctm[5], 1), 

176 ) 

177 

178 @staticmethod 

179 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

180 """ 

181 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

182 

183 Args: 

184 matrix: The transformation matrix as a tuple of tuples. 

185 

186 Returns: 

187 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

188 

189 """ 

190 return ( 

191 matrix[0][0], 

192 matrix[0][1], 

193 matrix[1][0], 

194 matrix[1][1], 

195 matrix[2][0], 

196 matrix[2][1], 

197 ) 

198 

199 def transform(self, m: "Transformation") -> "Transformation": 

200 """ 

201 Apply one transformation to another. 

202 

203 Args: 

204 m: a Transformation to apply. 

205 

206 Returns: 

207 A new ``Transformation`` instance 

208 

209 Example: 

210 >>> from pypdf import Transformation 

211 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

212 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

213 >>> page.add_transformation(op) 

214 

215 """ 

216 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

217 return Transformation(ctm) 

218 

219 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

220 """ 

221 Translate the contents of a page. 

222 

223 Args: 

224 tx: The translation along the x-axis. 

225 ty: The translation along the y-axis. 

226 

227 Returns: 

228 A new ``Transformation`` instance 

229 

230 """ 

231 m = self.ctm 

232 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

233 

234 def scale( 

235 self, sx: Optional[float] = None, sy: Optional[float] = None 

236 ) -> "Transformation": 

237 """ 

238 Scale the contents of a page towards the origin of the coordinate system. 

239 

240 Typically, that is the lower-left corner of the page. That can be 

241 changed by translating the contents / the page boxes. 

242 

243 Args: 

244 sx: The scale factor along the x-axis. 

245 sy: The scale factor along the y-axis. 

246 

247 Returns: 

248 A new Transformation instance with the scaled matrix. 

249 

250 """ 

251 if sx is None and sy is None: 

252 raise ValueError("Either sx or sy must be specified") 

253 if sx is None: 

254 sx = sy 

255 if sy is None: 

256 sy = sx 

257 assert sx is not None 

258 assert sy is not None 

259 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

260 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

261 return Transformation(ctm) 

262 

263 def rotate(self, rotation: float) -> "Transformation": 

264 """ 

265 Rotate the contents of a page. 

266 

267 Args: 

268 rotation: The angle of rotation in degrees. 

269 

270 Returns: 

271 A new ``Transformation`` instance with the rotated matrix. 

272 

273 """ 

274 rotation = math.radians(rotation) 

275 op: TransformationMatrixType = ( 

276 (math.cos(rotation), math.sin(rotation), 0), 

277 (-math.sin(rotation), math.cos(rotation), 0), 

278 (0, 0, 1), 

279 ) 

280 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

281 return Transformation(ctm) 

282 

283 def __repr__(self) -> str: 

284 return f"Transformation(ctm={self.ctm})" 

285 

286 @overload 

287 def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]: 

288 ... 

289 

290 @overload 

291 def apply_on( 

292 self, pt: Tuple[float, float], as_object: bool = False 

293 ) -> Tuple[float, float]: 

294 ... 

295 

296 def apply_on( 

297 self, 

298 pt: Union[Tuple[float, float], List[float]], 

299 as_object: bool = False, 

300 ) -> Union[Tuple[float, float], List[float]]: 

301 """ 

302 Apply the transformation matrix on the given point. 

303 

304 Args: 

305 pt: A tuple or list representing the point in the form (x, y). 

306 as_object: If True, return items as FloatObject, otherwise as plain floats. 

307 

308 Returns: 

309 A tuple or list representing the transformed point in the form (x', y') 

310 

311 """ 

312 typ = FloatObject if as_object else float 

313 pt1 = ( 

314 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

315 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

316 ) 

317 return list(pt1) if isinstance(pt, list) else pt1 

318 

319 

320@dataclass 

321class ImageFile: 

322 """ 

323 Image within the PDF file. *This object is not designed to be built.* 

324 

325 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

326 """ 

327 

328 name: str = "" 

329 """ 

330 Filename as identified within the PDF file. 

331 """ 

332 

333 data: bytes = b"" 

334 """ 

335 Data as bytes. 

336 """ 

337 

338 image: Optional[Image] = None 

339 """ 

340 Data as PIL image. 

341 """ 

342 

343 indirect_reference: Optional[IndirectObject] = None 

344 """ 

345 Reference to the object storing the stream. 

346 """ 

347 

348 def replace(self, new_image: Image, **kwargs: Any) -> None: 

349 """ 

350 Replace the image with a new PIL image. 

351 

352 Args: 

353 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

354 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

355 

356 Raises: 

357 TypeError: If the image is inline or in a PdfReader. 

358 TypeError: If the image does not belong to a PdfWriter. 

359 TypeError: If `new_image` is not a PIL Image. 

360 

361 Note: 

362 This method replaces the existing image with a new image. 

363 It is not allowed for inline images or images within a PdfReader. 

364 The `kwargs` parameter allows passing additional parameters 

365 to `Image.save()`, such as quality. 

366 

367 """ 

368 if pil_not_imported: 

369 raise ImportError( 

370 "pillow is required to do image extraction. " 

371 "It can be installed via 'pip install pypdf[image]'" 

372 ) 

373 

374 from ._reader import PdfReader # noqa: PLC0415 

375 

376 # to prevent circular import 

377 from .filters import _xobj_to_image # noqa: PLC0415 

378 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

379 

380 if self.indirect_reference is None: 

381 raise TypeError("Cannot update an inline image.") 

382 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

383 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

384 if not isinstance(new_image, Image): 

385 raise TypeError("new_image shall be a PIL Image") 

386 b = BytesIO() 

387 new_image.save(b, "PDF", **kwargs) 

388 reader = PdfReader(b) 

389 assert reader.pages[0].images[0].indirect_reference is not None 

390 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

391 reader.pages[0].images[0].indirect_reference.get_object() 

392 ) 

393 cast( 

394 PdfObject, self.indirect_reference.get_object() 

395 ).indirect_reference = self.indirect_reference 

396 # change the object attributes 

397 extension, byte_stream, img = _xobj_to_image( 

398 cast(DictionaryObject, self.indirect_reference.get_object()) 

399 ) 

400 assert extension is not None 

401 self.name = self.name[: self.name.rfind(".")] + extension 

402 self.data = byte_stream 

403 self.image = img 

404 

405 def __str__(self) -> str: 

406 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

407 

408 def __repr__(self) -> str: 

409 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

410 

411 

412class VirtualListImages(Sequence[ImageFile]): 

413 """ 

414 Provides access to images referenced within a page. 

415 Only one copy will be returned if the usage is used on the same page multiple times. 

416 See :func:`PageObject.images` for more details. 

417 """ 

418 

419 def __init__( 

420 self, 

421 ids_function: Callable[[], List[Union[str, List[str]]]], 

422 get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], 

423 ) -> None: 

424 self.ids_function = ids_function 

425 self.get_function = get_function 

426 self.current = -1 

427 

428 def __len__(self) -> int: 

429 return len(self.ids_function()) 

430 

431 def keys(self) -> List[Union[str, List[str]]]: 

432 return self.ids_function() 

433 

434 def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: 

435 return [(x, self[x]) for x in self.ids_function()] 

436 

437 @overload 

438 def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: 

439 ... 

440 

441 @overload 

442 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

443 ... 

444 

445 def __getitem__( 

446 self, index: Union[int, slice, str, List[str], Tuple[str]] 

447 ) -> Union[ImageFile, Sequence[ImageFile]]: 

448 lst = self.ids_function() 

449 if isinstance(index, slice): 

450 indices = range(*index.indices(len(self))) 

451 lst = [lst[x] for x in indices] 

452 cls = type(self) 

453 return cls((lambda: lst), self.get_function) 

454 if isinstance(index, (str, list, tuple)): 

455 return self.get_function(index) 

456 if not isinstance(index, int): 

457 raise TypeError("Invalid sequence indices type") 

458 len_self = len(lst) 

459 if index < 0: 

460 # support negative indexes 

461 index += len_self 

462 if not (0 <= index < len_self): 

463 raise IndexError("Sequence index out of range") 

464 return self.get_function(lst[index]) 

465 

466 def __iter__(self) -> Iterator[ImageFile]: 

467 for i in range(len(self)): 

468 yield self[i] 

469 

470 def __str__(self) -> str: 

471 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

472 return f"[{', '.join(p)}]" 

473 

474 

475class PageObject(DictionaryObject): 

476 """ 

477 PageObject represents a single page within a PDF file. 

478 

479 Typically these objects will be created by accessing the 

480 :attr:`pages<pypdf.PdfReader.pages>` property of the 

481 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

482 also possible to create an empty page with the 

483 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

484 

485 Args: 

486 pdf: PDF file the page belongs to. 

487 indirect_reference: Stores the original indirect reference to 

488 this object in its source PDF 

489 

490 """ 

491 

492 original_page: "PageObject" # very local use in writer when appending 

493 

494 def __init__( 

495 self, 

496 pdf: Optional[PdfCommonDocProtocol] = None, 

497 indirect_reference: Optional[IndirectObject] = None, 

498 ) -> None: 

499 DictionaryObject.__init__(self) 

500 self.pdf = pdf 

501 self.inline_images: Optional[Dict[str, ImageFile]] = None 

502 self.indirect_reference = indirect_reference 

503 if not is_null_or_none(indirect_reference): 

504 assert indirect_reference is not None, "mypy" 

505 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

506 self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {} 

507 

508 def hash_bin(self) -> int: 

509 """ 

510 Used to detect modified object. 

511 

512 Note: this function is overloaded to return the same results 

513 as a DictionaryObject. 

514 

515 Returns: 

516 Hash considering type and value. 

517 

518 """ 

519 return hash( 

520 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

521 ) 

522 

523 def hash_value_data(self) -> bytes: 

524 data = super().hash_value_data() 

525 data += b"%d" % id(self) 

526 return data 

527 

528 @property 

529 def user_unit(self) -> float: 

530 """ 

531 A read-only positive number giving the size of user space units. 

532 

533 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

534 space unit is 1/72 inch, and a value of 3 means that a user 

535 space unit is 3/72 inch. 

536 """ 

537 return self.get(PG.USER_UNIT, 1) 

538 

539 @staticmethod 

540 def create_blank_page( 

541 pdf: Optional[PdfCommonDocProtocol] = None, 

542 width: Union[float, Decimal, None] = None, 

543 height: Union[float, Decimal, None] = None, 

544 ) -> "PageObject": 

545 """ 

546 Return a new blank page. 

547 

548 If ``width`` or ``height`` is ``None``, try to get the page size 

549 from the last page of *pdf*. 

550 

551 Args: 

552 pdf: PDF file the page is within. 

553 width: The width of the new page expressed in default user 

554 space units. 

555 height: The height of the new page expressed in default user 

556 space units. 

557 

558 Returns: 

559 The new blank page 

560 

561 Raises: 

562 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

563 no page 

564 

565 """ 

566 page = PageObject(pdf) 

567 

568 # Creates a new page (cf PDF Reference §7.7.3.3) 

569 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

570 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

571 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

572 if width is None or height is None: 

573 if pdf is not None and len(pdf.pages) > 0: 

574 lastpage = pdf.pages[len(pdf.pages) - 1] 

575 width = lastpage.mediabox.width 

576 height = lastpage.mediabox.height 

577 else: 

578 raise PageSizeNotDefinedError 

579 page.__setitem__( 

580 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

581 ) 

582 

583 return page 

584 

585 def _get_ids_image( 

586 self, 

587 obj: Optional[DictionaryObject] = None, 

588 ancest: Optional[List[str]] = None, 

589 call_stack: Optional[List[Any]] = None, 

590 ) -> List[Union[str, List[str]]]: 

591 if call_stack is None: 

592 call_stack = [] 

593 _i = getattr(obj, "indirect_reference", None) 

594 if _i in call_stack: 

595 return [] 

596 call_stack.append(_i) 

597 if self.inline_images is None: 

598 self.inline_images = self._get_inline_images() 

599 if obj is None: 

600 obj = self 

601 if ancest is None: 

602 ancest = [] 

603 lst: List[Union[str, List[str]]] = [] 

604 if PG.RESOURCES not in obj or RES.XOBJECT not in cast( 

605 DictionaryObject, obj[PG.RESOURCES] 

606 ): 

607 return [] if self.inline_images is None else list(self.inline_images.keys()) 

608 

609 x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore 

610 for o in x_object: 

611 if not isinstance(x_object[o], StreamObject): 

612 continue 

613 if x_object[o][IA.SUBTYPE] == "/Image": 

614 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

615 else: # is a form with possible images inside 

616 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

617 assert self.inline_images is not None 

618 lst.extend(list(self.inline_images.keys())) 

619 return lst 

620 

621 def _get_image( 

622 self, 

623 id: Union[str, List[str], Tuple[str]], 

624 obj: Optional[DictionaryObject] = None, 

625 ) -> ImageFile: 

626 if obj is None: 

627 obj = cast(DictionaryObject, self) 

628 if isinstance(id, tuple): 

629 id = list(id) 

630 if isinstance(id, List) and len(id) == 1: 

631 id = id[0] 

632 try: 

633 xobjs = cast( 

634 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

635 ) 

636 except KeyError: 

637 if not (id[0] == "~" and id[-1] == "~"): 

638 raise 

639 if isinstance(id, str): 

640 if id[0] == "~" and id[-1] == "~": 

641 if self.inline_images is None: 

642 self.inline_images = self._get_inline_images() 

643 if self.inline_images is None: # pragma: no cover 

644 raise KeyError("No inline image can be found") 

645 return self.inline_images[id] 

646 

647 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

648 extension, byte_stream = imgd[:2] 

649 return ImageFile( 

650 name=f"{id[1:]}{extension}", 

651 data=byte_stream, 

652 image=imgd[2], 

653 indirect_reference=xobjs[id].indirect_reference, 

654 ) 

655 # in a subobject 

656 ids = id[1:] 

657 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

658 

659 @property 

660 def images(self) -> VirtualListImages: 

661 """ 

662 Read-only property emulating a list of images on a page. 

663 

664 Get a list of all images on the page. The key can be: 

665 - A string (for the top object) 

666 - A tuple (for images within XObject forms) 

667 - An integer 

668 

669 Examples: 

670 * `reader.pages[0].images[0]` # return first image 

671 * `reader.pages[0].images['/I0']` # return image '/I0' 

672 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

673 * `for img in reader.pages[0].images:` # loops through all objects 

674 

675 images.keys() and images.items() can be used. 

676 

677 The ImageFile has the following properties: 

678 

679 * `.name` : name of the object 

680 * `.data` : bytes of the object 

681 * `.image` : PIL Image Object 

682 * `.indirect_reference` : object reference 

683 

684 and the following methods: 

685 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

686 replace the image in the pdf with the new image 

687 applying the saving parameters indicated (such as quality) 

688 

689 Example usage: 

690 

691 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

692 

693 Inline images are extracted and named ~0~, ~1~, ..., with the 

694 indirect_reference set to None. 

695 

696 """ 

697 return VirtualListImages(self._get_ids_image, self._get_image) 

698 

699 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

700 """Translate values used in inline image""" 

701 try: 

702 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

703 except (TypeError, KeyError): 

704 if isinstance(v, NameObject): 

705 # It is a custom name, thus we have to look in resources. 

706 # The only applicable case is for ColorSpace. 

707 try: 

708 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

709 v = cast(DictionaryObject, res)[v] 

710 except KeyError: # for res and v 

711 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

712 return v 

713 

714 def _get_inline_images(self) -> Dict[str, ImageFile]: 

715 """Load inline images. Entries will be identified as `~1~`.""" 

716 content = self.get_contents() 

717 if is_null_or_none(content): 

718 return {} 

719 imgs_data = [] 

720 assert content is not None, "mypy" 

721 for param, ope in content.operations: 

722 if ope == b"INLINE IMAGE": 

723 imgs_data.append( 

724 {"settings": param["settings"], "__streamdata__": param["data"]} 

725 ) 

726 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

727 raise PdfReadError( 

728 f"{ope!r} operator met whereas not expected, " 

729 "please share use case with pypdf dev team" 

730 ) 

731 files = {} 

732 for num, ii in enumerate(imgs_data): 

733 init = { 

734 "__streamdata__": ii["__streamdata__"], 

735 "/Length": len(ii["__streamdata__"]), 

736 } 

737 for k, v in ii["settings"].items(): 

738 if k in {"/Length", "/L"}: # no length is expected 

739 continue 

740 if isinstance(v, list): 

741 v = ArrayObject( 

742 [self._translate_value_inline_image(k, x) for x in v] 

743 ) 

744 else: 

745 v = self._translate_value_inline_image(k, v) 

746 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

747 if k not in init: 

748 init[k] = v 

749 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

750 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

751 files[f"~{num}~"] = ImageFile( 

752 name=f"~{num}~{extension}", 

753 data=byte_stream, 

754 image=img, 

755 indirect_reference=None, 

756 ) 

757 return files 

758 

759 @property 

760 def rotation(self) -> int: 

761 """ 

762 The visual rotation of the page. 

763 

764 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

765 valid values. This property does not affect ``/Contents``. 

766 """ 

767 rotate_obj = self.get(PG.ROTATE, 0) 

768 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

769 

770 @rotation.setter 

771 def rotation(self, r: float) -> None: 

772 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

773 

774 def transfer_rotation_to_content(self) -> None: 

775 """ 

776 Apply the rotation of the page to the content and the media/crop/... 

777 boxes. 

778 

779 It is recommended to apply this function before page merging. 

780 """ 

781 r = -self.rotation # rotation to apply is in the otherway 

782 self.rotation = 0 

783 mb = RectangleObject(self.mediabox) 

784 trsf = ( 

785 Transformation() 

786 .translate( 

787 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

788 ) 

789 .rotate(r) 

790 ) 

791 pt1 = trsf.apply_on(mb.lower_left) 

792 pt2 = trsf.apply_on(mb.upper_right) 

793 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

794 self.add_transformation(trsf, False) 

795 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

796 if b in self: 

797 rr = RectangleObject(self[b]) # type: ignore 

798 pt1 = trsf.apply_on(rr.lower_left) 

799 pt2 = trsf.apply_on(rr.upper_right) 

800 self[NameObject(b)] = RectangleObject( 

801 ( 

802 min(pt1[0], pt2[0]), 

803 min(pt1[1], pt2[1]), 

804 max(pt1[0], pt2[0]), 

805 max(pt1[1], pt2[1]), 

806 ) 

807 ) 

808 

809 def rotate(self, angle: int) -> "PageObject": 

810 """ 

811 Rotate a page clockwise by increments of 90 degrees. 

812 

813 Args: 

814 angle: Angle to rotate the page. Must be an increment of 90 deg. 

815 

816 Returns: 

817 The rotated PageObject 

818 

819 """ 

820 if angle % 90 != 0: 

821 raise ValueError("Rotation angle must be a multiple of 90") 

822 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

823 return self 

824 

825 def _merge_resources( 

826 self, 

827 res1: DictionaryObject, 

828 res2: DictionaryObject, 

829 resource: Any, 

830 new_res1: bool = True, 

831 ) -> Tuple[Dict[str, Any], Dict[str, Any]]: 

832 try: 

833 assert isinstance(self.indirect_reference, IndirectObject) 

834 pdf = self.indirect_reference.pdf 

835 is_pdf_writer = hasattr( 

836 pdf, "_add_object" 

837 ) # expect isinstance(pdf, PdfWriter) 

838 except (AssertionError, AttributeError): 

839 pdf = None 

840 is_pdf_writer = False 

841 

842 def compute_unique_key(base_key: str) -> Tuple[str, bool]: 

843 """ 

844 Find a key that either doesn't already exist or has the same value 

845 (indicated by the bool) 

846 

847 Args: 

848 base_key: An index is added to this to get the computed key 

849 

850 Returns: 

851 A tuple (computed key, bool) where the boolean indicates 

852 if there is a resource of the given computed_key with the same 

853 value. 

854 

855 """ 

856 value = page2res.raw_get(base_key) 

857 # TODO: a possible improvement for writer, the indirect_reference 

858 # cannot be found because translated 

859 

860 # try the current key first (e.g. "foo"), but otherwise iterate 

861 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

862 # many keys, thus this'll eventually end, even if it's been crafted 

863 # to be maximally annoying. 

864 computed_key = base_key 

865 idx = 0 

866 while computed_key in new_res: 

867 if new_res.raw_get(computed_key) == value: 

868 # there's already a resource of this name, with the exact 

869 # same value 

870 return computed_key, True 

871 computed_key = f"{base_key}-{idx}" 

872 idx += 1 

873 return computed_key, False 

874 

875 if new_res1: 

876 new_res = DictionaryObject() 

877 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

878 else: 

879 new_res = cast(DictionaryObject, res1[resource]) 

880 page2res = cast( 

881 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

882 ) 

883 rename_res = {} 

884 for key in page2res: 

885 unique_key, same_value = compute_unique_key(key) 

886 newname = NameObject(unique_key) 

887 if key != unique_key: 

888 # we have to use a different name for this 

889 rename_res[key] = newname 

890 

891 if not same_value: 

892 if is_pdf_writer: 

893 new_res[newname] = page2res.raw_get(key).clone(pdf) 

894 try: 

895 new_res[newname] = new_res[newname].indirect_reference 

896 except AttributeError: 

897 pass 

898 else: 

899 new_res[newname] = page2res.raw_get(key) 

900 lst = sorted(new_res.items()) 

901 new_res.clear() 

902 for el in lst: 

903 new_res[el[0]] = el[1] 

904 return new_res, rename_res 

905 

906 @staticmethod 

907 def _content_stream_rename( 

908 stream: ContentStream, 

909 rename: Dict[Any, Any], 

910 pdf: Optional[PdfCommonDocProtocol], 

911 ) -> ContentStream: 

912 if not rename: 

913 return stream 

914 stream = ContentStream(stream, pdf) 

915 for operands, _operator in stream.operations: 

916 if isinstance(operands, list): 

917 for i, op in enumerate(operands): 

918 if isinstance(op, NameObject): 

919 operands[i] = rename.get(op, op) 

920 elif isinstance(operands, dict): 

921 for i, op in operands.items(): 

922 if isinstance(op, NameObject): 

923 operands[i] = rename.get(op, op) 

924 else: 

925 raise KeyError(f"Type of operands is {type(operands)}") 

926 return stream 

927 

928 @staticmethod 

929 def _add_transformation_matrix( 

930 contents: Any, 

931 pdf: Optional[PdfCommonDocProtocol], 

932 ctm: CompressedTransformationMatrix, 

933 ) -> ContentStream: 

934 """Add transformation matrix at the beginning of the given contents stream.""" 

935 contents = ContentStream(contents, pdf) 

936 contents.operations.insert( 

937 0, 

938 [ 

939 [FloatObject(x) for x in ctm], 

940 b"cm", 

941 ], 

942 ) 

943 return contents 

944 

945 def _get_contents_as_bytes(self) -> Optional[bytes]: 

946 """ 

947 Return the page contents as bytes. 

948 

949 Returns: 

950 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

951 

952 """ 

953 if PG.CONTENTS in self: 

954 obj = self[PG.CONTENTS].get_object() 

955 if isinstance(obj, list): 

956 return b"".join(x.get_object().get_data() for x in obj) 

957 return cast(EncodedStreamObject, obj).get_data() 

958 return None 

959 

960 def get_contents(self) -> Optional[ContentStream]: 

961 """ 

962 Access the page contents. 

963 

964 Returns: 

965 The ``/Contents`` object, or ``None`` if it does not exist. 

966 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

967 

968 """ 

969 if PG.CONTENTS in self: 

970 try: 

971 pdf = cast(IndirectObject, self.indirect_reference).pdf 

972 except AttributeError: 

973 pdf = None 

974 obj = self[PG.CONTENTS] 

975 if is_null_or_none(obj): 

976 return None 

977 resolved_object = obj.get_object() 

978 return ContentStream(resolved_object, pdf) 

979 return None 

980 

981 def replace_contents( 

982 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

983 ) -> None: 

984 """ 

985 Replace the page contents with the new content and nullify old objects 

986 Args: 

987 content: new content; if None delete the content field. 

988 """ 

989 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

990 # the page is not attached : the content is directly attached. 

991 self[NameObject(PG.CONTENTS)] = content 

992 return 

993 

994 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

995 for o in self[PG.CONTENTS]: # type: ignore[attr-defined] 

996 try: 

997 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore 

998 except AttributeError: 

999 pass 

1000 

1001 if isinstance(content, ArrayObject): 

1002 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content) 

1003 

1004 if is_null_or_none(content): 

1005 if PG.CONTENTS not in self: 

1006 return 

1007 assert self.indirect_reference is not None 

1008 assert self[PG.CONTENTS].indirect_reference is not None 

1009 self.indirect_reference.pdf._objects[ 

1010 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore 

1011 ] = NullObject() 

1012 del self[PG.CONTENTS] 

1013 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1014 try: 

1015 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( 

1016 content 

1017 ) 

1018 except AttributeError: 

1019 # applies at least for page not in writer 

1020 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1021 # this will be fixed with the _add_object 

1022 self[NameObject(PG.CONTENTS)] = content 

1023 else: 

1024 assert content is not None, "mypy" 

1025 content.indirect_reference = self[ 

1026 PG.CONTENTS 

1027 ].indirect_reference # TODO: in the future may require generation management 

1028 try: 

1029 self.indirect_reference.pdf._objects[ 

1030 content.indirect_reference.idnum - 1 # type: ignore 

1031 ] = content 

1032 except AttributeError: 

1033 # applies at least for page not in writer 

1034 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1035 # this will be fixed with the _add_object 

1036 self[NameObject(PG.CONTENTS)] = content 

1037 # forces recalculation of inline_images 

1038 self.inline_images = None 

1039 

1040 def merge_page( 

1041 self, page2: "PageObject", expand: bool = False, over: bool = True 

1042 ) -> None: 

1043 """ 

1044 Merge the content streams of two pages into one. 

1045 

1046 Resource references (e.g. fonts) are maintained from both pages. 

1047 The mediabox, cropbox, etc of this page are not altered. 

1048 The parameter page's content stream will 

1049 be added to the end of this page's content stream, 

1050 meaning that it will be drawn after, or "on top" of this page. 

1051 

1052 Args: 

1053 page2: The page to be merged into this one. Should be 

1054 an instance of :class:`PageObject<PageObject>`. 

1055 over: set the page2 content over page1 if True (default) else under 

1056 expand: If True, the current page dimensions will be 

1057 expanded to accommodate the dimensions of the page to be merged. 

1058 

1059 """ 

1060 self._merge_page(page2, over=over, expand=expand) 

1061 

1062 def _merge_page( 

1063 self, 

1064 page2: "PageObject", 

1065 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1066 ctm: Optional[CompressedTransformationMatrix] = None, 

1067 over: bool = True, 

1068 expand: bool = False, 

1069 ) -> None: 

1070 # First we work on merging the resource dictionaries. This allows us 

1071 # to find out what symbols in the content streams we might need to 

1072 # rename. 

1073 try: 

1074 assert isinstance(self.indirect_reference, IndirectObject) 

1075 if hasattr( 

1076 self.indirect_reference.pdf, "_add_object" 

1077 ): # to detect PdfWriter 

1078 return self._merge_page_writer( 

1079 page2, page2transformation, ctm, over, expand 

1080 ) 

1081 except (AssertionError, AttributeError): 

1082 pass 

1083 

1084 new_resources = DictionaryObject() 

1085 rename = {} 

1086 try: 

1087 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1088 except KeyError: 

1089 original_resources = DictionaryObject() 

1090 try: 

1091 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1092 except KeyError: 

1093 page2resources = DictionaryObject() 

1094 new_annots = ArrayObject() 

1095 

1096 for page in (self, page2): 

1097 if PG.ANNOTS in page: 

1098 annots = page[PG.ANNOTS] 

1099 if isinstance(annots, ArrayObject): 

1100 new_annots.extend(annots) 

1101 

1102 for res in ( 

1103 RES.EXT_G_STATE, 

1104 RES.FONT, 

1105 RES.XOBJECT, 

1106 RES.COLOR_SPACE, 

1107 RES.PATTERN, 

1108 RES.SHADING, 

1109 RES.PROPERTIES, 

1110 ): 

1111 new, newrename = self._merge_resources( 

1112 original_resources, page2resources, res 

1113 ) 

1114 if new: 

1115 new_resources[NameObject(res)] = new 

1116 rename.update(newrename) 

1117 

1118 # Combine /ProcSet sets, making sure there's a consistent order 

1119 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1120 sorted( 

1121 set( 

1122 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1123 ).union( 

1124 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1125 ) 

1126 ) 

1127 ) 

1128 

1129 new_content_array = ArrayObject() 

1130 original_content = self.get_contents() 

1131 if original_content is not None: 

1132 original_content.isolate_graphics_state() 

1133 new_content_array.append(original_content) 

1134 

1135 page2content = page2.get_contents() 

1136 if page2content is not None: 

1137 rect = getattr(page2, MERGE_CROP_BOX) 

1138 page2content.operations.insert( 

1139 0, 

1140 ( 

1141 map( 

1142 FloatObject, 

1143 [ 

1144 rect.left, 

1145 rect.bottom, 

1146 rect.width, 

1147 rect.height, 

1148 ], 

1149 ), 

1150 b"re", 

1151 ), 

1152 ) 

1153 page2content.operations.insert(1, ([], b"W")) 

1154 page2content.operations.insert(2, ([], b"n")) 

1155 if page2transformation is not None: 

1156 page2content = page2transformation(page2content) 

1157 page2content = PageObject._content_stream_rename( 

1158 page2content, rename, self.pdf 

1159 ) 

1160 page2content.isolate_graphics_state() 

1161 if over: 

1162 new_content_array.append(page2content) 

1163 else: 

1164 new_content_array.insert(0, page2content) 

1165 

1166 # if expanding the page to fit a new page, calculate the new media box size 

1167 if expand: 

1168 self._expand_mediabox(page2, ctm) 

1169 

1170 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1171 self[NameObject(PG.RESOURCES)] = new_resources 

1172 self[NameObject(PG.ANNOTS)] = new_annots 

1173 

1174 def _merge_page_writer( 

1175 self, 

1176 page2: "PageObject", 

1177 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1178 ctm: Optional[CompressedTransformationMatrix] = None, 

1179 over: bool = True, 

1180 expand: bool = False, 

1181 ) -> None: 

1182 # First we work on merging the resource dictionaries. This allows us 

1183 # to find which symbols in the content streams we might need to 

1184 # rename. 

1185 assert isinstance(self.indirect_reference, IndirectObject) 

1186 pdf = self.indirect_reference.pdf 

1187 

1188 rename = {} 

1189 if PG.RESOURCES not in self: 

1190 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1191 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1192 if PG.RESOURCES not in page2: 

1193 page2resources = DictionaryObject() 

1194 else: 

1195 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1196 

1197 for res in ( 

1198 RES.EXT_G_STATE, 

1199 RES.FONT, 

1200 RES.XOBJECT, 

1201 RES.COLOR_SPACE, 

1202 RES.PATTERN, 

1203 RES.SHADING, 

1204 RES.PROPERTIES, 

1205 ): 

1206 if res in page2resources: 

1207 if res not in original_resources: 

1208 original_resources[NameObject(res)] = DictionaryObject() 

1209 _, newrename = self._merge_resources( 

1210 original_resources, page2resources, res, False 

1211 ) 

1212 rename.update(newrename) 

1213 # Combine /ProcSet sets. 

1214 if RES.PROC_SET in page2resources: 

1215 if RES.PROC_SET not in original_resources: 

1216 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1217 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1218 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1219 if x not in arr: 

1220 arr.append(x) 

1221 arr.sort() 

1222 

1223 if PG.ANNOTS in page2: 

1224 if PG.ANNOTS not in self: 

1225 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1226 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1227 if ctm is None: 

1228 trsf = Transformation() 

1229 else: 

1230 trsf = Transformation(ctm) 

1231 for a in cast(ArrayObject, page2[PG.ANNOTS]): 

1232 a = a.get_object() 

1233 aa = a.clone( 

1234 pdf, 

1235 ignore_fields=("/P", "/StructParent", "/Parent"), 

1236 force_duplicate=True, 

1237 ) 

1238 r = cast(ArrayObject, a["/Rect"]) 

1239 pt1 = trsf.apply_on((r[0], r[1]), True) 

1240 pt2 = trsf.apply_on((r[2], r[3]), True) 

1241 aa[NameObject("/Rect")] = ArrayObject( 

1242 ( 

1243 min(pt1[0], pt2[0]), 

1244 min(pt1[1], pt2[1]), 

1245 max(pt1[0], pt2[0]), 

1246 max(pt1[1], pt2[1]), 

1247 ) 

1248 ) 

1249 if "/QuadPoints" in a: 

1250 q = cast(ArrayObject, a["/QuadPoints"]) 

1251 aa[NameObject("/QuadPoints")] = ArrayObject( 

1252 trsf.apply_on((q[0], q[1]), True) 

1253 + trsf.apply_on((q[2], q[3]), True) 

1254 + trsf.apply_on((q[4], q[5]), True) 

1255 + trsf.apply_on((q[6], q[7]), True) 

1256 ) 

1257 try: 

1258 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1259 except KeyError: 

1260 pass 

1261 try: 

1262 aa[NameObject("/P")] = self.indirect_reference 

1263 annots.append(aa.indirect_reference) 

1264 except AttributeError: 

1265 pass 

1266 

1267 new_content_array = ArrayObject() 

1268 original_content = self.get_contents() 

1269 if original_content is not None: 

1270 original_content.isolate_graphics_state() 

1271 new_content_array.append(original_content) 

1272 

1273 page2content = page2.get_contents() 

1274 if page2content is not None: 

1275 rect = getattr(page2, MERGE_CROP_BOX) 

1276 page2content.operations.insert( 

1277 0, 

1278 ( 

1279 map( 

1280 FloatObject, 

1281 [ 

1282 rect.left, 

1283 rect.bottom, 

1284 rect.width, 

1285 rect.height, 

1286 ], 

1287 ), 

1288 b"re", 

1289 ), 

1290 ) 

1291 page2content.operations.insert(1, ([], b"W")) 

1292 page2content.operations.insert(2, ([], b"n")) 

1293 if page2transformation is not None: 

1294 page2content = page2transformation(page2content) 

1295 page2content = PageObject._content_stream_rename( 

1296 page2content, rename, self.pdf 

1297 ) 

1298 page2content.isolate_graphics_state() 

1299 if over: 

1300 new_content_array.append(page2content) 

1301 else: 

1302 new_content_array.insert(0, page2content) 

1303 

1304 # if expanding the page to fit a new page, calculate the new media box size 

1305 if expand: 

1306 self._expand_mediabox(page2, ctm) 

1307 

1308 self.replace_contents(new_content_array) 

1309 

1310 def _expand_mediabox( 

1311 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1312 ) -> None: 

1313 corners1 = ( 

1314 self.mediabox.left.as_numeric(), 

1315 self.mediabox.bottom.as_numeric(), 

1316 self.mediabox.right.as_numeric(), 

1317 self.mediabox.top.as_numeric(), 

1318 ) 

1319 corners2 = ( 

1320 page2.mediabox.left.as_numeric(), 

1321 page2.mediabox.bottom.as_numeric(), 

1322 page2.mediabox.left.as_numeric(), 

1323 page2.mediabox.top.as_numeric(), 

1324 page2.mediabox.right.as_numeric(), 

1325 page2.mediabox.top.as_numeric(), 

1326 page2.mediabox.right.as_numeric(), 

1327 page2.mediabox.bottom.as_numeric(), 

1328 ) 

1329 if ctm is not None: 

1330 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1331 new_x = tuple( 

1332 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1333 for i in range(0, 8, 2) 

1334 ) 

1335 new_y = tuple( 

1336 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1337 for i in range(0, 8, 2) 

1338 ) 

1339 else: 

1340 new_x = corners2[0:8:2] 

1341 new_y = corners2[1:8:2] 

1342 lowerleft = (min(new_x), min(new_y)) 

1343 upperright = (max(new_x), max(new_y)) 

1344 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1345 upperright = ( 

1346 max(corners1[2], upperright[0]), 

1347 max(corners1[3], upperright[1]), 

1348 ) 

1349 

1350 self.mediabox.lower_left = lowerleft 

1351 self.mediabox.upper_right = upperright 

1352 

1353 def merge_transformed_page( 

1354 self, 

1355 page2: "PageObject", 

1356 ctm: Union[CompressedTransformationMatrix, Transformation], 

1357 over: bool = True, 

1358 expand: bool = False, 

1359 ) -> None: 

1360 """ 

1361 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1362 matrix is applied to the merged stream. 

1363 

1364 Args: 

1365 page2: The page to be merged into this one. 

1366 ctm: a 6-element tuple containing the operands of the 

1367 transformation matrix 

1368 over: set the page2 content over page1 if True (default) else under 

1369 expand: Whether the page should be expanded to fit the dimensions 

1370 of the page to be merged. 

1371 

1372 """ 

1373 if isinstance(ctm, Transformation): 

1374 ctm = ctm.ctm 

1375 self._merge_page( 

1376 page2, 

1377 lambda page2Content: PageObject._add_transformation_matrix( 

1378 page2Content, page2.pdf, ctm 

1379 ), 

1380 ctm, 

1381 over, 

1382 expand, 

1383 ) 

1384 

1385 def merge_scaled_page( 

1386 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1387 ) -> None: 

1388 """ 

1389 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1390 is scaled by applying a transformation matrix. 

1391 

1392 Args: 

1393 page2: The page to be merged into this one. 

1394 scale: The scaling factor 

1395 over: set the page2 content over page1 if True (default) else under 

1396 expand: Whether the page should be expanded to fit the 

1397 dimensions of the page to be merged. 

1398 

1399 """ 

1400 op = Transformation().scale(scale, scale) 

1401 self.merge_transformed_page(page2, op, over, expand) 

1402 

1403 def merge_rotated_page( 

1404 self, 

1405 page2: "PageObject", 

1406 rotation: float, 

1407 over: bool = True, 

1408 expand: bool = False, 

1409 ) -> None: 

1410 """ 

1411 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1412 is rotated by applying a transformation matrix. 

1413 

1414 Args: 

1415 page2: The page to be merged into this one. 

1416 rotation: The angle of the rotation, in degrees 

1417 over: set the page2 content over page1 if True (default) else under 

1418 expand: Whether the page should be expanded to fit the 

1419 dimensions of the page to be merged. 

1420 

1421 """ 

1422 op = Transformation().rotate(rotation) 

1423 self.merge_transformed_page(page2, op, over, expand) 

1424 

1425 def merge_translated_page( 

1426 self, 

1427 page2: "PageObject", 

1428 tx: float, 

1429 ty: float, 

1430 over: bool = True, 

1431 expand: bool = False, 

1432 ) -> None: 

1433 """ 

1434 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1435 merged is translated by applying a transformation matrix. 

1436 

1437 Args: 

1438 page2: the page to be merged into this one. 

1439 tx: The translation on X axis 

1440 ty: The translation on Y axis 

1441 over: set the page2 content over page1 if True (default) else under 

1442 expand: Whether the page should be expanded to fit the 

1443 dimensions of the page to be merged. 

1444 

1445 """ 

1446 op = Transformation().translate(tx, ty) 

1447 self.merge_transformed_page(page2, op, over, expand) 

1448 

1449 def add_transformation( 

1450 self, 

1451 ctm: Union[Transformation, CompressedTransformationMatrix], 

1452 expand: bool = False, 

1453 ) -> None: 

1454 """ 

1455 Apply a transformation matrix to the page. 

1456 

1457 Args: 

1458 ctm: A 6-element tuple containing the operands of the 

1459 transformation matrix. Alternatively, a 

1460 :py:class:`Transformation<pypdf.Transformation>` 

1461 object can be passed. 

1462 

1463 See :doc:`/user/cropping-and-transforming`. 

1464 

1465 """ 

1466 if isinstance(ctm, Transformation): 

1467 ctm = ctm.ctm 

1468 content = self.get_contents() 

1469 if content is not None: 

1470 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1471 content.isolate_graphics_state() 

1472 self.replace_contents(content) 

1473 # if expanding the page to fit a new page, calculate the new media box size 

1474 if expand: 

1475 corners = [ 

1476 self.mediabox.left.as_numeric(), 

1477 self.mediabox.bottom.as_numeric(), 

1478 self.mediabox.left.as_numeric(), 

1479 self.mediabox.top.as_numeric(), 

1480 self.mediabox.right.as_numeric(), 

1481 self.mediabox.top.as_numeric(), 

1482 self.mediabox.right.as_numeric(), 

1483 self.mediabox.bottom.as_numeric(), 

1484 ] 

1485 

1486 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1487 new_x = [ 

1488 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1489 for i in range(0, 8, 2) 

1490 ] 

1491 new_y = [ 

1492 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1493 for i in range(0, 8, 2) 

1494 ] 

1495 

1496 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1497 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1498 

1499 def scale(self, sx: float, sy: float) -> None: 

1500 """ 

1501 Scale a page by the given factors by applying a transformation matrix 

1502 to its content and updating the page size. 

1503 

1504 This updates the various page boundaries (mediabox, cropbox, etc.) 

1505 and the contents of the page. 

1506 

1507 Args: 

1508 sx: The scaling factor on horizontal axis. 

1509 sy: The scaling factor on vertical axis. 

1510 

1511 """ 

1512 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1513 self.mediabox = self.mediabox.scale(sx, sy) 

1514 self.cropbox = self.cropbox.scale(sx, sy) 

1515 self.bleedbox = self.bleedbox.scale(sx, sy) 

1516 self.trimbox = self.trimbox.scale(sx, sy) 

1517 self.artbox = self.artbox.scale(sx, sy) 

1518 

1519 if PG.ANNOTS in self: 

1520 annotations = self[PG.ANNOTS] 

1521 if isinstance(annotations, ArrayObject): 

1522 for annotation in annotations: 

1523 annotation_obj = annotation.get_object() 

1524 if ADA.Rect in annotation_obj: 

1525 rectangle = annotation_obj[ADA.Rect] 

1526 if isinstance(rectangle, ArrayObject): 

1527 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1528 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1529 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1530 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1531 

1532 if PG.VP in self: 

1533 viewport = self[PG.VP] 

1534 if isinstance(viewport, ArrayObject): 

1535 bbox = viewport[0]["/BBox"] 

1536 else: 

1537 bbox = viewport["/BBox"] # type: ignore 

1538 scaled_bbox = RectangleObject( 

1539 ( 

1540 float(bbox[0]) * sx, 

1541 float(bbox[1]) * sy, 

1542 float(bbox[2]) * sx, 

1543 float(bbox[3]) * sy, 

1544 ) 

1545 ) 

1546 if isinstance(viewport, ArrayObject): 

1547 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1548 NameObject("/BBox") 

1549 ] = scaled_bbox 

1550 else: 

1551 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1552 

1553 def scale_by(self, factor: float) -> None: 

1554 """ 

1555 Scale a page by the given factor by applying a transformation matrix to 

1556 its content and updating the page size. 

1557 

1558 Args: 

1559 factor: The scaling factor (for both X and Y axis). 

1560 

1561 """ 

1562 self.scale(factor, factor) 

1563 

1564 def scale_to(self, width: float, height: float) -> None: 

1565 """ 

1566 Scale a page to the specified dimensions by applying a transformation 

1567 matrix to its content and updating the page size. 

1568 

1569 Args: 

1570 width: The new width. 

1571 height: The new height. 

1572 

1573 """ 

1574 sx = width / float(self.mediabox.width) 

1575 sy = height / float(self.mediabox.height) 

1576 self.scale(sx, sy) 

1577 

1578 def compress_content_streams(self, level: int = -1) -> None: 

1579 """ 

1580 Compress the size of this page by joining all content streams and 

1581 applying a FlateDecode filter. 

1582 

1583 However, it is possible that this function will perform no action if 

1584 content stream compression becomes "automatic". 

1585 """ 

1586 content = self.get_contents() 

1587 if content is not None: 

1588 content_obj = content.flate_encode(level) 

1589 try: 

1590 content.indirect_reference.pdf._objects[ # type: ignore 

1591 content.indirect_reference.idnum - 1 # type: ignore 

1592 ] = content_obj 

1593 except AttributeError: 

1594 if self.indirect_reference is not None and hasattr( 

1595 self.indirect_reference.pdf, "_add_object" 

1596 ): 

1597 self.replace_contents(content_obj) 

1598 else: 

1599 raise ValueError("Page must be part of a PdfWriter") 

1600 

1601 @property 

1602 def page_number(self) -> Optional[int]: 

1603 """ 

1604 Read-only property which returns the page number within the PDF file. 

1605 

1606 Returns: 

1607 Page number; None if the page is not attached to a PDF. 

1608 

1609 """ 

1610 if self.indirect_reference is None: 

1611 return None 

1612 try: 

1613 lst = self.indirect_reference.pdf.pages 

1614 return lst.index(self) 

1615 except ValueError: 

1616 return None 

1617 

1618 def _debug_for_extract(self) -> str: # pragma: no cover 

1619 out = "" 

1620 for ope, op in ContentStream( 

1621 self["/Contents"].get_object(), self.pdf, "bytes" 

1622 ).operations: 

1623 if op == b"TJ": 

1624 s = [x for x in ope[0] if isinstance(x, str)] 

1625 else: 

1626 s = [] 

1627 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1628 out += "\n=============================\n" 

1629 try: 

1630 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1631 out += fo + "\n" 

1632 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1633 try: 

1634 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1635 "/Encoding" 

1636 ].__repr__() 

1637 out += enc_repr + "\n" 

1638 except Exception: 

1639 pass 

1640 try: 

1641 out += ( 

1642 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1643 "/ToUnicode" 

1644 ] 

1645 .get_data() 

1646 .decode() 

1647 + "\n" 

1648 ) 

1649 except Exception: 

1650 pass 

1651 

1652 except KeyError: 

1653 out += "No Font\n" 

1654 return out 

1655 

1656 def _extract_text( 

1657 self, 

1658 obj: Any, 

1659 pdf: Any, 

1660 orientations: Tuple[int, ...] = (0, 90, 180, 270), 

1661 space_width: float = 200.0, 

1662 content_key: Optional[str] = PG.CONTENTS, 

1663 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1664 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1665 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1666 ) -> str: 

1667 """ 

1668 See extract_text for most arguments. 

1669 

1670 Args: 

1671 content_key: indicate the default key where to extract data 

1672 None = the object; this allows reusing the function on an XObject 

1673 default = "/Content" 

1674 

1675 """ 

1676 extractor = TextExtraction() 

1677 cmaps: Dict[ 

1678 str, 

1679 Tuple[ 

1680 str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject 

1681 ], 

1682 ] = {} 

1683 

1684 try: 

1685 objr = obj 

1686 while NameObject(PG.RESOURCES) not in objr: 

1687 # /Resources can be inherited so we look to parents 

1688 objr = objr["/Parent"].get_object() 

1689 # If no parents then no /Resources will be available, 

1690 # so an exception will be raised 

1691 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1692 except Exception: 

1693 # No resources means no text is possible (no font); we consider the 

1694 # file as not damaged, no need to check for TJ or Tj 

1695 return "" 

1696 

1697 if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]): 

1698 for f in cast(DictionaryObject, font): 

1699 try: 

1700 cmaps[f] = build_char_map(f, space_width, obj) 

1701 except TypeError: 

1702 pass 

1703 

1704 try: 

1705 content = ( 

1706 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1707 ) 

1708 if not isinstance(content, ContentStream): 

1709 content = ContentStream(content, pdf, "bytes") 

1710 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1711 return "" 

1712 # We check all strings are TextStringObjects. ByteStringObjects 

1713 # are strings where the byte->string encoding was unknown, so adding 

1714 # them to the text here would be gibberish. 

1715 

1716 # Initialize the extractor with the necessary parameters 

1717 extractor.initialize_extraction(orientations, visitor_text, cmaps) 

1718 

1719 for operands, operator in content.operations: 

1720 if visitor_operand_before is not None: 

1721 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1722 # Multiple operators are handled here 

1723 if operator == b"'": 

1724 extractor.process_operation(b"T*", []) 

1725 extractor.process_operation(b"Tj", operands) 

1726 elif operator == b'"': 

1727 extractor.process_operation(b"Tw", [operands[0]]) 

1728 extractor.process_operation(b"Tc", [operands[1]]) 

1729 extractor.process_operation(b"T*", []) 

1730 extractor.process_operation(b"Tj", operands[2:]) 

1731 elif operator == b"TJ": 

1732 # The space width may be smaller than the font width, so the width should be 95%. 

1733 _confirm_space_width = extractor._space_width * 0.95 

1734 if operands: 

1735 for op in operands[0]: 

1736 if isinstance(op, (str, bytes)): 

1737 extractor.process_operation(b"Tj", [op]) 

1738 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1739 abs(float(op)) >= _confirm_space_width 

1740 and extractor.text 

1741 and extractor.text[-1] != " " 

1742 ): 

1743 extractor.process_operation(b"Tj", [" "]) 

1744 elif operator == b"TD": 

1745 extractor.process_operation(b"TL", [-operands[1]]) 

1746 extractor.process_operation(b"Td", operands) 

1747 elif operator == b"Do": 

1748 extractor.output += extractor.text 

1749 if visitor_text is not None: 

1750 visitor_text( 

1751 extractor.text, 

1752 extractor.memo_cm, 

1753 extractor.memo_tm, 

1754 extractor.cmap[3], 

1755 extractor.font_size, 

1756 ) 

1757 try: 

1758 if extractor.output[-1] != "\n": 

1759 extractor.output += "\n" 

1760 if visitor_text is not None: 

1761 visitor_text( 

1762 "\n", 

1763 extractor.memo_cm, 

1764 extractor.memo_tm, 

1765 extractor.cmap[3], 

1766 extractor.font_size, 

1767 ) 

1768 except IndexError: 

1769 pass 

1770 try: 

1771 xobj = resources_dict["/XObject"] 

1772 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

1773 text = self.extract_xform_text( 

1774 xobj[operands[0]], # type: ignore 

1775 orientations, 

1776 space_width, 

1777 visitor_operand_before, 

1778 visitor_operand_after, 

1779 visitor_text, 

1780 ) 

1781 extractor.output += text 

1782 if visitor_text is not None: 

1783 visitor_text( 

1784 text, 

1785 extractor.memo_cm, 

1786 extractor.memo_tm, 

1787 extractor.cmap[3], 

1788 extractor.font_size, 

1789 ) 

1790 except Exception as exception: 

1791 logger_warning( 

1792 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

1793 __name__, 

1794 ) 

1795 finally: 

1796 extractor.text = "" 

1797 extractor.memo_cm = extractor.cm_matrix.copy() 

1798 extractor.memo_tm = extractor.tm_matrix.copy() 

1799 else: 

1800 extractor.process_operation(operator, operands) 

1801 if visitor_operand_after is not None: 

1802 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1803 extractor.output += extractor.text # just in case 

1804 if extractor.text != "" and visitor_text is not None: 

1805 visitor_text( 

1806 extractor.text, 

1807 extractor.memo_cm, 

1808 extractor.memo_tm, 

1809 extractor.cmap[3], 

1810 extractor.font_size, 

1811 ) 

1812 return extractor.output 

1813 

1814 def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]: 

1815 """ 

1816 Get fonts formatted for "layout" mode text extraction. 

1817 

1818 Returns: 

1819 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name 

1820 

1821 """ 

1822 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1823 objr: Any = self 

1824 fonts: Dict[str, _layout_mode.Font] = {} 

1825 while objr is not None: 

1826 try: 

1827 resources_dict: Any = objr[PG.RESOURCES] 

1828 except KeyError: 

1829 resources_dict = {} 

1830 if "/Font" in resources_dict and self.pdf is not None: 

1831 for font_name in resources_dict["/Font"]: 

1832 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) 

1833 font_dict = { 

1834 k: v.get_object() 

1835 if isinstance(v, IndirectObject) 

1836 else [_v.get_object() for _v in v] 

1837 if isinstance(v, ArrayObject) 

1838 else v 

1839 for k, v in font_dict_obj.items() 

1840 } 

1841 # mypy really sucks at unpacking 

1842 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] 

1843 try: 

1844 objr = objr["/Parent"].get_object() 

1845 except KeyError: 

1846 objr = None 

1847 

1848 return fonts 

1849 

1850 def _layout_mode_text( 

1851 self, 

1852 space_vertically: bool = True, 

1853 scale_weight: float = 1.25, 

1854 strip_rotated: bool = True, 

1855 debug_path: Optional[Path] = None, 

1856 font_height_weight: float = 1, 

1857 ) -> str: 

1858 """ 

1859 Get text preserving fidelity to source PDF text layout. 

1860 

1861 Args: 

1862 space_vertically: include blank lines inferred from y distance + font 

1863 height. Defaults to True. 

1864 scale_weight: multiplier for string length when calculating weighted 

1865 average character width. Defaults to 1.25. 

1866 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1867 layout mode output. Defaults to True. 

1868 debug_path (Path | None): if supplied, must target a directory. 

1869 creates the following files with debug information for layout mode 

1870 functions if supplied: 

1871 - fonts.json: output of self._layout_mode_fonts 

1872 - tjs.json: individual text render ops with corresponding transform matrices 

1873 - bts.json: text render ops left justified and grouped by BT/ET operators 

1874 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1875 Defaults to None. 

1876 font_height_weight: multiplier for font height when calculating 

1877 blank lines. Defaults to 1. 

1878 

1879 Returns: 

1880 str: multiline string containing page text in a fixed width format that 

1881 closely adheres to the rendered layout in the source pdf. 

1882 

1883 """ 

1884 fonts = self._layout_mode_fonts() 

1885 if debug_path: # pragma: no cover 

1886 import json # noqa: PLC0415 

1887 

1888 debug_path.joinpath("fonts.json").write_text( 

1889 json.dumps( 

1890 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) 

1891 ), 

1892 "utf-8", 

1893 ) 

1894 

1895 ops = iter( 

1896 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1897 ) 

1898 bt_groups = _layout_mode.text_show_operations( 

1899 ops, fonts, strip_rotated, debug_path 

1900 ) 

1901 

1902 if not bt_groups: 

1903 return "" 

1904 

1905 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1906 

1907 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1908 

1909 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1910 

1911 def extract_text( 

1912 self, 

1913 *args: Any, 

1914 orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), 

1915 space_width: float = 200.0, 

1916 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1917 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1918 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1919 extraction_mode: Literal["plain", "layout"] = "plain", 

1920 **kwargs: Any, 

1921 ) -> str: 

1922 """ 

1923 Locate all text drawing commands, in the order they are provided in the 

1924 content stream, and extract the text. 

1925 

1926 This works well for some PDF files, but poorly for others, depending on 

1927 the generator used. This will be refined in the future. 

1928 

1929 Do not rely on the order of text coming out of this function, as it 

1930 will change if this function is made more sophisticated. 

1931 

1932 Arabic and Hebrew are extracted in the correct order. 

1933 If required a custom RTL range of characters can be defined; 

1934 see function set_custom_rtl. 

1935 

1936 Additionally you can provide visitor methods to get informed on all 

1937 operations and all text objects. 

1938 For example in some PDF files this can be useful to parse tables. 

1939 

1940 Args: 

1941 orientations: list of orientations extract_text will look for 

1942 default = (0, 90, 180, 270) 

1943 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1944 270 (turned right) 

1945 Silently ignored in "layout" mode. 

1946 space_width: force default space width 

1947 if not extracted from font (default: 200) 

1948 Silently ignored in "layout" mode. 

1949 visitor_operand_before: function to be called before processing an operation. 

1950 It has four arguments: operator, operand-arguments, 

1951 current transformation matrix and text matrix. 

1952 Ignored with a warning in "layout" mode. 

1953 visitor_operand_after: function to be called after processing an operation. 

1954 It has four arguments: operator, operand-arguments, 

1955 current transformation matrix and text matrix. 

1956 Ignored with a warning in "layout" mode. 

1957 visitor_text: function to be called when extracting some text at some position. 

1958 It has five arguments: text, current transformation matrix, 

1959 text matrix, font-dictionary and font-size. 

1960 The font-dictionary may be None in case of unknown fonts. 

1961 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1962 Ignored with a warning in "layout" mode. 

1963 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1964 "layout" for experimental layout mode functionality. 

1965 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1966 in "layout" mode. 

1967 

1968 kwargs: 

1969 layout_mode_space_vertically (bool): include blank lines inferred from 

1970 y distance + font height. Defaults to True. 

1971 layout_mode_scale_weight (float): multiplier for string length when calculating 

1972 weighted average character width. Defaults to 1.25. 

1973 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

1974 Set to False to include rotated text anyway. If rotated text is discovered, 

1975 layout will be degraded and a warning will result. Defaults to True. 

1976 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

1977 creates the following files with debug information for layout mode 

1978 functions if supplied: 

1979 

1980 - fonts.json: output of self._layout_mode_fonts 

1981 - tjs.json: individual text render ops with corresponding transform matrices 

1982 - bts.json: text render ops left justified and grouped by BT/ET operators 

1983 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1984 layout_mode_font_height_weight (float): multiplier for font height when calculating 

1985 blank lines. Defaults to 1. 

1986 

1987 Returns: 

1988 The extracted text 

1989 

1990 """ 

1991 if extraction_mode not in ["plain", "layout"]: 

1992 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

1993 if extraction_mode == "layout": 

1994 for visitor in ( 

1995 "visitor_operand_before", 

1996 "visitor_operand_after", 

1997 "visitor_text", 

1998 ): 

1999 if locals()[visitor]: 

2000 logger_warning( 

2001 f"Argument {visitor} is ignored in layout mode", 

2002 __name__, 

2003 ) 

2004 return self._layout_mode_text( 

2005 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2006 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2007 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2008 debug_path=kwargs.get("layout_mode_debug_path"), 

2009 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2010 ) 

2011 if len(args) >= 1: 

2012 if isinstance(args[0], str): 

2013 if len(args) >= 3: 

2014 if isinstance(args[2], (tuple, int)): 

2015 orientations = args[2] 

2016 else: 

2017 raise TypeError(f"Invalid positional parameter {args[2]}") 

2018 if len(args) >= 4: 

2019 if isinstance(args[3], (float, int)): 

2020 space_width = args[3] 

2021 else: 

2022 raise TypeError(f"Invalid positional parameter {args[3]}") 

2023 elif isinstance(args[0], (tuple, int)): 

2024 orientations = args[0] 

2025 if len(args) >= 2: 

2026 if isinstance(args[1], (float, int)): 

2027 space_width = args[1] 

2028 else: 

2029 raise TypeError(f"Invalid positional parameter {args[1]}") 

2030 else: 

2031 raise TypeError(f"Invalid positional parameter {args[0]}") 

2032 

2033 if isinstance(orientations, int): 

2034 orientations = (orientations,) 

2035 

2036 return self._extract_text( 

2037 self, 

2038 self.pdf, 

2039 orientations, 

2040 space_width, 

2041 PG.CONTENTS, 

2042 visitor_operand_before, 

2043 visitor_operand_after, 

2044 visitor_text, 

2045 ) 

2046 

2047 def extract_xform_text( 

2048 self, 

2049 xform: EncodedStreamObject, 

2050 orientations: Tuple[int, ...] = (0, 90, 270, 360), 

2051 space_width: float = 200.0, 

2052 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2053 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2054 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2055 ) -> str: 

2056 """ 

2057 Extract text from an XObject. 

2058 

2059 Args: 

2060 xform: 

2061 orientations: 

2062 space_width: force default space width (if not extracted from font (default 200) 

2063 visitor_operand_before: 

2064 visitor_operand_after: 

2065 visitor_text: 

2066 

2067 Returns: 

2068 The extracted text 

2069 

2070 """ 

2071 return self._extract_text( 

2072 xform, 

2073 self.pdf, 

2074 orientations, 

2075 space_width, 

2076 None, 

2077 visitor_operand_before, 

2078 visitor_operand_after, 

2079 visitor_text, 

2080 ) 

2081 

2082 def _get_fonts(self) -> Tuple[Set[str], Set[str]]: 

2083 """ 

2084 Get the names of embedded fonts and unembedded fonts. 

2085 

2086 Returns: 

2087 A tuple (set of embedded fonts, set of unembedded fonts) 

2088 

2089 """ 

2090 obj = self.get_object() 

2091 assert isinstance(obj, DictionaryObject) 

2092 fonts: Set[str] = set() 

2093 embedded: Set[str] = set() 

2094 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2095 unembedded = fonts - embedded 

2096 return embedded, unembedded 

2097 

2098 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2099 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2100 default user space units, defining the boundaries of the physical medium on 

2101 which the page is intended to be displayed or printed.""" 

2102 

2103 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2104 """ 

2105 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2106 default user space units, defining the visible region of default user 

2107 space. 

2108 

2109 When the page is displayed or printed, its contents are to be clipped 

2110 (cropped) to this rectangle and then imposed on the output medium in some 

2111 implementation-defined manner. Default value: same as 

2112 :attr:`mediabox<mediabox>`. 

2113 """ 

2114 

2115 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2116 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2117 default user space units, defining the region to which the contents of the 

2118 page should be clipped when output in a production environment.""" 

2119 

2120 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2121 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2122 default user space units, defining the intended dimensions of the finished 

2123 page after trimming.""" 

2124 

2125 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2126 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2127 default user space units, defining the extent of the page's meaningful 

2128 content as intended by the page's creator.""" 

2129 

2130 @property 

2131 def annotations(self) -> Optional[ArrayObject]: 

2132 if "/Annots" not in self: 

2133 return None 

2134 return cast(ArrayObject, self["/Annots"]) 

2135 

2136 @annotations.setter 

2137 def annotations(self, value: Optional[ArrayObject]) -> None: 

2138 """ 

2139 Set the annotations array of the page. 

2140 

2141 Typically you do not want to set this value, but append to it. 

2142 If you append to it, remember to add the object first to the writer 

2143 and only add the indirect object. 

2144 """ 

2145 if value is None: 

2146 del self[NameObject("/Annots")] 

2147 else: 

2148 self[NameObject("/Annots")] = value 

2149 

2150 

2151class _VirtualList(Sequence[PageObject]): 

2152 def __init__( 

2153 self, 

2154 length_function: Callable[[], int], 

2155 get_function: Callable[[int], PageObject], 

2156 ) -> None: 

2157 self.length_function = length_function 

2158 self.get_function = get_function 

2159 self.current = -1 

2160 

2161 def __len__(self) -> int: 

2162 return self.length_function() 

2163 

2164 @overload 

2165 def __getitem__(self, index: int) -> PageObject: 

2166 ... 

2167 

2168 @overload 

2169 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2170 ... 

2171 

2172 def __getitem__( 

2173 self, index: Union[int, slice] 

2174 ) -> Union[PageObject, Sequence[PageObject]]: 

2175 if isinstance(index, slice): 

2176 indices = range(*index.indices(len(self))) 

2177 cls = type(self) 

2178 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2179 if not isinstance(index, int): 

2180 raise TypeError("Sequence indices must be integers") 

2181 len_self = len(self) 

2182 if index < 0: 

2183 # support negative indexes 

2184 index += len_self 

2185 if not (0 <= index < len_self): 

2186 raise IndexError("Sequence index out of range") 

2187 return self.get_function(index) 

2188 

2189 def __delitem__(self, index: Union[int, slice]) -> None: 

2190 if isinstance(index, slice): 

2191 r = list(range(*index.indices(len(self)))) 

2192 # pages have to be deleted from last to first 

2193 r.sort() 

2194 r.reverse() 

2195 for p in r: 

2196 del self[p] # recursive call 

2197 return 

2198 if not isinstance(index, int): 

2199 raise TypeError("Index must be integers") 

2200 len_self = len(self) 

2201 if index < 0: 

2202 # support negative indexes 

2203 index += len_self 

2204 if not (0 <= index < len_self): 

2205 raise IndexError("Index out of range") 

2206 ind = self[index].indirect_reference 

2207 assert ind is not None 

2208 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2209 "/Parent", None 

2210 ) 

2211 first = True 

2212 while parent is not None: 

2213 parent = cast(DictionaryObject, parent.get_object()) 

2214 try: 

2215 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2216 del cast(ArrayObject, parent["/Kids"])[i] 

2217 first = False 

2218 try: 

2219 assert ind is not None 

2220 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2221 except Exception: # pragma: no cover 

2222 pass 

2223 if "/Count" in parent: 

2224 parent[NameObject("/Count")] = NumberObject( 

2225 cast(int, parent["/Count"]) - 1 

2226 ) 

2227 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2228 # No more objects in this part of this subtree 

2229 ind = parent.indirect_reference 

2230 parent = parent.get("/Parent", None) 

2231 except ValueError: # from index 

2232 if first: 

2233 raise PdfReadError(f"Page not found in page tree: {ind}") 

2234 break 

2235 

2236 def __iter__(self) -> Iterator[PageObject]: 

2237 for i in range(len(self)): 

2238 yield self[i] 

2239 

2240 def __str__(self) -> str: 

2241 p = [f"PageObject({i})" for i in range(self.length_function())] 

2242 return f"[{', '.join(p)}]" 

2243 

2244 

2245def _get_fonts_walk( 

2246 obj: DictionaryObject, 

2247 fnt: Set[str], 

2248 emb: Set[str], 

2249) -> Tuple[Set[str], Set[str]]: 

2250 """ 

2251 Get the set of all fonts and all embedded fonts. 

2252 

2253 Args: 

2254 obj: Page resources dictionary 

2255 fnt: font 

2256 emb: embedded fonts 

2257 

2258 Returns: 

2259 A tuple (fnt, emb) 

2260 

2261 If there is a key called 'BaseFont', that is a font that is used in the document. 

2262 If there is a key called 'FontName' and another key in the same dictionary object 

2263 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2264 embedded. 

2265 

2266 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2267 

2268 """ 

2269 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2270 

2271 def process_font(f: DictionaryObject) -> None: 

2272 nonlocal fnt, emb 

2273 f = cast(DictionaryObject, f.get_object()) # to be sure 

2274 if "/BaseFont" in f: 

2275 fnt.add(cast(str, f["/BaseFont"])) 

2276 

2277 if ( 

2278 ("/CharProcs" in f) 

2279 or ( 

2280 "/FontDescriptor" in f 

2281 and any( 

2282 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2283 ) 

2284 ) 

2285 or ( 

2286 "/DescendantFonts" in f 

2287 and "/FontDescriptor" 

2288 in cast( 

2289 DictionaryObject, 

2290 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2291 ) 

2292 and any( 

2293 x 

2294 in cast( 

2295 DictionaryObject, 

2296 cast( 

2297 DictionaryObject, 

2298 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2299 )["/FontDescriptor"], 

2300 ) 

2301 for x in fontkeys 

2302 ) 

2303 ) 

2304 ): 

2305 # the list comprehension ensures there is FontFile 

2306 try: 

2307 emb.add(cast(str, f["/BaseFont"])) 

2308 except KeyError: 

2309 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2310 

2311 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2312 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2313 process_font(f) 

2314 if "/Resources" in obj: 

2315 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2316 for f in cast( 

2317 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2318 ).values(): 

2319 process_font(f) 

2320 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2321 for x in cast( 

2322 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2323 ).values(): 

2324 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2325 if "/Annots" in obj: 

2326 for a in cast(ArrayObject, obj["/Annots"]): 

2327 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2328 if "/AP" in obj: 

2329 if ( 

2330 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2331 "/Type" 

2332 ) 

2333 == "/XObject" 

2334 ): 

2335 _get_fonts_walk( 

2336 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2337 fnt, 

2338 emb, 

2339 ) 

2340 else: 

2341 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2342 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2343 return fnt, emb # return the sets for each page