Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1044 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from dataclasses import dataclass 

32from decimal import Decimal 

33from io import BytesIO 

34from pathlib import Path 

35from typing import ( 

36 Any, 

37 Callable, 

38 Dict, 

39 Iterable, 

40 Iterator, 

41 List, 

42 Literal, 

43 Optional, 

44 Sequence, 

45 Set, 

46 Tuple, 

47 Union, 

48 cast, 

49 overload, 

50) 

51 

52from ._cmap import ( 

53 build_char_map, 

54 build_font_width_map, 

55 compute_font_width, 

56 get_actual_str_key, 

57 unknown_char_map, 

58) 

59from ._protocols import PdfCommonDocProtocol 

60from ._text_extraction import ( 

61 OrientationNotFoundError, 

62 _layout_mode, 

63 crlf_space_check, 

64 get_display_str, 

65 get_text_operands, 

66 mult, 

67) 

68from ._utils import ( 

69 CompressedTransformationMatrix, 

70 TransformationMatrixType, 

71 _human_readable_bytes, 

72 logger_warning, 

73 matrix_multiply, 

74) 

75from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING 

76from .constants import AnnotationDictionaryAttributes as ADA 

77from .constants import ImageAttributes as IA 

78from .constants import PageAttributes as PG 

79from .constants import Resources as RES 

80from .errors import PageSizeNotDefinedError, PdfReadError 

81from .filters import _xobj_to_image 

82from .generic import ( 

83 ArrayObject, 

84 ContentStream, 

85 DictionaryObject, 

86 EncodedStreamObject, 

87 FloatObject, 

88 IndirectObject, 

89 NameObject, 

90 NullObject, 

91 NumberObject, 

92 PdfObject, 

93 RectangleObject, 

94 StreamObject, 

95 TextStringObject, 

96 is_null_or_none, 

97) 

98 

99try: 

100 from PIL.Image import Image 

101 

102 pil_not_imported = False 

103except ImportError: 

104 Image = object # type: ignore 

105 pil_not_imported = True # error will be raised only when using images 

106 

107MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' 

108 

109 

110def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

111 retval: Union[None, RectangleObject, IndirectObject] = self.get(name) 

112 if isinstance(retval, RectangleObject): 

113 return retval 

114 if is_null_or_none(retval): 

115 for d in defaults: 

116 retval = self.get(d) 

117 if retval is not None: 

118 break 

119 if isinstance(retval, IndirectObject): 

120 retval = self.pdf.get_object(retval) 

121 retval = RectangleObject(retval) # type: ignore 

122 _set_rectangle(self, name, retval) 

123 return retval 

124 

125 

126def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

127 name = NameObject(name) 

128 self[name] = value 

129 

130 

131def _delete_rectangle(self: Any, name: str) -> None: 

132 del self[name] 

133 

134 

135def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

136 return property( 

137 lambda self: _get_rectangle(self, name, fallback), 

138 lambda self, value: _set_rectangle(self, name, value), 

139 lambda self: _delete_rectangle(self, name), 

140 ) 

141 

142 

143class Transformation: 

144 """ 

145 Represent a 2D transformation. 

146 

147 The transformation between two coordinate systems is represented by a 3-by-3 

148 transformation matrix with the following form:: 

149 

150 a b 0 

151 c d 0 

152 e f 1 

153 

154 Because a transformation matrix has only six elements that can be changed, 

155 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

156 

157 Coordinate transformations are expressed as matrix multiplications:: 

158 

159 a b 0 

160 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

161 e f 1 

162 

163 

164 Example: 

165 >>> from pypdf import Transformation 

166 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

167 >>> page.add_transformation(op) 

168 

169 """ 

170 

171 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

172 self.ctm = ctm 

173 

174 @property 

175 def matrix(self) -> TransformationMatrixType: 

176 """ 

177 Return the transformation matrix as a tuple of tuples in the form: 

178 

179 ((a, b, 0), (c, d, 0), (e, f, 1)) 

180 """ 

181 return ( 

182 (self.ctm[0], self.ctm[1], 0), 

183 (self.ctm[2], self.ctm[3], 0), 

184 (self.ctm[4], self.ctm[5], 1), 

185 ) 

186 

187 @staticmethod 

188 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

189 """ 

190 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

191 

192 Args: 

193 matrix: The transformation matrix as a tuple of tuples. 

194 

195 Returns: 

196 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

197 

198 """ 

199 return ( 

200 matrix[0][0], 

201 matrix[0][1], 

202 matrix[1][0], 

203 matrix[1][1], 

204 matrix[2][0], 

205 matrix[2][1], 

206 ) 

207 

208 def transform(self, m: "Transformation") -> "Transformation": 

209 """ 

210 Apply one transformation to another. 

211 

212 Args: 

213 m: a Transformation to apply. 

214 

215 Returns: 

216 A new ``Transformation`` instance 

217 

218 Example: 

219 >>> from pypdf import Transformation 

220 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

221 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

222 >>> page.add_transformation(op) 

223 

224 """ 

225 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

226 return Transformation(ctm) 

227 

228 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

229 """ 

230 Translate the contents of a page. 

231 

232 Args: 

233 tx: The translation along the x-axis. 

234 ty: The translation along the y-axis. 

235 

236 Returns: 

237 A new ``Transformation`` instance 

238 

239 """ 

240 m = self.ctm 

241 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

242 

243 def scale( 

244 self, sx: Optional[float] = None, sy: Optional[float] = None 

245 ) -> "Transformation": 

246 """ 

247 Scale the contents of a page towards the origin of the coordinate system. 

248 

249 Typically, that is the lower-left corner of the page. That can be 

250 changed by translating the contents / the page boxes. 

251 

252 Args: 

253 sx: The scale factor along the x-axis. 

254 sy: The scale factor along the y-axis. 

255 

256 Returns: 

257 A new Transformation instance with the scaled matrix. 

258 

259 """ 

260 if sx is None and sy is None: 

261 raise ValueError("Either sx or sy must be specified") 

262 if sx is None: 

263 sx = sy 

264 if sy is None: 

265 sy = sx 

266 assert sx is not None 

267 assert sy is not None 

268 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

269 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

270 return Transformation(ctm) 

271 

272 def rotate(self, rotation: float) -> "Transformation": 

273 """ 

274 Rotate the contents of a page. 

275 

276 Args: 

277 rotation: The angle of rotation in degrees. 

278 

279 Returns: 

280 A new ``Transformation`` instance with the rotated matrix. 

281 

282 """ 

283 rotation = math.radians(rotation) 

284 op: TransformationMatrixType = ( 

285 (math.cos(rotation), math.sin(rotation), 0), 

286 (-math.sin(rotation), math.cos(rotation), 0), 

287 (0, 0, 1), 

288 ) 

289 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

290 return Transformation(ctm) 

291 

292 def __repr__(self) -> str: 

293 return f"Transformation(ctm={self.ctm})" 

294 

295 @overload 

296 def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]: 

297 ... 

298 

299 @overload 

300 def apply_on( 

301 self, pt: Tuple[float, float], as_object: bool = False 

302 ) -> Tuple[float, float]: 

303 ... 

304 

305 def apply_on( 

306 self, 

307 pt: Union[Tuple[float, float], List[float]], 

308 as_object: bool = False, 

309 ) -> Union[Tuple[float, float], List[float]]: 

310 """ 

311 Apply the transformation matrix on the given point. 

312 

313 Args: 

314 pt: A tuple or list representing the point in the form (x, y). 

315 as_object: If True, return items as FloatObject, otherwise as plain floats. 

316 

317 Returns: 

318 A tuple or list representing the transformed point in the form (x', y') 

319 

320 """ 

321 typ = FloatObject if as_object else float 

322 pt1 = ( 

323 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

324 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

325 ) 

326 return list(pt1) if isinstance(pt, list) else pt1 

327 

328 

329@dataclass 

330class ImageFile: 

331 """ 

332 Image within the PDF file. *This object is not designed to be built.* 

333 

334 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

335 """ 

336 

337 name: str = "" 

338 """ 

339 Filename as identified within the PDF file. 

340 """ 

341 

342 data: bytes = b"" 

343 """ 

344 Data as bytes. 

345 """ 

346 

347 image: Optional[Image] = None 

348 """ 

349 Data as PIL image. 

350 """ 

351 

352 indirect_reference: Optional[IndirectObject] = None 

353 """ 

354 Reference to the object storing the stream. 

355 """ 

356 

357 def replace(self, new_image: Image, **kwargs: Any) -> None: 

358 """ 

359 Replace the image with a new PIL image. 

360 

361 Args: 

362 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

363 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

364 

365 Raises: 

366 TypeError: If the image is inline or in a PdfReader. 

367 TypeError: If the image does not belong to a PdfWriter. 

368 TypeError: If `new_image` is not a PIL Image. 

369 

370 Note: 

371 This method replaces the existing image with a new image. 

372 It is not allowed for inline images or images within a PdfReader. 

373 The `kwargs` parameter allows passing additional parameters 

374 to `Image.save()`, such as quality. 

375 

376 """ 

377 if pil_not_imported: 

378 raise ImportError( 

379 "pillow is required to do image extraction. " 

380 "It can be installed via 'pip install pypdf[image]'" 

381 ) 

382 

383 from ._reader import PdfReader # noqa: PLC0415 

384 

385 # to prevent circular import 

386 from .filters import _xobj_to_image # noqa: PLC0415 

387 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

388 

389 if self.indirect_reference is None: 

390 raise TypeError("Cannot update an inline image.") 

391 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

392 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

393 if not isinstance(new_image, Image): 

394 raise TypeError("new_image shall be a PIL Image") 

395 b = BytesIO() 

396 new_image.save(b, "PDF", **kwargs) 

397 reader = PdfReader(b) 

398 assert reader.pages[0].images[0].indirect_reference is not None 

399 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

400 reader.pages[0].images[0].indirect_reference.get_object() 

401 ) 

402 cast( 

403 PdfObject, self.indirect_reference.get_object() 

404 ).indirect_reference = self.indirect_reference 

405 # change the object attributes 

406 extension, byte_stream, img = _xobj_to_image( 

407 cast(DictionaryObject, self.indirect_reference.get_object()) 

408 ) 

409 assert extension is not None 

410 self.name = self.name[: self.name.rfind(".")] + extension 

411 self.data = byte_stream 

412 self.image = img 

413 

414 def __str__(self) -> str: 

415 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

416 

417 def __repr__(self) -> str: 

418 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

419 

420 

421class VirtualListImages(Sequence[ImageFile]): 

422 """ 

423 Provides access to images referenced within a page. 

424 Only one copy will be returned if the usage is used on the same page multiple times. 

425 See :func:`PageObject.images` for more details. 

426 """ 

427 

428 def __init__( 

429 self, 

430 ids_function: Callable[[], List[Union[str, List[str]]]], 

431 get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], 

432 ) -> None: 

433 self.ids_function = ids_function 

434 self.get_function = get_function 

435 self.current = -1 

436 

437 def __len__(self) -> int: 

438 return len(self.ids_function()) 

439 

440 def keys(self) -> List[Union[str, List[str]]]: 

441 return self.ids_function() 

442 

443 def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: 

444 return [(x, self[x]) for x in self.ids_function()] 

445 

446 @overload 

447 def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: 

448 ... 

449 

450 @overload 

451 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

452 ... 

453 

454 def __getitem__( 

455 self, index: Union[int, slice, str, List[str], Tuple[str]] 

456 ) -> Union[ImageFile, Sequence[ImageFile]]: 

457 lst = self.ids_function() 

458 if isinstance(index, slice): 

459 indices = range(*index.indices(len(self))) 

460 lst = [lst[x] for x in indices] 

461 cls = type(self) 

462 return cls((lambda: lst), self.get_function) 

463 if isinstance(index, (str, list, tuple)): 

464 return self.get_function(index) 

465 if not isinstance(index, int): 

466 raise TypeError("Invalid sequence indices type") 

467 len_self = len(lst) 

468 if index < 0: 

469 # support negative indexes 

470 index += len_self 

471 if not (0 <= index < len_self): 

472 raise IndexError("Sequence index out of range") 

473 return self.get_function(lst[index]) 

474 

475 def __iter__(self) -> Iterator[ImageFile]: 

476 for i in range(len(self)): 

477 yield self[i] 

478 

479 def __str__(self) -> str: 

480 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

481 return f"[{', '.join(p)}]" 

482 

483 

484class PageObject(DictionaryObject): 

485 """ 

486 PageObject represents a single page within a PDF file. 

487 

488 Typically these objects will be created by accessing the 

489 :attr:`pages<pypdf.PdfReader.pages>` property of the 

490 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

491 also possible to create an empty page with the 

492 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

493 

494 Args: 

495 pdf: PDF file the page belongs to. 

496 indirect_reference: Stores the original indirect reference to 

497 this object in its source PDF 

498 

499 """ 

500 

501 original_page: "PageObject" # very local use in writer when appending 

502 

503 def __init__( 

504 self, 

505 pdf: Optional[PdfCommonDocProtocol] = None, 

506 indirect_reference: Optional[IndirectObject] = None, 

507 ) -> None: 

508 DictionaryObject.__init__(self) 

509 self.pdf = pdf 

510 self.inline_images: Optional[Dict[str, ImageFile]] = None 

511 self.indirect_reference = indirect_reference 

512 if not is_null_or_none(indirect_reference): 

513 assert indirect_reference is not None, "mypy" 

514 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

515 self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {} 

516 

517 def hash_bin(self) -> int: 

518 """ 

519 Used to detect modified object. 

520 

521 Note: this function is overloaded to return the same results 

522 as a DictionaryObject. 

523 

524 Returns: 

525 Hash considering type and value. 

526 

527 """ 

528 return hash( 

529 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

530 ) 

531 

532 def hash_value_data(self) -> bytes: 

533 data = super().hash_value_data() 

534 data += b"%d" % id(self) 

535 return data 

536 

537 @property 

538 def user_unit(self) -> float: 

539 """ 

540 A read-only positive number giving the size of user space units. 

541 

542 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

543 space unit is 1/72 inch, and a value of 3 means that a user 

544 space unit is 3/72 inch. 

545 """ 

546 return self.get(PG.USER_UNIT, 1) 

547 

548 @staticmethod 

549 def create_blank_page( 

550 pdf: Optional[PdfCommonDocProtocol] = None, 

551 width: Union[float, Decimal, None] = None, 

552 height: Union[float, Decimal, None] = None, 

553 ) -> "PageObject": 

554 """ 

555 Return a new blank page. 

556 

557 If ``width`` or ``height`` is ``None``, try to get the page size 

558 from the last page of *pdf*. 

559 

560 Args: 

561 pdf: PDF file the page is within. 

562 width: The width of the new page expressed in default user 

563 space units. 

564 height: The height of the new page expressed in default user 

565 space units. 

566 

567 Returns: 

568 The new blank page 

569 

570 Raises: 

571 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

572 no page 

573 

574 """ 

575 page = PageObject(pdf) 

576 

577 # Creates a new page (cf PDF Reference §7.7.3.3) 

578 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

579 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

580 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

581 if width is None or height is None: 

582 if pdf is not None and len(pdf.pages) > 0: 

583 lastpage = pdf.pages[len(pdf.pages) - 1] 

584 width = lastpage.mediabox.width 

585 height = lastpage.mediabox.height 

586 else: 

587 raise PageSizeNotDefinedError 

588 page.__setitem__( 

589 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore 

590 ) 

591 

592 return page 

593 

594 def _get_ids_image( 

595 self, 

596 obj: Optional[DictionaryObject] = None, 

597 ancest: Optional[List[str]] = None, 

598 call_stack: Optional[List[Any]] = None, 

599 ) -> List[Union[str, List[str]]]: 

600 if call_stack is None: 

601 call_stack = [] 

602 _i = getattr(obj, "indirect_reference", None) 

603 if _i in call_stack: 

604 return [] 

605 call_stack.append(_i) 

606 if self.inline_images is None: 

607 self.inline_images = self._get_inline_images() 

608 if obj is None: 

609 obj = self 

610 if ancest is None: 

611 ancest = [] 

612 lst: List[Union[str, List[str]]] = [] 

613 if PG.RESOURCES not in obj or RES.XOBJECT not in cast( 

614 DictionaryObject, obj[PG.RESOURCES] 

615 ): 

616 return [] if self.inline_images is None else list(self.inline_images.keys()) 

617 

618 x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore 

619 for o in x_object: 

620 if not isinstance(x_object[o], StreamObject): 

621 continue 

622 if x_object[o][IA.SUBTYPE] == "/Image": 

623 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

624 else: # is a form with possible images inside 

625 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

626 assert self.inline_images is not None 

627 lst.extend(list(self.inline_images.keys())) 

628 return lst 

629 

630 def _get_image( 

631 self, 

632 id: Union[str, List[str], Tuple[str]], 

633 obj: Optional[DictionaryObject] = None, 

634 ) -> ImageFile: 

635 if obj is None: 

636 obj = cast(DictionaryObject, self) 

637 if isinstance(id, tuple): 

638 id = list(id) 

639 if isinstance(id, List) and len(id) == 1: 

640 id = id[0] 

641 try: 

642 xobjs = cast( 

643 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

644 ) 

645 except KeyError: 

646 if not (id[0] == "~" and id[-1] == "~"): 

647 raise 

648 if isinstance(id, str): 

649 if id[0] == "~" and id[-1] == "~": 

650 if self.inline_images is None: 

651 self.inline_images = self._get_inline_images() 

652 if self.inline_images is None: # pragma: no cover 

653 raise KeyError("No inline image can be found") 

654 return self.inline_images[id] 

655 

656 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

657 extension, byte_stream = imgd[:2] 

658 return ImageFile( 

659 name=f"{id[1:]}{extension}", 

660 data=byte_stream, 

661 image=imgd[2], 

662 indirect_reference=xobjs[id].indirect_reference, 

663 ) 

664 # in a subobject 

665 ids = id[1:] 

666 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

667 

668 @property 

669 def images(self) -> VirtualListImages: 

670 """ 

671 Read-only property emulating a list of images on a page. 

672 

673 Get a list of all images on the page. The key can be: 

674 - A string (for the top object) 

675 - A tuple (for images within XObject forms) 

676 - An integer 

677 

678 Examples: 

679 * `reader.pages[0].images[0]` # return first image 

680 * `reader.pages[0].images['/I0']` # return image '/I0' 

681 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

682 * `for img in reader.pages[0].images:` # loops through all objects 

683 

684 images.keys() and images.items() can be used. 

685 

686 The ImageFile has the following properties: 

687 

688 * `.name` : name of the object 

689 * `.data` : bytes of the object 

690 * `.image` : PIL Image Object 

691 * `.indirect_reference` : object reference 

692 

693 and the following methods: 

694 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

695 replace the image in the pdf with the new image 

696 applying the saving parameters indicated (such as quality) 

697 

698 Example usage: 

699 

700 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

701 

702 Inline images are extracted and named ~0~, ~1~, ..., with the 

703 indirect_reference set to None. 

704 

705 """ 

706 return VirtualListImages(self._get_ids_image, self._get_image) 

707 

708 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

709 """Translate values used in inline image""" 

710 try: 

711 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

712 except (TypeError, KeyError): 

713 if isinstance(v, NameObject): 

714 # It is a custom name, thus we have to look in resources. 

715 # The only applicable case is for ColorSpace. 

716 try: 

717 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

718 v = cast(DictionaryObject, res)[v] 

719 except KeyError: # for res and v 

720 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

721 return v 

722 

723 def _get_inline_images(self) -> Dict[str, ImageFile]: 

724 """Load inline images. Entries will be identified as `~1~`.""" 

725 content = self.get_contents() 

726 if is_null_or_none(content): 

727 return {} 

728 imgs_data = [] 

729 assert content is not None, "mypy" 

730 for param, ope in content.operations: 

731 if ope == b"INLINE IMAGE": 

732 imgs_data.append( 

733 {"settings": param["settings"], "__streamdata__": param["data"]} 

734 ) 

735 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

736 raise PdfReadError( 

737 f"{ope!r} operator met whereas not expected, " 

738 "please share use case with pypdf dev team" 

739 ) 

740 files = {} 

741 for num, ii in enumerate(imgs_data): 

742 init = { 

743 "__streamdata__": ii["__streamdata__"], 

744 "/Length": len(ii["__streamdata__"]), 

745 } 

746 for k, v in ii["settings"].items(): 

747 if k in {"/Length", "/L"}: # no length is expected 

748 continue 

749 if isinstance(v, list): 

750 v = ArrayObject( 

751 [self._translate_value_inline_image(k, x) for x in v] 

752 ) 

753 else: 

754 v = self._translate_value_inline_image(k, v) 

755 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

756 if k not in init: 

757 init[k] = v 

758 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

759 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

760 files[f"~{num}~"] = ImageFile( 

761 name=f"~{num}~{extension}", 

762 data=byte_stream, 

763 image=img, 

764 indirect_reference=None, 

765 ) 

766 return files 

767 

768 @property 

769 def rotation(self) -> int: 

770 """ 

771 The visual rotation of the page. 

772 

773 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

774 valid values. This property does not affect ``/Contents``. 

775 """ 

776 rotate_obj = self.get(PG.ROTATE, 0) 

777 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

778 

779 @rotation.setter 

780 def rotation(self, r: float) -> None: 

781 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

782 

783 def transfer_rotation_to_content(self) -> None: 

784 """ 

785 Apply the rotation of the page to the content and the media/crop/... 

786 boxes. 

787 

788 It is recommended to apply this function before page merging. 

789 """ 

790 r = -self.rotation # rotation to apply is in the otherway 

791 self.rotation = 0 

792 mb = RectangleObject(self.mediabox) 

793 trsf = ( 

794 Transformation() 

795 .translate( 

796 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

797 ) 

798 .rotate(r) 

799 ) 

800 pt1 = trsf.apply_on(mb.lower_left) 

801 pt2 = trsf.apply_on(mb.upper_right) 

802 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

803 self.add_transformation(trsf, False) 

804 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

805 if b in self: 

806 rr = RectangleObject(self[b]) # type: ignore 

807 pt1 = trsf.apply_on(rr.lower_left) 

808 pt2 = trsf.apply_on(rr.upper_right) 

809 self[NameObject(b)] = RectangleObject( 

810 ( 

811 min(pt1[0], pt2[0]), 

812 min(pt1[1], pt2[1]), 

813 max(pt1[0], pt2[0]), 

814 max(pt1[1], pt2[1]), 

815 ) 

816 ) 

817 

818 def rotate(self, angle: int) -> "PageObject": 

819 """ 

820 Rotate a page clockwise by increments of 90 degrees. 

821 

822 Args: 

823 angle: Angle to rotate the page. Must be an increment of 90 deg. 

824 

825 Returns: 

826 The rotated PageObject 

827 

828 """ 

829 if angle % 90 != 0: 

830 raise ValueError("Rotation angle must be a multiple of 90") 

831 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

832 return self 

833 

834 def _merge_resources( 

835 self, 

836 res1: DictionaryObject, 

837 res2: DictionaryObject, 

838 resource: Any, 

839 new_res1: bool = True, 

840 ) -> Tuple[Dict[str, Any], Dict[str, Any]]: 

841 try: 

842 assert isinstance(self.indirect_reference, IndirectObject) 

843 pdf = self.indirect_reference.pdf 

844 is_pdf_writer = hasattr( 

845 pdf, "_add_object" 

846 ) # expect isinstance(pdf, PdfWriter) 

847 except (AssertionError, AttributeError): 

848 pdf = None 

849 is_pdf_writer = False 

850 

851 def compute_unique_key(base_key: str) -> Tuple[str, bool]: 

852 """ 

853 Find a key that either doesn't already exist or has the same value 

854 (indicated by the bool) 

855 

856 Args: 

857 base_key: An index is added to this to get the computed key 

858 

859 Returns: 

860 A tuple (computed key, bool) where the boolean indicates 

861 if there is a resource of the given computed_key with the same 

862 value. 

863 

864 """ 

865 value = page2res.raw_get(base_key) 

866 # TODO: a possible improvement for writer, the indirect_reference 

867 # cannot be found because translated 

868 

869 # try the current key first (e.g. "foo"), but otherwise iterate 

870 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

871 # many keys, thus this'll eventually end, even if it's been crafted 

872 # to be maximally annoying. 

873 computed_key = base_key 

874 idx = 0 

875 while computed_key in new_res: 

876 if new_res.raw_get(computed_key) == value: 

877 # there's already a resource of this name, with the exact 

878 # same value 

879 return computed_key, True 

880 computed_key = f"{base_key}-{idx}" 

881 idx += 1 

882 return computed_key, False 

883 

884 if new_res1: 

885 new_res = DictionaryObject() 

886 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

887 else: 

888 new_res = cast(DictionaryObject, res1[resource]) 

889 page2res = cast( 

890 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

891 ) 

892 rename_res = {} 

893 for key in page2res: 

894 unique_key, same_value = compute_unique_key(key) 

895 newname = NameObject(unique_key) 

896 if key != unique_key: 

897 # we have to use a different name for this 

898 rename_res[key] = newname 

899 

900 if not same_value: 

901 if is_pdf_writer: 

902 new_res[newname] = page2res.raw_get(key).clone(pdf) 

903 try: 

904 new_res[newname] = new_res[newname].indirect_reference 

905 except AttributeError: 

906 pass 

907 else: 

908 new_res[newname] = page2res.raw_get(key) 

909 lst = sorted(new_res.items()) 

910 new_res.clear() 

911 for el in lst: 

912 new_res[el[0]] = el[1] 

913 return new_res, rename_res 

914 

915 @staticmethod 

916 def _content_stream_rename( 

917 stream: ContentStream, 

918 rename: Dict[Any, Any], 

919 pdf: Optional[PdfCommonDocProtocol], 

920 ) -> ContentStream: 

921 if not rename: 

922 return stream 

923 stream = ContentStream(stream, pdf) 

924 for operands, _operator in stream.operations: 

925 if isinstance(operands, list): 

926 for i, op in enumerate(operands): 

927 if isinstance(op, NameObject): 

928 operands[i] = rename.get(op, op) 

929 elif isinstance(operands, dict): 

930 for i, op in operands.items(): 

931 if isinstance(op, NameObject): 

932 operands[i] = rename.get(op, op) 

933 else: 

934 raise KeyError(f"Type of operands is {type(operands)}") 

935 return stream 

936 

937 @staticmethod 

938 def _add_transformation_matrix( 

939 contents: Any, 

940 pdf: Optional[PdfCommonDocProtocol], 

941 ctm: CompressedTransformationMatrix, 

942 ) -> ContentStream: 

943 """Add transformation matrix at the beginning of the given contents stream.""" 

944 contents = ContentStream(contents, pdf) 

945 contents.operations.insert( 

946 0, 

947 [ 

948 [FloatObject(x) for x in ctm], 

949 b"cm", 

950 ], 

951 ) 

952 return contents 

953 

954 def _get_contents_as_bytes(self) -> Optional[bytes]: 

955 """ 

956 Return the page contents as bytes. 

957 

958 Returns: 

959 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

960 

961 """ 

962 if PG.CONTENTS in self: 

963 obj = self[PG.CONTENTS].get_object() 

964 if isinstance(obj, list): 

965 return b"".join(x.get_object().get_data() for x in obj) 

966 return cast(EncodedStreamObject, obj).get_data() 

967 return None 

968 

969 def get_contents(self) -> Optional[ContentStream]: 

970 """ 

971 Access the page contents. 

972 

973 Returns: 

974 The ``/Contents`` object, or ``None`` if it does not exist. 

975 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

976 

977 """ 

978 if PG.CONTENTS in self: 

979 try: 

980 pdf = cast(IndirectObject, self.indirect_reference).pdf 

981 except AttributeError: 

982 pdf = None 

983 obj = self[PG.CONTENTS] 

984 if is_null_or_none(obj): 

985 return None 

986 resolved_object = obj.get_object() 

987 return ContentStream(resolved_object, pdf) 

988 return None 

989 

990 def replace_contents( 

991 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

992 ) -> None: 

993 """ 

994 Replace the page contents with the new content and nullify old objects 

995 Args: 

996 content: new content; if None delete the content field. 

997 """ 

998 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

999 # the page is not attached : the content is directly attached. 

1000 self[NameObject(PG.CONTENTS)] = content 

1001 return 

1002 

1003 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1004 for o in self[PG.CONTENTS]: # type: ignore[attr-defined] 

1005 try: 

1006 self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore 

1007 except AttributeError: 

1008 pass 

1009 

1010 if isinstance(content, ArrayObject): 

1011 content = ArrayObject(self.indirect_reference.pdf._add_object(obj) for obj in content) 

1012 

1013 if is_null_or_none(content): 

1014 if PG.CONTENTS not in self: 

1015 return 

1016 assert self.indirect_reference is not None 

1017 assert self[PG.CONTENTS].indirect_reference is not None 

1018 self.indirect_reference.pdf._objects[ 

1019 self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore 

1020 ] = NullObject() 

1021 del self[PG.CONTENTS] 

1022 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1023 try: 

1024 self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( 

1025 content 

1026 ) 

1027 except AttributeError: 

1028 # applies at least for page not in writer 

1029 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1030 # this will be fixed with the _add_object 

1031 self[NameObject(PG.CONTENTS)] = content 

1032 else: 

1033 assert content is not None, "mypy" 

1034 content.indirect_reference = self[ 

1035 PG.CONTENTS 

1036 ].indirect_reference # TODO: in the future may require generation management 

1037 try: 

1038 self.indirect_reference.pdf._objects[ 

1039 content.indirect_reference.idnum - 1 # type: ignore 

1040 ] = content 

1041 except AttributeError: 

1042 # applies at least for page not in writer 

1043 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1044 # this will be fixed with the _add_object 

1045 self[NameObject(PG.CONTENTS)] = content 

1046 # forces recalculation of inline_images 

1047 self.inline_images = None 

1048 

1049 def merge_page( 

1050 self, page2: "PageObject", expand: bool = False, over: bool = True 

1051 ) -> None: 

1052 """ 

1053 Merge the content streams of two pages into one. 

1054 

1055 Resource references (e.g. fonts) are maintained from both pages. 

1056 The mediabox, cropbox, etc of this page are not altered. 

1057 The parameter page's content stream will 

1058 be added to the end of this page's content stream, 

1059 meaning that it will be drawn after, or "on top" of this page. 

1060 

1061 Args: 

1062 page2: The page to be merged into this one. Should be 

1063 an instance of :class:`PageObject<PageObject>`. 

1064 over: set the page2 content over page1 if True (default) else under 

1065 expand: If True, the current page dimensions will be 

1066 expanded to accommodate the dimensions of the page to be merged. 

1067 

1068 """ 

1069 self._merge_page(page2, over=over, expand=expand) 

1070 

1071 def _merge_page( 

1072 self, 

1073 page2: "PageObject", 

1074 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1075 ctm: Optional[CompressedTransformationMatrix] = None, 

1076 over: bool = True, 

1077 expand: bool = False, 

1078 ) -> None: 

1079 # First we work on merging the resource dictionaries. This allows us 

1080 # to find out what symbols in the content streams we might need to 

1081 # rename. 

1082 try: 

1083 assert isinstance(self.indirect_reference, IndirectObject) 

1084 if hasattr( 

1085 self.indirect_reference.pdf, "_add_object" 

1086 ): # to detect PdfWriter 

1087 return self._merge_page_writer( 

1088 page2, page2transformation, ctm, over, expand 

1089 ) 

1090 except (AssertionError, AttributeError): 

1091 pass 

1092 

1093 new_resources = DictionaryObject() 

1094 rename = {} 

1095 try: 

1096 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1097 except KeyError: 

1098 original_resources = DictionaryObject() 

1099 try: 

1100 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1101 except KeyError: 

1102 page2resources = DictionaryObject() 

1103 new_annots = ArrayObject() 

1104 

1105 for page in (self, page2): 

1106 if PG.ANNOTS in page: 

1107 annots = page[PG.ANNOTS] 

1108 if isinstance(annots, ArrayObject): 

1109 new_annots.extend(annots) 

1110 

1111 for res in ( 

1112 RES.EXT_G_STATE, 

1113 RES.FONT, 

1114 RES.XOBJECT, 

1115 RES.COLOR_SPACE, 

1116 RES.PATTERN, 

1117 RES.SHADING, 

1118 RES.PROPERTIES, 

1119 ): 

1120 new, newrename = self._merge_resources( 

1121 original_resources, page2resources, res 

1122 ) 

1123 if new: 

1124 new_resources[NameObject(res)] = new 

1125 rename.update(newrename) 

1126 

1127 # Combine /ProcSet sets, making sure there's a consistent order 

1128 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1129 sorted( 

1130 set( 

1131 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1132 ).union( 

1133 set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1134 ) 

1135 ) 

1136 ) 

1137 

1138 new_content_array = ArrayObject() 

1139 original_content = self.get_contents() 

1140 if original_content is not None: 

1141 original_content.isolate_graphics_state() 

1142 new_content_array.append(original_content) 

1143 

1144 page2content = page2.get_contents() 

1145 if page2content is not None: 

1146 rect = getattr(page2, MERGE_CROP_BOX) 

1147 page2content.operations.insert( 

1148 0, 

1149 ( 

1150 map( 

1151 FloatObject, 

1152 [ 

1153 rect.left, 

1154 rect.bottom, 

1155 rect.width, 

1156 rect.height, 

1157 ], 

1158 ), 

1159 b"re", 

1160 ), 

1161 ) 

1162 page2content.operations.insert(1, ([], b"W")) 

1163 page2content.operations.insert(2, ([], b"n")) 

1164 if page2transformation is not None: 

1165 page2content = page2transformation(page2content) 

1166 page2content = PageObject._content_stream_rename( 

1167 page2content, rename, self.pdf 

1168 ) 

1169 page2content.isolate_graphics_state() 

1170 if over: 

1171 new_content_array.append(page2content) 

1172 else: 

1173 new_content_array.insert(0, page2content) 

1174 

1175 # if expanding the page to fit a new page, calculate the new media box size 

1176 if expand: 

1177 self._expand_mediabox(page2, ctm) 

1178 

1179 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1180 self[NameObject(PG.RESOURCES)] = new_resources 

1181 self[NameObject(PG.ANNOTS)] = new_annots 

1182 

1183 def _merge_page_writer( 

1184 self, 

1185 page2: "PageObject", 

1186 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1187 ctm: Optional[CompressedTransformationMatrix] = None, 

1188 over: bool = True, 

1189 expand: bool = False, 

1190 ) -> None: 

1191 # First we work on merging the resource dictionaries. This allows us 

1192 # to find which symbols in the content streams we might need to 

1193 # rename. 

1194 assert isinstance(self.indirect_reference, IndirectObject) 

1195 pdf = self.indirect_reference.pdf 

1196 

1197 rename = {} 

1198 if PG.RESOURCES not in self: 

1199 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1200 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1201 if PG.RESOURCES not in page2: 

1202 page2resources = DictionaryObject() 

1203 else: 

1204 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1205 

1206 for res in ( 

1207 RES.EXT_G_STATE, 

1208 RES.FONT, 

1209 RES.XOBJECT, 

1210 RES.COLOR_SPACE, 

1211 RES.PATTERN, 

1212 RES.SHADING, 

1213 RES.PROPERTIES, 

1214 ): 

1215 if res in page2resources: 

1216 if res not in original_resources: 

1217 original_resources[NameObject(res)] = DictionaryObject() 

1218 _, newrename = self._merge_resources( 

1219 original_resources, page2resources, res, False 

1220 ) 

1221 rename.update(newrename) 

1222 # Combine /ProcSet sets. 

1223 if RES.PROC_SET in page2resources: 

1224 if RES.PROC_SET not in original_resources: 

1225 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1226 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1227 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1228 if x not in arr: 

1229 arr.append(x) 

1230 arr.sort() 

1231 

1232 if PG.ANNOTS in page2: 

1233 if PG.ANNOTS not in self: 

1234 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1235 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1236 if ctm is None: 

1237 trsf = Transformation() 

1238 else: 

1239 trsf = Transformation(ctm) 

1240 for a in cast(ArrayObject, page2[PG.ANNOTS]): 

1241 a = a.get_object() 

1242 aa = a.clone( 

1243 pdf, 

1244 ignore_fields=("/P", "/StructParent", "/Parent"), 

1245 force_duplicate=True, 

1246 ) 

1247 r = cast(ArrayObject, a["/Rect"]) 

1248 pt1 = trsf.apply_on((r[0], r[1]), True) 

1249 pt2 = trsf.apply_on((r[2], r[3]), True) 

1250 aa[NameObject("/Rect")] = ArrayObject( 

1251 ( 

1252 min(pt1[0], pt2[0]), 

1253 min(pt1[1], pt2[1]), 

1254 max(pt1[0], pt2[0]), 

1255 max(pt1[1], pt2[1]), 

1256 ) 

1257 ) 

1258 if "/QuadPoints" in a: 

1259 q = cast(ArrayObject, a["/QuadPoints"]) 

1260 aa[NameObject("/QuadPoints")] = ArrayObject( 

1261 trsf.apply_on((q[0], q[1]), True) 

1262 + trsf.apply_on((q[2], q[3]), True) 

1263 + trsf.apply_on((q[4], q[5]), True) 

1264 + trsf.apply_on((q[6], q[7]), True) 

1265 ) 

1266 try: 

1267 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1268 except KeyError: 

1269 pass 

1270 try: 

1271 aa[NameObject("/P")] = self.indirect_reference 

1272 annots.append(aa.indirect_reference) 

1273 except AttributeError: 

1274 pass 

1275 

1276 new_content_array = ArrayObject() 

1277 original_content = self.get_contents() 

1278 if original_content is not None: 

1279 original_content.isolate_graphics_state() 

1280 new_content_array.append(original_content) 

1281 

1282 page2content = page2.get_contents() 

1283 if page2content is not None: 

1284 rect = getattr(page2, MERGE_CROP_BOX) 

1285 page2content.operations.insert( 

1286 0, 

1287 ( 

1288 map( 

1289 FloatObject, 

1290 [ 

1291 rect.left, 

1292 rect.bottom, 

1293 rect.width, 

1294 rect.height, 

1295 ], 

1296 ), 

1297 b"re", 

1298 ), 

1299 ) 

1300 page2content.operations.insert(1, ([], b"W")) 

1301 page2content.operations.insert(2, ([], b"n")) 

1302 if page2transformation is not None: 

1303 page2content = page2transformation(page2content) 

1304 page2content = PageObject._content_stream_rename( 

1305 page2content, rename, self.pdf 

1306 ) 

1307 page2content.isolate_graphics_state() 

1308 if over: 

1309 new_content_array.append(page2content) 

1310 else: 

1311 new_content_array.insert(0, page2content) 

1312 

1313 # if expanding the page to fit a new page, calculate the new media box size 

1314 if expand: 

1315 self._expand_mediabox(page2, ctm) 

1316 

1317 self.replace_contents(new_content_array) 

1318 

1319 def _expand_mediabox( 

1320 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1321 ) -> None: 

1322 corners1 = ( 

1323 self.mediabox.left.as_numeric(), 

1324 self.mediabox.bottom.as_numeric(), 

1325 self.mediabox.right.as_numeric(), 

1326 self.mediabox.top.as_numeric(), 

1327 ) 

1328 corners2 = ( 

1329 page2.mediabox.left.as_numeric(), 

1330 page2.mediabox.bottom.as_numeric(), 

1331 page2.mediabox.left.as_numeric(), 

1332 page2.mediabox.top.as_numeric(), 

1333 page2.mediabox.right.as_numeric(), 

1334 page2.mediabox.top.as_numeric(), 

1335 page2.mediabox.right.as_numeric(), 

1336 page2.mediabox.bottom.as_numeric(), 

1337 ) 

1338 if ctm is not None: 

1339 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1340 new_x = tuple( 

1341 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1342 for i in range(0, 8, 2) 

1343 ) 

1344 new_y = tuple( 

1345 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1346 for i in range(0, 8, 2) 

1347 ) 

1348 else: 

1349 new_x = corners2[0:8:2] 

1350 new_y = corners2[1:8:2] 

1351 lowerleft = (min(new_x), min(new_y)) 

1352 upperright = (max(new_x), max(new_y)) 

1353 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1354 upperright = ( 

1355 max(corners1[2], upperright[0]), 

1356 max(corners1[3], upperright[1]), 

1357 ) 

1358 

1359 self.mediabox.lower_left = lowerleft 

1360 self.mediabox.upper_right = upperright 

1361 

1362 def merge_transformed_page( 

1363 self, 

1364 page2: "PageObject", 

1365 ctm: Union[CompressedTransformationMatrix, Transformation], 

1366 over: bool = True, 

1367 expand: bool = False, 

1368 ) -> None: 

1369 """ 

1370 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1371 matrix is applied to the merged stream. 

1372 

1373 Args: 

1374 page2: The page to be merged into this one. 

1375 ctm: a 6-element tuple containing the operands of the 

1376 transformation matrix 

1377 over: set the page2 content over page1 if True (default) else under 

1378 expand: Whether the page should be expanded to fit the dimensions 

1379 of the page to be merged. 

1380 

1381 """ 

1382 if isinstance(ctm, Transformation): 

1383 ctm = ctm.ctm 

1384 self._merge_page( 

1385 page2, 

1386 lambda page2Content: PageObject._add_transformation_matrix( 

1387 page2Content, page2.pdf, ctm 

1388 ), 

1389 ctm, 

1390 over, 

1391 expand, 

1392 ) 

1393 

1394 def merge_scaled_page( 

1395 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1396 ) -> None: 

1397 """ 

1398 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1399 is scaled by applying a transformation matrix. 

1400 

1401 Args: 

1402 page2: The page to be merged into this one. 

1403 scale: The scaling factor 

1404 over: set the page2 content over page1 if True (default) else under 

1405 expand: Whether the page should be expanded to fit the 

1406 dimensions of the page to be merged. 

1407 

1408 """ 

1409 op = Transformation().scale(scale, scale) 

1410 self.merge_transformed_page(page2, op, over, expand) 

1411 

1412 def merge_rotated_page( 

1413 self, 

1414 page2: "PageObject", 

1415 rotation: float, 

1416 over: bool = True, 

1417 expand: bool = False, 

1418 ) -> None: 

1419 """ 

1420 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1421 is rotated by applying a transformation matrix. 

1422 

1423 Args: 

1424 page2: The page to be merged into this one. 

1425 rotation: The angle of the rotation, in degrees 

1426 over: set the page2 content over page1 if True (default) else under 

1427 expand: Whether the page should be expanded to fit the 

1428 dimensions of the page to be merged. 

1429 

1430 """ 

1431 op = Transformation().rotate(rotation) 

1432 self.merge_transformed_page(page2, op, over, expand) 

1433 

1434 def merge_translated_page( 

1435 self, 

1436 page2: "PageObject", 

1437 tx: float, 

1438 ty: float, 

1439 over: bool = True, 

1440 expand: bool = False, 

1441 ) -> None: 

1442 """ 

1443 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1444 merged is translated by applying a transformation matrix. 

1445 

1446 Args: 

1447 page2: the page to be merged into this one. 

1448 tx: The translation on X axis 

1449 ty: The translation on Y axis 

1450 over: set the page2 content over page1 if True (default) else under 

1451 expand: Whether the page should be expanded to fit the 

1452 dimensions of the page to be merged. 

1453 

1454 """ 

1455 op = Transformation().translate(tx, ty) 

1456 self.merge_transformed_page(page2, op, over, expand) 

1457 

1458 def add_transformation( 

1459 self, 

1460 ctm: Union[Transformation, CompressedTransformationMatrix], 

1461 expand: bool = False, 

1462 ) -> None: 

1463 """ 

1464 Apply a transformation matrix to the page. 

1465 

1466 Args: 

1467 ctm: A 6-element tuple containing the operands of the 

1468 transformation matrix. Alternatively, a 

1469 :py:class:`Transformation<pypdf.Transformation>` 

1470 object can be passed. 

1471 

1472 See :doc:`/user/cropping-and-transforming`. 

1473 

1474 """ 

1475 if isinstance(ctm, Transformation): 

1476 ctm = ctm.ctm 

1477 content = self.get_contents() 

1478 if content is not None: 

1479 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1480 content.isolate_graphics_state() 

1481 self.replace_contents(content) 

1482 # if expanding the page to fit a new page, calculate the new media box size 

1483 if expand: 

1484 corners = [ 

1485 self.mediabox.left.as_numeric(), 

1486 self.mediabox.bottom.as_numeric(), 

1487 self.mediabox.left.as_numeric(), 

1488 self.mediabox.top.as_numeric(), 

1489 self.mediabox.right.as_numeric(), 

1490 self.mediabox.top.as_numeric(), 

1491 self.mediabox.right.as_numeric(), 

1492 self.mediabox.bottom.as_numeric(), 

1493 ] 

1494 

1495 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1496 new_x = [ 

1497 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1498 for i in range(0, 8, 2) 

1499 ] 

1500 new_y = [ 

1501 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1502 for i in range(0, 8, 2) 

1503 ] 

1504 

1505 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1506 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1507 

1508 def scale(self, sx: float, sy: float) -> None: 

1509 """ 

1510 Scale a page by the given factors by applying a transformation matrix 

1511 to its content and updating the page size. 

1512 

1513 This updates the various page boundaries (mediabox, cropbox, etc.) 

1514 and the contents of the page. 

1515 

1516 Args: 

1517 sx: The scaling factor on horizontal axis. 

1518 sy: The scaling factor on vertical axis. 

1519 

1520 """ 

1521 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1522 self.mediabox = self.mediabox.scale(sx, sy) 

1523 self.cropbox = self.cropbox.scale(sx, sy) 

1524 self.bleedbox = self.bleedbox.scale(sx, sy) 

1525 self.trimbox = self.trimbox.scale(sx, sy) 

1526 self.artbox = self.artbox.scale(sx, sy) 

1527 

1528 if PG.ANNOTS in self: 

1529 annotations = self[PG.ANNOTS] 

1530 if isinstance(annotations, ArrayObject): 

1531 for annotation in annotations: 

1532 annotation_obj = annotation.get_object() 

1533 if ADA.Rect in annotation_obj: 

1534 rectangle = annotation_obj[ADA.Rect] 

1535 if isinstance(rectangle, ArrayObject): 

1536 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1537 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1538 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1539 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1540 

1541 if PG.VP in self: 

1542 viewport = self[PG.VP] 

1543 if isinstance(viewport, ArrayObject): 

1544 bbox = viewport[0]["/BBox"] 

1545 else: 

1546 bbox = viewport["/BBox"] # type: ignore 

1547 scaled_bbox = RectangleObject( 

1548 ( 

1549 float(bbox[0]) * sx, 

1550 float(bbox[1]) * sy, 

1551 float(bbox[2]) * sx, 

1552 float(bbox[3]) * sy, 

1553 ) 

1554 ) 

1555 if isinstance(viewport, ArrayObject): 

1556 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore 

1557 NameObject("/BBox") 

1558 ] = scaled_bbox 

1559 else: 

1560 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore 

1561 

1562 def scale_by(self, factor: float) -> None: 

1563 """ 

1564 Scale a page by the given factor by applying a transformation matrix to 

1565 its content and updating the page size. 

1566 

1567 Args: 

1568 factor: The scaling factor (for both X and Y axis). 

1569 

1570 """ 

1571 self.scale(factor, factor) 

1572 

1573 def scale_to(self, width: float, height: float) -> None: 

1574 """ 

1575 Scale a page to the specified dimensions by applying a transformation 

1576 matrix to its content and updating the page size. 

1577 

1578 Args: 

1579 width: The new width. 

1580 height: The new height. 

1581 

1582 """ 

1583 sx = width / float(self.mediabox.width) 

1584 sy = height / float(self.mediabox.height) 

1585 self.scale(sx, sy) 

1586 

1587 def compress_content_streams(self, level: int = -1) -> None: 

1588 """ 

1589 Compress the size of this page by joining all content streams and 

1590 applying a FlateDecode filter. 

1591 

1592 However, it is possible that this function will perform no action if 

1593 content stream compression becomes "automatic". 

1594 """ 

1595 content = self.get_contents() 

1596 if content is not None: 

1597 content_obj = content.flate_encode(level) 

1598 try: 

1599 content.indirect_reference.pdf._objects[ # type: ignore 

1600 content.indirect_reference.idnum - 1 # type: ignore 

1601 ] = content_obj 

1602 except AttributeError: 

1603 if self.indirect_reference is not None and hasattr( 

1604 self.indirect_reference.pdf, "_add_object" 

1605 ): 

1606 self.replace_contents(content_obj) 

1607 else: 

1608 raise ValueError("Page must be part of a PdfWriter") 

1609 

1610 @property 

1611 def page_number(self) -> Optional[int]: 

1612 """ 

1613 Read-only property which returns the page number within the PDF file. 

1614 

1615 Returns: 

1616 Page number; None if the page is not attached to a PDF. 

1617 

1618 """ 

1619 if self.indirect_reference is None: 

1620 return None 

1621 try: 

1622 lst = self.indirect_reference.pdf.pages 

1623 return lst.index(self) 

1624 except ValueError: 

1625 return None 

1626 

1627 def _debug_for_extract(self) -> str: # pragma: no cover 

1628 out = "" 

1629 for ope, op in ContentStream( 

1630 self["/Contents"].get_object(), self.pdf, "bytes" 

1631 ).operations: 

1632 if op == b"TJ": 

1633 s = [x for x in ope[0] if isinstance(x, str)] 

1634 else: 

1635 s = [] 

1636 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1637 out += "\n=============================\n" 

1638 try: 

1639 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1640 out += fo + "\n" 

1641 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1642 try: 

1643 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1644 "/Encoding" 

1645 ].__repr__() 

1646 out += enc_repr + "\n" 

1647 except Exception: 

1648 pass 

1649 try: 

1650 out += ( 

1651 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1652 "/ToUnicode" 

1653 ] 

1654 .get_data() 

1655 .decode() 

1656 + "\n" 

1657 ) 

1658 except Exception: 

1659 pass 

1660 

1661 except KeyError: 

1662 out += "No Font\n" 

1663 return out 

1664 

1665 def _get_actual_font_widths( 

1666 self, 

1667 cmap: Tuple[ 

1668 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

1669 ], 

1670 text_operands: str, 

1671 font_size: float, 

1672 space_width: float 

1673 ) -> Tuple[float, float, float]: 

1674 font_widths: float = 0 

1675 font_name: str = cmap[2] 

1676 if font_name not in self._font_width_maps: 

1677 if cmap[3] is None: 

1678 font_width_map: Dict[Any, float] = {} 

1679 space_char = " " 

1680 actual_space_width: float = space_width 

1681 font_width_map["default"] = actual_space_width * 2 

1682 else: 

1683 space_char = get_actual_str_key(" ", cmap[0], cmap[1]) 

1684 font_width_map = build_font_width_map(cmap[3], space_width * 2) 

1685 actual_space_width = compute_font_width(font_width_map, space_char) 

1686 if actual_space_width == 0: 

1687 actual_space_width = space_width 

1688 self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) 

1689 font_width_map = self._font_width_maps[font_name][0] 

1690 space_char = self._font_width_maps[font_name][1] 

1691 actual_space_width = self._font_width_maps[font_name][2] 

1692 

1693 if text_operands: 

1694 for char in text_operands: 

1695 if char == space_char: 

1696 font_widths += actual_space_width 

1697 continue 

1698 font_widths += compute_font_width(font_width_map, char) 

1699 return (font_widths * font_size, space_width * font_size, font_size) 

1700 

1701 def _handle_tj( 

1702 self, 

1703 text: str, 

1704 operands: List[Union[str, TextStringObject]], 

1705 cm_matrix: List[float], 

1706 tm_matrix: List[float], 

1707 cmap: Tuple[ 

1708 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

1709 ], 

1710 orientations: Tuple[int, ...], 

1711 font_size: float, 

1712 rtl_dir: bool, 

1713 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 

1714 space_width: float, 

1715 actual_str_size: Dict[str, float] 

1716 ) -> Tuple[str, bool, Dict[str, float]]: 

1717 text_operands, is_str_operands = get_text_operands( 

1718 operands, cm_matrix, tm_matrix, cmap, orientations) 

1719 if is_str_operands: 

1720 text += text_operands 

1721 else: 

1722 text, rtl_dir = get_display_str( 

1723 text, 

1724 cm_matrix, 

1725 tm_matrix, # text matrix 

1726 cmap, 

1727 text_operands, 

1728 font_size, 

1729 rtl_dir, 

1730 visitor_text) 

1731 font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( 

1732 self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) 

1733 actual_str_size["str_widths"] += font_widths 

1734 

1735 return text, rtl_dir, actual_str_size 

1736 

1737 def _extract_text( 

1738 self, 

1739 obj: Any, 

1740 pdf: Any, 

1741 orientations: Tuple[int, ...] = (0, 90, 180, 270), 

1742 space_width: float = 200.0, 

1743 content_key: Optional[str] = PG.CONTENTS, 

1744 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1745 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1746 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1747 ) -> str: 

1748 """ 

1749 See extract_text for most arguments. 

1750 

1751 Args: 

1752 content_key: indicate the default key where to extract data 

1753 None = the object; this allows reusing the function on an XObject 

1754 default = "/Content" 

1755 

1756 """ 

1757 text: str = "" 

1758 output: str = "" 

1759 rtl_dir: bool = False # right-to-left 

1760 cmaps: Dict[ 

1761 str, 

1762 Tuple[ 

1763 str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject 

1764 ], 

1765 ] = {} 

1766 

1767 try: 

1768 objr = obj 

1769 while NameObject(PG.RESOURCES) not in objr: 

1770 # /Resources can be inherited so we look to parents 

1771 objr = objr["/Parent"].get_object() 

1772 # If no parents then no /Resources will be available, 

1773 # so an exception will be raised 

1774 resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) 

1775 except Exception: 

1776 # No resources means no text is possible (no font); we consider the 

1777 # file as not damaged, no need to check for TJ or Tj 

1778 return "" 

1779 

1780 if "/Font" in resources_dict and (font := resources_dict["/Font"]): 

1781 for f in cast(DictionaryObject, font): 

1782 cmaps[f] = build_char_map(f, space_width, obj) 

1783 cmap: Tuple[ 

1784 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

1785 ] = ( 

1786 "charmap", 

1787 {}, 

1788 "NotInitialized", 

1789 None, 

1790 ) # (encoding, CMAP, font resource name, font) 

1791 

1792 try: 

1793 content = ( 

1794 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1795 ) 

1796 if not isinstance(content, ContentStream): 

1797 content = ContentStream(content, pdf, "bytes") 

1798 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1799 return "" 

1800 # We check all strings are TextStringObjects. ByteStringObjects 

1801 # are strings where the byte->string encoding was unknown, so adding 

1802 # them to the text here would be gibberish. 

1803 

1804 cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1805 tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1806 cm_stack = [] 

1807 

1808 # Store the last modified matrices; can be an intermediate position 

1809 cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1810 tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1811 

1812 # Store the position at the beginning of building the text 

1813 memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1814 memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1815 

1816 char_scale = 1.0 

1817 space_scale = 1.0 

1818 _space_width: float = 500.0 # will be set correctly at first Tf 

1819 _actual_str_size: Dict[str, float] = { 

1820 "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0 

1821 } # will be set to string length calculation result 

1822 TL = 0.0 

1823 font_size = 12.0 # init just in case of 

1824 

1825 def compute_str_widths(str_widths: float) -> float: 

1826 return str_widths / 1000 

1827 

1828 def process_operation(operator: bytes, operands: List[Any]) -> None: 

1829 nonlocal cm_matrix, tm_matrix, cm_stack, cm_prev, tm_prev, memo_cm, memo_tm 

1830 nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap 

1831 nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size 

1832 

1833 str_widths: float = 0.0 

1834 

1835 # Table 5.4 page 405 

1836 if operator == b"BT": # Begin Text 

1837 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1838 # Flush text: 

1839 output += text 

1840 if visitor_text is not None: 

1841 visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) 

1842 text = "" 

1843 memo_cm = cm_matrix.copy() 

1844 memo_tm = tm_matrix.copy() 

1845 return 

1846 if operator == b"ET": # End Text 

1847 # Flush text: 

1848 output += text 

1849 if visitor_text is not None: 

1850 visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) 

1851 text = "" 

1852 memo_cm = cm_matrix.copy() 

1853 memo_tm = tm_matrix.copy() 

1854 

1855 # Table 4.7 "Graphics state operators", page 219 

1856 # cm_matrix calculation is reserved for later 

1857 elif operator == b"q": # Save graphics state 

1858 cm_stack.append( 

1859 ( 

1860 cm_matrix, 

1861 cmap, 

1862 font_size, 

1863 char_scale, 

1864 space_scale, 

1865 _space_width, 

1866 TL, 

1867 ) 

1868 ) 

1869 elif operator == b"Q": # Restore graphics state 

1870 try: 

1871 ( 

1872 cm_matrix, 

1873 cmap, 

1874 font_size, 

1875 char_scale, 

1876 space_scale, 

1877 _space_width, 

1878 TL, 

1879 ) = cm_stack.pop() 

1880 except Exception: 

1881 cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1882 elif operator == b"cm": # Modify current matrix 

1883 output += text 

1884 if visitor_text is not None: 

1885 visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) 

1886 text = "" 

1887 try: 

1888 cm_matrix = mult( 

1889 [float(operand) for operand in operands[:6]], 

1890 cm_matrix 

1891 ) 

1892 except Exception: 

1893 cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

1894 memo_cm = cm_matrix.copy() 

1895 memo_tm = tm_matrix.copy() 

1896 

1897 # Table 5.2 page 398 

1898 elif operator == b"Tz": # Set horizontal text scaling 

1899 char_scale = float(operands[0]) / 100 if operands else 1.0 

1900 elif operator == b"Tw": # Set word spacing 

1901 space_scale = 1.0 + float(operands[0] if operands else 0.0) 

1902 elif operator == b"TL": # Set Text Leading 

1903 scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2) 

1904 TL = float(operands[0] if operands else 0.0) * font_size * scale_x 

1905 elif operator == b"Tf": # Set font size 

1906 if text != "": 

1907 output += text # .translate(cmap) 

1908 if visitor_text is not None: 

1909 visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) 

1910 text = "" 

1911 memo_cm = cm_matrix.copy() 

1912 memo_tm = tm_matrix.copy() 

1913 try: 

1914 # char_map_tuple: font_type, 

1915 # float(sp_width / 2), 

1916 # encoding, 

1917 # map_dict, 

1918 # font_dict (describes the font) 

1919 char_map_tuple = cmaps[operands[0]] 

1920 # current cmap: encoding, 

1921 # map_dict, 

1922 # font resource name (internal name, not the real font name), 

1923 # font_dict 

1924 cmap = ( 

1925 char_map_tuple[2], 

1926 char_map_tuple[3], 

1927 operands[0], 

1928 char_map_tuple[4], 

1929 ) 

1930 _space_width = char_map_tuple[1] 

1931 except KeyError: # font not found 

1932 cmap = ( 

1933 unknown_char_map[2], 

1934 unknown_char_map[3], 

1935 f"???{operands[0]}", 

1936 None, 

1937 ) 

1938 _space_width = unknown_char_map[1] 

1939 try: 

1940 font_size = float(operands[1]) 

1941 except Exception: 

1942 pass # keep previous size 

1943 # Table 5.5 page 406 

1944 elif operator == b"Td": # Move text position 

1945 # A special case is a translating only tm: 

1946 # tm = [1, 0, 0, 1, e, f] 

1947 # i.e. tm[4] += tx, tm[5] += ty. 

1948 tx, ty = float(operands[0]), float(operands[1]) 

1949 tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] 

1950 tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] 

1951 str_widths = compute_str_widths(_actual_str_size["str_widths"]) 

1952 _actual_str_size["str_widths"] = 0.0 

1953 elif operator == b"Tm": # Set text matrix 

1954 tm_matrix = [float(operand) for operand in operands[:6]] 

1955 str_widths = compute_str_widths(_actual_str_size["str_widths"]) 

1956 _actual_str_size["str_widths"] = 0.0 

1957 elif operator == b"T*": # Move to next line 

1958 tm_matrix[4] -= TL * tm_matrix[2] 

1959 tm_matrix[5] -= TL * tm_matrix[3] 

1960 str_widths = compute_str_widths(_actual_str_size["str_widths"]) 

1961 _actual_str_size["str_widths"] = 0.0 

1962 elif operator == b"Tj": # Show text 

1963 text, rtl_dir, _actual_str_size = self._handle_tj( 

1964 text, 

1965 operands, 

1966 cm_matrix, 

1967 tm_matrix, 

1968 cmap, 

1969 orientations, 

1970 font_size, 

1971 rtl_dir, 

1972 visitor_text, 

1973 _space_width, 

1974 _actual_str_size, 

1975 ) 

1976 else: 

1977 return 

1978 

1979 if operator in {b"Td", b"Tm", b"T*", b"Tj"}: 

1980 try: 

1981 text, output, cm_prev, tm_prev = crlf_space_check( 

1982 text, 

1983 (cm_prev, tm_prev), 

1984 (cm_matrix, tm_matrix), 

1985 (memo_cm, memo_tm), 

1986 cmap, 

1987 orientations, 

1988 output, 

1989 font_size, 

1990 visitor_text, 

1991 str_widths, 

1992 compute_str_widths(_actual_str_size["space_width"]), 

1993 _actual_str_size["str_height"] 

1994 ) 

1995 if text == "": 

1996 memo_cm = cm_matrix.copy() 

1997 memo_tm = tm_matrix.copy() 

1998 except OrientationNotFoundError: 

1999 return 

2000 

2001 for operands, operator in content.operations: 

2002 if visitor_operand_before is not None: 

2003 visitor_operand_before(operator, operands, cm_matrix, tm_matrix) 

2004 # Multiple operators are handled here 

2005 if operator == b"'": 

2006 process_operation(b"T*", []) 

2007 process_operation(b"Tj", operands) 

2008 elif operator == b'"': 

2009 process_operation(b"Tw", [operands[0]]) 

2010 process_operation(b"Tc", [operands[1]]) 

2011 process_operation(b"T*", []) 

2012 process_operation(b"Tj", operands[2:]) 

2013 elif operator == b"TJ": 

2014 # The space width may be smaller than the font width, so the width should be 95%. 

2015 _confirm_space_width = _space_width * 0.95 

2016 if operands: 

2017 for op in operands[0]: 

2018 if isinstance(op, (str, bytes)): 

2019 process_operation(b"Tj", [op]) 

2020 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

2021 abs(float(op)) >= _confirm_space_width 

2022 and text 

2023 and text[-1] != " " 

2024 ): 

2025 process_operation(b"Tj", [" "]) 

2026 elif operator == b"TD": 

2027 process_operation(b"TL", [-operands[1]]) 

2028 process_operation(b"Td", operands) 

2029 elif operator == b"Do": 

2030 output += text 

2031 if visitor_text is not None: 

2032 visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) 

2033 try: 

2034 if output[-1] != "\n": 

2035 output += "\n" 

2036 if visitor_text is not None: 

2037 visitor_text( 

2038 "\n", 

2039 memo_cm, 

2040 memo_tm, 

2041 cmap[3], 

2042 font_size, 

2043 ) 

2044 except IndexError: 

2045 pass 

2046 try: 

2047 xobj = resources_dict["/XObject"] 

2048 if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore 

2049 text = self.extract_xform_text( 

2050 xobj[operands[0]], # type: ignore 

2051 orientations, 

2052 space_width, 

2053 visitor_operand_before, 

2054 visitor_operand_after, 

2055 visitor_text, 

2056 ) 

2057 output += text 

2058 if visitor_text is not None: 

2059 visitor_text( 

2060 text, 

2061 memo_cm, 

2062 memo_tm, 

2063 cmap[3], 

2064 font_size, 

2065 ) 

2066 except Exception as exception: 

2067 logger_warning( 

2068 f"Impossible to decode XFormObject {operands[0]}: {exception}", 

2069 __name__, 

2070 ) 

2071 finally: 

2072 text = "" 

2073 memo_cm = cm_matrix.copy() 

2074 memo_tm = tm_matrix.copy() 

2075 else: 

2076 process_operation(operator, operands) 

2077 if visitor_operand_after is not None: 

2078 visitor_operand_after(operator, operands, cm_matrix, tm_matrix) 

2079 output += text # just in case 

2080 if text != "" and visitor_text is not None: 

2081 visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) 

2082 return output 

2083 

2084 def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]: 

2085 """ 

2086 Get fonts formatted for "layout" mode text extraction. 

2087 

2088 Returns: 

2089 Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name 

2090 

2091 """ 

2092 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

2093 objr: Any = self 

2094 fonts: Dict[str, _layout_mode.Font] = {} 

2095 while objr is not None: 

2096 try: 

2097 resources_dict: Any = objr[PG.RESOURCES] 

2098 except KeyError: 

2099 resources_dict = {} 

2100 if "/Font" in resources_dict and self.pdf is not None: 

2101 for font_name in resources_dict["/Font"]: 

2102 *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) 

2103 font_dict = { 

2104 k: v.get_object() 

2105 if isinstance(v, IndirectObject) 

2106 else [_v.get_object() for _v in v] 

2107 if isinstance(v, ArrayObject) 

2108 else v 

2109 for k, v in font_dict_obj.items() 

2110 } 

2111 # mypy really sucks at unpacking 

2112 fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] 

2113 try: 

2114 objr = objr["/Parent"].get_object() 

2115 except KeyError: 

2116 objr = None 

2117 

2118 return fonts 

2119 

2120 def _layout_mode_text( 

2121 self, 

2122 space_vertically: bool = True, 

2123 scale_weight: float = 1.25, 

2124 strip_rotated: bool = True, 

2125 debug_path: Optional[Path] = None, 

2126 font_height_weight: float = 1, 

2127 ) -> str: 

2128 """ 

2129 Get text preserving fidelity to source PDF text layout. 

2130 

2131 Args: 

2132 space_vertically: include blank lines inferred from y distance + font 

2133 height. Defaults to True. 

2134 scale_weight: multiplier for string length when calculating weighted 

2135 average character width. Defaults to 1.25. 

2136 strip_rotated: Removes text that is rotated w.r.t. to the page from 

2137 layout mode output. Defaults to True. 

2138 debug_path (Path | None): if supplied, must target a directory. 

2139 creates the following files with debug information for layout mode 

2140 functions if supplied: 

2141 - fonts.json: output of self._layout_mode_fonts 

2142 - tjs.json: individual text render ops with corresponding transform matrices 

2143 - bts.json: text render ops left justified and grouped by BT/ET operators 

2144 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

2145 Defaults to None. 

2146 font_height_weight: multiplier for font height when calculating 

2147 blank lines. Defaults to 1. 

2148 

2149 Returns: 

2150 str: multiline string containing page text in a fixed width format that 

2151 closely adheres to the rendered layout in the source pdf. 

2152 

2153 """ 

2154 fonts = self._layout_mode_fonts() 

2155 if debug_path: # pragma: no cover 

2156 import json # noqa: PLC0415 

2157 

2158 debug_path.joinpath("fonts.json").write_text( 

2159 json.dumps( 

2160 fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) 

2161 ), 

2162 "utf-8", 

2163 ) 

2164 

2165 ops = iter( 

2166 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

2167 ) 

2168 bt_groups = _layout_mode.text_show_operations( 

2169 ops, fonts, strip_rotated, debug_path 

2170 ) 

2171 

2172 if not bt_groups: 

2173 return "" 

2174 

2175 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

2176 

2177 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

2178 

2179 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

2180 

2181 def extract_text( 

2182 self, 

2183 *args: Any, 

2184 orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), 

2185 space_width: float = 200.0, 

2186 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2187 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2188 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2189 extraction_mode: Literal["plain", "layout"] = "plain", 

2190 **kwargs: Any, 

2191 ) -> str: 

2192 """ 

2193 Locate all text drawing commands, in the order they are provided in the 

2194 content stream, and extract the text. 

2195 

2196 This works well for some PDF files, but poorly for others, depending on 

2197 the generator used. This will be refined in the future. 

2198 

2199 Do not rely on the order of text coming out of this function, as it 

2200 will change if this function is made more sophisticated. 

2201 

2202 Arabic and Hebrew are extracted in the correct order. 

2203 If required a custom RTL range of characters can be defined; 

2204 see function set_custom_rtl. 

2205 

2206 Additionally you can provide visitor methods to get informed on all 

2207 operations and all text objects. 

2208 For example in some PDF files this can be useful to parse tables. 

2209 

2210 Args: 

2211 orientations: list of orientations extract_text will look for 

2212 default = (0, 90, 180, 270) 

2213 note: currently only 0 (up),90 (turned left), 180 (upside down), 

2214 270 (turned right) 

2215 Silently ignored in "layout" mode. 

2216 space_width: force default space width 

2217 if not extracted from font (default: 200) 

2218 Silently ignored in "layout" mode. 

2219 visitor_operand_before: function to be called before processing an operation. 

2220 It has four arguments: operator, operand-arguments, 

2221 current transformation matrix and text matrix. 

2222 Ignored with a warning in "layout" mode. 

2223 visitor_operand_after: function to be called after processing an operation. 

2224 It has four arguments: operator, operand-arguments, 

2225 current transformation matrix and text matrix. 

2226 Ignored with a warning in "layout" mode. 

2227 visitor_text: function to be called when extracting some text at some position. 

2228 It has five arguments: text, current transformation matrix, 

2229 text matrix, font-dictionary and font-size. 

2230 The font-dictionary may be None in case of unknown fonts. 

2231 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

2232 Ignored with a warning in "layout" mode. 

2233 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

2234 "layout" for experimental layout mode functionality. 

2235 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

2236 in "layout" mode. 

2237 

2238 kwargs: 

2239 layout_mode_space_vertically (bool): include blank lines inferred from 

2240 y distance + font height. Defaults to True. 

2241 layout_mode_scale_weight (float): multiplier for string length when calculating 

2242 weighted average character width. Defaults to 1.25. 

2243 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

2244 Set to False to include rotated text anyway. If rotated text is discovered, 

2245 layout will be degraded and a warning will result. Defaults to True. 

2246 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

2247 creates the following files with debug information for layout mode 

2248 functions if supplied: 

2249 

2250 - fonts.json: output of self._layout_mode_fonts 

2251 - tjs.json: individual text render ops with corresponding transform matrices 

2252 - bts.json: text render ops left justified and grouped by BT/ET operators 

2253 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

2254 layout_mode_font_height_weight (float): multiplier for font height when calculating 

2255 blank lines. Defaults to 1. 

2256 

2257 Returns: 

2258 The extracted text 

2259 

2260 """ 

2261 if extraction_mode not in ["plain", "layout"]: 

2262 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

2263 if extraction_mode == "layout": 

2264 for visitor in ( 

2265 "visitor_operand_before", 

2266 "visitor_operand_after", 

2267 "visitor_text", 

2268 ): 

2269 if locals()[visitor]: 

2270 logger_warning( 

2271 f"Argument {visitor} is ignored in layout mode", 

2272 __name__, 

2273 ) 

2274 return self._layout_mode_text( 

2275 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2276 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2277 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2278 debug_path=kwargs.get("layout_mode_debug_path"), 

2279 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2280 ) 

2281 if len(args) >= 1: 

2282 if isinstance(args[0], str): 

2283 if len(args) >= 3: 

2284 if isinstance(args[2], (tuple, int)): 

2285 orientations = args[2] 

2286 else: 

2287 raise TypeError(f"Invalid positional parameter {args[2]}") 

2288 if len(args) >= 4: 

2289 if isinstance(args[3], (float, int)): 

2290 space_width = args[3] 

2291 else: 

2292 raise TypeError(f"Invalid positional parameter {args[3]}") 

2293 elif isinstance(args[0], (tuple, int)): 

2294 orientations = args[0] 

2295 if len(args) >= 2: 

2296 if isinstance(args[1], (float, int)): 

2297 space_width = args[1] 

2298 else: 

2299 raise TypeError(f"Invalid positional parameter {args[1]}") 

2300 else: 

2301 raise TypeError(f"Invalid positional parameter {args[0]}") 

2302 

2303 if isinstance(orientations, int): 

2304 orientations = (orientations,) 

2305 

2306 return self._extract_text( 

2307 self, 

2308 self.pdf, 

2309 orientations, 

2310 space_width, 

2311 PG.CONTENTS, 

2312 visitor_operand_before, 

2313 visitor_operand_after, 

2314 visitor_text, 

2315 ) 

2316 

2317 def extract_xform_text( 

2318 self, 

2319 xform: EncodedStreamObject, 

2320 orientations: Tuple[int, ...] = (0, 90, 270, 360), 

2321 space_width: float = 200.0, 

2322 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2323 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2324 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2325 ) -> str: 

2326 """ 

2327 Extract text from an XObject. 

2328 

2329 Args: 

2330 xform: 

2331 orientations: 

2332 space_width: force default space width (if not extracted from font (default 200) 

2333 visitor_operand_before: 

2334 visitor_operand_after: 

2335 visitor_text: 

2336 

2337 Returns: 

2338 The extracted text 

2339 

2340 """ 

2341 return self._extract_text( 

2342 xform, 

2343 self.pdf, 

2344 orientations, 

2345 space_width, 

2346 None, 

2347 visitor_operand_before, 

2348 visitor_operand_after, 

2349 visitor_text, 

2350 ) 

2351 

2352 def _get_fonts(self) -> Tuple[Set[str], Set[str]]: 

2353 """ 

2354 Get the names of embedded fonts and unembedded fonts. 

2355 

2356 Returns: 

2357 A tuple (set of embedded fonts, set of unembedded fonts) 

2358 

2359 """ 

2360 obj = self.get_object() 

2361 assert isinstance(obj, DictionaryObject) 

2362 fonts: Set[str] = set() 

2363 embedded: Set[str] = set() 

2364 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2365 unembedded = fonts - embedded 

2366 return embedded, unembedded 

2367 

2368 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2369 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2370 default user space units, defining the boundaries of the physical medium on 

2371 which the page is intended to be displayed or printed.""" 

2372 

2373 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2374 """ 

2375 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2376 default user space units, defining the visible region of default user 

2377 space. 

2378 

2379 When the page is displayed or printed, its contents are to be clipped 

2380 (cropped) to this rectangle and then imposed on the output medium in some 

2381 implementation-defined manner. Default value: same as 

2382 :attr:`mediabox<mediabox>`. 

2383 """ 

2384 

2385 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2386 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2387 default user space units, defining the region to which the contents of the 

2388 page should be clipped when output in a production environment.""" 

2389 

2390 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2391 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2392 default user space units, defining the intended dimensions of the finished 

2393 page after trimming.""" 

2394 

2395 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2396 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2397 default user space units, defining the extent of the page's meaningful 

2398 content as intended by the page's creator.""" 

2399 

2400 @property 

2401 def annotations(self) -> Optional[ArrayObject]: 

2402 if "/Annots" not in self: 

2403 return None 

2404 return cast(ArrayObject, self["/Annots"]) 

2405 

2406 @annotations.setter 

2407 def annotations(self, value: Optional[ArrayObject]) -> None: 

2408 """ 

2409 Set the annotations array of the page. 

2410 

2411 Typically you do not want to set this value, but append to it. 

2412 If you append to it, remember to add the object first to the writer 

2413 and only add the indirect object. 

2414 """ 

2415 if value is None: 

2416 del self[NameObject("/Annots")] 

2417 else: 

2418 self[NameObject("/Annots")] = value 

2419 

2420 

2421class _VirtualList(Sequence[PageObject]): 

2422 def __init__( 

2423 self, 

2424 length_function: Callable[[], int], 

2425 get_function: Callable[[int], PageObject], 

2426 ) -> None: 

2427 self.length_function = length_function 

2428 self.get_function = get_function 

2429 self.current = -1 

2430 

2431 def __len__(self) -> int: 

2432 return self.length_function() 

2433 

2434 @overload 

2435 def __getitem__(self, index: int) -> PageObject: 

2436 ... 

2437 

2438 @overload 

2439 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2440 ... 

2441 

2442 def __getitem__( 

2443 self, index: Union[int, slice] 

2444 ) -> Union[PageObject, Sequence[PageObject]]: 

2445 if isinstance(index, slice): 

2446 indices = range(*index.indices(len(self))) 

2447 cls = type(self) 

2448 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2449 if not isinstance(index, int): 

2450 raise TypeError("Sequence indices must be integers") 

2451 len_self = len(self) 

2452 if index < 0: 

2453 # support negative indexes 

2454 index += len_self 

2455 if not (0 <= index < len_self): 

2456 raise IndexError("Sequence index out of range") 

2457 return self.get_function(index) 

2458 

2459 def __delitem__(self, index: Union[int, slice]) -> None: 

2460 if isinstance(index, slice): 

2461 r = list(range(*index.indices(len(self)))) 

2462 # pages have to be deleted from last to first 

2463 r.sort() 

2464 r.reverse() 

2465 for p in r: 

2466 del self[p] # recursive call 

2467 return 

2468 if not isinstance(index, int): 

2469 raise TypeError("Index must be integers") 

2470 len_self = len(self) 

2471 if index < 0: 

2472 # support negative indexes 

2473 index += len_self 

2474 if not (0 <= index < len_self): 

2475 raise IndexError("Index out of range") 

2476 ind = self[index].indirect_reference 

2477 assert ind is not None 

2478 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2479 "/Parent", None 

2480 ) 

2481 first = True 

2482 while parent is not None: 

2483 parent = cast(DictionaryObject, parent.get_object()) 

2484 try: 

2485 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2486 del cast(ArrayObject, parent["/Kids"])[i] 

2487 first = False 

2488 try: 

2489 assert ind is not None 

2490 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2491 except Exception: # pragma: no cover 

2492 pass 

2493 if "/Count" in parent: 

2494 parent[NameObject("/Count")] = NumberObject( 

2495 cast(int, parent["/Count"]) - 1 

2496 ) 

2497 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2498 # No more objects in this part of this subtree 

2499 ind = parent.indirect_reference 

2500 parent = parent.get("/Parent", None) 

2501 except ValueError: # from index 

2502 if first: 

2503 raise PdfReadError(f"Page not found in page tree: {ind}") 

2504 break 

2505 

2506 def __iter__(self) -> Iterator[PageObject]: 

2507 for i in range(len(self)): 

2508 yield self[i] 

2509 

2510 def __str__(self) -> str: 

2511 p = [f"PageObject({i})" for i in range(self.length_function())] 

2512 return f"[{', '.join(p)}]" 

2513 

2514 

2515def _get_fonts_walk( 

2516 obj: DictionaryObject, 

2517 fnt: Set[str], 

2518 emb: Set[str], 

2519) -> Tuple[Set[str], Set[str]]: 

2520 """ 

2521 Get the set of all fonts and all embedded fonts. 

2522 

2523 Args: 

2524 obj: Page resources dictionary 

2525 fnt: font 

2526 emb: embedded fonts 

2527 

2528 Returns: 

2529 A tuple (fnt, emb) 

2530 

2531 If there is a key called 'BaseFont', that is a font that is used in the document. 

2532 If there is a key called 'FontName' and another key in the same dictionary object 

2533 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2534 embedded. 

2535 

2536 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2537 

2538 """ 

2539 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2540 

2541 def process_font(f: DictionaryObject) -> None: 

2542 nonlocal fnt, emb 

2543 f = cast(DictionaryObject, f.get_object()) # to be sure 

2544 if "/BaseFont" in f: 

2545 fnt.add(cast(str, f["/BaseFont"])) 

2546 

2547 if ( 

2548 ("/CharProcs" in f) 

2549 or ( 

2550 "/FontDescriptor" in f 

2551 and any( 

2552 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2553 ) 

2554 ) 

2555 or ( 

2556 "/DescendantFonts" in f 

2557 and "/FontDescriptor" 

2558 in cast( 

2559 DictionaryObject, 

2560 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2561 ) 

2562 and any( 

2563 x 

2564 in cast( 

2565 DictionaryObject, 

2566 cast( 

2567 DictionaryObject, 

2568 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2569 )["/FontDescriptor"], 

2570 ) 

2571 for x in fontkeys 

2572 ) 

2573 ) 

2574 ): 

2575 # the list comprehension ensures there is FontFile 

2576 try: 

2577 emb.add(cast(str, f["/BaseFont"])) 

2578 except KeyError: 

2579 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2580 

2581 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2582 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2583 process_font(f) 

2584 if "/Resources" in obj: 

2585 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2586 for f in cast( 

2587 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2588 ).values(): 

2589 process_font(f) 

2590 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2591 for x in cast( 

2592 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2593 ).values(): 

2594 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2595 if "/Annots" in obj: 

2596 for a in cast(ArrayObject, obj["/Annots"]): 

2597 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2598 if "/AP" in obj: 

2599 if ( 

2600 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2601 "/Type" 

2602 ) 

2603 == "/XObject" 

2604 ): 

2605 _get_fonts_walk( 

2606 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2607 fnt, 

2608 emb, 

2609 ) 

2610 else: 

2611 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2612 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2613 return fnt, emb # return the sets for each page