Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_page.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

930 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from collections.abc import Iterable, Iterator, Sequence 

32from copy import deepcopy 

33from dataclasses import asdict, dataclass 

34from decimal import Decimal 

35from io import BytesIO 

36from pathlib import Path 

37from typing import ( 

38 Any, 

39 Callable, 

40 Literal, 

41 Optional, 

42 Union, 

43 cast, 

44 overload, 

45) 

46 

47from ._font import Font 

48from ._protocols import PdfCommonDocProtocol 

49from ._text_extraction import ( 

50 _layout_mode, 

51) 

52from ._text_extraction._text_extractor import TextExtraction 

53from ._utils import ( 

54 CompressedTransformationMatrix, 

55 TransformationMatrixType, 

56 _human_readable_bytes, 

57 deprecate, 

58 logger_warning, 

59 matrix_multiply, 

60) 

61from .constants import ( 

62 _INLINE_IMAGE_KEY_MAPPING, 

63 _INLINE_IMAGE_VALUE_MAPPING, 

64 AnnotationDictionaryAttributes, 

65 ImageAttributes, 

66) 

67from .constants import PageAttributes as PG 

68from .constants import Resources as RES 

69from .errors import PageSizeNotDefinedError, PdfReadError 

70from .generic import ( 

71 ArrayObject, 

72 ContentStream, 

73 DictionaryObject, 

74 EncodedStreamObject, 

75 FloatObject, 

76 IndirectObject, 

77 NameObject, 

78 NullObject, 

79 NumberObject, 

80 PdfObject, 

81 RectangleObject, 

82 StreamObject, 

83 is_null_or_none, 

84) 

85 

86try: 

87 from PIL.Image import Image 

88 

89 pil_not_imported = False 

90except ImportError: 

91 Image = object # type: ignore[assignment,misc,unused-ignore] # TODO: Remove unused-ignore on Python 3.10 

92 pil_not_imported = True # error will be raised only when using images 

93 

94MERGE_CROP_BOX = "cropbox" # pypdf <= 3.4.0 used "trimbox" 

95 

96 

97def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: 

98 retval: Union[None, RectangleObject, ArrayObject, IndirectObject] = self.get(name) 

99 if isinstance(retval, RectangleObject): 

100 return retval 

101 if is_null_or_none(retval): 

102 for d in defaults: 

103 retval = self.get(d) 

104 if retval is not None: 

105 break 

106 if isinstance(retval, IndirectObject): 

107 retval = self.pdf.get_object(retval) 

108 if isinstance(retval, ArrayObject) and (length := len(retval)) > 4: 

109 logger_warning( 

110 "Expected four values, got %(length)d: %(retval)s", 

111 source=__name__, 

112 length=length, 

113 retval=retval, 

114 ) 

115 retval = RectangleObject(tuple(retval[:4])) 

116 else: 

117 retval = RectangleObject(retval) # type: ignore[arg-type] 

118 _set_rectangle(self, name, retval) 

119 return retval 

120 

121 

122def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: 

123 self[NameObject(name)] = value 

124 

125 

126def _delete_rectangle(self: Any, name: str) -> None: 

127 del self[name] 

128 

129 

130def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: 

131 return property( 

132 lambda self: _get_rectangle(self, name, fallback), 

133 lambda self, value: _set_rectangle(self, name, value), 

134 lambda self: _delete_rectangle(self, name), 

135 ) 

136 

137 

138class Transformation: 

139 """ 

140 Represent a 2D transformation. 

141 

142 The transformation between two coordinate systems is represented by a 3-by-3 

143 transformation matrix with the following form:: 

144 

145 a b 0 

146 c d 0 

147 e f 1 

148 

149 Because a transformation matrix has only six elements that can be changed, 

150 it is usually specified in PDF as the six-element array [ a b c d e f ]. 

151 

152 Coordinate transformations are expressed as matrix multiplications:: 

153 

154 a b 0 

155 [ x′ y′ 1 ] = [ x y 1 ] × c d 0 

156 e f 1 

157 

158 

159 Example: 

160 >>> from pypdf import PdfWriter, Transformation 

161 >>> page = PdfWriter().add_blank_page(800, 600) 

162 >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) 

163 >>> page.add_transformation(op) 

164 

165 """ 

166 

167 def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)) -> None: 

168 self.ctm = ctm 

169 

170 @property 

171 def matrix(self) -> TransformationMatrixType: 

172 """ 

173 Return the transformation matrix as a tuple of tuples in the form: 

174 

175 ((a, b, 0), (c, d, 0), (e, f, 1)) 

176 """ 

177 return ( 

178 (self.ctm[0], self.ctm[1], 0), 

179 (self.ctm[2], self.ctm[3], 0), 

180 (self.ctm[4], self.ctm[5], 1), 

181 ) 

182 

183 @staticmethod 

184 def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: 

185 """ 

186 Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). 

187 

188 Args: 

189 matrix: The transformation matrix as a tuple of tuples. 

190 

191 Returns: 

192 A tuple representing the transformation matrix as (a, b, c, d, e, f) 

193 

194 """ 

195 return ( 

196 matrix[0][0], 

197 matrix[0][1], 

198 matrix[1][0], 

199 matrix[1][1], 

200 matrix[2][0], 

201 matrix[2][1], 

202 ) 

203 

204 def _to_cm(self) -> str: 

205 # Returns the cm operation string for the given transformation matrix 

206 return ( 

207 f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} " 

208 f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm" 

209 ) 

210 

211 def transform(self, m: "Transformation") -> "Transformation": 

212 """ 

213 Apply one transformation to another. 

214 

215 Args: 

216 m: a Transformation to apply. 

217 

218 Returns: 

219 A new ``Transformation`` instance 

220 

221 Example: 

222 >>> from pypdf import PdfWriter, Transformation 

223 >>> height, width = 40, 50 

224 >>> page = PdfWriter().add_blank_page(800, 600) 

225 >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror 

226 >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, width, 0))) # horizontal mirror 

227 >>> page.add_transformation(op) 

228 

229 """ 

230 ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) 

231 return Transformation(ctm) 

232 

233 def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": 

234 """ 

235 Translate the contents of a page. 

236 

237 Args: 

238 tx: The translation along the x-axis. 

239 ty: The translation along the y-axis. 

240 

241 Returns: 

242 A new ``Transformation`` instance 

243 

244 """ 

245 m = self.ctm 

246 return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) 

247 

248 def scale( 

249 self, sx: Optional[float] = None, sy: Optional[float] = None 

250 ) -> "Transformation": 

251 """ 

252 Scale the contents of a page towards the origin of the coordinate system. 

253 

254 Typically, that is the lower-left corner of the page. That can be 

255 changed by translating the contents / the page boxes. 

256 

257 Args: 

258 sx: The scale factor along the x-axis. 

259 sy: The scale factor along the y-axis. 

260 

261 Returns: 

262 A new Transformation instance with the scaled matrix. 

263 

264 """ 

265 if sx is None and sy is None: 

266 raise ValueError("Either sx or sy must be specified") 

267 if sx is None: 

268 sx = sy 

269 if sy is None: 

270 sy = sx 

271 assert sx is not None 

272 assert sy is not None 

273 op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) 

274 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

275 return Transformation(ctm) 

276 

277 def rotate(self, rotation: float) -> "Transformation": 

278 """ 

279 Rotate the contents of a page. 

280 

281 Args: 

282 rotation: The angle of rotation in degrees. 

283 

284 Returns: 

285 A new ``Transformation`` instance with the rotated matrix. 

286 

287 """ 

288 rotation = math.radians(rotation) 

289 op: TransformationMatrixType = ( 

290 (math.cos(rotation), math.sin(rotation), 0), 

291 (-math.sin(rotation), math.cos(rotation), 0), 

292 (0, 0, 1), 

293 ) 

294 ctm = Transformation.compress(matrix_multiply(self.matrix, op)) 

295 return Transformation(ctm) 

296 

297 def __repr__(self) -> str: 

298 return f"Transformation(ctm={self.ctm})" 

299 

300 @overload 

301 def apply_on(self, pt: list[float], as_object: bool = False) -> list[float]: 

302 ... 

303 

304 @overload 

305 def apply_on( 

306 self, pt: tuple[float, float], as_object: bool = False 

307 ) -> tuple[float, float]: 

308 ... 

309 

310 def apply_on( 

311 self, 

312 pt: Union[tuple[float, float], list[float]], 

313 as_object: bool = False, 

314 ) -> Union[tuple[float, float], list[float]]: 

315 """ 

316 Apply the transformation matrix on the given point. 

317 

318 Args: 

319 pt: A tuple or list representing the point in the form (x, y). 

320 as_object: If True, return items as FloatObject, otherwise as plain floats. 

321 

322 Returns: 

323 A tuple or list representing the transformed point in the form (x', y') 

324 

325 """ 

326 typ = FloatObject if as_object else float 

327 pt1 = ( 

328 typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), 

329 typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), 

330 ) 

331 return list(pt1) if isinstance(pt, list) else pt1 

332 

333 

334@dataclass 

335class ImageFile: 

336 """ 

337 Image within the PDF file. *This object is not designed to be built.* 

338 

339 This object should not be modified except using :func:`ImageFile.replace` to replace the image with a new one. 

340 """ 

341 

342 name: str = "" 

343 """ 

344 Filename as identified within the PDF file. 

345 """ 

346 

347 data: bytes = b"" 

348 """ 

349 Data as bytes. 

350 """ 

351 

352 image: Optional[Image] = None 

353 """ 

354 Data as PIL image. 

355 """ 

356 

357 indirect_reference: Optional[IndirectObject] = None 

358 """ 

359 Reference to the object storing the stream. 

360 """ 

361 

362 def replace(self, new_image: Image, **kwargs: Any) -> None: 

363 """ 

364 Replace the image with a new PIL image. 

365 

366 Args: 

367 new_image (PIL.Image.Image): The new PIL image to replace the existing image. 

368 **kwargs: Additional keyword arguments to pass to `Image.save()`. 

369 

370 Raises: 

371 TypeError: If the image is inline or in a PdfReader. 

372 TypeError: If the image does not belong to a PdfWriter. 

373 TypeError: If `new_image` is not a PIL Image. 

374 

375 Note: 

376 This method replaces the existing image with a new image. 

377 It is not allowed for inline images or images within a PdfReader. 

378 The `kwargs` parameter allows passing additional parameters 

379 to `Image.save()`, such as quality. 

380 

381 """ 

382 if pil_not_imported: 

383 raise ImportError( 

384 "pillow is required to do image extraction. " 

385 "It can be installed via 'pip install pypdf[image]'" 

386 ) 

387 

388 from ._reader import PdfReader # noqa: PLC0415 

389 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 

390 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 

391 

392 if self.indirect_reference is None: 

393 raise TypeError("Cannot update an inline image.") 

394 if not hasattr(self.indirect_reference.pdf, "_id_translated"): 

395 raise TypeError("Cannot update an image not belonging to a PdfWriter.") 

396 if not isinstance(new_image, Image): 

397 raise TypeError("new_image shall be a PIL Image") 

398 b = BytesIO() 

399 new_image.save(b, "PDF", **kwargs) 

400 reader = PdfReader(b) 

401 page_image = reader.pages[0].images[0] 

402 assert page_image.indirect_reference is not None 

403 self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( 

404 page_image.indirect_reference.get_object() 

405 ) 

406 cast( 

407 PdfObject, self.indirect_reference.get_object() 

408 ).indirect_reference = self.indirect_reference 

409 # change the object attributes 

410 extension, byte_stream, img = _xobj_to_image( 

411 cast(DictionaryObject, self.indirect_reference.get_object()), 

412 pillow_parameters=kwargs, 

413 ) 

414 assert extension is not None 

415 self.name = self.name[: self.name.rfind(".")] + extension 

416 self.data = byte_stream 

417 self.image = img 

418 

419 def __str__(self) -> str: 

420 return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" 

421 

422 def __repr__(self) -> str: 

423 return self.__str__()[:-1] + f", hash: {hash(self.data)})" 

424 

425 

426class VirtualListImages(Sequence[ImageFile]): 

427 """ 

428 Provides access to images referenced within a page. 

429 Only one copy will be returned if the usage is used on the same page multiple times. 

430 See :func:`PageObject.images` for more details. 

431 """ 

432 

433 def __init__( 

434 self, 

435 ids_function: Callable[[], list[Union[str, list[str]]]], 

436 get_function: Callable[[Union[str, list[str], tuple[str]]], ImageFile], 

437 ) -> None: 

438 self.ids_function = ids_function 

439 self.get_function = get_function 

440 self.current = -1 

441 

442 def __len__(self) -> int: 

443 return len(self.ids_function()) 

444 

445 def keys(self) -> list[Union[str, list[str]]]: 

446 return self.ids_function() 

447 

448 def items(self) -> list[tuple[Union[str, list[str]], ImageFile]]: 

449 return [(x, self[x]) for x in self.ids_function()] 

450 

451 @overload 

452 def __getitem__(self, index: Union[int, str, list[str]]) -> ImageFile: 

453 ... 

454 

455 @overload 

456 def __getitem__(self, index: slice) -> Sequence[ImageFile]: 

457 ... 

458 

459 def __getitem__( 

460 self, index: Union[int, slice, str, list[str], tuple[str]] 

461 ) -> Union[ImageFile, Sequence[ImageFile]]: 

462 lst = self.ids_function() 

463 if isinstance(index, slice): 

464 indices = range(*index.indices(len(self))) 

465 lst = [lst[x] for x in indices] 

466 cls = type(self) 

467 return cls((lambda: lst), self.get_function) 

468 if isinstance(index, (str, list, tuple)): 

469 return self.get_function(index) 

470 if not isinstance(index, int): 

471 raise TypeError("Invalid sequence indices type") 

472 len_self = len(lst) 

473 if index < 0: 

474 # support negative indexes 

475 index += len_self 

476 if not (0 <= index < len_self): 

477 raise IndexError("Sequence index out of range") 

478 return self.get_function(lst[index]) 

479 

480 def __iter__(self) -> Iterator[ImageFile]: 

481 for i in range(len(self)): 

482 yield self[i] 

483 

484 def __str__(self) -> str: 

485 p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] 

486 return f"[{', '.join(p)}]" 

487 

488 

489class PageObject(DictionaryObject): 

490 """ 

491 PageObject represents a single page within a PDF file. 

492 

493 Typically these objects will be created by accessing the 

494 :attr:`pages<pypdf.PdfReader.pages>` property of the 

495 :class:`PdfReader<pypdf.PdfReader>` class, but it is 

496 also possible to create an empty page with the 

497 :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. 

498 

499 Args: 

500 pdf: PDF file the page belongs to. 

501 indirect_reference: Stores the original indirect reference to 

502 this object in its source PDF 

503 

504 """ 

505 

506 original_page: "PageObject" # very local use in writer when appending 

507 

508 def __init__( 

509 self, 

510 pdf: Optional[PdfCommonDocProtocol] = None, 

511 indirect_reference: Optional[IndirectObject] = None, 

512 ) -> None: 

513 DictionaryObject.__init__(self) 

514 self.pdf = pdf 

515 self.inline_images: Optional[dict[str, ImageFile]] = None 

516 self.indirect_reference = indirect_reference 

517 if not is_null_or_none(indirect_reference): 

518 assert indirect_reference is not None, "mypy" 

519 self.update(cast(DictionaryObject, indirect_reference.get_object())) 

520 

521 def hash_bin(self) -> int: 

522 """ 

523 Used to detect modified object. 

524 

525 Note: this function is overloaded to return the same results 

526 as a DictionaryObject. 

527 

528 Returns: 

529 Hash considering type and value. 

530 

531 """ 

532 return hash( 

533 (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) 

534 ) 

535 

536 def hash_value_data(self) -> bytes: 

537 data = super().hash_value_data() 

538 data += f"{id(self)}".encode() 

539 return data 

540 

541 @property 

542 def user_unit(self) -> float: 

543 """ 

544 A read-only positive number giving the size of user space units. 

545 

546 It is in multiples of 1/72 inch. Hence a value of 1 means a user 

547 space unit is 1/72 inch, and a value of 3 means that a user 

548 space unit is 3/72 inch. 

549 """ 

550 return cast(float, self.get(PG.USER_UNIT, 1)) 

551 

552 @staticmethod 

553 def create_blank_page( 

554 pdf: Optional[PdfCommonDocProtocol] = None, 

555 width: Union[float, Decimal, None] = None, 

556 height: Union[float, Decimal, None] = None, 

557 ) -> "PageObject": 

558 """ 

559 Return a new blank page. 

560 

561 If ``width`` or ``height`` is ``None``, try to get the page size 

562 from the last page of *pdf*. 

563 

564 Args: 

565 pdf: PDF file the page is within. 

566 width: The width of the new page expressed in default user 

567 space units. 

568 height: The height of the new page expressed in default user 

569 space units. 

570 

571 Returns: 

572 The new blank page 

573 

574 Raises: 

575 PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

576 no page 

577 

578 """ 

579 page = PageObject(pdf) 

580 

581 # Creates a new page (cf PDF Reference §7.7.3.3) 

582 page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) 

583 page.__setitem__(NameObject(PG.PARENT), NullObject()) 

584 page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) 

585 if width is None or height is None: 

586 if pdf is not None and len(pdf.pages) > 0: 

587 lastpage = pdf.pages[len(pdf.pages) - 1] 

588 width = lastpage.mediabox.width 

589 height = lastpage.mediabox.height 

590 else: 

591 raise PageSizeNotDefinedError 

592 page.__setitem__( 

593 NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore[arg-type] 

594 ) 

595 

596 return page 

597 

598 def _get_ids_image( 

599 self, 

600 obj: Optional[DictionaryObject] = None, 

601 ancest: Optional[list[str]] = None, 

602 call_stack: Optional[list[Any]] = None, 

603 ) -> list[Union[str, list[str]]]: 

604 if call_stack is None: 

605 call_stack = [] 

606 _i = getattr(obj, "indirect_reference", None) 

607 if _i in call_stack: 

608 return [] 

609 call_stack.append(_i) 

610 if self.inline_images is None: 

611 self.inline_images = self._get_inline_images() 

612 if obj is None: 

613 obj = self 

614 if ancest is None: 

615 ancest = [] 

616 lst: list[Union[str, list[str]]] = [] 

617 if ( 

618 PG.RESOURCES not in obj or 

619 is_null_or_none(resources := obj[PG.RESOURCES]) or 

620 RES.XOBJECT not in cast(DictionaryObject, resources) 

621 ): 

622 return [] if self.inline_images is None else list(self.inline_images.keys()) 

623 

624 x_object = resources[RES.XOBJECT].get_object() # type: ignore[index] 

625 for o in x_object: 

626 if not isinstance(x_object[o], StreamObject): 

627 continue 

628 if x_object[o][ImageAttributes.SUBTYPE] == "/Image": 

629 lst.append(o if len(ancest) == 0 else [*ancest, o]) 

630 else: # is a form with possible images inside 

631 lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) 

632 assert self.inline_images is not None 

633 lst.extend(list(self.inline_images.keys())) 

634 return lst 

635 

636 def _get_image( 

637 self, 

638 id: Union[str, list[str], tuple[str]], 

639 obj: Optional[DictionaryObject] = None, 

640 ) -> ImageFile: 

641 if obj is None: 

642 obj = cast(DictionaryObject, self) 

643 if isinstance(id, tuple): 

644 id = list(id) 

645 if isinstance(id, list) and len(id) == 1: 

646 id = id[0] 

647 xobjs: Optional[DictionaryObject] = None 

648 try: 

649 xobjs = cast( 

650 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] 

651 ) 

652 except KeyError as exc: 

653 if not (id[0] == "~" and id[-1] == "~"): 

654 raise KeyError( 

655 f"Cannot access image object {id} without XObject resources" 

656 ) from exc 

657 if isinstance(id, str): 

658 if id[0] == "~" and id[-1] == "~": 

659 if self.inline_images is None: 

660 self.inline_images = self._get_inline_images() 

661 if self.inline_images is None: 

662 raise KeyError("No inline image can be found") 

663 return self.inline_images[id] 

664 

665 assert xobjs is not None 

666 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 

667 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) 

668 extension, byte_stream = imgd[:2] 

669 return ImageFile( 

670 name=f"{id[1:]}{extension}", 

671 data=byte_stream, 

672 image=imgd[2], 

673 indirect_reference=xobjs[id].indirect_reference, 

674 ) 

675 # in a subobject 

676 assert xobjs is not None 

677 ids = id[1:] 

678 return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) 

679 

680 @property 

681 def images(self) -> VirtualListImages: 

682 """ 

683 Read-only property emulating a list of images on a page. 

684 

685 Get a list of all images on the page. The key can be: 

686 - A string (for the top object) 

687 - A tuple (for images within XObject forms) 

688 - An integer 

689 

690 Examples: 

691 * `reader.pages[0].images[0]` # return first image 

692 * `reader.pages[0].images['/I0']` # return image '/I0' 

693 * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form 

694 * `for img in reader.pages[0].images:` # loops through all objects 

695 

696 images.keys() and images.items() can be used. 

697 

698 The ImageFile has the following properties: 

699 

700 * `.name` : name of the object 

701 * `.data` : bytes of the object 

702 * `.image` : PIL Image Object 

703 * `.indirect_reference` : object reference 

704 

705 and the following methods: 

706 `.replace(new_image: PIL.Image.Image, **kwargs)` : 

707 replace the image in the pdf with the new image 

708 applying the saving parameters indicated (such as quality) 

709 

710 Example usage: 

711 

712 reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) 

713 

714 Inline images are extracted and named ~0~, ~1~, ..., with the 

715 indirect_reference set to None. 

716 

717 """ 

718 return VirtualListImages(self._get_ids_image, self._get_image) 

719 

720 def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: 

721 """Translate values used in inline image""" 

722 try: 

723 v = NameObject(_INLINE_IMAGE_VALUE_MAPPING[cast(str, v)]) 

724 except (TypeError, KeyError): 

725 if isinstance(v, NameObject): 

726 # It is a custom name, thus we have to look in resources. 

727 # The only applicable case is for ColorSpace. 

728 try: 

729 res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] 

730 v = cast(DictionaryObject, res)[v] 

731 except KeyError: # for res and v 

732 raise PdfReadError(f"Cannot find resource entry {v} for {k}") 

733 return v 

734 

735 def _get_inline_images(self) -> dict[str, ImageFile]: 

736 """Load inline images. Entries will be identified as `~1~`.""" 

737 content = self.get_contents() 

738 if is_null_or_none(content): 

739 return {} 

740 imgs_data = [] 

741 assert content is not None, "mypy" 

742 for param, ope in content.operations: 

743 if ope == b"INLINE IMAGE": 

744 imgs_data.append( 

745 {"settings": param["settings"], "__streamdata__": param["data"]} 

746 ) 

747 elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover 

748 raise PdfReadError( 

749 f"{ope!r} operator met whereas not expected, " 

750 "please share use case with pypdf dev team" 

751 ) 

752 files = {} 

753 for num, ii in enumerate(imgs_data): 

754 init = { 

755 "__streamdata__": ii["__streamdata__"], 

756 "/Length": len(ii["__streamdata__"]), 

757 } 

758 for k, v in ii["settings"].items(): 

759 if k in {"/Length", "/L"}: # no length is expected 

760 continue 

761 if isinstance(v, list): 

762 v = ArrayObject( 

763 [self._translate_value_inline_image(k, x) for x in v] 

764 ) 

765 else: 

766 v = self._translate_value_inline_image(k, v) 

767 k = NameObject(_INLINE_IMAGE_KEY_MAPPING[k]) 

768 if k not in init: 

769 init[k] = v 

770 ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) 

771 from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 

772 extension, byte_stream, img = _xobj_to_image(ii["object"]) 

773 files[f"~{num}~"] = ImageFile( 

774 name=f"~{num}~{extension}", 

775 data=byte_stream, 

776 image=img, 

777 indirect_reference=None, 

778 ) 

779 return files 

780 

781 @property 

782 def rotation(self) -> int: 

783 """ 

784 The visual rotation of the page. 

785 

786 This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are 

787 valid values. This property does not affect ``/Contents``. 

788 """ 

789 rotate_obj = self.get(PG.ROTATE, 0) 

790 return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() 

791 

792 @rotation.setter 

793 def rotation(self, r: float) -> None: 

794 self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) 

795 

796 def transfer_rotation_to_content(self) -> None: 

797 """ 

798 Apply the rotation of the page to the content and the media/crop/... 

799 boxes. 

800 

801 It is recommended to apply this function before page merging. 

802 """ 

803 r = -self.rotation # rotation to apply is in the otherway 

804 self.rotation = 0 

805 mb = RectangleObject(self.mediabox) 

806 trsf = ( 

807 Transformation() 

808 .translate( 

809 -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) 

810 ) 

811 .rotate(r) 

812 ) 

813 pt1 = trsf.apply_on(mb.lower_left) 

814 pt2 = trsf.apply_on(mb.upper_right) 

815 trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) 

816 self.add_transformation(trsf, False) 

817 for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: 

818 if b in self: 

819 rr = RectangleObject(self[b]) # type: ignore[arg-type] 

820 pt1 = trsf.apply_on(rr.lower_left) 

821 pt2 = trsf.apply_on(rr.upper_right) 

822 self[NameObject(b)] = RectangleObject( 

823 ( 

824 min(pt1[0], pt2[0]), 

825 min(pt1[1], pt2[1]), 

826 max(pt1[0], pt2[0]), 

827 max(pt1[1], pt2[1]), 

828 ) 

829 ) 

830 

831 def rotate(self, angle: int) -> "PageObject": 

832 """ 

833 Rotate a page clockwise by increments of 90 degrees. 

834 

835 Args: 

836 angle: Angle to rotate the page. Must be an increment of 90 deg. 

837 

838 Returns: 

839 The rotated PageObject 

840 

841 """ 

842 if angle % 90 != 0: 

843 raise ValueError("Rotation angle must be a multiple of 90") 

844 self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) 

845 return self 

846 

847 def _merge_resources( 

848 self, 

849 res1: DictionaryObject, 

850 res2: DictionaryObject, 

851 resource: Any, 

852 new_res1: bool = True, 

853 ) -> tuple[dict[str, Any], dict[str, Any]]: 

854 try: 

855 assert isinstance(self.indirect_reference, IndirectObject) 

856 pdf = self.indirect_reference.pdf 

857 is_pdf_writer = hasattr( 

858 pdf, "_add_object" 

859 ) # expect isinstance(pdf, PdfWriter) 

860 except (AssertionError, AttributeError): 

861 pdf = None 

862 is_pdf_writer = False 

863 

864 def compute_unique_key(base_key: str) -> tuple[str, bool]: 

865 """ 

866 Find a key that either doesn't already exist or has the same value 

867 (indicated by the bool) 

868 

869 Args: 

870 base_key: An index is added to this to get the computed key 

871 

872 Returns: 

873 A tuple (computed key, bool) where the boolean indicates 

874 if there is a resource of the given computed_key with the same 

875 value. 

876 

877 """ 

878 value = page2res.raw_get(base_key) 

879 # TODO: a possible improvement for writer, the indirect_reference 

880 # cannot be found because translated 

881 

882 # try the current key first (e.g. "foo"), but otherwise iterate 

883 # through "foo-0", "foo-1", etc. new_res can contain only finitely 

884 # many keys, thus this'll eventually end, even if it's been crafted 

885 # to be maximally annoying. 

886 computed_key = base_key 

887 idx = 0 

888 while computed_key in new_res: 

889 if new_res.raw_get(computed_key) == value: 

890 # there's already a resource of this name, with the exact 

891 # same value 

892 return computed_key, True 

893 computed_key = f"{base_key}-{idx}" 

894 idx += 1 

895 return computed_key, False 

896 

897 if new_res1: 

898 new_res = DictionaryObject() 

899 new_res.update(res1.get(resource, DictionaryObject()).get_object()) 

900 else: 

901 new_res = cast(DictionaryObject, res1[resource]) 

902 page2res = cast( 

903 DictionaryObject, res2.get(resource, DictionaryObject()).get_object() 

904 ) 

905 rename_res = {} 

906 for key in page2res: 

907 unique_key, same_value = compute_unique_key(key) 

908 newname = NameObject(unique_key) 

909 if key != unique_key: 

910 # we have to use a different name for this 

911 rename_res[key] = newname 

912 

913 if not same_value: 

914 if is_pdf_writer: 

915 new_res[newname] = page2res.raw_get(key).clone(pdf) 

916 try: 

917 new_res[newname] = new_res[newname].indirect_reference 

918 except AttributeError: 

919 pass 

920 else: 

921 new_res[newname] = page2res.raw_get(key) 

922 lst = sorted(new_res.items()) 

923 new_res.clear() 

924 for el in lst: 

925 new_res[el[0]] = el[1] 

926 return new_res, rename_res 

927 

928 @staticmethod 

929 def _content_stream_rename( 

930 stream: ContentStream, 

931 rename: dict[Any, Any], 

932 pdf: Optional[PdfCommonDocProtocol], 

933 ) -> ContentStream: 

934 if not rename: 

935 return stream 

936 stream = ContentStream(stream, pdf) 

937 for operands, _operator in stream.operations: 

938 if isinstance(operands, list): 

939 for i, op in enumerate(operands): 

940 if isinstance(op, NameObject): 

941 operands[i] = rename.get(op, op) 

942 elif isinstance(operands, dict): 

943 for i, op in operands.items(): 

944 if isinstance(op, NameObject): 

945 operands[i] = rename.get(op, op) 

946 else: 

947 raise KeyError(f"Type of operands is {type(operands)}") 

948 return stream 

949 

950 @staticmethod 

951 def _add_transformation_matrix( 

952 contents: Any, 

953 pdf: Optional[PdfCommonDocProtocol], 

954 ctm: CompressedTransformationMatrix, 

955 ) -> ContentStream: 

956 """Add transformation matrix at the beginning of the given contents stream.""" 

957 content_stream = ContentStream(contents, pdf) 

958 content_stream.operations.insert( 

959 0, 

960 ( 

961 [FloatObject(x) for x in ctm], 

962 b"cm", 

963 ), 

964 ) 

965 return content_stream 

966 

967 def _get_contents_as_bytes(self) -> Optional[bytes]: 

968 """ 

969 Return the page contents as bytes. 

970 

971 Returns: 

972 The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. 

973 

974 """ 

975 if PG.CONTENTS in self: 

976 obj = self[PG.CONTENTS].get_object() 

977 if isinstance(obj, list): 

978 return b"".join(x.get_object().get_data() for x in obj) 

979 return cast(EncodedStreamObject, obj).get_data() 

980 return None 

981 

982 def get_contents(self) -> Optional[ContentStream]: 

983 """ 

984 Access the page contents. 

985 

986 Returns: 

987 The ``/Contents`` object, or ``None`` if it does not exist. 

988 ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. 

989 

990 """ 

991 if PG.CONTENTS in self: 

992 try: 

993 pdf = cast(IndirectObject, self.indirect_reference).pdf 

994 except AttributeError: 

995 pdf = None 

996 obj = self[PG.CONTENTS] 

997 if is_null_or_none(obj): 

998 return None 

999 resolved_object = obj.get_object() 

1000 return ContentStream(resolved_object, pdf) 

1001 return None 

1002 

1003 def replace_contents( 

1004 self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] 

1005 ) -> None: 

1006 """ 

1007 Replace the page contents with the new content and nullify old objects 

1008 Args: 

1009 content: new content; if None delete the content field. 

1010 """ 

1011 if not hasattr(self, "indirect_reference") or self.indirect_reference is None: 

1012 # the page is not attached : the content is directly attached. 

1013 self[NameObject(PG.CONTENTS)] = content 

1014 return 

1015 

1016 from pypdf._writer import PdfWriter # noqa: PLC0415 

1017 if not isinstance(self.indirect_reference.pdf, PdfWriter): 

1018 deprecate( 

1019 "Calling `PageObject.replace_contents()` for pages not assigned to a writer is deprecated " 

1020 "and will be removed in pypdf 7.0.0. Attach the page to the writer first or use " 

1021 "`PdfWriter(clone_from=...)` directly. The existing approach has proved being unreliable." 

1022 ) 

1023 

1024 writer = self.indirect_reference.pdf 

1025 if isinstance(self.get(PG.CONTENTS, None), ArrayObject): 

1026 content_array = cast(ArrayObject, self[PG.CONTENTS]) 

1027 for reference in content_array: 

1028 try: 

1029 writer._replace_object(indirect_reference=reference.indirect_reference, obj=NullObject()) 

1030 except ValueError: 

1031 # Occurs when called on PdfReader. 

1032 pass 

1033 

1034 if isinstance(content, ArrayObject): 

1035 content = ArrayObject(writer._add_object(obj) for obj in content) 

1036 

1037 if is_null_or_none(content): 

1038 if PG.CONTENTS not in self: 

1039 return 

1040 assert self[PG.CONTENTS].indirect_reference is not None 

1041 writer._replace_object(indirect_reference=self[PG.CONTENTS].indirect_reference, obj=NullObject()) 

1042 del self[PG.CONTENTS] 

1043 elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): 

1044 try: 

1045 self[NameObject(PG.CONTENTS)] = writer._add_object(content) 

1046 except AttributeError: 

1047 # applies at least for page not in writer 

1048 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1049 # this will be fixed with the _add_object 

1050 self[NameObject(PG.CONTENTS)] = content 

1051 else: 

1052 assert content is not None, "mypy" 

1053 content.indirect_reference = self[ 

1054 PG.CONTENTS 

1055 ].indirect_reference # TODO: in the future may require generation management 

1056 try: 

1057 writer._replace_object(indirect_reference=content.indirect_reference, obj=content) 

1058 except AttributeError: 

1059 # applies at least for page not in writer 

1060 # as a backup solution, we put content as an object although not in accordance with pdf ref 

1061 # this will be fixed with the _add_object 

1062 self[NameObject(PG.CONTENTS)] = content 

1063 # forces recalculation of inline_images 

1064 self.inline_images = None 

1065 

1066 def merge_page( 

1067 self, page2: "PageObject", expand: bool = False, over: bool = True 

1068 ) -> None: 

1069 """ 

1070 Merge the content streams of two pages into one. 

1071 

1072 Resource references (e.g. fonts) are maintained from both pages. 

1073 The mediabox, cropbox, etc of this page are not altered. 

1074 The parameter page's content stream will 

1075 be added to the end of this page's content stream, 

1076 meaning that it will be drawn after, or "on top" of this page. 

1077 

1078 Args: 

1079 page2: The page to be merged into this one. Should be 

1080 an instance of :class:`PageObject<PageObject>`. 

1081 over: set the page2 content over page1 if True (default) else under 

1082 expand: If True, the current page dimensions will be 

1083 expanded to accommodate the dimensions of the page to be merged. 

1084 

1085 """ 

1086 self._merge_page(page2, over=over, expand=expand) 

1087 

1088 def _merge_page( 

1089 self, 

1090 page2: "PageObject", 

1091 page2_transformation: Optional[Callable[[Any], ContentStream]] = None, 

1092 ctm: Optional[CompressedTransformationMatrix] = None, 

1093 over: bool = True, 

1094 expand: bool = False, 

1095 ) -> None: 

1096 # First we work on merging the resource dictionaries. This allows us 

1097 # to find out what symbols in the content streams we might need to 

1098 # rename. 

1099 try: 

1100 assert isinstance(self.indirect_reference, IndirectObject) 

1101 if hasattr(self.indirect_reference.pdf, "_add_object"): # to detect PdfWriter 

1102 return self._merge_page_writer( 

1103 page2, page2_transformation, ctm, over, expand 

1104 ) 

1105 except (AssertionError, AttributeError): 

1106 pass 

1107 

1108 new_resources = DictionaryObject() 

1109 rename: dict[str, Any] = {} 

1110 original_resources = cast(DictionaryObject, self.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1111 page2_resources = cast(DictionaryObject, page2.get(PG.RESOURCES, DictionaryObject()).get_object()) 

1112 new_annots = ArrayObject() 

1113 

1114 for page in (self, page2): 

1115 if PG.ANNOTS in page: 

1116 annots = page[PG.ANNOTS] 

1117 if isinstance(annots, ArrayObject): 

1118 new_annots.extend(annots) 

1119 self[NameObject(PG.ANNOTS)] = new_annots 

1120 

1121 for res in ( 

1122 RES.EXT_G_STATE, 

1123 RES.COLOR_SPACE, 

1124 RES.PATTERN, 

1125 RES.SHADING, 

1126 RES.XOBJECT, 

1127 RES.FONT, 

1128 RES.PROPERTIES, 

1129 ): 

1130 new, new_resource_name = self._merge_resources( 

1131 original_resources, page2_resources, res 

1132 ) 

1133 if new: 

1134 new_resources[NameObject(res)] = new 

1135 rename.update(new_resource_name) 

1136 

1137 # Combine /ProcSet sets, making sure there is a consistent order 

1138 new_resources[NameObject(RES.PROC_SET)] = ArrayObject( 

1139 sorted( 

1140 set( 

1141 original_resources.get(RES.PROC_SET, ArrayObject()).get_object() 

1142 ).union( 

1143 set(page2_resources.get(RES.PROC_SET, ArrayObject()).get_object()) 

1144 ) 

1145 ) 

1146 ) 

1147 

1148 new_content_array = ArrayObject() 

1149 original_content = self.get_contents() 

1150 if original_content is not None: 

1151 original_content.isolate_graphics_state() 

1152 new_content_array.append(original_content) 

1153 

1154 page2_content = page2.get_contents() 

1155 if page2_content is not None: 

1156 rect = getattr(page2, MERGE_CROP_BOX) 

1157 page2_content.operations.insert( 

1158 0, 

1159 ( 

1160 map( 

1161 FloatObject, 

1162 [ 

1163 rect.left, 

1164 rect.bottom, 

1165 rect.width, 

1166 rect.height, 

1167 ], 

1168 ), 

1169 b"re", 

1170 ), 

1171 ) 

1172 page2_content.operations.insert(1, ([], b"W")) 

1173 page2_content.operations.insert(2, ([], b"n")) 

1174 if page2_transformation is not None: 

1175 page2_content = page2_transformation(page2_content) 

1176 page2_content = PageObject._content_stream_rename( 

1177 page2_content, rename, self.pdf 

1178 ) 

1179 page2_content.isolate_graphics_state() 

1180 if over: 

1181 new_content_array.append(page2_content) 

1182 else: 

1183 new_content_array.insert(0, page2_content) 

1184 

1185 # if expanding the page to fit a new page, calculate the new media box size 

1186 if expand: 

1187 self._expand_mediabox(page2, ctm) 

1188 

1189 self.replace_contents(ContentStream(new_content_array, self.pdf)) 

1190 self[NameObject(PG.RESOURCES)] = new_resources 

1191 

1192 return None 

1193 

1194 def _merge_page_writer( 

1195 self, 

1196 page2: "PageObject", 

1197 page2transformation: Optional[Callable[[Any], ContentStream]] = None, 

1198 ctm: Optional[CompressedTransformationMatrix] = None, 

1199 over: bool = True, 

1200 expand: bool = False, 

1201 ) -> None: 

1202 # First we work on merging the resource dictionaries. This allows us 

1203 # to find which symbols in the content streams we might need to 

1204 # rename. 

1205 assert isinstance(self.indirect_reference, IndirectObject) 

1206 pdf = self.indirect_reference.pdf 

1207 

1208 if PG.RESOURCES not in self: 

1209 self[NameObject(PG.RESOURCES)] = DictionaryObject() 

1210 original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) 

1211 if PG.RESOURCES not in page2: 

1212 page2resources = DictionaryObject() 

1213 else: 

1214 page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) 

1215 

1216 rename = {} 

1217 for res in ( 

1218 RES.EXT_G_STATE, 

1219 RES.COLOR_SPACE, 

1220 RES.PATTERN, 

1221 RES.SHADING, 

1222 RES.XOBJECT, 

1223 RES.FONT, 

1224 RES.PROPERTIES, 

1225 ): 

1226 if res in page2resources: 

1227 if res not in original_resources: 

1228 original_resources[NameObject(res)] = DictionaryObject() 

1229 _, newrename = self._merge_resources( 

1230 original_resources, page2resources, res, False 

1231 ) 

1232 rename.update(newrename) 

1233 # Combine /ProcSet sets 

1234 if RES.PROC_SET in page2resources: 

1235 if RES.PROC_SET not in original_resources: 

1236 original_resources[NameObject(RES.PROC_SET)] = ArrayObject() 

1237 arr = cast(ArrayObject, original_resources[RES.PROC_SET]) 

1238 for x in cast(ArrayObject, page2resources[RES.PROC_SET]): 

1239 if x not in arr: 

1240 arr.append(x) 

1241 arr.sort() 

1242 

1243 if not is_null_or_none(page2.get(PG.ANNOTS, None)): 

1244 if PG.ANNOTS not in self: 

1245 self[NameObject(PG.ANNOTS)] = ArrayObject() 

1246 annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) 

1247 if ctm is None: 

1248 trsf = Transformation() 

1249 else: 

1250 trsf = Transformation(ctm) 

1251 # Ensure we are working on a copy of the list. Otherwise, if both pages 

1252 # are the same object, we might run into an infinite loop. 

1253 for a in cast(ArrayObject, deepcopy(page2[PG.ANNOTS])): 

1254 a = a.get_object() 

1255 aa = a.clone( 

1256 pdf, 

1257 ignore_fields=("/P", "/StructParent", "/Parent"), 

1258 force_duplicate=True, 

1259 ) 

1260 r = cast(ArrayObject, a["/Rect"]) 

1261 pt1 = trsf.apply_on((r[0], r[1]), True) 

1262 pt2 = trsf.apply_on((r[2], r[3]), True) 

1263 aa[NameObject("/Rect")] = ArrayObject( 

1264 ( 

1265 min(pt1[0], pt2[0]), 

1266 min(pt1[1], pt2[1]), 

1267 max(pt1[0], pt2[0]), 

1268 max(pt1[1], pt2[1]), 

1269 ) 

1270 ) 

1271 if "/QuadPoints" in a: 

1272 q = cast(ArrayObject, a["/QuadPoints"]) 

1273 aa[NameObject("/QuadPoints")] = ArrayObject( 

1274 trsf.apply_on((q[0], q[1]), True) 

1275 + trsf.apply_on((q[2], q[3]), True) 

1276 + trsf.apply_on((q[4], q[5]), True) 

1277 + trsf.apply_on((q[6], q[7]), True) 

1278 ) 

1279 try: 

1280 aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference 

1281 except KeyError: 

1282 pass 

1283 try: 

1284 aa[NameObject("/P")] = self.indirect_reference 

1285 annots.append(aa.indirect_reference) 

1286 except AttributeError: 

1287 pass 

1288 

1289 new_content_array = ArrayObject() 

1290 original_content = self.get_contents() 

1291 if original_content is not None: 

1292 original_content.isolate_graphics_state() 

1293 new_content_array.append(original_content) 

1294 

1295 page2content = page2.get_contents() 

1296 if page2content is not None: 

1297 rect = getattr(page2, MERGE_CROP_BOX) 

1298 page2content.operations.insert( 

1299 0, 

1300 ( 

1301 map( 

1302 FloatObject, 

1303 [ 

1304 rect.left, 

1305 rect.bottom, 

1306 rect.width, 

1307 rect.height, 

1308 ], 

1309 ), 

1310 b"re", 

1311 ), 

1312 ) 

1313 page2content.operations.insert(1, ([], b"W")) 

1314 page2content.operations.insert(2, ([], b"n")) 

1315 if page2transformation is not None: 

1316 page2content = page2transformation(page2content) 

1317 page2content = PageObject._content_stream_rename( 

1318 page2content, rename, self.pdf 

1319 ) 

1320 page2content.isolate_graphics_state() 

1321 if over: 

1322 new_content_array.append(page2content) 

1323 else: 

1324 new_content_array.insert(0, page2content) 

1325 

1326 # if expanding the page to fit a new page, calculate the new media box size 

1327 if expand: 

1328 self._expand_mediabox(page2, ctm) 

1329 

1330 self.replace_contents(new_content_array) 

1331 

1332 def _expand_mediabox( 

1333 self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] 

1334 ) -> None: 

1335 corners1 = ( 

1336 self.mediabox.left.as_numeric(), 

1337 self.mediabox.bottom.as_numeric(), 

1338 self.mediabox.right.as_numeric(), 

1339 self.mediabox.top.as_numeric(), 

1340 ) 

1341 corners2 = ( 

1342 page2.mediabox.left.as_numeric(), 

1343 page2.mediabox.bottom.as_numeric(), 

1344 page2.mediabox.left.as_numeric(), 

1345 page2.mediabox.top.as_numeric(), 

1346 page2.mediabox.right.as_numeric(), 

1347 page2.mediabox.top.as_numeric(), 

1348 page2.mediabox.right.as_numeric(), 

1349 page2.mediabox.bottom.as_numeric(), 

1350 ) 

1351 if ctm is not None: 

1352 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1353 new_x = tuple( 

1354 ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] 

1355 for i in range(0, 8, 2) 

1356 ) 

1357 new_y = tuple( 

1358 ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] 

1359 for i in range(0, 8, 2) 

1360 ) 

1361 else: 

1362 new_x = corners2[0:8:2] 

1363 new_y = corners2[1:8:2] 

1364 lowerleft = (min(new_x), min(new_y)) 

1365 upperright = (max(new_x), max(new_y)) 

1366 lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) 

1367 upperright = ( 

1368 max(corners1[2], upperright[0]), 

1369 max(corners1[3], upperright[1]), 

1370 ) 

1371 

1372 self.mediabox.lower_left = lowerleft 

1373 self.mediabox.upper_right = upperright 

1374 

1375 def merge_transformed_page( 

1376 self, 

1377 page2: "PageObject", 

1378 ctm: Union[CompressedTransformationMatrix, Transformation], 

1379 over: bool = True, 

1380 expand: bool = False, 

1381 ) -> None: 

1382 """ 

1383 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but a transformation 

1384 matrix is applied to the merged stream. 

1385 

1386 Args: 

1387 page2: The page to be merged into this one. 

1388 ctm: a 6-element tuple containing the operands of the 

1389 transformation matrix 

1390 over: set the page2 content over page1 if True (default) else under 

1391 expand: Whether the page should be expanded to fit the dimensions 

1392 of the page to be merged. 

1393 

1394 """ 

1395 if isinstance(ctm, Transformation): 

1396 ctm = ctm.ctm 

1397 self._merge_page( 

1398 page2, 

1399 lambda page2_content: PageObject._add_transformation_matrix( 

1400 page2_content, page2.pdf, ctm 

1401 ), 

1402 ctm, 

1403 over, 

1404 expand, 

1405 ) 

1406 

1407 def merge_scaled_page( 

1408 self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False 

1409 ) -> None: 

1410 """ 

1411 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1412 is scaled by applying a transformation matrix. 

1413 

1414 Args: 

1415 page2: The page to be merged into this one. 

1416 scale: The scaling factor 

1417 over: set the page2 content over page1 if True (default) else under 

1418 expand: Whether the page should be expanded to fit the 

1419 dimensions of the page to be merged. 

1420 

1421 """ 

1422 op = Transformation().scale(scale, scale) 

1423 self.merge_transformed_page(page2, op, over, expand) 

1424 

1425 def merge_rotated_page( 

1426 self, 

1427 page2: "PageObject", 

1428 rotation: float, 

1429 over: bool = True, 

1430 expand: bool = False, 

1431 ) -> None: 

1432 """ 

1433 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be merged 

1434 is rotated by applying a transformation matrix. 

1435 

1436 Args: 

1437 page2: The page to be merged into this one. 

1438 rotation: The angle of the rotation, in degrees 

1439 over: set the page2 content over page1 if True (default) else under 

1440 expand: Whether the page should be expanded to fit the 

1441 dimensions of the page to be merged. 

1442 

1443 """ 

1444 op = Transformation().rotate(rotation) 

1445 self.merge_transformed_page(page2, op, over, expand) 

1446 

1447 def merge_translated_page( 

1448 self, 

1449 page2: "PageObject", 

1450 tx: float, 

1451 ty: float, 

1452 over: bool = True, 

1453 expand: bool = False, 

1454 ) -> None: 

1455 """ 

1456 Similar to :meth:`~pypdf._page.PageObject.merge_page`, but the stream to be 

1457 merged is translated by applying a transformation matrix. 

1458 

1459 Args: 

1460 page2: the page to be merged into this one. 

1461 tx: The translation on X axis 

1462 ty: The translation on Y axis 

1463 over: set the page2 content over page1 if True (default) else under 

1464 expand: Whether the page should be expanded to fit the 

1465 dimensions of the page to be merged. 

1466 

1467 """ 

1468 op = Transformation().translate(tx, ty) 

1469 self.merge_transformed_page(page2, op, over, expand) 

1470 

1471 def add_transformation( 

1472 self, 

1473 ctm: Union[Transformation, CompressedTransformationMatrix], 

1474 expand: bool = False, 

1475 ) -> None: 

1476 """ 

1477 Apply a transformation matrix to the page. 

1478 

1479 Args: 

1480 ctm: A 6-element tuple containing the operands of the 

1481 transformation matrix. Alternatively, a 

1482 :py:class:`Transformation<pypdf.Transformation>` 

1483 object can be passed. 

1484 

1485 See :doc:`/user/cropping-and-transforming`. 

1486 

1487 """ 

1488 if isinstance(ctm, Transformation): 

1489 ctm = ctm.ctm 

1490 content = self.get_contents() 

1491 if content is not None: 

1492 content = PageObject._add_transformation_matrix(content, self.pdf, ctm) 

1493 content.isolate_graphics_state() 

1494 self.replace_contents(content) 

1495 # if expanding the page to fit a new page, calculate the new media box size 

1496 if expand: 

1497 corners = [ 

1498 self.mediabox.left.as_numeric(), 

1499 self.mediabox.bottom.as_numeric(), 

1500 self.mediabox.left.as_numeric(), 

1501 self.mediabox.top.as_numeric(), 

1502 self.mediabox.right.as_numeric(), 

1503 self.mediabox.top.as_numeric(), 

1504 self.mediabox.right.as_numeric(), 

1505 self.mediabox.bottom.as_numeric(), 

1506 ] 

1507 

1508 ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] 

1509 new_x = [ 

1510 ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] 

1511 for i in range(0, 8, 2) 

1512 ] 

1513 new_y = [ 

1514 ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] 

1515 for i in range(0, 8, 2) 

1516 ] 

1517 

1518 self.mediabox.lower_left = (min(new_x), min(new_y)) 

1519 self.mediabox.upper_right = (max(new_x), max(new_y)) 

1520 

1521 def scale(self, sx: float, sy: float) -> None: 

1522 """ 

1523 Scale a page by the given factors by applying a transformation matrix 

1524 to its content and updating the page size. 

1525 

1526 This updates the various page boundaries (bleedbox, trimbox, etc.) 

1527 and the contents of the page. 

1528 

1529 Args: 

1530 sx: The scaling factor on horizontal axis. 

1531 sy: The scaling factor on vertical axis. 

1532 

1533 """ 

1534 self.add_transformation((sx, 0, 0, sy, 0, 0)) 

1535 self.bleedbox = self.bleedbox.scale(sx, sy) 

1536 self.trimbox = self.trimbox.scale(sx, sy) 

1537 self.artbox = self.artbox.scale(sx, sy) 

1538 self.cropbox = self.cropbox.scale(sx, sy) 

1539 self.mediabox = self.mediabox.scale(sx, sy) 

1540 

1541 if PG.ANNOTS in self: 

1542 annotations = self[PG.ANNOTS] 

1543 if isinstance(annotations, ArrayObject): 

1544 for annotation in annotations: 

1545 annotation_obj = annotation.get_object() 

1546 if AnnotationDictionaryAttributes.Rect in annotation_obj: 

1547 rectangle = annotation_obj[AnnotationDictionaryAttributes.Rect] 

1548 if isinstance(rectangle, ArrayObject): 

1549 rectangle[0] = FloatObject(float(rectangle[0]) * sx) 

1550 rectangle[1] = FloatObject(float(rectangle[1]) * sy) 

1551 rectangle[2] = FloatObject(float(rectangle[2]) * sx) 

1552 rectangle[3] = FloatObject(float(rectangle[3]) * sy) 

1553 

1554 if PG.VP in self: 

1555 viewport = self[PG.VP] 

1556 if isinstance(viewport, ArrayObject): 

1557 bbox = viewport[0]["/BBox"] 

1558 else: 

1559 bbox = viewport["/BBox"] # type: ignore[index] 

1560 scaled_bbox = RectangleObject( 

1561 ( 

1562 float(bbox[0]) * sx, 

1563 float(bbox[1]) * sy, 

1564 float(bbox[2]) * sx, 

1565 float(bbox[3]) * sy, 

1566 ) 

1567 ) 

1568 if isinstance(viewport, ArrayObject): 

1569 self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore[index] 

1570 NameObject("/BBox") 

1571 ] = scaled_bbox 

1572 else: 

1573 self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore[index] 

1574 

1575 def scale_by(self, factor: float) -> None: 

1576 """ 

1577 Scale a page by the given factor by applying a transformation matrix to 

1578 its content and updating the page size. 

1579 

1580 Args: 

1581 factor: The scaling factor (for both X and Y axis). 

1582 

1583 """ 

1584 self.scale(factor, factor) 

1585 

1586 def scale_to(self, width: float, height: float) -> None: 

1587 """ 

1588 Scale a page to the specified dimensions by applying a transformation 

1589 matrix to its content and updating the page size. 

1590 

1591 Args: 

1592 width: The new width. 

1593 height: The new height. 

1594 

1595 """ 

1596 sx = width / float(self.mediabox.width) 

1597 sy = height / float(self.mediabox.height) 

1598 self.scale(sx, sy) 

1599 

1600 def compress_content_streams(self, level: int = -1) -> None: 

1601 """ 

1602 Compress the size of this page by joining all content streams and 

1603 applying a FlateDecode filter. 

1604 

1605 However, it is possible that this function will perform no action if 

1606 content stream compression becomes "automatic". 

1607 """ 

1608 content = self.get_contents() 

1609 if content is not None: 

1610 content_obj = content.flate_encode(level) 

1611 try: 

1612 content.indirect_reference.pdf._objects[ # type: ignore[union-attr] 

1613 content.indirect_reference.idnum - 1 # type: ignore[union-attr] 

1614 ] = content_obj 

1615 except AttributeError: 

1616 if self.indirect_reference is not None and hasattr( 

1617 self.indirect_reference.pdf, "_add_object" 

1618 ): 

1619 self.replace_contents(content_obj) 

1620 else: 

1621 raise ValueError("Page must be part of a PdfWriter") 

1622 

1623 @property 

1624 def page_number(self) -> Optional[int]: 

1625 """ 

1626 Read-only property which returns the page number within the PDF file. 

1627 

1628 Returns: 

1629 Page number; None if the page is not attached to a PDF. 

1630 

1631 """ 

1632 if self.indirect_reference is None: 

1633 return None 

1634 try: 

1635 lst = self.indirect_reference.pdf.pages 

1636 return int(lst.index(self)) 

1637 except ValueError: 

1638 return None 

1639 

1640 def _debug_for_extract(self) -> str: # pragma: no cover 

1641 out = "" 

1642 for ope, op in ContentStream( 

1643 self["/Contents"].get_object(), self.pdf, "bytes" 

1644 ).operations: 

1645 if op == b"TJ": 

1646 s = [x for x in ope[0] if isinstance(x, str)] 

1647 else: 

1648 s = [] 

1649 out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" 

1650 out += "\n=============================\n" 

1651 try: 

1652 for fo in self[PG.RESOURCES]["/Font"]: # type:ignore 

1653 out += fo + "\n" 

1654 out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore 

1655 try: 

1656 enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1657 "/Encoding" 

1658 ].__repr__() 

1659 out += enc_repr + "\n" 

1660 except Exception: 

1661 pass 

1662 try: 

1663 out += ( 

1664 self[PG.RESOURCES]["/Font"][fo][ # type:ignore 

1665 "/ToUnicode" 

1666 ] 

1667 .get_data() 

1668 .decode() 

1669 + "\n" 

1670 ) 

1671 except Exception: 

1672 pass 

1673 

1674 except KeyError: 

1675 out += "No Font\n" 

1676 return out 

1677 

1678 def _extract_text( 

1679 self, 

1680 obj: DictionaryObject, 

1681 pdf: Any, 

1682 orientations: tuple[int, ...] = (0, 90, 180, 270), 

1683 space_width: float = 200.0, 

1684 content_key: Optional[str] = PG.CONTENTS, 

1685 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1686 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1687 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1688 *, 

1689 known_ids: Optional[set[int]] = None, 

1690 ) -> str: 

1691 """ 

1692 See extract_text for most arguments. 

1693 

1694 Args: 

1695 content_key: indicate the default key where to extract data 

1696 None = the object; this allows reusing the function on an XObject 

1697 default = "/Content" 

1698 

1699 """ 

1700 if known_ids is None: 

1701 known_ids = set() 

1702 

1703 extractor = TextExtraction() 

1704 font_resources: dict[str, DictionaryObject] = {} 

1705 fonts: dict[str, Font] = {} 

1706 

1707 resources_dict = cast( 

1708 Optional[DictionaryObject], 

1709 obj.get_inherited(key=PG.RESOURCES, default=DictionaryObject()) 

1710 ) 

1711 if is_null_or_none(resources_dict) or not resources_dict: 

1712 # No resources means no text is possible (no font); we consider the 

1713 # file as not damaged, no need to check for TJ or Tj 

1714 return "" 

1715 

1716 if ( 

1717 "/Font" in resources_dict 

1718 and (font_resources_dict := cast(DictionaryObject, resources_dict["/Font"])) 

1719 ): 

1720 for font_resource in font_resources_dict: 

1721 try: 

1722 font_resource_object = cast(DictionaryObject, font_resources_dict[font_resource].get_object()) 

1723 font_resources[font_resource] = font_resource_object 

1724 fonts[font_resource] = Font.from_font_resource(font_resource_object) 

1725 # Override space width, if applicable 

1726 if fonts[font_resource].character_widths.get(fonts[font_resource].space_char, 0) == 0: 

1727 fonts[font_resource].space_width = space_width 

1728 except (AttributeError, TypeError): 

1729 pass 

1730 

1731 try: 

1732 content = ( 

1733 obj[content_key].get_object() if isinstance(content_key, str) else obj 

1734 ) 

1735 if not isinstance(content, ContentStream): 

1736 content = ContentStream(content, pdf, "bytes") 

1737 except (AttributeError, KeyError): # no content can be extracted (certainly empty page) 

1738 return "" 

1739 # We check all strings are TextStringObjects. ByteStringObjects 

1740 # are strings where the byte->string encoding was unknown, so adding 

1741 # them to the text here would be gibberish. 

1742 

1743 # Initialize the extractor with the necessary parameters 

1744 extractor.initialize_extraction(orientations, visitor_text, font_resources, fonts) 

1745 

1746 for operands, operator in content.operations: 

1747 if visitor_operand_before is not None: 

1748 visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1749 # Multiple operators are handled here 

1750 if operator == b"'": 

1751 extractor.process_operation(b"T*", []) 

1752 extractor.process_operation(b"Tj", operands) 

1753 elif operator == b'"': 

1754 extractor.process_operation(b"Tw", [operands[0]]) 

1755 extractor.process_operation(b"Tc", [operands[1]]) 

1756 extractor.process_operation(b"T*", []) 

1757 extractor.process_operation(b"Tj", operands[2:]) 

1758 elif operator == b"TJ": 

1759 # The space width may be smaller than the font width, so the width should be 95%. 

1760 _confirm_space_width = extractor._space_width * 0.95 

1761 if operands: 

1762 for op in operands[0]: 

1763 if isinstance(op, (str, bytes)): 

1764 extractor.process_operation(b"Tj", [op]) 

1765 if isinstance(op, (int, float, NumberObject, FloatObject)) and ( 

1766 abs(float(op)) >= _confirm_space_width 

1767 and extractor.text 

1768 and extractor.text[-1] != " " 

1769 ): 

1770 extractor.process_operation(b"Tj", [" "]) 

1771 elif operator == b"TD": 

1772 extractor.process_operation(b"TL", [-operands[1]]) 

1773 extractor.process_operation(b"Td", operands) 

1774 elif operator == b"Do": 

1775 extractor.output += extractor.text 

1776 if visitor_text is not None: 

1777 visitor_text( 

1778 extractor.text, 

1779 extractor.memo_cm, 

1780 extractor.memo_tm, 

1781 extractor.font_resource, 

1782 extractor.font_size, 

1783 ) 

1784 try: 

1785 if extractor.output[-1] != "\n": 

1786 extractor.output += "\n" 

1787 if visitor_text is not None: 

1788 visitor_text( 

1789 "\n", 

1790 extractor.memo_cm, 

1791 extractor.memo_tm, 

1792 extractor.font_resource, 

1793 extractor.font_size, 

1794 ) 

1795 except IndexError: 

1796 pass 

1797 try: 

1798 xobj = cast(DictionaryObject, resources_dict["/XObject"]) 

1799 xform = cast(EncodedStreamObject, xobj[operands[0]]) 

1800 if xform["/Subtype"] != NameObject("/Image"): 

1801 xform_id = id(xform) 

1802 if xform_id in known_ids: 

1803 logger_warning( 

1804 "Detected cyclic form XObject reference, skipping %(operand)s.", 

1805 source=__name__, 

1806 operand=operands[0] 

1807 ) 

1808 text = "" 

1809 else: 

1810 known_ids.add(xform_id) 

1811 try: 

1812 text = self.extract_xform_text( 

1813 xform, 

1814 orientations, 

1815 space_width, 

1816 visitor_operand_before, 

1817 visitor_operand_after, 

1818 visitor_text, 

1819 known_ids=known_ids, 

1820 ) 

1821 finally: 

1822 known_ids.discard(xform_id) 

1823 extractor.output += text 

1824 if visitor_text is not None: 

1825 visitor_text( 

1826 text, 

1827 extractor.memo_cm, 

1828 extractor.memo_tm, 

1829 extractor.font_resource, 

1830 extractor.font_size, 

1831 ) 

1832 except Exception as exception: 

1833 logger_warning( 

1834 "Impossible to decode XFormObject %(operand)s: %(exception)s", 

1835 source=__name__, 

1836 operand=operands[0], 

1837 exception=exception, 

1838 ) 

1839 finally: 

1840 extractor.text = "" 

1841 extractor.memo_cm = extractor.cm_matrix.copy() 

1842 extractor.memo_tm = extractor.tm_matrix.copy() 

1843 else: 

1844 extractor.process_operation(operator, operands) 

1845 if visitor_operand_after is not None: 

1846 visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) 

1847 extractor.output += extractor.text # just in case 

1848 if extractor.text != "" and visitor_text is not None: 

1849 visitor_text( 

1850 extractor.text, 

1851 extractor.memo_cm, 

1852 extractor.memo_tm, 

1853 extractor.font_resource, 

1854 extractor.font_size, 

1855 ) 

1856 return extractor.output 

1857 

1858 def _layout_mode_fonts(self) -> dict[str, Font]: 

1859 """ 

1860 Get fonts formatted for "layout" mode text extraction. 

1861 

1862 Returns: 

1863 Dict[str, Font]: dictionary of Font instances keyed by font name 

1864 

1865 """ 

1866 # Font retrieval logic adapted from pypdf.PageObject._extract_text() 

1867 objr: Any = self 

1868 fonts: dict[str, Font] = {} 

1869 while objr is not None: 

1870 try: 

1871 resources_dict: Any = objr[PG.RESOURCES] 

1872 except KeyError: 

1873 resources_dict = {} 

1874 if "/Font" in resources_dict and self.pdf is not None: 

1875 for font_name in resources_dict["/Font"]: 

1876 fonts[font_name] = Font.from_font_resource(resources_dict["/Font"][font_name]) 

1877 try: 

1878 objr = objr["/Parent"].get_object() 

1879 except KeyError: 

1880 objr = None 

1881 

1882 return fonts 

1883 

1884 def _layout_mode_text( 

1885 self, 

1886 space_vertically: bool = True, 

1887 scale_weight: float = 1.25, 

1888 strip_rotated: bool = True, 

1889 debug_path: Optional[Path] = None, 

1890 font_height_weight: float = 1, 

1891 ) -> str: 

1892 """ 

1893 Get text preserving fidelity to source PDF text layout. 

1894 

1895 Args: 

1896 space_vertically: include blank lines inferred from y distance + font 

1897 height. Defaults to True. 

1898 scale_weight: multiplier for string length when calculating weighted 

1899 average character width. Defaults to 1.25. 

1900 strip_rotated: Removes text that is rotated w.r.t. to the page from 

1901 layout mode output. Defaults to True. 

1902 debug_path (Path | None): if supplied, must target a directory. 

1903 creates the following files with debug information for layout mode 

1904 functions if supplied: 

1905 - fonts.json: output of self._layout_mode_fonts 

1906 - tjs.json: individual text render ops with corresponding transform matrices 

1907 - bts.json: text render ops left justified and grouped by BT/ET operators 

1908 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

1909 Defaults to None. 

1910 font_height_weight: multiplier for font height when calculating 

1911 blank lines. Defaults to 1. 

1912 

1913 Returns: 

1914 str: multiline string containing page text in a fixed width format that 

1915 closely adheres to the rendered layout in the source pdf. 

1916 

1917 """ 

1918 fonts = self._layout_mode_fonts() 

1919 if debug_path: # pragma: no cover 

1920 import json # noqa: PLC0415 

1921 

1922 debug_path.joinpath("fonts.json").write_text( 

1923 json.dumps(fonts, indent=2, default=asdict), 

1924 "utf-8" 

1925 ) 

1926 

1927 ops = iter( 

1928 ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations 

1929 ) 

1930 bt_groups = _layout_mode.text_show_operations( 

1931 ops, fonts, strip_rotated, debug_path 

1932 ) 

1933 

1934 if not bt_groups: 

1935 return "" 

1936 

1937 ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) 

1938 

1939 char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) 

1940 

1941 return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) 

1942 

1943 def extract_text( 

1944 self, 

1945 *args: Any, 

1946 orientations: Union[int, tuple[int, ...]] = (0, 90, 180, 270), 

1947 space_width: float = 200.0, 

1948 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1949 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

1950 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

1951 extraction_mode: Literal["plain", "layout"] = "plain", 

1952 **kwargs: Any, 

1953 ) -> str: 

1954 """ 

1955 Locate all text drawing commands, in the order they are provided in the 

1956 content stream, and extract the text. 

1957 

1958 This works well for some PDF files, but poorly for others, depending on 

1959 the generator used. This will be refined in the future. 

1960 

1961 Do not rely on the order of text coming out of this function, as it 

1962 will change if this function is made more sophisticated. 

1963 

1964 Arabic and Hebrew are extracted in the correct order. 

1965 If required a custom RTL range of characters can be defined; 

1966 see function set_custom_rtl. 

1967 

1968 Additionally you can provide visitor methods to get informed on all 

1969 operations and all text objects. 

1970 For example in some PDF files this can be useful to parse tables. 

1971 

1972 Args: 

1973 orientations: list of orientations extract_text will look for 

1974 default = (0, 90, 180, 270) 

1975 note: currently only 0 (up),90 (turned left), 180 (upside down), 

1976 270 (turned right) 

1977 Silently ignored in "layout" mode. 

1978 space_width: force default space width 

1979 if not extracted from font (default: 200) 

1980 Silently ignored in "layout" mode. 

1981 visitor_operand_before: function to be called before processing an operation. 

1982 It has four arguments: operator, operand-arguments, 

1983 current transformation matrix and text matrix. 

1984 Ignored with a warning in "layout" mode. 

1985 visitor_operand_after: function to be called after processing an operation. 

1986 It has four arguments: operator, operand-arguments, 

1987 current transformation matrix and text matrix. 

1988 Ignored with a warning in "layout" mode. 

1989 visitor_text: function to be called when extracting some text at some position. 

1990 It has five arguments: text, current transformation matrix, 

1991 text matrix, font-dictionary and font-size. 

1992 The font-dictionary may be None in case of unknown fonts. 

1993 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". 

1994 Ignored with a warning in "layout" mode. 

1995 extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, 

1996 "layout" for experimental layout mode functionality. 

1997 NOTE: orientations, space_width, and visitor_* parameters are NOT respected 

1998 in "layout" mode. 

1999 

2000 kwargs: 

2001 layout_mode_space_vertically (bool): include blank lines inferred from 

2002 y distance + font height. Defaults to True. 

2003 layout_mode_scale_weight (float): multiplier for string length when calculating 

2004 weighted average character width. Defaults to 1.25. 

2005 layout_mode_strip_rotated (bool): layout mode does not support rotated text. 

2006 Set to False to include rotated text anyway. If rotated text is discovered, 

2007 layout will be degraded and a warning will result. Defaults to True. 

2008 layout_mode_debug_path (Path | None): if supplied, must target a directory. 

2009 creates the following files with debug information for layout mode 

2010 functions if supplied: 

2011 

2012 - fonts.json: output of self._layout_mode_fonts 

2013 - tjs.json: individual text render ops with corresponding transform matrices 

2014 - bts.json: text render ops left justified and grouped by BT/ET operators 

2015 - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) 

2016 layout_mode_font_height_weight (float): multiplier for font height when calculating 

2017 blank lines. Defaults to 1. 

2018 

2019 Returns: 

2020 The extracted text 

2021 

2022 """ 

2023 if extraction_mode not in ["plain", "layout"]: 

2024 raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") 

2025 if extraction_mode == "layout": 

2026 for visitor in ( 

2027 "visitor_operand_before", 

2028 "visitor_operand_after", 

2029 "visitor_text", 

2030 ): 

2031 if locals()[visitor]: 

2032 logger_warning( 

2033 "Argument %(visitor)s is ignored in layout mode", 

2034 source=__name__, 

2035 visitor=visitor, 

2036 ) 

2037 return self._layout_mode_text( 

2038 space_vertically=kwargs.get("layout_mode_space_vertically", True), 

2039 scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), 

2040 strip_rotated=kwargs.get("layout_mode_strip_rotated", True), 

2041 debug_path=kwargs.get("layout_mode_debug_path"), 

2042 font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) 

2043 ) 

2044 if len(args) >= 1: 

2045 if isinstance(args[0], str): 

2046 if len(args) >= 3: 

2047 if isinstance(args[2], (tuple, int)): 

2048 orientations = args[2] 

2049 else: 

2050 raise TypeError(f"Invalid positional parameter {args[2]}") 

2051 if len(args) >= 4: 

2052 if isinstance(args[3], (float, int)): 

2053 space_width = args[3] 

2054 else: 

2055 raise TypeError(f"Invalid positional parameter {args[3]}") 

2056 elif isinstance(args[0], (tuple, int)): 

2057 orientations = args[0] 

2058 if len(args) >= 2: 

2059 if isinstance(args[1], (float, int)): 

2060 space_width = args[1] 

2061 else: 

2062 raise TypeError(f"Invalid positional parameter {args[1]}") 

2063 else: 

2064 raise TypeError(f"Invalid positional parameter {args[0]}") 

2065 

2066 if isinstance(orientations, int): 

2067 orientations = (orientations,) 

2068 

2069 return self._extract_text( 

2070 self, 

2071 self.pdf, 

2072 orientations, 

2073 space_width, 

2074 PG.CONTENTS, 

2075 visitor_operand_before, 

2076 visitor_operand_after, 

2077 visitor_text, 

2078 ) 

2079 

2080 def extract_xform_text( 

2081 self, 

2082 xform: EncodedStreamObject, 

2083 orientations: tuple[int, ...] = (0, 90, 270, 360), 

2084 space_width: float = 200.0, 

2085 visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2086 visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, 

2087 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

2088 *, 

2089 known_ids: Optional[set[int]] = None, 

2090 ) -> str: 

2091 """ 

2092 Extract text from an XObject. 

2093 

2094 Args: 

2095 xform: 

2096 orientations: 

2097 space_width: force default space width (if not extracted from font (default 200) 

2098 visitor_operand_before: 

2099 visitor_operand_after: 

2100 visitor_text: 

2101 

2102 Returns: 

2103 The extracted text 

2104 

2105 """ 

2106 return self._extract_text( 

2107 xform, 

2108 self.pdf, 

2109 orientations, 

2110 space_width, 

2111 None, 

2112 visitor_operand_before, 

2113 visitor_operand_after, 

2114 visitor_text, 

2115 known_ids=known_ids, 

2116 ) 

2117 

2118 def _get_fonts(self) -> tuple[set[str], set[str]]: 

2119 """ 

2120 Get the names of embedded fonts and unembedded fonts. 

2121 

2122 Returns: 

2123 A tuple (set of embedded fonts, set of unembedded fonts) 

2124 

2125 """ 

2126 obj = self.get_object() 

2127 assert isinstance(obj, DictionaryObject) 

2128 fonts: set[str] = set() 

2129 embedded: set[str] = set() 

2130 fonts, embedded = _get_fonts_walk(obj, fonts, embedded) 

2131 unembedded = fonts - embedded 

2132 return embedded, unembedded 

2133 

2134 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) 

2135 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2136 default user space units, defining the boundaries of the physical medium on 

2137 which the page is intended to be displayed or printed.""" 

2138 

2139 cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) 

2140 """ 

2141 A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2142 default user space units, defining the visible region of default user 

2143 space. 

2144 

2145 When the page is displayed or printed, its contents are to be clipped 

2146 (cropped) to this rectangle and then imposed on the output medium in some 

2147 implementation-defined manner. Default value: same as 

2148 :attr:`mediabox<mediabox>`. 

2149 """ 

2150 

2151 bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) 

2152 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2153 default user space units, defining the region to which the contents of the 

2154 page should be clipped when output in a production environment.""" 

2155 

2156 trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) 

2157 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2158 default user space units, defining the intended dimensions of the finished 

2159 page after trimming.""" 

2160 

2161 artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) 

2162 """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in 

2163 default user space units, defining the extent of the page's meaningful 

2164 content as intended by the page's creator.""" 

2165 

2166 @property 

2167 def annotations(self) -> Optional[ArrayObject]: 

2168 if "/Annots" not in self: 

2169 return None 

2170 return cast(ArrayObject, self["/Annots"]) 

2171 

2172 @annotations.setter 

2173 def annotations(self, value: Optional[ArrayObject]) -> None: 

2174 """ 

2175 Set the annotations array of the page. 

2176 

2177 Typically you do not want to set this value, but append to it. 

2178 If you append to it, remember to add the object first to the writer 

2179 and only add the indirect object. 

2180 """ 

2181 if value is None: 

2182 if "/Annots" not in self: 

2183 return 

2184 del self[NameObject("/Annots")] 

2185 else: 

2186 self[NameObject("/Annots")] = value 

2187 

2188 

2189class _VirtualList(Sequence[PageObject]): 

2190 def __init__( 

2191 self, 

2192 length_function: Callable[[], int], 

2193 get_function: Callable[[int], PageObject], 

2194 ) -> None: 

2195 self.length_function = length_function 

2196 self.get_function = get_function 

2197 self.current = -1 

2198 

2199 def __len__(self) -> int: 

2200 return self.length_function() 

2201 

2202 @overload 

2203 def __getitem__(self, index: int) -> PageObject: 

2204 ... 

2205 

2206 @overload 

2207 def __getitem__(self, index: slice) -> Sequence[PageObject]: 

2208 ... 

2209 

2210 def __getitem__( 

2211 self, index: Union[int, slice] 

2212 ) -> Union[PageObject, Sequence[PageObject]]: 

2213 if isinstance(index, slice): 

2214 indices = range(*index.indices(len(self))) 

2215 cls = type(self) 

2216 return cls(indices.__len__, lambda idx: self[indices[idx]]) 

2217 if not isinstance(index, int): 

2218 raise TypeError("Sequence indices must be integers") 

2219 len_self = len(self) 

2220 if index < 0: 

2221 # support negative indexes 

2222 index += len_self 

2223 if not (0 <= index < len_self): 

2224 raise IndexError("Sequence index out of range") 

2225 return self.get_function(index) 

2226 

2227 def __delitem__(self, index: Union[int, slice]) -> None: 

2228 if isinstance(index, slice): 

2229 r = list(range(*index.indices(len(self)))) 

2230 # pages have to be deleted from last to first 

2231 r.sort() 

2232 r.reverse() 

2233 for p in r: 

2234 del self[p] # recursive call 

2235 return 

2236 if not isinstance(index, int): 

2237 raise TypeError("Index must be integers") 

2238 len_self = len(self) 

2239 if index < 0: 

2240 # support negative indexes 

2241 index += len_self 

2242 if not (0 <= index < len_self): 

2243 raise IndexError("Index out of range") 

2244 ind = self[index].indirect_reference 

2245 assert ind is not None 

2246 parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( 

2247 "/Parent", None 

2248 ) 

2249 first = True 

2250 while parent is not None: 

2251 parent = cast(DictionaryObject, parent.get_object()) 

2252 try: 

2253 i = cast(ArrayObject, parent["/Kids"]).index(ind) 

2254 del cast(ArrayObject, parent["/Kids"])[i] 

2255 first = False 

2256 try: 

2257 assert ind is not None 

2258 del ind.pdf.flattened_pages[index] # case of page in a Reader 

2259 except Exception: # pragma: no cover 

2260 pass 

2261 if "/Count" in parent: 

2262 parent[NameObject("/Count")] = NumberObject( 

2263 cast(int, parent["/Count"]) - 1 

2264 ) 

2265 if len(cast(ArrayObject, parent["/Kids"])) == 0: 

2266 # No more objects in this part of this subtree 

2267 ind = parent.indirect_reference 

2268 parent = parent.get("/Parent", None) 

2269 except ValueError: # from index 

2270 if first: 

2271 raise PdfReadError(f"Page not found in page tree: {ind}") 

2272 break 

2273 

2274 def __iter__(self) -> Iterator[PageObject]: 

2275 for i in range(len(self)): 

2276 yield self[i] 

2277 

2278 def __str__(self) -> str: 

2279 p = [f"PageObject({i})" for i in range(self.length_function())] 

2280 return f"[{', '.join(p)}]" 

2281 

2282 

2283def _get_fonts_walk( 

2284 obj: DictionaryObject, 

2285 fnt: set[str], 

2286 emb: set[str], 

2287) -> tuple[set[str], set[str]]: 

2288 """ 

2289 Get the set of all fonts and all embedded fonts. 

2290 

2291 Args: 

2292 obj: Page resources dictionary 

2293 fnt: font 

2294 emb: embedded fonts 

2295 

2296 Returns: 

2297 A tuple (fnt, emb) 

2298 

2299 If there is a key called 'BaseFont', that is a font that is used in the document. 

2300 If there is a key called 'FontName' and another key in the same dictionary object 

2301 that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 

2302 embedded. 

2303 

2304 We create and add to two sets, fnt = fonts used and emb = fonts embedded. 

2305 

2306 """ 

2307 fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") 

2308 

2309 def process_font(f: DictionaryObject) -> None: 

2310 nonlocal fnt, emb 

2311 f = cast(DictionaryObject, f.get_object()) # to be sure 

2312 if "/BaseFont" in f: 

2313 fnt.add(cast(str, f["/BaseFont"])) 

2314 

2315 if ( 

2316 ("/CharProcs" in f) 

2317 or ( 

2318 "/FontDescriptor" in f 

2319 and any( 

2320 x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys 

2321 ) 

2322 ) 

2323 or ( 

2324 "/DescendantFonts" in f 

2325 and "/FontDescriptor" 

2326 in cast( 

2327 DictionaryObject, 

2328 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2329 ) 

2330 and any( 

2331 x 

2332 in cast( 

2333 DictionaryObject, 

2334 cast( 

2335 DictionaryObject, 

2336 cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), 

2337 )["/FontDescriptor"], 

2338 ) 

2339 for x in fontkeys 

2340 ) 

2341 ) 

2342 ): 

2343 # the list comprehension ensures there is FontFile 

2344 try: 

2345 emb.add(cast(str, f["/BaseFont"])) 

2346 except KeyError: 

2347 emb.add("(" + cast(str, f["/Subtype"]) + ")") 

2348 

2349 if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): 

2350 for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): 

2351 process_font(f) 

2352 if "/Resources" in obj: 

2353 if "/Font" in cast(DictionaryObject, obj["/Resources"]): 

2354 for f in cast( 

2355 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] 

2356 ).values(): 

2357 process_font(f) 

2358 if "/XObject" in cast(DictionaryObject, obj["/Resources"]): 

2359 for x in cast( 

2360 DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] 

2361 ).values(): 

2362 _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) 

2363 if "/Annots" in obj: 

2364 for a in cast(ArrayObject, obj["/Annots"]): 

2365 _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) 

2366 if "/AP" in obj: 

2367 if ( 

2368 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( 

2369 "/Type" 

2370 ) 

2371 == "/XObject" 

2372 ): 

2373 _get_fonts_walk( 

2374 cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), 

2375 fnt, 

2376 emb, 

2377 ) 

2378 else: 

2379 for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): 

2380 _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) 

2381 return fnt, emb # return the sets for each page