Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1451 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import decimal 

31import enum 

32import hashlib 

33import re 

34import struct 

35import uuid 

36from io import BytesIO, FileIO, IOBase 

37from itertools import compress 

38from pathlib import Path 

39from types import TracebackType 

40from typing import ( 

41 IO, 

42 Any, 

43 Callable, 

44 Dict, 

45 Iterable, 

46 List, 

47 Optional, 

48 Pattern, 

49 Tuple, 

50 Type, 

51 Union, 

52 cast, 

53) 

54 

55from ._cmap import _default_fonts_space_width, build_char_map_from_dict 

56from ._doc_common import DocumentInformation, PdfDocCommon 

57from ._encryption import EncryptAlgorithm, Encryption 

58from ._page import PageObject, Transformation 

59from ._page_labels import nums_clear_range, nums_insert, nums_next 

60from ._reader import PdfReader 

61from ._utils import ( 

62 StrByteType, 

63 StreamType, 

64 _get_max_pdf_version_header, 

65 deprecate, 

66 deprecate_no_replacement, 

67 deprecation_with_replacement, 

68 logger_warning, 

69) 

70from .constants import AnnotationDictionaryAttributes as AA 

71from .constants import CatalogAttributes as CA 

72from .constants import ( 

73 CatalogDictionary, 

74 GoToActionArguments, 

75 ImageType, 

76 InteractiveFormDictEntries, 

77 OutlineFontFlag, 

78 PageLabelStyle, 

79 TypFitArguments, 

80 UserAccessPermissions, 

81) 

82from .constants import Core as CO 

83from .constants import FieldDictionaryAttributes as FA 

84from .constants import PageAttributes as PG 

85from .constants import PagesAttributes as PA 

86from .constants import TrailerKeys as TK 

87from .errors import PyPdfError 

88from .generic import ( 

89 PAGE_FIT, 

90 ArrayObject, 

91 BooleanObject, 

92 ByteStringObject, 

93 ContentStream, 

94 DecodedStreamObject, 

95 Destination, 

96 DictionaryObject, 

97 EmbeddedFile, 

98 Fit, 

99 FloatObject, 

100 IndirectObject, 

101 NameObject, 

102 NullObject, 

103 NumberObject, 

104 PdfObject, 

105 RectangleObject, 

106 StreamObject, 

107 TextStringObject, 

108 TreeObject, 

109 ViewerPreferences, 

110 create_string_object, 

111 hex_to_rgb, 

112 is_null_or_none, 

113) 

114from .pagerange import PageRange, PageRangeSpec 

115from .types import ( 

116 AnnotationSubtype, 

117 BorderArrayType, 

118 LayoutType, 

119 OutlineItemType, 

120 OutlineType, 

121 PagemodeType, 

122) 

123from .xmp import XmpInformation 

124 

125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() 

126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 

127 

128 

129class ObjectDeletionFlag(enum.IntFlag): 

130 NONE = 0 

131 TEXT = enum.auto() 

132 LINKS = enum.auto() 

133 ATTACHMENTS = enum.auto() 

134 OBJECTS_3D = enum.auto() 

135 ALL_ANNOTATIONS = enum.auto() 

136 XOBJECT_IMAGES = enum.auto() 

137 INLINE_IMAGES = enum.auto() 

138 DRAWING_IMAGES = enum.auto() 

139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES 

140 

141 

142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: 

143 hash = hashlib.md5() 

144 for block in iter(lambda: stream.read(blocksize), b""): 

145 hash.update(block) 

146 return hash.hexdigest() 

147 

148 

149class PdfWriter(PdfDocCommon): 

150 """ 

151 Write a PDF file out, given pages produced by another class or through 

152 cloning a PDF file during initialization. 

153 

154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`. 

155 

156 Args: 

157 clone_from: identical to fileobj (for compatibility) 

158 

159 incremental: If true, loads the document and set the PdfWriter in incremental mode. 

160 

161 When writing incrementally, the original document is written first and new/modified 

162 content is appended. To be used for signed document/forms to keep signature valid. 

163 

164 full: If true, loads all the objects (always full if incremental = True). 

165 This parameter may allow loading large PDFs. 

166 

167 """ 

168 

169 def __init__( 

170 self, 

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "", 

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None, 

173 incremental: bool = False, 

174 full: bool = False, 

175 ) -> None: 

176 self.incremental = incremental or full 

177 """ 

178 Returns if the PdfWriter object has been started in incremental mode. 

179 """ 

180 

181 self._objects: List[Optional[PdfObject]] = [] 

182 """ 

183 The indirect objects in the PDF. 

184 For the incremental case, it will be filled with None 

185 in clone_reader_document_root. 

186 """ 

187 

188 self._original_hash: List[int] = [] 

189 """ 

190 List of hashes after import; used to identify changes. 

191 """ 

192 

193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} 

194 """ 

195 Maps hash values of indirect objects to the list of IndirectObjects. 

196 This is used for compression. 

197 """ 

198 

199 self._id_translated: Dict[int, Dict[int, int]] = {} 

200 """List of already translated IDs. 

201 dict[id(pdf)][(idnum, generation)] 

202 """ 

203 

204 self._info_obj: Optional[PdfObject] 

205 """The PDF files's document information dictionary, 

206 the Info entry in the PDF file's trailer dictionary.""" 

207 

208 self._ID: Union[ArrayObject, None] = None 

209 """The PDF file identifier, 

210 defined by the ID in the PDF file's trailer dictionary.""" 

211 

212 if self.incremental: 

213 if isinstance(fileobj, (str, Path)): 

214 with open(fileobj, "rb") as f: 

215 fileobj = BytesIO(f.read(-1)) 

216 if isinstance(fileobj, BytesIO): 

217 fileobj = PdfReader(fileobj) 

218 if not isinstance(fileobj, PdfReader): 

219 raise PyPdfError("Invalid type for incremental mode") 

220 self._reader = fileobj # prev content is in _reader.stream 

221 self._header = fileobj.pdf_header.encode() 

222 self._readonly = True # TODO: to be analysed 

223 else: 

224 self._header = b"%PDF-1.3" 

225 self._info_obj = self._add_object( 

226 DictionaryObject( 

227 {NameObject("/Producer"): create_string_object("pypdf")} 

228 ) 

229 ) 

230 

231 def _get_clone_from( 

232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]: 

235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and ( 

236 fileobj == "" or clone_from is not None 

237 ): 

238 return clone_from 

239 cloning = True 

240 if isinstance(fileobj, (str, Path)) and ( 

241 not Path(str(fileobj)).exists() 

242 or Path(str(fileobj)).stat().st_size == 0 

243 ): 

244 cloning = False 

245 if isinstance(fileobj, (IOBase, BytesIO)): 

246 t = fileobj.tell() 

247 if fileobj.seek(0, 2) == 0: 

248 cloning = False 

249 fileobj.seek(t, 0) 

250 if cloning: 

251 clone_from = fileobj 

252 return clone_from 

253 

254 clone_from = _get_clone_from(fileobj, clone_from) 

255 # To prevent overwriting 

256 self.temp_fileobj = fileobj 

257 self.fileobj = "" 

258 self._with_as_usage = False 

259 self._cloned = False 

260 # The root of our page tree node 

261 pages = DictionaryObject( 

262 { 

263 NameObject(PA.TYPE): NameObject("/Pages"), 

264 NameObject(PA.COUNT): NumberObject(0), 

265 NameObject(PA.KIDS): ArrayObject(), 

266 } 

267 ) 

268 self.flattened_pages = [] 

269 self._encryption: Optional[Encryption] = None 

270 self._encrypt_entry: Optional[DictionaryObject] = None 

271 

272 if clone_from is not None: 

273 if not isinstance(clone_from, PdfReader): 

274 clone_from = PdfReader(clone_from) 

275 self.clone_document_from_reader(clone_from) 

276 self._cloned = True 

277 else: 

278 self._pages = self._add_object(pages) 

279 self._root_object = DictionaryObject( 

280 { 

281 NameObject(PA.TYPE): NameObject(CO.CATALOG), 

282 NameObject(CO.PAGES): self._pages, 

283 } 

284 ) 

285 self._add_object(self._root_object) 

286 if full and not incremental: 

287 self.incremental = False 

288 if isinstance(self._ID, list): 

289 if isinstance(self._ID[0], TextStringObject): 

290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) 

291 if isinstance(self._ID[1], TextStringObject): 

292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) 

293 

294 # for commonality 

295 @property 

296 def is_encrypted(self) -> bool: 

297 """ 

298 Read-only boolean property showing whether this PDF file is encrypted. 

299 

300 Note that this property, if true, will remain true even after the 

301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

302 """ 

303 return False 

304 

305 @property 

306 def root_object(self) -> DictionaryObject: 

307 """ 

308 Provide direct access to PDF Structure. 

309 

310 Note: 

311 Recommended only for read access. 

312 

313 """ 

314 return self._root_object 

315 

316 @property 

317 def _info(self) -> Optional[DictionaryObject]: 

318 """ 

319 Provide access to "/Info". Standardized with PdfReader. 

320 

321 Returns: 

322 /Info Dictionary; None if the entry does not exist 

323 

324 """ 

325 return ( 

326 None 

327 if self._info_obj is None 

328 else cast(DictionaryObject, self._info_obj.get_object()) 

329 ) 

330 

331 @_info.setter 

332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: 

333 if value is None: 

334 try: 

335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore 

336 except (KeyError, AttributeError): 

337 pass 

338 self._info_obj = None 

339 else: 

340 if self._info_obj is None: 

341 self._info_obj = self._add_object(DictionaryObject()) 

342 obj = cast(DictionaryObject, self._info_obj.get_object()) 

343 obj.clear() 

344 obj.update(cast(DictionaryObject, value.get_object())) 

345 

346 @property 

347 def xmp_metadata(self) -> Optional[XmpInformation]: 

348 """XMP (Extensible Metadata Platform) data.""" 

349 return cast(XmpInformation, self.root_object.xmp_metadata) 

350 

351 @xmp_metadata.setter 

352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None: 

353 """XMP (Extensible Metadata Platform) data.""" 

354 if value is None: 

355 if "/Metadata" in self.root_object: 

356 del self.root_object["/Metadata"] 

357 else: 

358 self.root_object[NameObject("/Metadata")] = value 

359 

360 return self.root_object.xmp_metadata # type: ignore 

361 

362 @property 

363 def with_as_usage(self) -> bool: 

364 deprecate_no_replacement("with_as_usage", "6.0") 

365 return self._with_as_usage 

366 

367 @with_as_usage.setter 

368 def with_as_usage(self, value: bool) -> None: 

369 deprecate_no_replacement("with_as_usage", "6.0") 

370 self._with_as_usage = value 

371 

372 def __enter__(self) -> "PdfWriter": 

373 """Store how writer is initialized by 'with'.""" 

374 c: bool = self._cloned 

375 t = self.temp_fileobj 

376 self.__init__() # type: ignore 

377 self._cloned = c 

378 self._with_as_usage = True 

379 self.fileobj = t # type: ignore 

380 return self 

381 

382 def __exit__( 

383 self, 

384 exc_type: Optional[Type[BaseException]], 

385 exc: Optional[BaseException], 

386 traceback: Optional[TracebackType], 

387 ) -> None: 

388 """Write data to the fileobj.""" 

389 if self.fileobj and not self._cloned: 

390 self.write(self.fileobj) 

391 

392 @property 

393 def pdf_header(self) -> str: 

394 """ 

395 Read/Write property of the PDF header that is written. 

396 

397 This should be something like ``'%PDF-1.5'``. It is recommended to set 

398 the lowest version that supports all features which are used within the 

399 PDF file. 

400 

401 Note: `pdf_header` returns a string but accepts bytes or str for writing 

402 """ 

403 return self._header.decode() 

404 

405 @pdf_header.setter 

406 def pdf_header(self, new_header: Union[str, bytes]) -> None: 

407 if isinstance(new_header, str): 

408 new_header = new_header.encode() 

409 self._header = new_header 

410 

411 def _add_object(self, obj: PdfObject) -> IndirectObject: 

412 if ( 

413 getattr(obj, "indirect_reference", None) is not None 

414 and obj.indirect_reference.pdf == self # type: ignore 

415 ): 

416 return obj.indirect_reference # type: ignore 

417 # check for /Contents in Pages (/Contents in annotations are strings) 

418 if isinstance(obj, DictionaryObject) and isinstance( 

419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject) 

420 ): 

421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS]) 

422 self._objects.append(obj) 

423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self) 

424 return obj.indirect_reference 

425 

426 def get_object( 

427 self, 

428 indirect_reference: Union[int, IndirectObject], 

429 ) -> PdfObject: 

430 if isinstance(indirect_reference, int): 

431 obj = self._objects[indirect_reference - 1] 

432 elif indirect_reference.pdf != self: 

433 raise ValueError("PDF must be self") 

434 else: 

435 obj = self._objects[indirect_reference.idnum - 1] 

436 assert obj is not None, "mypy" 

437 return obj 

438 

439 def _replace_object( 

440 self, 

441 indirect_reference: Union[int, IndirectObject], 

442 obj: PdfObject, 

443 ) -> PdfObject: 

444 if isinstance(indirect_reference, IndirectObject): 

445 if indirect_reference.pdf != self: 

446 raise ValueError("PDF must be self") 

447 indirect_reference = indirect_reference.idnum 

448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore 

449 if ( 

450 getattr(obj, "indirect_reference", None) is not None 

451 and obj.indirect_reference.pdf != self # type: ignore 

452 ): 

453 obj = obj.clone(self) 

454 self._objects[indirect_reference - 1] = obj 

455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self) 

456 

457 assert isinstance(obj, PdfObject), "mypy" 

458 return obj 

459 

460 def _add_page( 

461 self, 

462 page: PageObject, 

463 index: int, 

464 excluded_keys: Iterable[str] = (), 

465 ) -> PageObject: 

466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE: 

467 raise ValueError("Invalid page object") 

468 assert self.flattened_pages is not None, "for mypy" 

469 page_org = page 

470 excluded_keys = list(excluded_keys) 

471 excluded_keys += [PA.PARENT, "/StructParents"] 

472 # Acrobat does not accept two indirect references pointing on the same 

473 # page; therefore in order to add multiple copies of the same 

474 # page, we need to create a new dictionary for the page, however the 

475 # objects below (including content) are not duplicated: 

476 try: # delete an already existing page 

477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore 

478 page_org.indirect_reference.idnum # type: ignore 

479 ] 

480 except Exception: 

481 pass 

482 page = cast( 

483 "PageObject", page_org.clone(self, False, excluded_keys).get_object() 

484 ) 

485 if page_org.pdf is not None: 

486 other = page_org.pdf.pdf_header 

487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) 

488 node, idx = self._get_page_in_node(index) 

489 page[NameObject(PA.PARENT)] = node.indirect_reference 

490 

491 if idx >= 0: 

492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference) 

493 self.flattened_pages.insert(index, page) 

494 else: 

495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) 

496 self.flattened_pages.append(page) 

497 recurse = 0 

498 while not is_null_or_none(node): 

499 node = cast(DictionaryObject, node.get_object()) 

500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) 

501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix. 

502 recurse += 1 

503 if recurse > 1000: 

504 raise PyPdfError("Too many recursive calls!") 

505 return page 

506 

507 def set_need_appearances_writer(self, state: bool = True) -> None: 

508 """ 

509 Sets the "NeedAppearances" flag in the PDF writer. 

510 

511 The "NeedAppearances" flag indicates whether the appearance dictionary 

512 for form fields should be automatically generated by the PDF viewer or 

513 if the embedded appearance should be used. 

514 

515 Args: 

516 state: The actual value of the NeedAppearances flag. 

517 

518 Returns: 

519 None 

520 

521 """ 

522 # See §12.7.2 and §7.7.2 for more information: 

523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

524 try: 

525 # get the AcroForm tree 

526 if CatalogDictionary.ACRO_FORM not in self._root_object: 

527 self._root_object[ 

528 NameObject(CatalogDictionary.ACRO_FORM) 

529 ] = self._add_object(DictionaryObject()) 

530 

531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) 

532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[ 

533 need_appearances 

534 ] = BooleanObject(state) 

535 except Exception as exc: # pragma: no cover 

536 logger_warning( 

537 f"set_need_appearances_writer({state}) catch : {exc}", __name__ 

538 ) 

539 

540 def create_viewer_preferences(self) -> ViewerPreferences: 

541 o = ViewerPreferences() 

542 self._root_object[ 

543 NameObject(CatalogDictionary.VIEWER_PREFERENCES) 

544 ] = self._add_object(o) 

545 return o 

546 

547 def add_page( 

548 self, 

549 page: PageObject, 

550 excluded_keys: Iterable[str] = (), 

551 ) -> PageObject: 

552 """ 

553 Add a page to this PDF file. 

554 

555 Recommended for advanced usage including the adequate excluded_keys. 

556 

557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>` 

558 instance. 

559 

560 Args: 

561 page: The page to add to the document. Should be 

562 an instance of :class:`PageObject<pypdf._page.PageObject>` 

563 excluded_keys: 

564 

565 Returns: 

566 The added PageObject. 

567 

568 """ 

569 assert self.flattened_pages is not None, "mypy" 

570 return self._add_page(page, len(self.flattened_pages), excluded_keys) 

571 

572 def insert_page( 

573 self, 

574 page: PageObject, 

575 index: int = 0, 

576 excluded_keys: Iterable[str] = (), 

577 ) -> PageObject: 

578 """ 

579 Insert a page in this PDF file. The page is usually acquired from a 

580 :class:`PdfReader<pypdf.PdfReader>` instance. 

581 

582 Args: 

583 page: The page to add to the document. 

584 index: Position at which the page will be inserted. 

585 excluded_keys: 

586 

587 Returns: 

588 The added PageObject. 

589 

590 """ 

591 assert self.flattened_pages is not None, "mypy" 

592 if index < 0: 

593 index = len(self.flattened_pages) + index 

594 if index < 0: 

595 raise ValueError("Invalid index value") 

596 if index >= len(self.flattened_pages): 

597 return self.add_page(page, excluded_keys) 

598 return self._add_page(page, index, excluded_keys) 

599 

600 def _get_page_number_by_indirect( 

601 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

602 ) -> Optional[int]: 

603 """ 

604 Generate _page_id2num. 

605 

606 Args: 

607 indirect_reference: 

608 

609 Returns: 

610 The page number or None 

611 

612 """ 

613 # To provide same function as in PdfReader 

614 if is_null_or_none(indirect_reference): 

615 return None 

616 assert indirect_reference is not None, "mypy" 

617 if isinstance(indirect_reference, int): 

618 indirect_reference = IndirectObject(indirect_reference, 0, self) 

619 obj = indirect_reference.get_object() 

620 if isinstance(obj, PageObject): 

621 return obj.page_number 

622 return None 

623 

624 def add_blank_page( 

625 self, width: Optional[float] = None, height: Optional[float] = None 

626 ) -> PageObject: 

627 """ 

628 Append a blank page to this PDF file and return it. 

629 

630 If no page size is specified, use the size of the last page. 

631 

632 Args: 

633 width: The width of the new page expressed in default user 

634 space units. 

635 height: The height of the new page expressed in default 

636 user space units. 

637 

638 Returns: 

639 The newly appended page. 

640 

641 Raises: 

642 PageSizeNotDefinedError: if width and height are not defined 

643 and previous page does not exist. 

644 

645 """ 

646 page = PageObject.create_blank_page(self, width, height) 

647 return self.add_page(page) 

648 

649 def insert_blank_page( 

650 self, 

651 width: Optional[Union[float, decimal.Decimal]] = None, 

652 height: Optional[Union[float, decimal.Decimal]] = None, 

653 index: int = 0, 

654 ) -> PageObject: 

655 """ 

656 Insert a blank page to this PDF file and return it. 

657 

658 If no page size is specified, use the size of the last page. 

659 

660 Args: 

661 width: The width of the new page expressed in default user 

662 space units. 

663 height: The height of the new page expressed in default 

664 user space units. 

665 index: Position to add the page. 

666 

667 Returns: 

668 The newly inserted page. 

669 

670 Raises: 

671 PageSizeNotDefinedError: if width and height are not defined 

672 and previous page does not exist. 

673 

674 """ 

675 if width is None or (height is None and index < self.get_num_pages()): 

676 oldpage = self.pages[index] 

677 width = oldpage.mediabox.width 

678 height = oldpage.mediabox.height 

679 page = PageObject.create_blank_page(self, width, height) 

680 self.insert_page(page, index) 

681 return page 

682 

683 @property 

684 def open_destination( 

685 self, 

686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

687 return super().open_destination 

688 

689 @open_destination.setter 

690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

691 if dest is None: 

692 try: 

693 del self._root_object["/OpenAction"] 

694 except KeyError: 

695 pass 

696 elif isinstance(dest, str): 

697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) 

698 elif isinstance(dest, Destination): 

699 self._root_object[NameObject("/OpenAction")] = dest.dest_array 

700 elif isinstance(dest, PageObject): 

701 self._root_object[NameObject("/OpenAction")] = Destination( 

702 "Opening", 

703 dest.indirect_reference 

704 if dest.indirect_reference is not None 

705 else NullObject(), 

706 PAGE_FIT, 

707 ).dest_array 

708 

709 def add_js(self, javascript: str) -> None: 

710 """ 

711 Add JavaScript which will launch upon opening this PDF. 

712 

713 Args: 

714 javascript: Your JavaScript. 

715 

716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 

717 # Example: This will launch the print window when the PDF is opened. 

718 

719 """ 

720 # Names / JavaScript preferred to be able to add multiple scripts 

721 if "/Names" not in self._root_object: 

722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject() 

723 names = cast(DictionaryObject, self._root_object[CA.NAMES]) 

724 if "/JavaScript" not in names: 

725 names[NameObject("/JavaScript")] = DictionaryObject( 

726 {NameObject("/Names"): ArrayObject()} 

727 ) 

728 js_list = cast( 

729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] 

730 ) 

731 # We need a name for parameterized JavaScript in the PDF file, 

732 # but it can be anything. 

733 js_list.append(create_string_object(str(uuid.uuid4()))) 

734 

735 js = DictionaryObject( 

736 { 

737 NameObject(PA.TYPE): NameObject("/Action"), 

738 NameObject("/S"): NameObject("/JavaScript"), 

739 NameObject("/JS"): TextStringObject(f"{javascript}"), 

740 } 

741 ) 

742 js_list.append(self._add_object(js)) 

743 

744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile": 

745 """ 

746 Embed a file inside the PDF. 

747 

748 Reference: 

749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

750 Section 7.11.3 

751 

752 Args: 

753 filename: The filename to display. 

754 data: The data in the file. 

755 

756 Returns: 

757 EmbeddedFile instance for the newly created embedded file. 

758 

759 """ 

760 return EmbeddedFile._create_new(self, filename, data) 

761 

762 def append_pages_from_reader( 

763 self, 

764 reader: PdfReader, 

765 after_page_append: Optional[Callable[[PageObject], None]] = None, 

766 ) -> None: 

767 """ 

768 Copy pages from reader to writer. Includes an optional callback 

769 parameter which is invoked after pages are appended to the writer. 

770 

771 ``append`` should be preferred. 

772 

773 Args: 

774 reader: a PdfReader object from which to copy page 

775 annotations to this writer object. The writer's annots 

776 will then be updated. 

777 after_page_append: 

778 Callback function that is invoked after each page is appended to 

779 the writer. Signature includes a reference to the appended page 

780 (delegates to append_pages_from_reader). The single parameter of 

781 the callback is a reference to the page just appended to the 

782 document. 

783 

784 """ 

785 reader_num_pages = len(reader.pages) 

786 # Copy pages from reader to writer 

787 for reader_page_number in range(reader_num_pages): 

788 reader_page = reader.pages[reader_page_number] 

789 writer_page = self.add_page(reader_page) 

790 # Trigger callback, pass writer page as parameter 

791 if callable(after_page_append): 

792 after_page_append(writer_page) 

793 

794 def _merge_content_stream_to_page( 

795 self, 

796 page: PageObject, 

797 new_content_data: bytes, 

798 ) -> None: 

799 """ 

800 Combines existing content stream(s) with new content (as bytes), 

801 and returns a new single StreamObject. 

802 

803 Args: 

804 page: The page to which the new content data will be added. 

805 new_content_data: A binary-encoded new content stream, for 

806 instance the commands to draw an XObject. 

807 """ 

808 # First resolve the existing page content. This always is an IndirectObject: 

809 # PDF Explained by John Whitington 

810 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html 

811 if NameObject("/Contents") in page: 

812 existing_content_ref = page[NameObject("/Contents")] 

813 existing_content = existing_content_ref.get_object() 

814 

815 if isinstance(existing_content, ArrayObject): 

816 # Create a new StreamObject for the new_content_data 

817 new_stream_obj = StreamObject() 

818 new_stream_obj.set_data(new_content_data) 

819 existing_content.append(self._add_object(new_stream_obj)) 

820 page[NameObject("/Contents")] = self._add_object(existing_content) 

821 if isinstance(existing_content, StreamObject): 

822 # Merge new content to existing StreamObject 

823 merged_data = existing_content.get_data() + b"\n" + new_content_data 

824 new_stream = StreamObject() 

825 new_stream.set_data(merged_data) 

826 page[NameObject("/Contents")] = self._add_object(new_stream) 

827 else: 

828 # If no existing content, then we have an empty page. 

829 # Create a new StreamObject in a new /Contents entry. 

830 new_stream = StreamObject() 

831 new_stream.set_data(new_content_data) 

832 page[NameObject("/Contents")] = self._add_object(new_stream) 

833 

834 def _add_apstream_object( 

835 self, 

836 page: PageObject, 

837 appearance_stream_obj: StreamObject, 

838 object_name: str, 

839 x_offset: float, 

840 y_offset: float, 

841 font_res: Optional[DictionaryObject] = None 

842 ) -> None: 

843 """ 

844 Adds an appearance stream to the page content in the form of 

845 an XObject. 

846 

847 Args: 

848 page: The page to which to add the appearance stream. 

849 appearance_stream_obj: The appearance stream. 

850 object_name: The name of the appearance stream. 

851 x_offset: The horizontal offset for the appearance stream. 

852 y_offset: The vertical offset for the appearance stream. 

853 font_res: The appearance stream's font resource (if given). 

854 """ 

855 # Prepare XObject resource dictionary on the page 

856 pg_res = cast(DictionaryObject, page[PG.RESOURCES]) 

857 if font_res is not None: 

858 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated 

859 if "/Font" not in pg_res: 

860 pg_res[NameObject("/Font")] = DictionaryObject() 

861 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")]) 

862 if font_name not in pg_ft_res: 

863 pg_ft_res[NameObject(font_name)] = font_res 

864 # Always add the resolved stream object to the writer to get a new IndirectObject. 

865 # This ensures we have a valid IndirectObject managed by *this* writer. 

866 xobject_ref = self._add_object(appearance_stream_obj) 

867 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize() 

868 if "/XObject" not in pg_res: 

869 pg_res[NameObject("/XObject")] = DictionaryObject() 

870 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"]) 

871 if xobject_name not in pg_xo_res: 

872 pg_xo_res[xobject_name] = xobject_ref 

873 else: 

874 logger_warning( 

875 f"XObject {xobject_name!r} already added to page resources. This might be an issue.", 

876 __name__ 

877 ) 

878 xobject_cm = Transformation().translate(x_offset, y_offset) 

879 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() 

880 self._merge_content_stream_to_page(page, xobject_drawing_commands) 

881 

882 def _update_field_annotation( 

883 self, 

884 page: PageObject, 

885 field: DictionaryObject, 

886 annotation: DictionaryObject, 

887 font_name: str = "", 

888 font_size: float = -1, 

889 flatten: bool = False, 

890 ) -> None: 

891 # Calculate rectangle dimensions 

892 _rct = cast(RectangleObject, annotation[AA.Rect]) 

893 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) 

894 

895 # Extract font information 

896 da = annotation.get_inherited( 

897 AA.DA, 

898 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( 

899 AA.DA, None 

900 ), 

901 ) 

902 if da is None: 

903 da = TextStringObject("/Helv 0 Tf 0 g") 

904 else: 

905 da = da.get_object() 

906 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") 

907 font_properties = [x for x in font_properties if x != ""] 

908 if font_name: 

909 font_properties[font_properties.index("Tf") - 2] = font_name 

910 else: 

911 font_name = font_properties[font_properties.index("Tf") - 2] 

912 font_height = ( 

913 font_size 

914 if font_size >= 0 

915 else float(font_properties[font_properties.index("Tf") - 1]) 

916 ) 

917 if font_height == 0: 

918 if field.get(FA.Ff, 0) & FA.FfBits.Multiline: 

919 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE 

920 else: 

921 font_height = rct.height - 2 

922 font_properties[font_properties.index("Tf") - 1] = str(font_height) 

923 da = " ".join(font_properties) 

924 y_offset = rct.height - 1 - font_height 

925 

926 # Retrieve font information from local DR ... 

927 dr: Any = cast( 

928 DictionaryObject, 

929 cast( 

930 DictionaryObject, 

931 annotation.get_inherited( 

932 "/DR", 

933 cast( 

934 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] 

935 ).get("/DR", DictionaryObject()), 

936 ), 

937 ).get_object(), 

938 ) 

939 dr = dr.get("/Font", DictionaryObject()).get_object() 

940 # _default_fonts_space_width keys is the list of Standard fonts 

941 if font_name not in dr and font_name not in _default_fonts_space_width: 

942 # ...or AcroForm dictionary 

943 dr = cast( 

944 Dict[Any, Any], 

945 cast( 

946 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] 

947 ).get("/DR", {}), 

948 ) 

949 dr = dr.get_object().get("/Font", DictionaryObject()).get_object() 

950 font_res = dr.get(font_name, None) 

951 if not is_null_or_none(font_res): 

952 font_res = cast(DictionaryObject, font_res.get_object()) 

953 font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 

954 200, font_res 

955 ) 

956 try: # remove width stored in -1 key 

957 del font_map[-1] 

958 except KeyError: 

959 pass 

960 font_full_rev: Dict[str, bytes] 

961 if isinstance(font_encoding, str): 

962 font_full_rev = { 

963 v: k.encode(font_encoding) for k, v in font_map.items() 

964 } 

965 else: 

966 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

967 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

968 for key, value in font_map.items(): 

969 font_full_rev[value] = font_encoding_rev.get(key, key) 

970 else: 

971 logger_warning(f"Font dictionary for {font_name} not found.", __name__) 

972 font_full_rev = {} 

973 

974 # Retrieve field text and selected values 

975 field_flags = field.get(FA.Ff, 0) 

976 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: 

977 txt = "\n".join(annotation.get_inherited(FA.Opt, [])) 

978 sel = field.get("/V", []) 

979 if not isinstance(sel, list): 

980 sel = [sel] 

981 else: # /Tx 

982 txt = field.get("/V", "") 

983 sel = [] 

984 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) 

985 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") 

986 # Generate appearance stream 

987 ap_stream = generate_appearance_stream( 

988 txt, sel, da, font_full_rev, rct, font_height, y_offset 

989 ) 

990 

991 # Create appearance dictionary 

992 dct = DecodedStreamObject.initialize_from_dictionary( 

993 { 

994 NameObject("/Type"): NameObject("/XObject"), 

995 NameObject("/Subtype"): NameObject("/Form"), 

996 NameObject("/BBox"): rct, 

997 "__streamdata__": ByteStringObject(ap_stream), 

998 "/Length": 0, 

999 } 

1000 ) 

1001 if AA.AP in annotation: 

1002 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): 

1003 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: 

1004 dct[k] = v 

1005 

1006 # Update Resources with font information if necessary 

1007 if font_res is not None: 

1008 dct[NameObject("/Resources")] = DictionaryObject( 

1009 { 

1010 NameObject("/Font"): DictionaryObject( 

1011 { 

1012 NameObject(font_name): getattr( 

1013 font_res, "indirect_reference", font_res 

1014 ) 

1015 } 

1016 ) 

1017 } 

1018 ) 

1019 if AA.AP not in annotation: 

1020 annotation[NameObject(AA.AP)] = DictionaryObject( 

1021 {NameObject("/N"): self._add_object(dct)} 

1022 ) 

1023 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]): 

1024 cast(DictionaryObject, annotation[NameObject(AA.AP)])[ 

1025 NameObject("/N") 

1026 ] = self._add_object(dct) 

1027 else: # [/AP][/N] exists 

1028 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore 

1029 self._objects[n - 1] = dct 

1030 dct.indirect_reference = IndirectObject(n, 0, self) 

1031 

1032 if flatten: 

1033 field_name = self._get_qualified_field_name(annotation) 

1034 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res) 

1035 

1036 FFBITS_NUL = FA.FfBits(0) 

1037 

1038 def update_page_form_field_values( 

1039 self, 

1040 page: Union[PageObject, List[PageObject], None], 

1041 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]], 

1042 flags: FA.FfBits = FFBITS_NUL, 

1043 auto_regenerate: Optional[bool] = True, 

1044 flatten: bool = False, 

1045 ) -> None: 

1046 """ 

1047 Update the form field values for a given page from a fields dictionary. 

1048 

1049 Copy field texts and values from fields to page. 

1050 If the field links to a parent object, add the information to the parent. 

1051 

1052 Args: 

1053 page: `PageObject` - references **PDF writer's page** where the 

1054 annotations and field data will be updated. 

1055 `List[Pageobject]` - provides list of pages to be processed. 

1056 `None` - all pages. 

1057 fields: a Python dictionary of: 

1058 

1059 * field names (/T) as keys and text values (/V) as value 

1060 * field names (/T) as keys and list of text values (/V) for multiple choice list 

1061 * field names (/T) as keys and tuple of: 

1062 * text values (/V) 

1063 * font id (e.g. /F1, the font id must exist) 

1064 * font size (0 for autosize) 

1065 

1066 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`. 

1067 

1068 auto_regenerate: Set/unset the need_appearances flag; 

1069 the flag is unchanged if auto_regenerate is None. 

1070 

1071 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's 

1072 appearance stream to the page contents. Note that this option does not remove the 

1073 annotation itself. 

1074 

1075 """ 

1076 if CatalogDictionary.ACRO_FORM not in self._root_object: 

1077 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") 

1078 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1079 if InteractiveFormDictEntries.Fields not in af: 

1080 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") 

1081 if isinstance(auto_regenerate, bool): 

1082 self.set_need_appearances_writer(auto_regenerate) 

1083 # Iterate through pages, update field values 

1084 if page is None: 

1085 page = list(self.pages) 

1086 if isinstance(page, list): 

1087 for p in page: 

1088 if PG.ANNOTS in p: # just to prevent warnings 

1089 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten) 

1090 return 

1091 if PG.ANNOTS not in page: 

1092 logger_warning("No fields to update on this page", __name__) 

1093 return 

1094 for annotation in page[PG.ANNOTS]: # type: ignore 

1095 annotation = cast(DictionaryObject, annotation.get_object()) 

1096 if annotation.get("/Subtype", "") != "/Widget": 

1097 continue 

1098 if "/FT" in annotation and "/T" in annotation: 

1099 parent_annotation = annotation 

1100 else: 

1101 parent_annotation = annotation.get( 

1102 PG.PARENT, DictionaryObject() 

1103 ).get_object() 

1104 

1105 for field, value in fields.items(): 

1106 if not ( 

1107 self._get_qualified_field_name(parent_annotation) == field 

1108 or parent_annotation.get("/T", None) == field 

1109 ): 

1110 continue 

1111 if ( 

1112 parent_annotation.get("/FT", None) == "/Ch" 

1113 and "/I" in parent_annotation 

1114 ): 

1115 del parent_annotation["/I"] 

1116 if flags: 

1117 annotation[NameObject(FA.Ff)] = NumberObject(flags) 

1118 if not (value is None and flatten): # Only change values if given by user and not flattening. 

1119 if isinstance(value, list): 

1120 lst = ArrayObject(TextStringObject(v) for v in value) 

1121 parent_annotation[NameObject(FA.V)] = lst 

1122 elif isinstance(value, tuple): 

1123 annotation[NameObject(FA.V)] = TextStringObject( 

1124 value[0], 

1125 ) 

1126 else: 

1127 parent_annotation[NameObject(FA.V)] = TextStringObject(value) 

1128 if parent_annotation.get(FA.FT) == "/Btn": 

1129 # Checkbox button (no /FT found in Radio widgets) 

1130 v = NameObject(value) 

1131 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) 

1132 normal_ap = cast(DictionaryObject, ap["/N"]) 

1133 if v not in normal_ap: 

1134 v = NameObject("/Off") 

1135 appearance_stream_obj = normal_ap.get(v) 

1136 # other cases will be updated through the for loop 

1137 annotation[NameObject(AA.AS)] = v 

1138 annotation[NameObject(FA.V)] = v 

1139 if flatten and appearance_stream_obj is not None: 

1140 # We basically copy the entire appearance stream, which should be an XObject that 

1141 # is already registered. No need to add font resources. 

1142 rct = cast(RectangleObject, annotation[AA.Rect]) 

1143 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) 

1144 elif ( 

1145 parent_annotation.get(FA.FT) == "/Tx" 

1146 or parent_annotation.get(FA.FT) == "/Ch" 

1147 ): 

1148 # textbox 

1149 if isinstance(value, tuple): 

1150 self._update_field_annotation( 

1151 page, parent_annotation, annotation, value[1], value[2], flatten=flatten 

1152 ) 

1153 else: 

1154 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten) 

1155 elif ( 

1156 annotation.get(FA.FT) == "/Sig" 

1157 ): # deprecated # not implemented yet 

1158 logger_warning("Signature forms not implemented yet", __name__) 

1159 

1160 def reattach_fields( 

1161 self, page: Optional[PageObject] = None 

1162 ) -> List[DictionaryObject]: 

1163 """ 

1164 Parse annotations within the page looking for orphan fields and 

1165 reattach then into the Fields Structure. 

1166 

1167 Args: 

1168 page: page to analyze. 

1169 If none is provided, all pages will be analyzed. 

1170 

1171 Returns: 

1172 list of reattached fields. 

1173 

1174 """ 

1175 lst = [] 

1176 if page is None: 

1177 for p in self.pages: 

1178 lst += self.reattach_fields(p) 

1179 return lst 

1180 

1181 try: 

1182 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1183 except KeyError: 

1184 af = DictionaryObject() 

1185 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af 

1186 try: 

1187 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields]) 

1188 except KeyError: 

1189 fields = ArrayObject() 

1190 af[NameObject(InteractiveFormDictEntries.Fields)] = fields 

1191 

1192 if "/Annots" not in page: 

1193 return lst 

1194 annotations = cast(ArrayObject, page["/Annots"]) 

1195 for idx, annotation in enumerate(annotations): 

1196 is_indirect = isinstance(annotation, IndirectObject) 

1197 annotation = cast(DictionaryObject, annotation.get_object()) 

1198 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation: 

1199 if ( 

1200 "indirect_reference" in annotation.__dict__ 

1201 and annotation.indirect_reference in fields 

1202 ): 

1203 continue 

1204 if not is_indirect: 

1205 annotations[idx] = self._add_object(annotation) 

1206 fields.append(annotation.indirect_reference) 

1207 lst.append(annotation) 

1208 return lst 

1209 

1210 def clone_reader_document_root(self, reader: PdfReader) -> None: 

1211 """ 

1212 Copy the reader document root to the writer and all sub-elements, 

1213 including pages, threads, outlines,... For partial insertion, ``append`` 

1214 should be considered. 

1215 

1216 Args: 

1217 reader: PdfReader from which the document root should be copied. 

1218 

1219 """ 

1220 self._info_obj = None 

1221 if self.incremental: 

1222 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1) 

1223 for i in range(len(self._objects)): 

1224 o = reader.get_object(i + 1) 

1225 if o is not None: 

1226 self._objects[i] = o.replicate(self) 

1227 else: 

1228 self._objects.clear() 

1229 self._root_object = reader.root_object.clone(self) 

1230 self._pages = self._root_object.raw_get("/Pages") 

1231 

1232 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest 

1233 # must be done here before rewriting 

1234 if self.incremental: 

1235 self._original_hash = [ 

1236 (obj.hash_bin() if obj is not None else 0) for obj in self._objects 

1237 ] 

1238 self._flatten() 

1239 assert self.flattened_pages is not None 

1240 for p in self.flattened_pages: 

1241 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) 

1242 if not self.incremental: 

1243 p[NameObject("/Parent")] = self._pages 

1244 if not self.incremental: 

1245 cast(DictionaryObject, self._pages.get_object())[ 

1246 NameObject("/Kids") 

1247 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) 

1248 

1249 def clone_document_from_reader( 

1250 self, 

1251 reader: PdfReader, 

1252 after_page_append: Optional[Callable[[PageObject], None]] = None, 

1253 ) -> None: 

1254 """ 

1255 Create a copy (clone) of a document from a PDF file reader cloning 

1256 section '/Root' and '/Info' and '/ID' of the pdf. 

1257 

1258 Args: 

1259 reader: PDF file reader instance from which the clone 

1260 should be created. 

1261 after_page_append: 

1262 Callback function that is invoked after each page is appended to 

1263 the writer. Signature includes a reference to the appended page 

1264 (delegates to append_pages_from_reader). The single parameter of 

1265 the callback is a reference to the page just appended to the 

1266 document. 

1267 

1268 """ 

1269 self.clone_reader_document_root(reader) 

1270 inf = reader._info 

1271 if self.incremental: 

1272 if inf is not None: 

1273 self._info_obj = cast( 

1274 IndirectObject, inf.clone(self).indirect_reference 

1275 ) 

1276 assert isinstance(self._info, DictionaryObject), "for mypy" 

1277 self._original_hash[ 

1278 self._info_obj.indirect_reference.idnum - 1 

1279 ] = self._info.hash_bin() 

1280 elif inf is not None: 

1281 self._info_obj = self._add_object( 

1282 DictionaryObject(cast(DictionaryObject, inf.get_object())) 

1283 ) 

1284 # else: _info_obj = None done in clone_reader_document_root() 

1285 

1286 try: 

1287 self._ID = cast(ArrayObject, reader._ID).clone(self) 

1288 except AttributeError: 

1289 pass 

1290 

1291 if callable(after_page_append): 

1292 for page in cast( 

1293 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] 

1294 ): 

1295 after_page_append(page.get_object()) 

1296 

1297 def _compute_document_identifier(self) -> ByteStringObject: 

1298 stream = BytesIO() 

1299 self._write_pdf_structure(stream) 

1300 stream.seek(0) 

1301 return ByteStringObject(_rolling_checksum(stream).encode("utf8")) 

1302 

1303 def generate_file_identifiers(self) -> None: 

1304 """ 

1305 Generate an identifier for the PDF that will be written. 

1306 

1307 The only point of this is ensuring uniqueness. Reproducibility is not 

1308 required. 

1309 When a file is first written, both identifiers shall be set to the same value. 

1310 If both identifiers match when a file reference is resolved, it is very 

1311 likely that the correct and unchanged file has been found. If only the first 

1312 identifier matches, a different version of the correct file has been found. 

1313 see §14.4 "File Identifiers". 

1314 """ 

1315 if self._ID: 

1316 id1 = self._ID[0] 

1317 id2 = self._compute_document_identifier() 

1318 else: 

1319 id1 = self._compute_document_identifier() 

1320 id2 = id1 

1321 self._ID = ArrayObject((id1, id2)) 

1322 

1323 def encrypt( 

1324 self, 

1325 user_password: str, 

1326 owner_password: Optional[str] = None, 

1327 use_128bit: bool = True, 

1328 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS, 

1329 *, 

1330 algorithm: Optional[str] = None, 

1331 ) -> None: 

1332 """ 

1333 Encrypt this PDF file with the PDF Standard encryption handler. 

1334 

1335 Args: 

1336 user_password: The password which allows for opening 

1337 and reading the PDF file with the restrictions provided. 

1338 owner_password: The password which allows for 

1339 opening the PDF files without any restrictions. By default, 

1340 the owner password is the same as the user password. 

1341 use_128bit: flag as to whether to use 128bit 

1342 encryption. When false, 40bit encryption will be used. 

1343 By default, this flag is on. 

1344 permissions_flag: permissions as described in 

1345 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means 

1346 the permission is granted. 

1347 Hence an integer value of -1 will set all flags. 

1348 Bit position 3 is for printing, 4 is for modifying content, 

1349 5 and 6 control annotations, 9 for form fields, 

1350 10 for extraction of text and graphics. 

1351 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128", 

1352 "AES-128", "AES-256-R5", "AES-256". If it is valid, 

1353 `use_128bit` will be ignored. 

1354 

1355 """ 

1356 if owner_password is None: 

1357 owner_password = user_password 

1358 

1359 if algorithm is not None: 

1360 try: 

1361 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_")) 

1362 except AttributeError: 

1363 raise ValueError(f"Algorithm '{algorithm}' NOT supported") 

1364 else: 

1365 alg = EncryptAlgorithm.RC4_128 

1366 if not use_128bit: 

1367 alg = EncryptAlgorithm.RC4_40 

1368 self.generate_file_identifiers() 

1369 assert self._ID 

1370 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0]) 

1371 # in case call `encrypt` again 

1372 entry = self._encryption.write_entry(user_password, owner_password) 

1373 if self._encrypt_entry: 

1374 # replace old encrypt_entry 

1375 assert self._encrypt_entry.indirect_reference is not None 

1376 entry.indirect_reference = self._encrypt_entry.indirect_reference 

1377 self._objects[entry.indirect_reference.idnum - 1] = entry 

1378 else: 

1379 self._add_object(entry) 

1380 self._encrypt_entry = entry 

1381 

1382 def write_stream(self, stream: StreamType) -> None: 

1383 if hasattr(stream, "mode") and "b" not in stream.mode: 

1384 logger_warning( 

1385 f"File <{stream.name}> to write to is not in binary mode. " 

1386 "It may not be written to correctly.", 

1387 __name__, 

1388 ) 

1389 # deprecated to be removed in pypdf 6.0.0 : 

1390 # if not self._root: 

1391 # self._root = self._add_object(self._root_object) 

1392 # self._sweep_indirect_references(self._root) 

1393 

1394 if self.incremental: 

1395 self._reader.stream.seek(0) 

1396 stream.write(self._reader.stream.read(-1)) 

1397 if len(self.list_objects_in_increment()) > 0: 

1398 self._write_increment(stream) # writes objs, xref stream and startxref 

1399 else: 

1400 object_positions, free_objects = self._write_pdf_structure(stream) 

1401 xref_location = self._write_xref_table( 

1402 stream, object_positions, free_objects 

1403 ) 

1404 self._write_trailer(stream, xref_location) 

1405 

1406 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: 

1407 """ 

1408 Write the collection of pages added to this object out as a PDF file. 

1409 

1410 Args: 

1411 stream: An object to write the file to. The object can support 

1412 the write method and the tell method, similar to a file object, or 

1413 be a file path, just like the fileobj, just named it stream to keep 

1414 existing workflow. 

1415 

1416 Returns: 

1417 A tuple (bool, IO). 

1418 

1419 """ 

1420 my_file = False 

1421 

1422 if stream == "": 

1423 raise ValueError(f"Output({stream=}) is empty.") 

1424 

1425 if isinstance(stream, (str, Path)): 

1426 stream = FileIO(stream, "wb") 

1427 my_file = True 

1428 

1429 self.write_stream(stream) 

1430 

1431 if my_file: 

1432 stream.close() 

1433 else: 

1434 stream.flush() 

1435 

1436 return my_file, stream 

1437 

1438 def list_objects_in_increment(self) -> List[IndirectObject]: 

1439 """ 

1440 For analysis or debugging. 

1441 Provides the list of new or modified objects that will be written 

1442 in the increment. 

1443 Deleted objects will not be freed but will become orphans. 

1444 

1445 Returns: 

1446 List of new or modified IndirectObjects 

1447 

1448 """ 

1449 original_hash_count = len(self._original_hash) 

1450 return [ 

1451 cast(IndirectObject, obj).indirect_reference 

1452 for i, obj in enumerate(self._objects) 

1453 if ( 

1454 obj is not None 

1455 and ( 

1456 i >= original_hash_count 

1457 or obj.hash_bin() != self._original_hash[i] 

1458 ) 

1459 ) 

1460 ] 

1461 

1462 def _write_increment(self, stream: StreamType) -> None: 

1463 object_positions = {} 

1464 object_blocks = [] 

1465 current_start = -1 

1466 current_stop = -2 

1467 original_hash_count = len(self._original_hash) 

1468 for i, obj in enumerate(self._objects): 

1469 if obj is not None and ( 

1470 i >= original_hash_count 

1471 or obj.hash_bin() != self._original_hash[i] 

1472 ): 

1473 idnum = i + 1 

1474 assert isinstance(obj, PdfObject), "mypy" 

1475 # first write new/modified object 

1476 object_positions[idnum] = stream.tell() 

1477 stream.write(f"{idnum} 0 obj\n".encode()) 

1478 """ encryption is not operational 

1479 if self._encryption and obj != self._encrypt_entry: 

1480 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1481 """ 

1482 obj.write_to_stream(stream) 

1483 stream.write(b"\nendobj\n") 

1484 

1485 # prepare xref 

1486 if idnum != current_stop: 

1487 if current_start > 0: 

1488 object_blocks.append( 

1489 [current_start, current_stop - current_start] 

1490 ) 

1491 current_start = idnum 

1492 current_stop = idnum + 1 

1493 assert current_start > 0, "for pytest only" 

1494 object_blocks.append([current_start, current_stop - current_start]) 

1495 # write incremented xref 

1496 xref_location = stream.tell() 

1497 xr_id = len(self._objects) + 1 

1498 stream.write(f"{xr_id} 0 obj".encode()) 

1499 init_data = { 

1500 NameObject("/Type"): NameObject("/XRef"), 

1501 NameObject("/Size"): NumberObject(xr_id + 1), 

1502 NameObject("/Root"): self.root_object.indirect_reference, 

1503 NameObject("/Filter"): NameObject("/FlateDecode"), 

1504 NameObject("/Index"): ArrayObject( 

1505 [NumberObject(_it) for _su in object_blocks for _it in _su] 

1506 ), 

1507 NameObject("/W"): ArrayObject( 

1508 [NumberObject(1), NumberObject(4), NumberObject(1)] 

1509 ), 

1510 "__streamdata__": b"", 

1511 } 

1512 if self._info is not None and ( 

1513 self._info.indirect_reference.idnum - 1 # type: ignore 

1514 >= len(self._original_hash) 

1515 or cast(IndirectObject, self._info).hash_bin() # kept for future 

1516 != self._original_hash[ 

1517 self._info.indirect_reference.idnum - 1 # type: ignore 

1518 ] 

1519 ): 

1520 init_data[NameObject(TK.INFO)] = self._info.indirect_reference 

1521 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) 

1522 if self._ID: 

1523 init_data[NameObject(TK.ID)] = self._ID 

1524 xr = StreamObject.initialize_from_dictionary(init_data) 

1525 xr.set_data( 

1526 b"".join( 

1527 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] 

1528 ) 

1529 ) 

1530 xr.write_to_stream(stream) 

1531 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1532 

1533 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: 

1534 object_positions = [] 

1535 free_objects = [] 

1536 stream.write(self.pdf_header.encode() + b"\n") 

1537 stream.write(b"%\xE2\xE3\xCF\xD3\n") 

1538 

1539 for idnum, obj in enumerate(self._objects, start=1): 

1540 if obj is not None: 

1541 object_positions.append(stream.tell()) 

1542 stream.write(f"{idnum} 0 obj\n".encode()) 

1543 if self._encryption and obj != self._encrypt_entry: 

1544 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1545 obj.write_to_stream(stream) 

1546 stream.write(b"\nendobj\n") 

1547 else: 

1548 object_positions.append(-1) 

1549 free_objects.append(idnum) 

1550 free_objects.append(0) # add 0 to loop in accordance with specification 

1551 return object_positions, free_objects 

1552 

1553 def _write_xref_table( 

1554 self, stream: StreamType, object_positions: List[int], free_objects: List[int] 

1555 ) -> int: 

1556 xref_location = stream.tell() 

1557 stream.write(b"xref\n") 

1558 stream.write(f"0 {len(self._objects) + 1}\n".encode()) 

1559 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) 

1560 free_idx = 1 

1561 for offset in object_positions: 

1562 if offset > 0: 

1563 stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) 

1564 else: 

1565 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) 

1566 free_idx += 1 

1567 return xref_location 

1568 

1569 def _write_trailer(self, stream: StreamType, xref_location: int) -> None: 

1570 """ 

1571 Write the PDF trailer to the stream. 

1572 

1573 To quote the PDF specification: 

1574 [The] trailer [gives] the location of the cross-reference table and 

1575 of certain special objects within the body of the file. 

1576 """ 

1577 stream.write(b"trailer\n") 

1578 trailer = DictionaryObject( 

1579 { 

1580 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), 

1581 NameObject(TK.ROOT): self.root_object.indirect_reference, 

1582 } 

1583 ) 

1584 if self._info is not None: 

1585 trailer[NameObject(TK.INFO)] = self._info.indirect_reference 

1586 if self._ID is not None: 

1587 trailer[NameObject(TK.ID)] = self._ID 

1588 if self._encrypt_entry: 

1589 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference 

1590 trailer.write_to_stream(stream) 

1591 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1592 

1593 @property 

1594 def metadata(self) -> Optional[DocumentInformation]: 

1595 """ 

1596 Retrieve/set the PDF file's document information dictionary, if it exists. 

1597 

1598 Args: 

1599 value: dict with the entries to be set. if None : remove the /Info entry from the pdf. 

1600 

1601 Note that some PDF files use (XMP) metadata streams instead of document 

1602 information dictionaries, and these metadata streams will not be 

1603 accessed by this function, but by :meth:`~xmp_metadata`. 

1604 

1605 """ 

1606 return super().metadata 

1607 

1608 @metadata.setter 

1609 def metadata( 

1610 self, 

1611 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]], 

1612 ) -> None: 

1613 if value is None: 

1614 self._info = None 

1615 else: 

1616 if self._info is not None: 

1617 self._info.clear() 

1618 

1619 self.add_metadata(value) 

1620 

1621 def add_metadata(self, infos: Dict[str, Any]) -> None: 

1622 """ 

1623 Add custom metadata to the output. 

1624 

1625 Args: 

1626 infos: a Python dictionary where each key is a field 

1627 and each value is your new metadata. 

1628 

1629 """ 

1630 args = {} 

1631 if isinstance(infos, PdfObject): 

1632 infos = cast(DictionaryObject, infos.get_object()) 

1633 for key, value in list(infos.items()): 

1634 if isinstance(value, PdfObject): 

1635 value = value.get_object() 

1636 args[NameObject(key)] = create_string_object(str(value)) 

1637 if self._info is None: 

1638 self._info = DictionaryObject() 

1639 self._info.update(args) 

1640 

1641 def compress_identical_objects( 

1642 self, 

1643 remove_identicals: bool = True, 

1644 remove_orphans: bool = True, 

1645 ) -> None: 

1646 """ 

1647 Parse the PDF file and merge objects that have the same hash. 

1648 This will make objects common to multiple pages. 

1649 Recommended to be used just before writing output. 

1650 

1651 Args: 

1652 remove_identicals: Remove identical objects. 

1653 remove_orphans: Remove unreferenced objects. 

1654 

1655 """ 

1656 

1657 def replace_in_obj( 

1658 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] 

1659 ) -> None: 

1660 if isinstance(obj, DictionaryObject): 

1661 key_val = obj.items() 

1662 elif isinstance(obj, ArrayObject): 

1663 key_val = enumerate(obj) # type: ignore 

1664 else: 

1665 return 

1666 assert isinstance(obj, (DictionaryObject, ArrayObject)) 

1667 for k, v in key_val: 

1668 if isinstance(v, IndirectObject): 

1669 orphans[v.idnum - 1] = False 

1670 if v in crossref: 

1671 obj[k] = crossref[v] 

1672 else: 

1673 """the filtering on DictionaryObject and ArrayObject only 

1674 will be performed within replace_in_obj""" 

1675 replace_in_obj(v, crossref) 

1676 

1677 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) 

1678 self._idnum_hash = {} 

1679 orphans = [True] * len(self._objects) 

1680 # look for similar objects 

1681 for idx, obj in enumerate(self._objects): 

1682 if is_null_or_none(obj): 

1683 continue 

1684 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. 

1685 assert isinstance(obj.indirect_reference, IndirectObject) 

1686 h = obj.hash_value() 

1687 if remove_identicals and h in self._idnum_hash: 

1688 self._idnum_hash[h][1].append(obj.indirect_reference) 

1689 self._objects[idx] = None 

1690 else: 

1691 self._idnum_hash[h] = (obj.indirect_reference, []) 

1692 

1693 # generate the dict converting others to 1st 

1694 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} 

1695 cnv_rev: Dict[IndirectObject, IndirectObject] = {} 

1696 for k, v in cnv.items(): 

1697 cnv_rev.update(zip(v, (k,) * len(v))) 

1698 

1699 # replace reference to merged objects 

1700 for obj in self._objects: 

1701 if isinstance(obj, (DictionaryObject, ArrayObject)): 

1702 replace_in_obj(obj, cnv_rev) 

1703 

1704 # remove orphans (if applicable) 

1705 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore 

1706 

1707 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore 

1708 

1709 try: 

1710 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore 

1711 except AttributeError: 

1712 pass 

1713 for i in compress(range(len(self._objects)), orphans): 

1714 self._objects[i] = None 

1715 

1716 def _sweep_indirect_references( 

1717 self, 

1718 root: Union[ 

1719 ArrayObject, 

1720 BooleanObject, 

1721 DictionaryObject, 

1722 FloatObject, 

1723 IndirectObject, 

1724 NameObject, 

1725 PdfObject, 

1726 NumberObject, 

1727 TextStringObject, 

1728 NullObject, 

1729 ], 

1730 ) -> None: # deprecated 

1731 """ 

1732 Resolving any circular references to Page objects. 

1733 

1734 Circular references to Page objects can arise when objects such as 

1735 annotations refer to their associated page. If these references are not 

1736 properly handled, the PDF file will contain multiple copies of the same 

1737 Page object. To address this problem, Page objects store their original 

1738 object reference number. This method adds the reference number of any 

1739 circularly referenced Page objects to an external reference map. This 

1740 ensures that self-referencing trees reference the correct new object 

1741 location, rather than copying in a new copy of the Page object. 

1742 

1743 Args: 

1744 root: The root of the PDF object tree to sweep. 

1745 

1746 """ 

1747 deprecate( 

1748 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed", 

1749 ) 

1750 

1751 def _resolve_indirect_object( 

1752 self, data: IndirectObject 

1753 ) -> IndirectObject: # deprecated 

1754 """ 

1755 Resolves an indirect object to an indirect object in this PDF file. 

1756 

1757 If the input indirect object already belongs to this PDF file, it is 

1758 returned directly. Otherwise, the object is retrieved from the input 

1759 object's PDF file using the object's ID number and generation number. If 

1760 the object cannot be found, a warning is logged and a `NullObject` is 

1761 returned. 

1762 

1763 If the object is not already in this PDF file, it is added to the file's 

1764 list of objects and assigned a new ID number and generation number of 0. 

1765 The hash value of the object is then added to the `_idnum_hash` 

1766 dictionary, with the corresponding `IndirectObject` reference as the 

1767 value. 

1768 

1769 Args: 

1770 data: The `IndirectObject` to resolve. 

1771 

1772 Returns: 

1773 The resolved `IndirectObject` in this PDF file. 

1774 

1775 Raises: 

1776 ValueError: If the input stream is closed. 

1777 

1778 """ 

1779 deprecate( 

1780 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed", 

1781 ) 

1782 return IndirectObject(0, 0, self) 

1783 

1784 def get_reference(self, obj: PdfObject) -> IndirectObject: 

1785 idnum = self._objects.index(obj) + 1 

1786 ref = IndirectObject(idnum, 0, self) 

1787 assert ref.get_object() == obj 

1788 return ref 

1789 

1790 def get_outline_root(self) -> TreeObject: 

1791 if CO.OUTLINES in self._root_object: 

1792 # Entries in the catalog dictionary 

1793 outline = cast(TreeObject, self._root_object[CO.OUTLINES]) 

1794 if not isinstance(outline, TreeObject): 

1795 t = TreeObject(outline) 

1796 self._replace_object(outline.indirect_reference.idnum, t) 

1797 outline = t 

1798 idnum = self._objects.index(outline) + 1 

1799 outline_ref = IndirectObject(idnum, 0, self) 

1800 assert outline_ref.get_object() == outline 

1801 else: 

1802 outline = TreeObject() 

1803 outline.update({}) 

1804 outline_ref = self._add_object(outline) 

1805 self._root_object[NameObject(CO.OUTLINES)] = outline_ref 

1806 

1807 return outline 

1808 

1809 def get_threads_root(self) -> ArrayObject: 

1810 """ 

1811 The list of threads. 

1812 

1813 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1814 

1815 Returns: 

1816 An array (possibly empty) of Dictionaries with an ``/F`` key, 

1817 and optionally information about the thread in ``/I`` or ``/Metadata`` keys. 

1818 

1819 """ 

1820 if CO.THREADS in self._root_object: 

1821 # Entries in the catalog dictionary 

1822 threads = cast(ArrayObject, self._root_object[CO.THREADS]) 

1823 else: 

1824 threads = ArrayObject() 

1825 self._root_object[NameObject(CO.THREADS)] = threads 

1826 return threads 

1827 

1828 @property 

1829 def threads(self) -> ArrayObject: 

1830 """ 

1831 Read-only property for the list of threads. 

1832 

1833 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1834 

1835 Each element is a dictionary with an ``/F`` key, and optionally 

1836 information about the thread in ``/I`` or ``/Metadata`` keys. 

1837 """ 

1838 return self.get_threads_root() 

1839 

1840 def add_outline_item_destination( 

1841 self, 

1842 page_destination: Union[IndirectObject, PageObject, TreeObject], 

1843 parent: Union[None, TreeObject, IndirectObject] = None, 

1844 before: Union[None, TreeObject, IndirectObject] = None, 

1845 is_open: bool = True, 

1846 ) -> IndirectObject: 

1847 page_destination = cast(PageObject, page_destination.get_object()) 

1848 if isinstance(page_destination, PageObject): 

1849 return self.add_outline_item_destination( 

1850 Destination( 

1851 f"page #{page_destination.page_number}", 

1852 cast(IndirectObject, page_destination.indirect_reference), 

1853 Fit.fit(), 

1854 ) 

1855 ) 

1856 

1857 if parent is None: 

1858 parent = self.get_outline_root() 

1859 

1860 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open) 

1861 parent = cast(TreeObject, parent.get_object()) 

1862 page_destination_ref = self._add_object(page_destination) 

1863 if before is not None: 

1864 before = before.indirect_reference 

1865 parent.insert_child( 

1866 page_destination_ref, 

1867 before, 

1868 self, 

1869 page_destination.inc_parent_counter_outline 

1870 if is_open 

1871 else (lambda x, y: 0), # noqa: ARG005 

1872 ) 

1873 if "/Count" not in page_destination: 

1874 page_destination[NameObject("/Count")] = NumberObject(0) 

1875 

1876 return page_destination_ref 

1877 

1878 def add_outline_item_dict( 

1879 self, 

1880 outline_item: OutlineItemType, 

1881 parent: Union[None, TreeObject, IndirectObject] = None, 

1882 before: Union[None, TreeObject, IndirectObject] = None, 

1883 is_open: bool = True, 

1884 ) -> IndirectObject: 

1885 outline_item_object = TreeObject() 

1886 outline_item_object.update(outline_item) 

1887 

1888 """code currently unreachable 

1889 if "/A" in outline_item: 

1890 action = DictionaryObject() 

1891 a_dict = cast(DictionaryObject, outline_item["/A"]) 

1892 for k, v in list(a_dict.items()): 

1893 action[NameObject(str(k))] = v 

1894 action_ref = self._add_object(action) 

1895 outline_item_object[NameObject("/A")] = action_ref 

1896 """ 

1897 return self.add_outline_item_destination( 

1898 outline_item_object, parent, before, is_open 

1899 ) 

1900 

1901 def add_outline_item( 

1902 self, 

1903 title: str, 

1904 page_number: Union[None, PageObject, IndirectObject, int], 

1905 parent: Union[None, TreeObject, IndirectObject] = None, 

1906 before: Union[None, TreeObject, IndirectObject] = None, 

1907 color: Optional[Union[Tuple[float, float, float], str]] = None, 

1908 bold: bool = False, 

1909 italic: bool = False, 

1910 fit: Fit = PAGE_FIT, 

1911 is_open: bool = True, 

1912 ) -> IndirectObject: 

1913 """ 

1914 Add an outline item (commonly referred to as a "Bookmark") to the PDF file. 

1915 

1916 Args: 

1917 title: Title to use for this outline item. 

1918 page_number: Page number this outline item will point to. 

1919 parent: A reference to a parent outline item to create nested 

1920 outline items. 

1921 before: 

1922 color: Color of the outline item's font as a red, green, blue tuple 

1923 from 0.0 to 1.0 or as a Hex String (#RRGGBB) 

1924 bold: Outline item font is bold 

1925 italic: Outline item font is italic 

1926 fit: The fit of the destination page. 

1927 

1928 Returns: 

1929 The added outline item as an indirect object. 

1930 

1931 """ 

1932 page_ref: Union[None, NullObject, IndirectObject, NumberObject] 

1933 if isinstance(italic, Fit): # it means that we are on the old params 

1934 if fit is not None and page_number is None: 

1935 page_number = fit 

1936 return self.add_outline_item( 

1937 title, page_number, parent, None, before, color, bold, italic, is_open=is_open 

1938 ) 

1939 if page_number is None: 

1940 action_ref = None 

1941 else: 

1942 if isinstance(page_number, IndirectObject): 

1943 page_ref = page_number 

1944 elif isinstance(page_number, PageObject): 

1945 page_ref = page_number.indirect_reference 

1946 elif isinstance(page_number, int): 

1947 try: 

1948 page_ref = self.pages[page_number].indirect_reference 

1949 except IndexError: 

1950 page_ref = NumberObject(page_number) 

1951 if page_ref is None: 

1952 logger_warning( 

1953 f"can not find reference of page {page_number}", 

1954 __name__, 

1955 ) 

1956 page_ref = NullObject() 

1957 dest = Destination( 

1958 NameObject("/" + title + " outline item"), 

1959 page_ref, 

1960 fit, 

1961 ) 

1962 

1963 action_ref = self._add_object( 

1964 DictionaryObject( 

1965 { 

1966 NameObject(GoToActionArguments.D): dest.dest_array, 

1967 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

1968 } 

1969 ) 

1970 ) 

1971 outline_item = self._add_object( 

1972 _create_outline_item(action_ref, title, color, italic, bold) 

1973 ) 

1974 

1975 if parent is None: 

1976 parent = self.get_outline_root() 

1977 return self.add_outline_item_destination(outline_item, parent, before, is_open) 

1978 

1979 def add_outline(self) -> None: 

1980 raise NotImplementedError( 

1981 "This method is not yet implemented. Use :meth:`add_outline_item` instead." 

1982 ) 

1983 

1984 def add_named_destination_array( 

1985 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject] 

1986 ) -> None: 

1987 named_dest = self.get_named_dest_root() 

1988 i = 0 

1989 while i < len(named_dest): 

1990 if title < named_dest[i]: 

1991 named_dest.insert(i, destination) 

1992 named_dest.insert(i, TextStringObject(title)) 

1993 return 

1994 i += 2 

1995 named_dest.extend([TextStringObject(title), destination]) 

1996 return 

1997 

1998 def add_named_destination_object( 

1999 self, 

2000 page_destination: PdfObject, 

2001 ) -> IndirectObject: 

2002 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore 

2003 self.add_named_destination_array( 

2004 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore 

2005 ) 

2006 

2007 return page_destination_ref 

2008 

2009 def add_named_destination( 

2010 self, 

2011 title: str, 

2012 page_number: int, 

2013 ) -> IndirectObject: 

2014 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore 

2015 dest = DictionaryObject() 

2016 dest.update( 

2017 { 

2018 NameObject(GoToActionArguments.D): ArrayObject( 

2019 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)] 

2020 ), 

2021 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

2022 } 

2023 ) 

2024 

2025 dest_ref = self._add_object(dest) 

2026 if not isinstance(title, TextStringObject): 

2027 title = TextStringObject(str(title)) 

2028 

2029 self.add_named_destination_array(title, dest_ref) 

2030 return dest_ref 

2031 

2032 def remove_links(self) -> None: 

2033 """Remove links and annotations from this output.""" 

2034 for page in self.pages: 

2035 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS) 

2036 

2037 def remove_annotations( 

2038 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]] 

2039 ) -> None: 

2040 """ 

2041 Remove annotations by annotation subtype. 

2042 

2043 Args: 

2044 subtypes: subtype or list of subtypes to be removed. 

2045 Examples are: "/Link", "/FileAttachment", "/Sound", 

2046 "/Movie", "/Screen", ... 

2047 If you want to remove all annotations, use subtypes=None. 

2048 

2049 """ 

2050 for page in self.pages: 

2051 self._remove_annots_from_page(page, subtypes) 

2052 

2053 def _remove_annots_from_page( 

2054 self, 

2055 page: Union[IndirectObject, PageObject, DictionaryObject], 

2056 subtypes: Optional[Iterable[str]], 

2057 ) -> None: 

2058 page = cast(DictionaryObject, page.get_object()) 

2059 if PG.ANNOTS in page: 

2060 i = 0 

2061 while i < len(cast(ArrayObject, page[PG.ANNOTS])): 

2062 an = cast(ArrayObject, page[PG.ANNOTS])[i] 

2063 obj = cast(DictionaryObject, an.get_object()) 

2064 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes: 

2065 if isinstance(an, IndirectObject): 

2066 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size 

2067 del page[PG.ANNOTS][i] # type:ignore 

2068 else: 

2069 i += 1 

2070 

2071 def remove_objects_from_page( 

2072 self, 

2073 page: Union[PageObject, DictionaryObject], 

2074 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]], 

2075 text_filters: Optional[Dict[str, Any]] = None 

2076 ) -> None: 

2077 """ 

2078 Remove objects specified by ``to_delete`` from the given page. 

2079 

2080 Args: 

2081 page: Page object to clean up. 

2082 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag`` 

2083 or a list of ObjectDeletionFlag 

2084 text_filters: Properties of text to be deleted, if applicable. Optional. 

2085 This is a Python dictionary with the following properties: 

2086 

2087 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted. 

2088 

2089 """ 

2090 if isinstance(to_delete, (list, tuple)): 

2091 for to_d in to_delete: 

2092 self.remove_objects_from_page(page, to_d) 

2093 return None 

2094 assert isinstance(to_delete, ObjectDeletionFlag) 

2095 

2096 if to_delete & ObjectDeletionFlag.LINKS: 

2097 return self._remove_annots_from_page(page, ("/Link",)) 

2098 if to_delete & ObjectDeletionFlag.ATTACHMENTS: 

2099 return self._remove_annots_from_page( 

2100 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen") 

2101 ) 

2102 if to_delete & ObjectDeletionFlag.OBJECTS_3D: 

2103 return self._remove_annots_from_page(page, ("/3D",)) 

2104 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: 

2105 return self._remove_annots_from_page(page, None) 

2106 

2107 jump_operators = [] 

2108 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: 

2109 jump_operators = ( 

2110 [ 

2111 b"w", b"J", b"j", b"M", b"d", b"i", 

2112 b"W", b"W*", 

2113 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n", 

2114 b"m", b"l", b"c", b"v", b"y", b"h", b"re", 

2115 b"sh" 

2116 ] 

2117 ) 

2118 if to_delete & ObjectDeletionFlag.TEXT: 

2119 jump_operators = [b"Tj", b"TJ", b"'", b'"'] 

2120 

2121 def clean( 

2122 content: ContentStream, 

2123 images: List[str], 

2124 forms: List[str], 

2125 text_filters: Optional[Dict[str, Any]] = None 

2126 ) -> None: 

2127 nonlocal jump_operators, to_delete 

2128 

2129 font_id = None 

2130 font_ids_to_delete = [] 

2131 if text_filters and to_delete & ObjectDeletionFlag.TEXT: 

2132 font_ids_to_delete = text_filters.get("font_ids", []) 

2133 

2134 i = 0 

2135 while i < len(content.operations): 

2136 operands, operator = content.operations[i] 

2137 if operator == b"Tf": 

2138 font_id = operands[0] 

2139 if ( 

2140 ( 

2141 operator == b"INLINE IMAGE" 

2142 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES) 

2143 ) 

2144 or (operator in jump_operators) 

2145 or ( 

2146 operator == b"Do" 

2147 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES) 

2148 and (operands[0] in images) 

2149 ) 

2150 ): 

2151 if ( 

2152 not to_delete & ObjectDeletionFlag.TEXT 

2153 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters) 

2154 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete) 

2155 ): 

2156 del content.operations[i] 

2157 else: 

2158 i += 1 

2159 else: 

2160 i += 1 

2161 content.get_data() # this ensures ._data is rebuilt from the .operations 

2162 

2163 def clean_forms( 

2164 elt: DictionaryObject, stack: List[DictionaryObject] 

2165 ) -> Tuple[List[str], List[str]]: 

2166 nonlocal to_delete 

2167 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference 

2168 if (elt in stack) or ( 

2169 hasattr(elt, "indirect_reference") 

2170 and any( 

2171 elt.indirect_reference == getattr(x, "indirect_reference", -1) 

2172 for x in stack 

2173 ) 

2174 ): 

2175 # to prevent infinite looping 

2176 return [], [] # pragma: no cover 

2177 try: 

2178 d = cast( 

2179 Dict[Any, Any], 

2180 cast(DictionaryObject, elt["/Resources"])["/XObject"], 

2181 ) 

2182 except KeyError: 

2183 d = {} 

2184 images = [] 

2185 forms = [] 

2186 for k, v in d.items(): 

2187 o = v.get_object() 

2188 try: 

2189 content: Any = None 

2190 if ( 

2191 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES 

2192 and o["/Subtype"] == "/Image" 

2193 ): 

2194 content = NullObject() # to delete the image keeping the entry 

2195 images.append(k) 

2196 if o["/Subtype"] == "/Form": 

2197 forms.append(k) 

2198 if isinstance(o, ContentStream): 

2199 content = o 

2200 else: 

2201 content = ContentStream(o, self) 

2202 content.update( 

2203 { 

2204 k1: v1 

2205 for k1, v1 in o.items() 

2206 if k1 not in ["/Length", "/Filter", "/DecodeParms"] 

2207 } 

2208 ) 

2209 try: 

2210 content.indirect_reference = o.indirect_reference 

2211 except AttributeError: # pragma: no cover 

2212 pass 

2213 stack.append(elt) 

2214 clean_forms(content, stack) # clean subforms 

2215 if content is not None: 

2216 if isinstance(v, IndirectObject): 

2217 self._objects[v.idnum - 1] = content 

2218 else: 

2219 # should only occur in a PDF not respecting PDF spec 

2220 # where streams must be indirected. 

2221 d[k] = self._add_object(content) # pragma: no cover 

2222 except (TypeError, KeyError): 

2223 pass 

2224 for im in images: 

2225 del d[im] # for clean-up 

2226 if isinstance(elt, StreamObject): # for /Form 

2227 if not isinstance(elt, ContentStream): # pragma: no cover 

2228 e = ContentStream(elt, self) 

2229 e.update(elt.items()) 

2230 elt = e 

2231 clean(elt, images, forms, text_filters) # clean the content 

2232 return images, forms 

2233 

2234 if not isinstance(page, PageObject): 

2235 page = PageObject(self, page.indirect_reference) # pragma: no cover 

2236 if "/Contents" in page: 

2237 content = cast(ContentStream, page.get_contents()) 

2238 

2239 images, forms = clean_forms(page, []) 

2240 

2241 clean(content, images, forms, text_filters) 

2242 page.replace_contents(content) 

2243 

2244 def remove_images( 

2245 self, 

2246 to_delete: ImageType = ImageType.ALL, 

2247 ) -> None: 

2248 """ 

2249 Remove images from this output. 

2250 

2251 Args: 

2252 to_delete: The type of images to be deleted 

2253 (default = all images types) 

2254 

2255 """ 

2256 if isinstance(to_delete, bool): 

2257 to_delete = ImageType.ALL 

2258 

2259 i = ObjectDeletionFlag.NONE 

2260 

2261 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"): 

2262 if to_delete & ImageType[image]: 

2263 i |= ObjectDeletionFlag[image] 

2264 

2265 for page in self.pages: 

2266 self.remove_objects_from_page(page, i) 

2267 

2268 def remove_text(self, font_names: Optional[List[str]] = None) -> None: 

2269 """ 

2270 Remove text from the PDF. 

2271 

2272 Args: 

2273 font_names: List of font names to remove, such as "Helvetica-Bold". 

2274 Optional. If not specified, all text will be removed. 

2275 """ 

2276 if not font_names: 

2277 font_names = [] 

2278 

2279 for page in self.pages: 

2280 resource_ids_to_remove = [] 

2281 

2282 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0" 

2283 # Font names need to be converted to resource names/IDs for easier removal 

2284 if font_names: 

2285 # Recursively loop through page objects to gather font info 

2286 def get_font_info( 

2287 obj: Any, 

2288 font_info: Optional[Dict[str, Any]] = None, 

2289 key: Optional[str] = None 

2290 ) -> Dict[str, Any]: 

2291 if font_info is None: 

2292 font_info = {} 

2293 if isinstance(obj, IndirectObject): 

2294 obj = obj.get_object() 

2295 if isinstance(obj, dict): 

2296 if obj.get("/Type") == "/Font": 

2297 font_name = obj.get("/BaseFont", "") 

2298 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold" 

2299 normalized_font_name = font_name.lstrip("/").split("+")[-1] 

2300 if normalized_font_name not in font_info: 

2301 font_info[normalized_font_name] = { 

2302 "normalized_font_name": normalized_font_name, 

2303 "resource_ids": [], 

2304 } 

2305 if key not in font_info[normalized_font_name]["resource_ids"]: 

2306 font_info[normalized_font_name]["resource_ids"].append(key) 

2307 for k in obj: 

2308 font_info = get_font_info(obj[k], font_info, k) 

2309 elif isinstance(obj, (list, ArrayObject)): 

2310 for child_obj in obj: 

2311 font_info = get_font_info(child_obj, font_info) 

2312 return font_info 

2313 

2314 # Add relevant resource names for removal 

2315 font_info = get_font_info(page.get("/Resources")) 

2316 for font_name in font_names: 

2317 if font_name in font_info: 

2318 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"]) 

2319 

2320 text_filters = {} 

2321 if font_names: 

2322 text_filters["font_ids"] = resource_ids_to_remove 

2323 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters) 

2324 

2325 def add_uri( 

2326 self, 

2327 page_number: int, 

2328 uri: str, 

2329 rect: RectangleObject, 

2330 border: Optional[ArrayObject] = None, 

2331 ) -> None: 

2332 """ 

2333 Add an URI from a rectangular area to the specified page. 

2334 

2335 Args: 

2336 page_number: index of the page on which to place the URI action. 

2337 uri: URI of resource to link to. 

2338 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or 

2339 array of four integers specifying the clickable rectangular area 

2340 ``[xLL, yLL, xUR, yUR]``, or string in the form 

2341 ``"[ xLL yLL xUR yUR ]"``. 

2342 border: if provided, an array describing border-drawing 

2343 properties. See the PDF spec for details. No border will be 

2344 drawn if this argument is omitted. 

2345 

2346 """ 

2347 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore 

2348 page_ref = cast(Dict[str, Any], self.get_object(page_link)) 

2349 

2350 border_arr: BorderArrayType 

2351 if border is not None: 

2352 border_arr = [NumberObject(n) for n in border[:3]] 

2353 if len(border) == 4: 

2354 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) 

2355 border_arr.append(dash_pattern) 

2356 else: 

2357 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)] 

2358 

2359 if isinstance(rect, str): 

2360 rect = NumberObject(rect) 

2361 elif isinstance(rect, RectangleObject): 

2362 pass 

2363 else: 

2364 rect = RectangleObject(rect) 

2365 

2366 lnk2 = DictionaryObject() 

2367 lnk2.update( 

2368 { 

2369 NameObject("/S"): NameObject("/URI"), 

2370 NameObject("/URI"): TextStringObject(uri), 

2371 } 

2372 ) 

2373 lnk = DictionaryObject() 

2374 lnk.update( 

2375 { 

2376 NameObject(AA.Type): NameObject("/Annot"), 

2377 NameObject(AA.Subtype): NameObject("/Link"), 

2378 NameObject(AA.P): page_link, 

2379 NameObject(AA.Rect): rect, 

2380 NameObject("/H"): NameObject("/I"), 

2381 NameObject(AA.Border): ArrayObject(border_arr), 

2382 NameObject("/A"): lnk2, 

2383 } 

2384 ) 

2385 lnk_ref = self._add_object(lnk) 

2386 

2387 if PG.ANNOTS in page_ref: 

2388 page_ref[PG.ANNOTS].append(lnk_ref) 

2389 else: 

2390 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) 

2391 

2392 _valid_layouts = ( 

2393 "/NoLayout", 

2394 "/SinglePage", 

2395 "/OneColumn", 

2396 "/TwoColumnLeft", 

2397 "/TwoColumnRight", 

2398 "/TwoPageLeft", 

2399 "/TwoPageRight", 

2400 ) 

2401 

2402 def _get_page_layout(self) -> Optional[LayoutType]: 

2403 try: 

2404 return cast(LayoutType, self._root_object["/PageLayout"]) 

2405 except KeyError: 

2406 return None 

2407 

2408 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: 

2409 """ 

2410 Set the page layout. 

2411 

2412 Args: 

2413 layout: The page layout to be used. 

2414 

2415 .. list-table:: Valid ``layout`` arguments 

2416 :widths: 50 200 

2417 

2418 * - /NoLayout 

2419 - Layout explicitly not specified 

2420 * - /SinglePage 

2421 - Show one page at a time 

2422 * - /OneColumn 

2423 - Show one column at a time 

2424 * - /TwoColumnLeft 

2425 - Show pages in two columns, odd-numbered pages on the left 

2426 * - /TwoColumnRight 

2427 - Show pages in two columns, odd-numbered pages on the right 

2428 * - /TwoPageLeft 

2429 - Show two pages at a time, odd-numbered pages on the left 

2430 * - /TwoPageRight 

2431 - Show two pages at a time, odd-numbered pages on the right 

2432 

2433 """ 

2434 if not isinstance(layout, NameObject): 

2435 if layout not in self._valid_layouts: 

2436 logger_warning( 

2437 f"Layout should be one of: {'', ''.join(self._valid_layouts)}", 

2438 __name__, 

2439 ) 

2440 layout = NameObject(layout) 

2441 self._root_object.update({NameObject("/PageLayout"): layout}) 

2442 

2443 def set_page_layout(self, layout: LayoutType) -> None: 

2444 """ 

2445 Set the page layout. 

2446 

2447 Args: 

2448 layout: The page layout to be used 

2449 

2450 .. list-table:: Valid ``layout`` arguments 

2451 :widths: 50 200 

2452 

2453 * - /NoLayout 

2454 - Layout explicitly not specified 

2455 * - /SinglePage 

2456 - Show one page at a time 

2457 * - /OneColumn 

2458 - Show one column at a time 

2459 * - /TwoColumnLeft 

2460 - Show pages in two columns, odd-numbered pages on the left 

2461 * - /TwoColumnRight 

2462 - Show pages in two columns, odd-numbered pages on the right 

2463 * - /TwoPageLeft 

2464 - Show two pages at a time, odd-numbered pages on the left 

2465 * - /TwoPageRight 

2466 - Show two pages at a time, odd-numbered pages on the right 

2467 

2468 """ 

2469 self._set_page_layout(layout) 

2470 

2471 @property 

2472 def page_layout(self) -> Optional[LayoutType]: 

2473 """ 

2474 Page layout property. 

2475 

2476 .. list-table:: Valid ``layout`` values 

2477 :widths: 50 200 

2478 

2479 * - /NoLayout 

2480 - Layout explicitly not specified 

2481 * - /SinglePage 

2482 - Show one page at a time 

2483 * - /OneColumn 

2484 - Show one column at a time 

2485 * - /TwoColumnLeft 

2486 - Show pages in two columns, odd-numbered pages on the left 

2487 * - /TwoColumnRight 

2488 - Show pages in two columns, odd-numbered pages on the right 

2489 * - /TwoPageLeft 

2490 - Show two pages at a time, odd-numbered pages on the left 

2491 * - /TwoPageRight 

2492 - Show two pages at a time, odd-numbered pages on the right 

2493 """ 

2494 return self._get_page_layout() 

2495 

2496 @page_layout.setter 

2497 def page_layout(self, layout: LayoutType) -> None: 

2498 self._set_page_layout(layout) 

2499 

2500 _valid_modes = ( 

2501 "/UseNone", 

2502 "/UseOutlines", 

2503 "/UseThumbs", 

2504 "/FullScreen", 

2505 "/UseOC", 

2506 "/UseAttachments", 

2507 ) 

2508 

2509 def _get_page_mode(self) -> Optional[PagemodeType]: 

2510 try: 

2511 return cast(PagemodeType, self._root_object["/PageMode"]) 

2512 except KeyError: 

2513 return None 

2514 

2515 @property 

2516 def page_mode(self) -> Optional[PagemodeType]: 

2517 """ 

2518 Page mode property. 

2519 

2520 .. list-table:: Valid ``mode`` values 

2521 :widths: 50 200 

2522 

2523 * - /UseNone 

2524 - Do not show outline or thumbnails panels 

2525 * - /UseOutlines 

2526 - Show outline (aka bookmarks) panel 

2527 * - /UseThumbs 

2528 - Show page thumbnails panel 

2529 * - /FullScreen 

2530 - Fullscreen view 

2531 * - /UseOC 

2532 - Show Optional Content Group (OCG) panel 

2533 * - /UseAttachments 

2534 - Show attachments panel 

2535 """ 

2536 return self._get_page_mode() 

2537 

2538 @page_mode.setter 

2539 def page_mode(self, mode: PagemodeType) -> None: 

2540 if isinstance(mode, NameObject): 

2541 mode_name: NameObject = mode 

2542 else: 

2543 if mode not in self._valid_modes: 

2544 logger_warning( 

2545 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__ 

2546 ) 

2547 mode_name = NameObject(mode) 

2548 self._root_object.update({NameObject("/PageMode"): mode_name}) 

2549 

2550 def add_annotation( 

2551 self, 

2552 page_number: Union[int, PageObject], 

2553 annotation: Dict[str, Any], 

2554 ) -> DictionaryObject: 

2555 """ 

2556 Add a single annotation to the page. 

2557 The added annotation must be a new annotation. 

2558 It cannot be recycled. 

2559 

2560 Args: 

2561 page_number: PageObject or page index. 

2562 annotation: Annotation to be added (created with annotation). 

2563 

2564 Returns: 

2565 The inserted object. 

2566 This can be used for popup creation, for example. 

2567 

2568 """ 

2569 page = page_number 

2570 if isinstance(page, int): 

2571 page = self.pages[page] 

2572 elif not isinstance(page, PageObject): 

2573 raise TypeError("page: invalid type") 

2574 

2575 to_add = cast(DictionaryObject, _pdf_objectify(annotation)) 

2576 to_add[NameObject("/P")] = page.indirect_reference 

2577 

2578 if page.annotations is None: 

2579 page[NameObject("/Annots")] = ArrayObject() 

2580 assert page.annotations is not None 

2581 

2582 # Internal link annotations need the correct object type for the 

2583 # destination 

2584 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add: 

2585 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")]) 

2586 dest = Destination( 

2587 NameObject("/LinkName"), 

2588 tmp["target_page_index"], 

2589 Fit( 

2590 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"] 

2591 ), # I have no clue why this dict-hack is necessary 

2592 ) 

2593 to_add[NameObject("/Dest")] = dest.dest_array 

2594 

2595 page.annotations.append(self._add_object(to_add)) 

2596 

2597 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add: 

2598 cast(DictionaryObject, to_add["/Parent"].get_object())[ 

2599 NameObject("/Popup") 

2600 ] = to_add.indirect_reference 

2601 

2602 return to_add 

2603 

2604 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: 

2605 """ 

2606 Perform some clean up in the page. 

2607 Currently: convert NameObject named destination to TextStringObject 

2608 (required for names/dests list) 

2609 

2610 Args: 

2611 page: 

2612 

2613 Returns: 

2614 The cleaned PageObject 

2615 

2616 """ 

2617 page = cast("PageObject", page.get_object()) 

2618 for a in page.get("/Annots", []): 

2619 a_obj = a.get_object() 

2620 d = a_obj.get("/Dest", None) 

2621 act = a_obj.get("/A", None) 

2622 if isinstance(d, NameObject): 

2623 a_obj[NameObject("/Dest")] = TextStringObject(d) 

2624 elif act is not None: 

2625 act = act.get_object() 

2626 d = act.get("/D", None) 

2627 if isinstance(d, NameObject): 

2628 act[NameObject("/D")] = TextStringObject(d) 

2629 return page 

2630 

2631 def _create_stream( 

2632 self, fileobj: Union[Path, StrByteType, PdfReader] 

2633 ) -> Tuple[IOBase, Optional[Encryption]]: 

2634 # If the fileobj parameter is a string, assume it is a path 

2635 # and create a file object at that location. If it is a file, 

2636 # copy the file's contents into a BytesIO stream object; if 

2637 # it is a PdfReader, copy that reader's stream into a 

2638 # BytesIO stream. 

2639 # If fileobj is none of the above types, it is not modified 

2640 encryption_obj = None 

2641 stream: IOBase 

2642 if isinstance(fileobj, (str, Path)): 

2643 with FileIO(fileobj, "rb") as f: 

2644 stream = BytesIO(f.read()) 

2645 elif isinstance(fileobj, PdfReader): 

2646 if fileobj._encryption: 

2647 encryption_obj = fileobj._encryption 

2648 orig_tell = fileobj.stream.tell() 

2649 fileobj.stream.seek(0) 

2650 stream = BytesIO(fileobj.stream.read()) 

2651 

2652 # reset the stream to its original location 

2653 fileobj.stream.seek(orig_tell) 

2654 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): 

2655 fileobj.seek(0) 

2656 filecontent = fileobj.read() 

2657 stream = BytesIO(filecontent) 

2658 else: 

2659 raise NotImplementedError( 

2660 "Merging requires an object that PdfReader can parse. " 

2661 "Typically, that is a Path or a string representing a Path, " 

2662 "a file object, or an object implementing .seek and .read. " 

2663 "Passing a PdfReader directly works as well." 

2664 ) 

2665 return stream, encryption_obj 

2666 

2667 def append( 

2668 self, 

2669 fileobj: Union[StrByteType, PdfReader, Path], 

2670 outline_item: Union[ 

2671 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] 

2672 ] = None, 

2673 pages: Union[ 

2674 None, 

2675 PageRange, 

2676 Tuple[int, int], 

2677 Tuple[int, int, int], 

2678 List[int], 

2679 List[PageObject], 

2680 ] = None, 

2681 import_outline: bool = True, 

2682 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None, 

2683 ) -> None: 

2684 """ 

2685 Identical to the :meth:`merge()<merge>` method, but assumes you want to 

2686 concatenate all pages onto the end of the file instead of specifying a 

2687 position. 

2688 

2689 Args: 

2690 fileobj: A File Object or an object that supports the standard 

2691 read and seek methods similar to a File Object. Could also be a 

2692 string representing a path to a PDF file. 

2693 outline_item: Optionally, you may specify a string to build an 

2694 outline (aka 'bookmark') to identify the beginning of the 

2695 included file. 

2696 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2697 or a ``(start, stop[, step])`` tuple 

2698 or a list of pages to be processed 

2699 to merge only the specified range of pages from the source 

2700 document into the output document. 

2701 import_outline: You may prevent the source document's 

2702 outline (collection of outline items, previously referred to as 

2703 'bookmarks') from being imported by specifying this as ``False``. 

2704 excluded_fields: Provide the list of fields/keys to be ignored 

2705 if ``/Annots`` is part of the list, the annotation will be ignored 

2706 if ``/B`` is part of the list, the articles will be ignored 

2707 

2708 """ 

2709 if excluded_fields is None: 

2710 excluded_fields = () 

2711 if isinstance(outline_item, (tuple, list, PageRange)): 

2712 if isinstance(pages, bool): 

2713 if not isinstance(import_outline, bool): 

2714 excluded_fields = import_outline 

2715 import_outline = pages 

2716 pages = outline_item 

2717 self.merge( 

2718 None, 

2719 fileobj, 

2720 None, 

2721 pages, 

2722 import_outline, 

2723 excluded_fields, 

2724 ) 

2725 else: # if isinstance(outline_item, str): 

2726 self.merge( 

2727 None, 

2728 fileobj, 

2729 outline_item, 

2730 pages, 

2731 import_outline, 

2732 excluded_fields, 

2733 ) 

2734 

2735 def merge( 

2736 self, 

2737 position: Optional[int], 

2738 fileobj: Union[Path, StrByteType, PdfReader], 

2739 outline_item: Optional[str] = None, 

2740 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None, 

2741 import_outline: bool = True, 

2742 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (), 

2743 ) -> None: 

2744 """ 

2745 Merge the pages from the given file into the output file at the 

2746 specified page number. 

2747 

2748 Args: 

2749 position: The *page number* to insert this file. File will 

2750 be inserted after the given number. 

2751 fileobj: A File Object or an object that supports the standard 

2752 read and seek methods similar to a File Object. Could also be a 

2753 string representing a path to a PDF file. 

2754 outline_item: Optionally, you may specify a string to build an outline 

2755 (aka 'bookmark') to identify the 

2756 beginning of the included file. 

2757 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2758 or a ``(start, stop[, step])`` tuple 

2759 or a list of pages to be processed 

2760 to merge only the specified range of pages from the source 

2761 document into the output document. 

2762 import_outline: You may prevent the source document's 

2763 outline (collection of outline items, previously referred to as 

2764 'bookmarks') from being imported by specifying this as ``False``. 

2765 excluded_fields: provide the list of fields/keys to be ignored 

2766 if ``/Annots`` is part of the list, the annotation will be ignored 

2767 if ``/B`` is part of the list, the articles will be ignored 

2768 

2769 Raises: 

2770 TypeError: The pages attribute is not configured properly 

2771 

2772 """ 

2773 if isinstance(fileobj, PdfDocCommon): 

2774 reader = fileobj 

2775 else: 

2776 stream, encryption_obj = self._create_stream(fileobj) 

2777 # Create a new PdfReader instance using the stream 

2778 # (either file or BytesIO or StringIO) created above 

2779 reader = PdfReader(stream, strict=False) # type: ignore[arg-type] 

2780 

2781 if excluded_fields is None: 

2782 excluded_fields = () 

2783 # Find the range of pages to merge. 

2784 if pages is None: 

2785 pages = list(range(len(reader.pages))) 

2786 elif isinstance(pages, PageRange): 

2787 pages = list(range(*pages.indices(len(reader.pages)))) 

2788 elif isinstance(pages, list): 

2789 pass # keep unchanged 

2790 elif isinstance(pages, tuple) and len(pages) <= 3: 

2791 pages = list(range(*pages)) 

2792 elif not isinstance(pages, tuple): 

2793 raise TypeError( 

2794 '"pages" must be a tuple of (start, stop[, step]) or a list' 

2795 ) 

2796 

2797 srcpages = {} 

2798 for page in pages: 

2799 if isinstance(page, PageObject): 

2800 pg = page 

2801 else: 

2802 pg = reader.pages[page] 

2803 assert pg.indirect_reference is not None 

2804 if position is None: 

2805 # numbers in the exclude list identifies that the exclusion is 

2806 # only applicable to 1st level of cloning 

2807 srcpages[pg.indirect_reference.idnum] = self.add_page( 

2808 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore 

2809 ) 

2810 else: 

2811 srcpages[pg.indirect_reference.idnum] = self.insert_page( 

2812 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore 

2813 ) 

2814 position += 1 

2815 srcpages[pg.indirect_reference.idnum].original_page = pg 

2816 

2817 reader._named_destinations = ( 

2818 reader.named_destinations 

2819 ) # need for the outline processing below 

2820 

2821 arr: Any 

2822 

2823 def _process_named_dests(dest: Any) -> None: 

2824 arr = dest.dest_array 

2825 if "/Names" in self._root_object and dest["/Title"] in cast( 

2826 List[Any], 

2827 cast( 

2828 DictionaryObject, 

2829 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()), 

2830 ).get("/Names", DictionaryObject()), 

2831 ): 

2832 # already exists: should not duplicate it 

2833 pass 

2834 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject): 

2835 pass 

2836 elif isinstance(dest["/Page"], int): 

2837 # the page reference is a page number normally not a PDF Reference 

2838 # page numbers as int are normally accepted only in external goto 

2839 try: 

2840 p = reader.pages[dest["/Page"]] 

2841 except IndexError: 

2842 return 

2843 assert p.indirect_reference is not None 

2844 try: 

2845 arr[NumberObject(0)] = NumberObject( 

2846 srcpages[p.indirect_reference.idnum].page_number 

2847 ) 

2848 self.add_named_destination_array(dest["/Title"], arr) 

2849 except KeyError: 

2850 pass 

2851 elif dest["/Page"].indirect_reference.idnum in srcpages: 

2852 arr[NumberObject(0)] = srcpages[ 

2853 dest["/Page"].indirect_reference.idnum 

2854 ].indirect_reference 

2855 self.add_named_destination_array(dest["/Title"], arr) 

2856 

2857 for dest in reader._named_destinations.values(): 

2858 _process_named_dests(dest) 

2859 

2860 outline_item_typ: TreeObject 

2861 if outline_item is not None: 

2862 outline_item_typ = cast( 

2863 "TreeObject", 

2864 self.add_outline_item( 

2865 TextStringObject(outline_item), 

2866 next(iter(srcpages.values())).indirect_reference, 

2867 fit=PAGE_FIT, 

2868 ).get_object(), 

2869 ) 

2870 else: 

2871 outline_item_typ = self.get_outline_root() 

2872 

2873 _ro = reader.root_object 

2874 if import_outline and CO.OUTLINES in _ro: 

2875 outline = self._get_filtered_outline( 

2876 _ro.get(CO.OUTLINES, None), srcpages, reader 

2877 ) 

2878 self._insert_filtered_outline( 

2879 outline, outline_item_typ, None 

2880 ) # TODO: use before parameter 

2881 

2882 if "/Annots" not in excluded_fields: 

2883 for pag in srcpages.values(): 

2884 lst = self._insert_filtered_annotations( 

2885 pag.original_page.get("/Annots", []), pag, srcpages, reader 

2886 ) 

2887 if len(lst) > 0: 

2888 pag[NameObject("/Annots")] = lst 

2889 self.clean_page(pag) 

2890 

2891 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None: 

2892 if "/AcroForm" not in self._root_object: 

2893 self._root_object[NameObject("/AcroForm")] = self._add_object( 

2894 cast( 

2895 DictionaryObject, 

2896 reader.root_object["/AcroForm"], 

2897 ).clone(self, False, ("/Fields",)) 

2898 ) 

2899 arr = ArrayObject() 

2900 else: 

2901 arr = cast( 

2902 ArrayObject, 

2903 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], 

2904 ) 

2905 trslat = self._id_translated[id(reader)] 

2906 try: 

2907 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore 

2908 try: 

2909 ind = IndirectObject(trslat[f.idnum], 0, self) 

2910 if ind not in arr: 

2911 arr.append(ind) 

2912 except KeyError: 

2913 # for trslat[] which mean the field has not be copied 

2914 # through the page 

2915 pass 

2916 except KeyError: # for /Acroform or /Fields are not existing 

2917 arr = self._add_object(ArrayObject()) 

2918 cast(DictionaryObject, self._root_object["/AcroForm"])[ 

2919 NameObject("/Fields") 

2920 ] = arr 

2921 

2922 if "/B" not in excluded_fields: 

2923 self.add_filtered_articles("", srcpages, reader) 

2924 

2925 def _add_articles_thread( 

2926 self, 

2927 thread: DictionaryObject, # thread entry from the reader's array of threads 

2928 pages: Dict[int, PageObject], 

2929 reader: PdfReader, 

2930 ) -> IndirectObject: 

2931 """ 

2932 Clone the thread with only the applicable articles. 

2933 

2934 Args: 

2935 thread: 

2936 pages: 

2937 reader: 

2938 

2939 Returns: 

2940 The added thread as an indirect reference 

2941 

2942 """ 

2943 nthread = thread.clone( 

2944 self, force_duplicate=True, ignore_fields=("/F",) 

2945 ) # use of clone to keep link between reader and writer 

2946 self.threads.append(nthread.indirect_reference) 

2947 first_article = cast("DictionaryObject", thread["/F"]) 

2948 current_article: Optional[DictionaryObject] = first_article 

2949 new_article: Optional[DictionaryObject] = None 

2950 while current_article is not None: 

2951 pag = self._get_cloned_page( 

2952 cast("PageObject", current_article["/P"]), pages, reader 

2953 ) 

2954 if pag is not None: 

2955 if new_article is None: 

2956 new_article = cast( 

2957 "DictionaryObject", 

2958 self._add_object(DictionaryObject()).get_object(), 

2959 ) 

2960 new_first = new_article 

2961 nthread[NameObject("/F")] = new_article.indirect_reference 

2962 else: 

2963 new_article2 = cast( 

2964 "DictionaryObject", 

2965 self._add_object( 

2966 DictionaryObject( 

2967 {NameObject("/V"): new_article.indirect_reference} 

2968 ) 

2969 ).get_object(), 

2970 ) 

2971 new_article[NameObject("/N")] = new_article2.indirect_reference 

2972 new_article = new_article2 

2973 new_article[NameObject("/P")] = pag 

2974 new_article[NameObject("/T")] = nthread.indirect_reference 

2975 new_article[NameObject("/R")] = current_article["/R"] 

2976 pag_obj = cast("PageObject", pag.get_object()) 

2977 if "/B" not in pag_obj: 

2978 pag_obj[NameObject("/B")] = ArrayObject() 

2979 cast("ArrayObject", pag_obj["/B"]).append( 

2980 new_article.indirect_reference 

2981 ) 

2982 current_article = cast("DictionaryObject", current_article["/N"]) 

2983 if current_article == first_article: 

2984 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore 

2985 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore 

2986 current_article = None 

2987 assert nthread.indirect_reference is not None 

2988 return nthread.indirect_reference 

2989 

2990 def add_filtered_articles( 

2991 self, 

2992 fltr: Union[ 

2993 Pattern[Any], str 

2994 ], # thread entry from the reader's array of threads 

2995 pages: Dict[int, PageObject], 

2996 reader: PdfReader, 

2997 ) -> None: 

2998 """ 

2999 Add articles matching the defined criteria. 

3000 

3001 Args: 

3002 fltr: 

3003 pages: 

3004 reader: 

3005 

3006 """ 

3007 if isinstance(fltr, str): 

3008 fltr = re.compile(fltr) 

3009 elif not isinstance(fltr, Pattern): 

3010 fltr = re.compile("") 

3011 for p in pages.values(): 

3012 pp = p.original_page 

3013 for a in pp.get("/B", ()): 

3014 thr = a.get_object().get("/T") 

3015 if thr is None: 

3016 continue 

3017 thr = thr.get_object() 

3018 if thr.indirect_reference.idnum not in self._id_translated[ 

3019 id(reader) 

3020 ] and fltr.search((thr.get("/I", {})).get("/Title", "")): 

3021 self._add_articles_thread(thr, pages, reader) 

3022 

3023 def _get_cloned_page( 

3024 self, 

3025 page: Union[None, IndirectObject, PageObject, NullObject], 

3026 pages: Dict[int, PageObject], 

3027 reader: PdfReader, 

3028 ) -> Optional[IndirectObject]: 

3029 if isinstance(page, NullObject): 

3030 return None 

3031 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": 

3032 _i = page.indirect_reference 

3033 elif isinstance(page, IndirectObject): 

3034 _i = page 

3035 try: 

3036 return pages[_i.idnum].indirect_reference # type: ignore 

3037 except Exception: 

3038 return None 

3039 

3040 def _insert_filtered_annotations( 

3041 self, 

3042 annots: Union[IndirectObject, List[DictionaryObject], None], 

3043 page: PageObject, 

3044 pages: Dict[int, PageObject], 

3045 reader: PdfReader, 

3046 ) -> List[Destination]: 

3047 outlist = ArrayObject() 

3048 if isinstance(annots, IndirectObject): 

3049 annots = cast("List[Any]", annots.get_object()) 

3050 if annots is None: 

3051 return outlist 

3052 if not isinstance(annots, list): 

3053 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__) 

3054 return outlist 

3055 for an in annots: 

3056 ano = cast("DictionaryObject", an.get_object()) 

3057 if ( 

3058 ano["/Subtype"] != "/Link" 

3059 or "/A" not in ano 

3060 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" 

3061 or "/Dest" in ano 

3062 ): 

3063 if "/Dest" not in ano: 

3064 outlist.append(self._add_object(ano.clone(self))) 

3065 else: 

3066 d = ano["/Dest"] 

3067 if isinstance(d, str): 

3068 # it is a named dest 

3069 if str(d) in self.get_named_dest_root(): 

3070 outlist.append(ano.clone(self).indirect_reference) 

3071 else: 

3072 d = cast("ArrayObject", d) 

3073 p = self._get_cloned_page(d[0], pages, reader) 

3074 if p is not None: 

3075 anc = ano.clone(self, ignore_fields=("/Dest",)) 

3076 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]]) 

3077 outlist.append(self._add_object(anc)) 

3078 else: 

3079 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject()) 

3080 if d is None or isinstance(d, NullObject): 

3081 continue 

3082 if isinstance(d, str): 

3083 # it is a named dest 

3084 if str(d) in self.get_named_dest_root(): 

3085 outlist.append(ano.clone(self).indirect_reference) 

3086 else: 

3087 d = cast("ArrayObject", d) 

3088 p = self._get_cloned_page(d[0], pages, reader) 

3089 if p is not None: 

3090 anc = ano.clone(self, ignore_fields=("/D",)) 

3091 cast("DictionaryObject", anc["/A"])[ 

3092 NameObject("/D") 

3093 ] = ArrayObject([p, *d[1:]]) 

3094 outlist.append(self._add_object(anc)) 

3095 return outlist 

3096 

3097 def _get_filtered_outline( 

3098 self, 

3099 node: Any, 

3100 pages: Dict[int, PageObject], 

3101 reader: PdfReader, 

3102 ) -> List[Destination]: 

3103 """ 

3104 Extract outline item entries that are part of the specified page set. 

3105 

3106 Args: 

3107 node: 

3108 pages: 

3109 reader: 

3110 

3111 Returns: 

3112 A list of destination objects. 

3113 

3114 """ 

3115 new_outline = [] 

3116 if node is None: 

3117 node = NullObject() 

3118 node = node.get_object() 

3119 if is_null_or_none(node): 

3120 node = DictionaryObject() 

3121 if node.get("/Type", "") == "/Outlines" or "/Title" not in node: 

3122 node = node.get("/First", None) 

3123 if node is not None: 

3124 node = node.get_object() 

3125 new_outline += self._get_filtered_outline(node, pages, reader) 

3126 else: 

3127 v: Union[None, IndirectObject, NullObject] 

3128 while node is not None: 

3129 node = node.get_object() 

3130 o = cast("Destination", reader._build_outline_item(node)) 

3131 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader) 

3132 if v is None: 

3133 v = NullObject() 

3134 o[NameObject("/Page")] = v 

3135 if "/First" in node: 

3136 o._filtered_children = self._get_filtered_outline( 

3137 node["/First"], pages, reader 

3138 ) 

3139 else: 

3140 o._filtered_children = [] 

3141 if ( 

3142 not isinstance(o["/Page"], NullObject) 

3143 or len(o._filtered_children) > 0 

3144 ): 

3145 new_outline.append(o) 

3146 node = node.get("/Next", None) 

3147 return new_outline 

3148 

3149 def _clone_outline(self, dest: Destination) -> TreeObject: 

3150 n_ol = TreeObject() 

3151 self._add_object(n_ol) 

3152 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"]) 

3153 if not isinstance(dest["/Page"], NullObject): 

3154 if dest.node is not None and "/A" in dest.node: 

3155 n_ol[NameObject("/A")] = dest.node["/A"].clone(self) 

3156 else: 

3157 n_ol[NameObject("/Dest")] = dest.dest_array 

3158 # TODO: /SE 

3159 if dest.node is not None: 

3160 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0)) 

3161 n_ol[NameObject("/C")] = ArrayObject( 

3162 dest.node.get( 

3163 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)] 

3164 ) 

3165 ) 

3166 return n_ol 

3167 

3168 def _insert_filtered_outline( 

3169 self, 

3170 outlines: List[Destination], 

3171 parent: Union[TreeObject, IndirectObject], 

3172 before: Union[None, TreeObject, IndirectObject] = None, 

3173 ) -> None: 

3174 for dest in outlines: 

3175 # TODO: can be improved to keep A and SE entries (ignored for the moment) 

3176 # with np=self.add_outline_item_destination(dest,parent,before) 

3177 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: 

3178 np = parent 

3179 else: 

3180 np = self._clone_outline(dest) 

3181 cast(TreeObject, parent.get_object()).insert_child(np, before, self) 

3182 self._insert_filtered_outline(dest._filtered_children, np, None) 

3183 

3184 def close(self) -> None: 

3185 """Implemented for API harmonization.""" 

3186 return 

3187 

3188 def find_outline_item( 

3189 self, 

3190 outline_item: Dict[str, Any], 

3191 root: Optional[OutlineType] = None, 

3192 ) -> Optional[List[int]]: 

3193 if root is None: 

3194 o = self.get_outline_root() 

3195 else: 

3196 o = cast("TreeObject", root) 

3197 

3198 i = 0 

3199 while o is not None: 

3200 if ( 

3201 o.indirect_reference == outline_item 

3202 or o.get("/Title", None) == outline_item 

3203 ): 

3204 return [i] 

3205 if "/First" in o: 

3206 res = self.find_outline_item( 

3207 outline_item, cast(OutlineType, o["/First"]) 

3208 ) 

3209 if res: 

3210 return ([i] if "/Title" in o else []) + res 

3211 if "/Next" in o: 

3212 i += 1 

3213 o = cast(TreeObject, o["/Next"]) 

3214 else: 

3215 return None 

3216 

3217 def find_bookmark( 

3218 self, 

3219 outline_item: Dict[str, Any], 

3220 root: Optional[OutlineType] = None, 

3221 ) -> None: # deprecated 

3222 """ 

3223 .. deprecated:: 2.9.0 

3224 Use :meth:`find_outline_item` instead. 

3225 """ 

3226 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0") 

3227 

3228 def reset_translation( 

3229 self, reader: Union[None, PdfReader, IndirectObject] = None 

3230 ) -> None: 

3231 """ 

3232 Reset the translation table between reader and the writer object. 

3233 

3234 Late cloning will create new independent objects. 

3235 

3236 Args: 

3237 reader: PdfReader or IndirectObject referencing a PdfReader object. 

3238 if set to None or omitted, all tables will be reset. 

3239 

3240 """ 

3241 if reader is None: 

3242 self._id_translated = {} 

3243 elif isinstance(reader, PdfReader): 

3244 try: 

3245 del self._id_translated[id(reader)] 

3246 except Exception: 

3247 pass 

3248 elif isinstance(reader, IndirectObject): 

3249 try: 

3250 del self._id_translated[id(reader.pdf)] 

3251 except Exception: 

3252 pass 

3253 else: 

3254 raise Exception("invalid parameter {reader}") 

3255 

3256 def set_page_label( 

3257 self, 

3258 page_index_from: int, 

3259 page_index_to: int, 

3260 style: Optional[PageLabelStyle] = None, 

3261 prefix: Optional[str] = None, 

3262 start: Optional[int] = 0, 

3263 ) -> None: 

3264 """ 

3265 Set a page label to a range of pages. 

3266 

3267 Page indexes must be given starting from 0. 

3268 Labels must have a style, a prefix or both. 

3269 If a range is not assigned any page label, a decimal label starting from 1 is applied. 

3270 

3271 Args: 

3272 page_index_from: page index of the beginning of the range starting from 0 

3273 page_index_to: page index of the beginning of the range starting from 0 

3274 style: The numbering style to be used for the numeric portion of each page label: 

3275 

3276 * ``/D`` Decimal Arabic numerals 

3277 * ``/R`` Uppercase Roman numerals 

3278 * ``/r`` Lowercase Roman numerals 

3279 * ``/A`` Uppercase letters (A to Z for the first 26 pages, 

3280 AA to ZZ for the next 26, and so on) 

3281 * ``/a`` Lowercase letters (a to z for the first 26 pages, 

3282 aa to zz for the next 26, and so on) 

3283 

3284 prefix: The label prefix for page labels in this range. 

3285 start: The value of the numeric portion for the first page label 

3286 in the range. 

3287 Subsequent pages are numbered sequentially from this value, 

3288 which must be greater than or equal to 1. 

3289 Default value: 1. 

3290 

3291 """ 

3292 if style is None and prefix is None: 

3293 raise ValueError("At least one of style and prefix must be given") 

3294 if page_index_from < 0: 

3295 raise ValueError("page_index_from must be greater or equal than 0") 

3296 if page_index_to < page_index_from: 

3297 raise ValueError( 

3298 "page_index_to must be greater or equal than page_index_from" 

3299 ) 

3300 if page_index_to >= len(self.pages): 

3301 raise ValueError("page_index_to exceeds number of pages") 

3302 if start is not None and start != 0 and start < 1: 

3303 raise ValueError("If given, start must be greater or equal than one") 

3304 

3305 self._set_page_label(page_index_from, page_index_to, style, prefix, start) 

3306 

3307 def _set_page_label( 

3308 self, 

3309 page_index_from: int, 

3310 page_index_to: int, 

3311 style: Optional[PageLabelStyle] = None, 

3312 prefix: Optional[str] = None, 

3313 start: Optional[int] = 0, 

3314 ) -> None: 

3315 """ 

3316 Set a page label to a range of pages. 

3317 

3318 Page indexes must be given starting from 0. 

3319 Labels must have a style, a prefix or both. 

3320 If a range is not assigned any page label a decimal label starting from 1 is applied. 

3321 

3322 Args: 

3323 page_index_from: page index of the beginning of the range starting from 0 

3324 page_index_to: page index of the beginning of the range starting from 0 

3325 style: The numbering style to be used for the numeric portion of each page label: 

3326 /D Decimal Arabic numerals 

3327 /R Uppercase Roman numerals 

3328 /r Lowercase Roman numerals 

3329 /A Uppercase letters (A to Z for the first 26 pages, 

3330 AA to ZZ for the next 26, and so on) 

3331 /a Lowercase letters (a to z for the first 26 pages, 

3332 aa to zz for the next 26, and so on) 

3333 prefix: The label prefix for page labels in this range. 

3334 start: The value of the numeric portion for the first page label 

3335 in the range. 

3336 Subsequent pages are numbered sequentially from this value, 

3337 which must be greater than or equal to 1. Default value: 1. 

3338 

3339 """ 

3340 default_page_label = DictionaryObject() 

3341 default_page_label[NameObject("/S")] = NameObject("/D") 

3342 

3343 new_page_label = DictionaryObject() 

3344 if style is not None: 

3345 new_page_label[NameObject("/S")] = NameObject(style) 

3346 if prefix is not None: 

3347 new_page_label[NameObject("/P")] = TextStringObject(prefix) 

3348 if start != 0: 

3349 new_page_label[NameObject("/St")] = NumberObject(start) 

3350 

3351 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: 

3352 nums = ArrayObject() 

3353 nums_insert(NumberObject(0), default_page_label, nums) 

3354 page_labels = TreeObject() 

3355 page_labels[NameObject("/Nums")] = nums 

3356 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3357 

3358 page_labels = cast( 

3359 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] 

3360 ) 

3361 nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) 

3362 

3363 nums_insert(NumberObject(page_index_from), new_page_label, nums) 

3364 nums_clear_range(NumberObject(page_index_from), page_index_to, nums) 

3365 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums) 

3366 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages): 

3367 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) 

3368 

3369 page_labels[NameObject("/Nums")] = nums 

3370 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3371 

3372 def _repr_mimebundle_( 

3373 self, 

3374 include: Union[None, Iterable[str]] = None, 

3375 exclude: Union[None, Iterable[str]] = None, 

3376 ) -> Dict[str, Any]: 

3377 """ 

3378 Integration into Jupyter Notebooks. 

3379 

3380 This method returns a dictionary that maps a mime-type to its 

3381 representation. 

3382 

3383 .. seealso:: 

3384 

3385 https://ipython.readthedocs.io/en/stable/config/integrating.html 

3386 """ 

3387 pdf_data = BytesIO() 

3388 self.write(pdf_data) 

3389 data = { 

3390 "application/pdf": pdf_data, 

3391 } 

3392 

3393 if include is not None: 

3394 # Filter representations based on include list 

3395 data = {k: v for k, v in data.items() if k in include} 

3396 

3397 if exclude is not None: 

3398 # Remove representations based on exclude list 

3399 data = {k: v for k, v in data.items() if k not in exclude} 

3400 

3401 return data 

3402 

3403 

3404def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject: 

3405 if isinstance(obj, PdfObject): 

3406 return obj 

3407 if isinstance(obj, dict): 

3408 to_add = DictionaryObject() 

3409 for key, value in obj.items(): 

3410 to_add[NameObject(key)] = _pdf_objectify(value) 

3411 return to_add 

3412 if isinstance(obj, str): 

3413 if obj.startswith("/"): 

3414 return NameObject(obj) 

3415 return TextStringObject(obj) 

3416 if isinstance(obj, (float, int)): 

3417 return FloatObject(obj) 

3418 if isinstance(obj, list): 

3419 return ArrayObject(_pdf_objectify(i) for i in obj) 

3420 raise NotImplementedError( 

3421 f"{type(obj)=} could not be cast to a PdfObject" 

3422 ) 

3423 

3424 

3425def _create_outline_item( 

3426 action_ref: Union[None, IndirectObject], 

3427 title: str, 

3428 color: Union[Tuple[float, float, float], str, None], 

3429 italic: bool, 

3430 bold: bool, 

3431) -> TreeObject: 

3432 outline_item = TreeObject() 

3433 if action_ref is not None: 

3434 outline_item[NameObject("/A")] = action_ref 

3435 outline_item.update( 

3436 { 

3437 NameObject("/Title"): create_string_object(title), 

3438 } 

3439 ) 

3440 if color: 

3441 if isinstance(color, str): 

3442 color = hex_to_rgb(color) 

3443 outline_item.update( 

3444 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} 

3445 ) 

3446 if italic or bold: 

3447 format_flag = 0 

3448 if italic: 

3449 format_flag += OutlineFontFlag.italic 

3450 if bold: 

3451 format_flag += OutlineFontFlag.bold 

3452 outline_item.update({NameObject("/F"): NumberObject(format_flag)}) 

3453 return outline_item 

3454 

3455 

3456def generate_appearance_stream( 

3457 txt: str, 

3458 sel: List[str], 

3459 da: str, 

3460 font_full_rev: Dict[str, bytes], 

3461 rct: RectangleObject, 

3462 font_height: float, 

3463 y_offset: float, 

3464) -> bytes: 

3465 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() 

3466 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): 

3467 if line in sel: 

3468 # may be improved but cannot find how to get fill working => replaced with lined box 

3469 ap_stream += ( 

3470 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" 

3471 f"0.5 0.5 0.5 rg s\n{da}\n" 

3472 ).encode() 

3473 if line_number == 0: 

3474 ap_stream += f"2 {y_offset} Td\n".encode() 

3475 else: 

3476 # Td is a relative translation 

3477 ap_stream += f"0 {- font_height * 1.4} Td\n".encode() 

3478 enc_line: List[bytes] = [ 

3479 font_full_rev.get(c, c.encode("utf-16-be")) for c in line 

3480 ] 

3481 if any(len(c) >= 2 for c in enc_line): 

3482 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" 

3483 else: 

3484 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" 

3485 ap_stream += b"ET\nQ\nEMC\nQ\n" 

3486 return ap_stream