Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1444 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import decimal 

31import enum 

32import hashlib 

33import re 

34import struct 

35import sys 

36import uuid 

37from collections.abc import Iterable, Mapping 

38from io import BytesIO, FileIO, IOBase 

39from itertools import compress 

40from pathlib import Path 

41from re import Pattern 

42from types import TracebackType 

43from typing import ( 

44 IO, 

45 Any, 

46 Callable, 

47 Optional, 

48 Union, 

49 cast, 

50) 

51 

52if sys.version_info >= (3, 11): 

53 from typing import Self 

54else: 

55 from typing_extensions import Self 

56 

57from ._doc_common import DocumentInformation, PdfDocCommon 

58from ._encryption import EncryptAlgorithm, Encryption 

59from ._page import PageObject, Transformation 

60from ._page_labels import nums_clear_range, nums_insert, nums_next 

61from ._reader import PdfReader 

62from ._utils import ( 

63 StrByteType, 

64 StreamType, 

65 _get_max_pdf_version_header, 

66 deprecate_with_replacement, 

67 deprecation_no_replacement, 

68 logger_warning, 

69) 

70from .constants import AnnotationDictionaryAttributes as AA 

71from .constants import CatalogAttributes as CA 

72from .constants import ( 

73 CatalogDictionary, 

74 GoToActionArguments, 

75 ImageType, 

76 InteractiveFormDictEntries, 

77 OutlineFontFlag, 

78 PageLabelStyle, 

79 PagesAttributes, 

80 TypFitArguments, 

81 UserAccessPermissions, 

82) 

83from .constants import Core as CO 

84from .constants import FieldDictionaryAttributes as FA 

85from .constants import PageAttributes as PG 

86from .constants import TrailerKeys as TK 

87from .errors import LimitReachedError, PdfReadError, PyPdfError 

88from .generic import ( 

89 PAGE_FIT, 

90 ArrayObject, 

91 BooleanObject, 

92 ByteStringObject, 

93 ContentStream, 

94 Destination, 

95 DictionaryObject, 

96 EmbeddedFile, 

97 Fit, 

98 FloatObject, 

99 IndirectObject, 

100 NameObject, 

101 NullObject, 

102 NumberObject, 

103 PdfObject, 

104 RectangleObject, 

105 ReferenceLink, 

106 StreamObject, 

107 TextStringObject, 

108 TreeObject, 

109 ViewerPreferences, 

110 create_string_object, 

111 extract_links, 

112 hex_to_rgb, 

113 is_null_or_none, 

114) 

115from .generic._appearance_stream import TextStreamAppearance 

116from .pagerange import PageRange, PageRangeSpec 

117from .types import ( 

118 AnnotationSubtype, 

119 BorderArrayType, 

120 LayoutType, 

121 OutlineItemType, 

122 OutlineType, 

123 PagemodeType, 

124) 

125from .xmp import XmpInformation 

126 

127ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() 

128 

129 

130class ObjectDeletionFlag(enum.IntFlag): 

131 NONE = 0 

132 TEXT = enum.auto() 

133 LINKS = enum.auto() 

134 ATTACHMENTS = enum.auto() 

135 OBJECTS_3D = enum.auto() 

136 ALL_ANNOTATIONS = enum.auto() 

137 XOBJECT_IMAGES = enum.auto() 

138 INLINE_IMAGES = enum.auto() 

139 DRAWING_IMAGES = enum.auto() 

140 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES 

141 

142 

143def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: 

144 hash = hashlib.md5(usedforsecurity=False) 

145 for block in iter(lambda: stream.read(blocksize), b""): 

146 hash.update(block) 

147 return hash.hexdigest() 

148 

149 

150class PdfWriter(PdfDocCommon): 

151 """ 

152 Write a PDF file out, given pages produced by another class or through 

153 cloning a PDF file during initialization. 

154 

155 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`. 

156 

157 Args: 

158 clone_from: identical to fileobj (for compatibility) 

159 

160 incremental: If true, loads the document and set the PdfWriter in incremental mode. 

161 

162 When writing incrementally, the original document is written first and new/modified 

163 content is appended. To be used for signed document/forms to keep signature valid. 

164 

165 full: If true, loads all the objects (always full if incremental = True). 

166 This parameter may allow loading large PDFs. 

167 

168 strict: If true, pypdf will raise an exception if a PDF does not follow the specification. 

169 If false, pypdf will try to be forgiving and do something reasonable, but it will log 

170 a warning message. It is a best-effort approach. 

171 

172 """ 

173 

174 def __init__( 

175 self, 

176 fileobj: Union[None, PdfReader, StrByteType, Path] = "", 

177 clone_from: Union[None, PdfReader, StrByteType, Path] = None, 

178 incremental: bool = False, 

179 full: bool = False, 

180 strict: bool = False, 

181 *, 

182 incremental_clone_object_count_limit: Optional[int] = 500_000, 

183 incremental_clone_object_id_limit: Optional[int] = 1_000_000, 

184 ) -> None: 

185 self.strict = strict 

186 """ 

187 If true, pypdf will raise an exception if a PDF does not follow the specification. 

188 If false, pypdf will try to be forgiving and do something reasonable, but it will log 

189 a warning message. It is a best-effort approach. 

190 """ 

191 

192 self.incremental = incremental or full 

193 """ 

194 Returns if the PdfWriter object has been started in incremental mode. 

195 """ 

196 

197 self._objects: list[Optional[PdfObject]] = [] 

198 """ 

199 The indirect objects in the PDF. 

200 For the incremental case, it will be filled with None 

201 in clone_reader_document_root. 

202 """ 

203 

204 self._original_hash: list[int] = [] 

205 """ 

206 List of hashes after import; used to identify changes. 

207 """ 

208 

209 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {} 

210 """ 

211 Maps hash values of indirect objects to the list of IndirectObjects. 

212 This is used for compression. 

213 """ 

214 

215 self._id_translated: dict[int, dict[int, int]] = {} 

216 """List of already translated IDs. 

217 dict[id(pdf)][(idnum, generation)] 

218 """ 

219 

220 self._info_obj: Optional[PdfObject] 

221 """The PDF files's document information dictionary, 

222 defined by Info in the PDF file's trailer dictionary.""" 

223 

224 self._ID: Union[ArrayObject, None] = None 

225 """The PDF file identifier, 

226 defined by the ID in the PDF file's trailer dictionary.""" 

227 

228 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = [] 

229 "Tracks links in pages added to the writer for resolving later." 

230 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {} 

231 "Tracks pages added to the writer and what page they turned into." 

232 

233 # Security parameters. 

234 self._incremental_clone_object_count_limit = ( 

235 incremental_clone_object_count_limit 

236 if isinstance(incremental_clone_object_count_limit, int) 

237 else sys.maxsize 

238 ) 

239 self._incremental_clone_object_id_limit = ( 

240 incremental_clone_object_id_limit if isinstance(incremental_clone_object_id_limit, int) else sys.maxsize 

241 ) 

242 

243 if self.incremental: 

244 if isinstance(fileobj, (str, Path)): 

245 with open(fileobj, "rb") as f: 

246 fileobj = BytesIO(f.read(-1)) 

247 if isinstance(fileobj, BytesIO): 

248 fileobj = PdfReader(fileobj) 

249 if not isinstance(fileobj, PdfReader): 

250 raise PyPdfError("Invalid type for incremental mode") 

251 self._reader = fileobj # prev content is in _reader.stream 

252 self._header = fileobj.pdf_header.encode() 

253 self._readonly = True # TODO: to be analysed 

254 else: 

255 self._header = b"%PDF-1.3" 

256 self._info_obj = self._add_object( 

257 DictionaryObject( 

258 {NameObject("/Producer"): create_string_object("pypdf")} 

259 ) 

260 ) 

261 

262 def _get_clone_from( 

263 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

264 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

265 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]: 

266 if isinstance(fileobj, (str, Path, IO, BytesIO)) and ( 

267 fileobj == "" or clone_from is not None 

268 ): 

269 return clone_from 

270 cloning = True 

271 if isinstance(fileobj, (str, Path)): 

272 fileobj_path = Path(fileobj) 

273 if not fileobj_path.exists() or fileobj_path.stat().st_size == 0: 

274 cloning = False 

275 elif isinstance(fileobj, (IOBase, BytesIO)): 

276 t = fileobj.tell() 

277 if fileobj.seek(0, 2) == 0: 

278 cloning = False 

279 fileobj.seek(t, 0) 

280 if cloning: 

281 clone_from = fileobj 

282 return clone_from 

283 

284 clone_from = _get_clone_from(fileobj, clone_from) 

285 # To prevent overwriting 

286 self.temp_fileobj = fileobj 

287 self.fileobj = "" 

288 self._with_as_usage = False 

289 self._cloned = False 

290 # The root of our page tree node 

291 pages = DictionaryObject( 

292 { 

293 NameObject(PagesAttributes.TYPE): NameObject("/Pages"), 

294 NameObject(PagesAttributes.COUNT): NumberObject(0), 

295 NameObject(PagesAttributes.KIDS): ArrayObject(), 

296 } 

297 ) 

298 self.flattened_pages = [] 

299 self._encryption: Optional[Encryption] = None 

300 self._encrypt_entry: Optional[DictionaryObject] = None 

301 

302 if clone_from is not None: 

303 if not isinstance(clone_from, PdfReader): 

304 clone_from = PdfReader(clone_from) 

305 self.clone_document_from_reader(clone_from) 

306 self._cloned = True 

307 else: 

308 self._pages = self._add_object(pages) 

309 self._root_object = DictionaryObject( 

310 { 

311 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG), 

312 NameObject(CO.PAGES): self._pages, 

313 } 

314 ) 

315 self._add_object(self._root_object) 

316 if full and not incremental: 

317 self.incremental = False 

318 if isinstance(self._ID, list): 

319 if isinstance(self._ID[0], TextStringObject): 

320 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) 

321 if isinstance(self._ID[1], TextStringObject): 

322 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) 

323 

324 # for commonality 

325 @property 

326 def is_encrypted(self) -> bool: 

327 """ 

328 Read-only boolean property showing whether this PDF file is encrypted. 

329 

330 Note that this property, if true, will remain true even after the 

331 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

332 """ 

333 return False 

334 

335 @property 

336 def root_object(self) -> DictionaryObject: 

337 """ 

338 Provide direct access to PDF Structure. 

339 

340 Note: 

341 Recommended only for read access. 

342 

343 """ 

344 return self._root_object 

345 

346 @property 

347 def _info(self) -> Optional[DictionaryObject]: 

348 """ 

349 Provide access to "/Info". Standardized with PdfReader. 

350 

351 Returns: 

352 /Info Dictionary; None if the entry does not exist 

353 

354 """ 

355 return ( 

356 None 

357 if self._info_obj is None 

358 else cast(DictionaryObject, self._info_obj.get_object()) 

359 ) 

360 

361 @_info.setter 

362 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: 

363 if value is None: 

364 try: 

365 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore[union-attr] 

366 except (KeyError, AttributeError): 

367 pass 

368 self._info_obj = None 

369 else: 

370 if self._info_obj is None: 

371 self._info_obj = self._add_object(DictionaryObject()) 

372 obj = cast(DictionaryObject, self._info_obj.get_object()) 

373 obj.clear() 

374 obj.update(cast(DictionaryObject, value.get_object())) 

375 

376 @property 

377 def xmp_metadata(self) -> Optional[XmpInformation]: 

378 """XMP (Extensible Metadata Platform) data.""" 

379 return cast(XmpInformation, self.root_object.xmp_metadata) 

380 

381 @xmp_metadata.setter 

382 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None: 

383 """XMP (Extensible Metadata Platform) data.""" 

384 if value is None: 

385 if "/Metadata" in self.root_object: 

386 del self.root_object["/Metadata"] 

387 return 

388 

389 metadata = self.root_object.get("/Metadata", None) 

390 if not isinstance(metadata, IndirectObject): 

391 if metadata is not None: 

392 del self.root_object["/Metadata"] 

393 metadata_stream = StreamObject() 

394 stream_reference = self._add_object(metadata_stream) 

395 self.root_object[NameObject("/Metadata")] = stream_reference 

396 else: 

397 metadata_stream = cast(StreamObject, metadata.get_object()) 

398 

399 if isinstance(value, XmpInformation): 

400 bytes_data = value.stream.get_data() 

401 else: 

402 bytes_data = value 

403 metadata_stream.set_data(bytes_data) 

404 

405 @property 

406 def with_as_usage(self) -> bool: 

407 deprecation_no_replacement("with_as_usage", "5.0") 

408 

409 @with_as_usage.setter 

410 def with_as_usage(self, value: bool) -> None: 

411 deprecation_no_replacement("with_as_usage", "5.0") 

412 

413 def __enter__(self) -> Self: 

414 """Store how writer is initialized by 'with'.""" 

415 c: bool = self._cloned 

416 t = self.temp_fileobj 

417 self.__init__() # type: ignore[misc] 

418 self._cloned = c 

419 self._with_as_usage = True 

420 self.fileobj = t # type: ignore[assignment] 

421 return self 

422 

423 def __exit__( 

424 self, 

425 exc_type: Optional[type[BaseException]], 

426 exc: Optional[BaseException], 

427 traceback: Optional[TracebackType], 

428 ) -> None: 

429 """Write data to the fileobj.""" 

430 if self.fileobj and not self._cloned: 

431 self.write(self.fileobj) 

432 

433 @property 

434 def pdf_header(self) -> str: 

435 """ 

436 Read/Write property of the PDF header that is written. 

437 

438 This should be something like ``'%PDF-1.5'``. It is recommended to set 

439 the lowest version that supports all features which are used within the 

440 PDF file. 

441 

442 Note: `pdf_header` returns a string but accepts bytes or str for writing 

443 """ 

444 return self._header.decode() 

445 

446 @pdf_header.setter 

447 def pdf_header(self, new_header: Union[str, bytes]) -> None: 

448 if isinstance(new_header, str): 

449 new_header = new_header.encode() 

450 self._header = new_header 

451 

452 def _add_object(self, obj: PdfObject) -> IndirectObject: 

453 if ( 

454 getattr(obj, "indirect_reference", None) is not None 

455 and obj.indirect_reference.pdf == self # type: ignore[union-attr] 

456 ): 

457 return obj.indirect_reference # type: ignore[return-value] 

458 # check for /Contents in Pages (/Contents in annotations are strings) 

459 if isinstance(obj, DictionaryObject) and isinstance( 

460 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject) 

461 ): 

462 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS]) 

463 self._objects.append(obj) 

464 obj.indirect_reference = IndirectObject(len(self._objects), 0, self) 

465 return obj.indirect_reference 

466 

467 def get_object( 

468 self, 

469 indirect_reference: Union[int, IndirectObject], 

470 ) -> PdfObject: 

471 if isinstance(indirect_reference, int): 

472 obj = self._objects[indirect_reference - 1] 

473 elif indirect_reference.pdf != self: 

474 raise ValueError("PDF must be self") 

475 else: 

476 obj = self._objects[indirect_reference.idnum - 1] 

477 if obj is None: 

478 raise PdfReadError(f"Object {indirect_reference!r} not found!") 

479 return obj 

480 

481 def _replace_object( 

482 self, 

483 indirect_reference: Union[int, IndirectObject], 

484 obj: PdfObject, 

485 ) -> PdfObject: 

486 if isinstance(indirect_reference, IndirectObject): 

487 if indirect_reference.pdf != self: 

488 raise ValueError("PDF must be self") 

489 indirect_reference = indirect_reference.idnum 

490 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore[union-attr] 

491 if ( 

492 getattr(obj, "indirect_reference", None) is not None 

493 and obj.indirect_reference.pdf != self # type: ignore[union-attr] 

494 ): 

495 obj = obj.clone(self) 

496 self._objects[indirect_reference - 1] = obj 

497 obj.indirect_reference = IndirectObject(indirect_reference, gen, self) 

498 

499 assert isinstance(obj, PdfObject), "mypy" 

500 return obj 

501 

502 def _add_page( 

503 self, 

504 page: PageObject, 

505 index: int, 

506 excluded_keys: Iterable[str] = (), 

507 ) -> PageObject: 

508 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE: 

509 raise ValueError("Invalid page object") 

510 assert self.flattened_pages is not None, "for mypy" 

511 page_org = page 

512 excluded_keys = list(excluded_keys) 

513 excluded_keys += [PagesAttributes.PARENT, "/StructParents"] 

514 # Acrobat does not accept two indirect references pointing on the same 

515 # page; therefore in order to add multiple copies of the same 

516 # page, we need to create a new dictionary for the page, however the 

517 # objects below (including content) are not duplicated: 

518 try: # delete an already existing page 

519 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore[union-attr] 

520 page_org.indirect_reference.idnum # type: ignore[union-attr] 

521 ] 

522 except Exception: 

523 pass 

524 

525 page = cast( 

526 "PageObject", page_org.clone(self, False, excluded_keys).get_object() 

527 ) 

528 if page_org.pdf is not None: 

529 other = page_org.pdf.pdf_header 

530 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) 

531 

532 node, idx = self._get_page_in_node(index) 

533 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference 

534 

535 if idx >= 0: 

536 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference) 

537 self.flattened_pages.insert(index, page) 

538 else: 

539 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference) 

540 self.flattened_pages.append(page) 

541 current: Optional[PdfObject] = node 

542 recurse = 0 

543 while not is_null_or_none(current): 

544 assert current is not None # for mypy; guarded by is_null_or_none 

545 node_dict = cast(DictionaryObject, current.get_object()) 

546 node_dict[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node_dict[PagesAttributes.COUNT]) + 1) 

547 current = node_dict.get(PagesAttributes.PARENT, None) 

548 recurse += 1 

549 if recurse > 1000: 

550 raise PyPdfError("Too many recursive calls!") 

551 

552 if page_org.pdf is not None: 

553 # the page may contain links to other pages, and those other 

554 # pages may or may not already be added. we store the 

555 # information we need, so that we can resolve the references 

556 # later. 

557 self._unresolved_links.extend(extract_links(page, page_org)) 

558 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference 

559 

560 return page 

561 

562 def set_need_appearances_writer(self, state: bool = True) -> None: 

563 """ 

564 Sets the "NeedAppearances" flag in the PDF writer. 

565 

566 The "NeedAppearances" flag indicates whether the appearance dictionary 

567 for form fields should be automatically generated by the PDF viewer or 

568 if the embedded appearance should be used. 

569 

570 Args: 

571 state: The actual value of the NeedAppearances flag. 

572 

573 Returns: 

574 None 

575 

576 """ 

577 # See §12.7.2 and §7.7.2 for more information: 

578 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

579 try: 

580 # get the AcroForm tree 

581 if CatalogDictionary.ACRO_FORM not in self._root_object: 

582 self._root_object[ 

583 NameObject(CatalogDictionary.ACRO_FORM) 

584 ] = self._add_object(DictionaryObject()) 

585 

586 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) 

587 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[ 

588 need_appearances 

589 ] = BooleanObject(state) 

590 except Exception as exc: # pragma: no cover 

591 logger_warning( 

592 "set_need_appearances_writer(%(state)s) catch : %(exc)s", 

593 source=__name__, 

594 state=state, 

595 exc=exc, 

596 ) 

597 

598 def create_viewer_preferences(self) -> ViewerPreferences: 

599 o = ViewerPreferences() 

600 self._root_object[ 

601 NameObject(CatalogDictionary.VIEWER_PREFERENCES) 

602 ] = self._add_object(o) 

603 return o 

604 

605 def add_page( 

606 self, 

607 page: PageObject, 

608 excluded_keys: Iterable[str] = (), 

609 ) -> PageObject: 

610 """ 

611 Add a page to this PDF file. 

612 

613 Recommended for advanced usage including the adequate excluded_keys. 

614 

615 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>` 

616 instance. 

617 

618 Args: 

619 page: The page to add to the document. Should be 

620 an instance of :class:`PageObject<pypdf._page.PageObject>` 

621 excluded_keys: 

622 

623 Returns: 

624 The added PageObject. 

625 

626 """ 

627 assert self.flattened_pages is not None, "mypy" 

628 return self._add_page(page, len(self.flattened_pages), excluded_keys) 

629 

630 def insert_page( 

631 self, 

632 page: PageObject, 

633 index: int = 0, 

634 excluded_keys: Iterable[str] = (), 

635 ) -> PageObject: 

636 """ 

637 Insert a page in this PDF file. The page is usually acquired from a 

638 :class:`PdfReader<pypdf.PdfReader>` instance. 

639 

640 Args: 

641 page: The page to add to the document. 

642 index: Position at which the page will be inserted. 

643 excluded_keys: 

644 

645 Returns: 

646 The added PageObject. 

647 

648 """ 

649 assert self.flattened_pages is not None, "mypy" 

650 if index < 0: 

651 index += len(self.flattened_pages) 

652 if index < 0: 

653 raise ValueError("Invalid index value") 

654 if index >= len(self.flattened_pages): 

655 return self.add_page(page, excluded_keys) 

656 return self._add_page(page, index, excluded_keys) 

657 

658 def _get_page_number_by_indirect( 

659 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

660 ) -> Optional[int]: 

661 """ 

662 Generate _page_id2num. 

663 

664 Args: 

665 indirect_reference: 

666 

667 Returns: 

668 The page number or None 

669 

670 """ 

671 # To provide same function as in PdfReader 

672 if is_null_or_none(indirect_reference): 

673 return None 

674 assert indirect_reference is not None, "mypy" 

675 if isinstance(indirect_reference, int): 

676 indirect_reference = IndirectObject(indirect_reference, 0, self) 

677 obj = indirect_reference.get_object() 

678 if isinstance(obj, PageObject): 

679 return obj.page_number 

680 return None 

681 

682 def add_blank_page( 

683 self, width: Optional[float] = None, height: Optional[float] = None 

684 ) -> PageObject: 

685 """ 

686 Append a blank page to this PDF file and return it. 

687 

688 If no page size is specified, use the size of the last page. 

689 

690 Args: 

691 width: The width of the new page expressed in default user 

692 space units. 

693 height: The height of the new page expressed in default 

694 user space units. 

695 

696 Returns: 

697 The newly appended page. 

698 

699 Raises: 

700 PageSizeNotDefinedError: if width and height are not defined 

701 and previous page does not exist. 

702 

703 """ 

704 page = PageObject.create_blank_page(self, width, height) 

705 return self.add_page(page) 

706 

707 def insert_blank_page( 

708 self, 

709 width: Optional[Union[float, decimal.Decimal]] = None, 

710 height: Optional[Union[float, decimal.Decimal]] = None, 

711 index: int = 0, 

712 ) -> PageObject: 

713 """ 

714 Insert a blank page to this PDF file and return it. 

715 

716 If no page size is specified for a dimension, use the size of the last page. 

717 

718 Args: 

719 width: The width of the new page in default user space units. 

720 height: The height of the new page in default user space units. 

721 index: Position to add the page. 

722 

723 Returns: 

724 The newly inserted page. 

725 

726 Raises: 

727 PageSizeNotDefinedError: if width and height are not defined 

728 and previous page does not exist. 

729 IndexError: Index is outside of [-self.get_num_pages(), self.get_num_pages()] 

730 """ 

731 num_pages = self.get_num_pages() 

732 if abs(index) <= num_pages: 

733 # Use the chosen index, but do not exceed the available pages 

734 fixed_index = min(index, num_pages - 1) 

735 mediabox = self.pages[fixed_index].mediabox 

736 if width is None or width <= 0: 

737 width = mediabox.width 

738 if height is None or height <= 0: 

739 height = mediabox.height 

740 else: 

741 raise IndexError(f"Index should be in range [-{num_pages}, {num_pages}]") 

742 

743 page = PageObject.create_blank_page(self, width, height) 

744 self.insert_page(page, index) 

745 return page 

746 

747 @property 

748 def open_destination( 

749 self, 

750 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

751 return super().open_destination 

752 

753 @open_destination.setter 

754 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

755 if dest is None: 

756 try: 

757 del self._root_object["/OpenAction"] 

758 except KeyError: 

759 pass 

760 elif isinstance(dest, str): 

761 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) 

762 elif isinstance(dest, Destination): 

763 self._root_object[NameObject("/OpenAction")] = dest.dest_array 

764 elif isinstance(dest, PageObject): 

765 self._root_object[NameObject("/OpenAction")] = Destination( 

766 "Opening", 

767 dest.indirect_reference 

768 if dest.indirect_reference is not None 

769 else NullObject(), 

770 PAGE_FIT, 

771 ).dest_array 

772 

773 def add_js(self, javascript: str) -> None: 

774 """ 

775 Add JavaScript which will launch upon opening this PDF. 

776 

777 Args: 

778 javascript: Your JavaScript. 

779 

780 Example: 

781 This will launch the print window when the PDF is opened. 

782 

783 >>> from pypdf import PdfWriter 

784 >>> output = PdfWriter() 

785 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 

786 

787 """ 

788 # Names / JavaScript preferred to be able to add multiple scripts 

789 if "/Names" not in self._root_object: 

790 self._root_object[NameObject(CA.NAMES)] = DictionaryObject() 

791 names = cast(DictionaryObject, self._root_object[CA.NAMES]) 

792 if "/JavaScript" not in names: 

793 names[NameObject("/JavaScript")] = DictionaryObject( 

794 {NameObject("/Names"): ArrayObject()} 

795 ) 

796 js_list = cast( 

797 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] 

798 ) 

799 # We need a name for parameterized JavaScript in the PDF file, 

800 # but it can be anything. 

801 js_list.append(create_string_object(str(uuid.uuid4()))) 

802 

803 js = DictionaryObject( 

804 { 

805 NameObject(PagesAttributes.TYPE): NameObject("/Action"), 

806 NameObject("/S"): NameObject("/JavaScript"), 

807 NameObject("/JS"): TextStringObject(f"{javascript}"), 

808 } 

809 ) 

810 js_list.append(self._add_object(js)) 

811 

812 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile": 

813 """ 

814 Embed a file inside the PDF. 

815 

816 Reference: 

817 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

818 Section 7.11.3 

819 

820 Args: 

821 filename: The filename to display. 

822 data: The data in the file. 

823 

824 Returns: 

825 EmbeddedFile instance for the newly created embedded file. 

826 

827 """ 

828 return EmbeddedFile._create_new(self, filename, data) 

829 

830 def append_pages_from_reader( 

831 self, 

832 reader: PdfReader, 

833 after_page_append: Optional[Callable[[PageObject], None]] = None, 

834 ) -> None: 

835 """ 

836 Copy pages from reader to writer. Includes an optional callback 

837 parameter which is invoked after pages are appended to the writer. 

838 

839 ``append`` should be preferred. 

840 

841 Args: 

842 reader: a PdfReader object from which to copy page 

843 annotations to this writer object. The writer's annots 

844 will then be updated. 

845 after_page_append: 

846 Callback function that is invoked after each page is appended to 

847 the writer. Signature includes a reference to the appended page 

848 (delegates to append_pages_from_reader). The single parameter of 

849 the callback is a reference to the page just appended to the 

850 document. 

851 

852 """ 

853 reader_num_pages = len(reader.pages) 

854 # Copy pages from reader to writer 

855 for reader_page_number in range(reader_num_pages): 

856 reader_page = reader.pages[reader_page_number] 

857 writer_page = self.add_page(reader_page) 

858 # Trigger callback, pass writer page as parameter 

859 if callable(after_page_append): 

860 after_page_append(writer_page) 

861 

862 def _merge_content_stream_to_page( 

863 self, 

864 page: PageObject, 

865 new_content_data: bytes, 

866 ) -> None: 

867 """ 

868 Combines existing content stream(s) with new content (as bytes). 

869 

870 Args: 

871 page: The page to which the new content data will be added. 

872 new_content_data: A binary-encoded new content stream, for 

873 instance the commands to draw an XObject. 

874 """ 

875 # First resolve the existing page content. This always is an IndirectObject: 

876 # PDF Explained by John Whitington 

877 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html 

878 if NameObject("/Contents") in page: 

879 existing_content_ref = page[NameObject("/Contents")] 

880 existing_content = existing_content_ref.get_object() 

881 

882 if isinstance(existing_content, ArrayObject): 

883 # Create a new StreamObject for the new_content_data 

884 new_stream_obj = StreamObject() 

885 new_stream_obj.set_data(new_content_data) 

886 existing_content.append(self._add_object(new_stream_obj)) 

887 page[NameObject("/Contents")] = self._add_object(existing_content) 

888 if isinstance(existing_content, StreamObject): 

889 # Merge new content to existing StreamObject 

890 merged_data = existing_content.get_data() + b"\n" + new_content_data 

891 new_stream = StreamObject() 

892 new_stream.set_data(merged_data) 

893 page[NameObject("/Contents")] = self._add_object(new_stream) 

894 else: 

895 # If no existing content, then we have an empty page. 

896 # Create a new StreamObject in a new /Contents entry. 

897 new_stream = StreamObject() 

898 new_stream.set_data(new_content_data) 

899 page[NameObject("/Contents")] = self._add_object(new_stream) 

900 

901 def _add_apstream_object( 

902 self, 

903 page: PageObject, 

904 appearance_stream_obj: StreamObject, 

905 object_name: str, 

906 x_offset: float, 

907 y_offset: float, 

908 ) -> None: 

909 """ 

910 Adds an appearance stream to the page content in the form of 

911 an XObject. 

912 

913 Args: 

914 page: The page to which to add the appearance stream. 

915 appearance_stream_obj: The appearance stream. 

916 object_name: The name of the appearance stream. 

917 x_offset: The horizontal offset for the appearance stream. 

918 y_offset: The vertical offset for the appearance stream. 

919 """ 

920 pg_res = cast(DictionaryObject, page[PG.RESOURCES]) 

921 # Always add the resolved stream object to the writer to get a new IndirectObject. 

922 # This ensures we have a valid IndirectObject managed by *this* writer. 

923 xobject_ref = self._add_object(appearance_stream_obj) 

924 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize() 

925 if "/XObject" not in pg_res: 

926 pg_res[NameObject("/XObject")] = DictionaryObject() 

927 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"]) 

928 if xobject_name not in pg_xo_res: 

929 pg_xo_res[xobject_name] = xobject_ref 

930 else: 

931 logger_warning( 

932 "XObject %(xobject_name)r already added to page resources. This might be an issue.", 

933 source=__name__, 

934 xobject_name=xobject_name, 

935 ) 

936 xobject_cm = Transformation().translate(x_offset, y_offset) 

937 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() 

938 self._merge_content_stream_to_page(page, xobject_drawing_commands) 

939 

940 FFBITS_NUL = FA.FfBits(0) 

941 

942 def update_page_form_field_values( 

943 self, 

944 page: Union[PageObject, list[PageObject], None], 

945 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]], 

946 flags: FA.FfBits = FFBITS_NUL, 

947 auto_regenerate: Optional[bool] = True, 

948 flatten: bool = False, 

949 ) -> None: 

950 """ 

951 Update the form field values for a given page from a fields dictionary. 

952 

953 Copy field texts and values from fields to page. 

954 If the field links to a parent object, add the information to the parent. 

955 

956 Args: 

957 page: `PageObject` - references **PDF writer's page** where the 

958 annotations and field data will be updated. 

959 `List[Pageobject]` - provides list of pages to be processed. 

960 `None` - all pages. 

961 fields: a Python dictionary of: 

962 

963 * field names (/T) as keys and text values (/V) as value 

964 * field names (/T) as keys and list of text values (/V) for multiple choice list 

965 * field names (/T) as keys and tuple of: 

966 * text values (/V) 

967 * font id (e.g. /F1, the font id must exist) 

968 * font size (0 for autosize) 

969 

970 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`. 

971 

972 auto_regenerate: Set/unset the need_appearances flag; 

973 the flag is unchanged if auto_regenerate is None. 

974 

975 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's 

976 appearance stream to the page contents. Note that this option does not remove the 

977 annotation itself. 

978 

979 """ 

980 if CatalogDictionary.ACRO_FORM not in self._root_object: 

981 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") 

982 acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

983 if InteractiveFormDictEntries.Fields not in acro_form: 

984 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") 

985 if isinstance(auto_regenerate, bool): 

986 self.set_need_appearances_writer(auto_regenerate) 

987 # Iterate through pages, update field values 

988 if page is None: 

989 page = list(self.pages) 

990 if isinstance(page, list): 

991 for p in page: 

992 if PG.ANNOTS in p: # just to prevent warnings 

993 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten) 

994 return 

995 if PG.ANNOTS not in page: 

996 logger_warning("No fields to update on this page", source=__name__) 

997 return 

998 appearance_stream_obj: Optional[StreamObject] = None 

999 

1000 for annotation in page[PG.ANNOTS]: # type: ignore[attr-defined] 

1001 annotation = cast(DictionaryObject, annotation.get_object()) 

1002 if annotation.get("/Subtype", "") != "/Widget": 

1003 continue 

1004 if "/FT" in annotation and "/T" in annotation: 

1005 parent_annotation = annotation 

1006 else: 

1007 parent_annotation = annotation.get( 

1008 PG.PARENT, DictionaryObject() 

1009 ).get_object() 

1010 

1011 for field, value in fields.items(): 

1012 rectangle = cast(RectangleObject, annotation[AA.Rect]) 

1013 if not ( 

1014 self._get_qualified_field_name(parent_annotation) == field 

1015 or parent_annotation.get("/T", None) == field 

1016 ): 

1017 continue 

1018 if ( 

1019 parent_annotation.get("/FT", None) == "/Ch" 

1020 and "/I" in parent_annotation 

1021 ): 

1022 del parent_annotation["/I"] 

1023 if flags: 

1024 annotation[NameObject(FA.Ff)] = NumberObject(flags) 

1025 # Set the field value 

1026 if not (value is None and flatten): # Only change values if given by user and not flattening. 

1027 if isinstance(value, list): 

1028 lst = ArrayObject(TextStringObject(v) for v in value) 

1029 parent_annotation[NameObject(FA.V)] = lst 

1030 elif isinstance(value, tuple): 

1031 annotation[NameObject(FA.V)] = TextStringObject( 

1032 value[0], 

1033 ) 

1034 else: 

1035 parent_annotation[NameObject(FA.V)] = TextStringObject(value) 

1036 # Get or create the field's appearance stream object 

1037 if parent_annotation.get(FA.FT) == "/Btn": 

1038 # Checkbox button (no /FT found in Radio widgets); 

1039 # We can find the associated appearance stream object 

1040 # within the annotation. 

1041 v = NameObject(value) 

1042 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) 

1043 normal_ap = cast(DictionaryObject, ap["/N"]) 

1044 if v not in normal_ap: 

1045 v = NameObject("/Off") 

1046 appearance_stream_obj = normal_ap.get(v) 

1047 # Other cases will be updated through the for loop 

1048 annotation[NameObject(AA.AS)] = v 

1049 annotation[NameObject(FA.V)] = v 

1050 elif ( 

1051 parent_annotation.get(FA.FT) == "/Tx" 

1052 or parent_annotation.get(FA.FT) == "/Ch" 

1053 ): 

1054 # Textbox; we need to generate the appearance stream object 

1055 if isinstance(value, tuple): 

1056 appearance_stream_obj = TextStreamAppearance.from_text_annotation( 

1057 self, page, flatten, acro_form, parent_annotation, annotation, value[1], value[2] 

1058 ) 

1059 else: 

1060 appearance_stream_obj = TextStreamAppearance.from_text_annotation( 

1061 self, page, flatten, acro_form, parent_annotation, annotation 

1062 ) 

1063 # Add the appearance stream object 

1064 if AA.AP not in annotation: 

1065 annotation[NameObject(AA.AP)] = DictionaryObject( 

1066 {NameObject("/N"): self._add_object(appearance_stream_obj)} 

1067 ) 

1068 elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])): 

1069 cast(DictionaryObject, annotation[NameObject(AA.AP)])[ 

1070 NameObject("/N") 

1071 ] = self._add_object(appearance_stream_obj) 

1072 else: # [/AP][/N] exists 

1073 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore[index] 

1074 self._objects[n - 1] = appearance_stream_obj 

1075 appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self) 

1076 elif ( 

1077 annotation.get(FA.FT) == "/Sig" 

1078 ): # deprecated # not implemented yet 

1079 logger_warning("Signature forms not implemented yet", source=__name__) 

1080 

1081 if appearance_stream_obj and flatten: 

1082 self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1]) 

1083 

1084 def reattach_fields( 

1085 self, page: Optional[PageObject] = None 

1086 ) -> list[DictionaryObject]: 

1087 """ 

1088 Parse annotations within the page looking for orphan fields and 

1089 reattach then into the Fields Structure. 

1090 

1091 Args: 

1092 page: page to analyze. 

1093 If none is provided, all pages will be analyzed. 

1094 

1095 Returns: 

1096 list of reattached fields. 

1097 

1098 """ 

1099 lst = [] 

1100 if page is None: 

1101 for p in self.pages: 

1102 lst += self.reattach_fields(p) 

1103 return lst 

1104 

1105 try: 

1106 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1107 except KeyError: 

1108 af = DictionaryObject() 

1109 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af 

1110 try: 

1111 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields]) 

1112 except KeyError: 

1113 fields = ArrayObject() 

1114 af[NameObject(InteractiveFormDictEntries.Fields)] = fields 

1115 

1116 if "/Annots" not in page: 

1117 return lst 

1118 annotations = cast(ArrayObject, page["/Annots"]) 

1119 for idx, annotation in enumerate(annotations): 

1120 is_indirect = isinstance(annotation, IndirectObject) 

1121 annotation = cast(DictionaryObject, annotation.get_object()) 

1122 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation: 

1123 if ( 

1124 "indirect_reference" in annotation.__dict__ 

1125 and annotation.indirect_reference in fields 

1126 ): 

1127 continue 

1128 if not is_indirect: 

1129 annotations[idx] = self._add_object(annotation) 

1130 fields.append(annotation.indirect_reference) 

1131 lst.append(annotation) 

1132 return lst 

1133 

1134 def _collect_incremental_clone_object_ids(self, reader: PdfReader) -> list[int]: 

1135 object_ids: set[int] = set() 

1136 for xref_entry in reader.xref.values(): 

1137 object_ids.update(filter(None, xref_entry)) 

1138 object_ids.update(filter(None, reader.xref_objStm)) 

1139 

1140 object_count = len(object_ids) 

1141 if object_count > self._incremental_clone_object_count_limit: 

1142 raise LimitReachedError( 

1143 f"Incremental clone object count {object_count} exceeds " 

1144 f"maximum allowed count {self._incremental_clone_object_count_limit}." 

1145 ) 

1146 

1147 max_object_id = max(object_ids, default=0) 

1148 if max_object_id > self._incremental_clone_object_id_limit: 

1149 raise LimitReachedError( 

1150 f"Incremental clone object ID {max_object_id} exceeds " 

1151 f"maximum allowed ID {self._incremental_clone_object_id_limit}." 

1152 ) 

1153 

1154 return sorted(object_ids) 

1155 

1156 def clone_reader_document_root(self, reader: PdfReader) -> None: 

1157 """ 

1158 Copy the reader document root to the writer and all sub-elements, 

1159 including pages, threads, outlines,... For partial insertion, ``append`` 

1160 should be considered. 

1161 

1162 Args: 

1163 reader: PdfReader from which the document root should be copied. 

1164 

1165 """ 

1166 self._info_obj = None 

1167 if self.incremental: 

1168 object_ids = self._collect_incremental_clone_object_ids(reader) 

1169 self._objects = [None] * (object_ids[-1] if object_ids else 0) 

1170 for object_id in object_ids: 

1171 reader_object = reader.get_object(object_id) 

1172 if reader_object is not None: 

1173 self._objects[object_id - 1] = reader_object.replicate(self) 

1174 else: 

1175 self._objects.clear() 

1176 self._root_object = reader.root_object.clone(self) 

1177 self._pages = self._root_object.raw_get("/Pages") 

1178 

1179 trailer_size = cast(int, reader.trailer["/Size"]) 

1180 if len(self._objects) > trailer_size: 

1181 if self.strict: 

1182 raise PdfReadError( 

1183 f"Object count {len(self._objects)} exceeds defined trailer size {trailer_size}" 

1184 ) 

1185 logger_warning( 

1186 "Object count %(object_count)d exceeds defined trailer size %(trailer_size)d", 

1187 source=__name__, 

1188 object_count=len(self._objects), 

1189 trailer_size=trailer_size, 

1190 ) 

1191 

1192 # must be done here before rewriting 

1193 if self.incremental: 

1194 self._original_hash = [ 

1195 (obj.hash_bin() if obj is not None else 0) for obj in self._objects 

1196 ] 

1197 

1198 try: 

1199 self._flatten() 

1200 except IndexError: 

1201 raise PdfReadError("Got index error while flattening.") 

1202 

1203 assert self.flattened_pages is not None 

1204 for p in self.flattened_pages: 

1205 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) 

1206 if not self.incremental: 

1207 p[NameObject("/Parent")] = self._pages 

1208 if not self.incremental: 

1209 cast(DictionaryObject, self._pages.get_object())[ 

1210 NameObject("/Kids") 

1211 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) 

1212 

1213 def clone_document_from_reader( 

1214 self, 

1215 reader: PdfReader, 

1216 after_page_append: Optional[Callable[[PageObject], None]] = None, 

1217 ) -> None: 

1218 """ 

1219 Create a copy (clone) of a document from a PDF file reader cloning 

1220 section '/Root' and '/Info' and '/ID' of the pdf. 

1221 

1222 Args: 

1223 reader: PDF file reader instance from which the clone 

1224 should be created. 

1225 after_page_append: 

1226 Callback function that is invoked after each page is appended to 

1227 the writer. Signature includes a reference to the appended page 

1228 (delegates to append_pages_from_reader). The single parameter of 

1229 the callback is a reference to the page just appended to the 

1230 document. 

1231 

1232 """ 

1233 self.clone_reader_document_root(reader) 

1234 inf = reader._info 

1235 if self.incremental: 

1236 if inf is not None: 

1237 self._info_obj = cast( 

1238 IndirectObject, inf.clone(self).indirect_reference 

1239 ) 

1240 assert isinstance(self._info, DictionaryObject), "for mypy" 

1241 self._original_hash[ 

1242 self._info_obj.indirect_reference.idnum - 1 

1243 ] = self._info.hash_bin() 

1244 elif inf is not None: 

1245 self._info_obj = self._add_object( 

1246 DictionaryObject(cast(DictionaryObject, inf.get_object())) 

1247 ) 

1248 # else: _info_obj = None done in clone_reader_document_root() 

1249 

1250 try: 

1251 self._ID = cast(ArrayObject, reader._ID).clone(self) 

1252 except AttributeError: 

1253 pass 

1254 

1255 if callable(after_page_append): 

1256 for page in cast( 

1257 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] 

1258 ): 

1259 after_page_append(page.get_object()) 

1260 

1261 def _compute_document_identifier(self) -> ByteStringObject: 

1262 stream = BytesIO() 

1263 self._write_pdf_structure(stream) 

1264 stream.seek(0) 

1265 return ByteStringObject(_rolling_checksum(stream).encode("utf8")) 

1266 

1267 def generate_file_identifiers(self) -> None: 

1268 """ 

1269 Generate an identifier for the PDF that will be written. 

1270 

1271 The only point of this is ensuring uniqueness. Reproducibility is not 

1272 required. 

1273 When a file is first written, both identifiers shall be set to the same value. 

1274 If both identifiers match when a file reference is resolved, it is very 

1275 likely that the correct and unchanged file has been found. If only the first 

1276 identifier matches, a different version of the correct file has been found. 

1277 see §14.4 "File Identifiers". 

1278 """ 

1279 if self._ID: 

1280 id1 = self._ID[0] 

1281 id2 = self._compute_document_identifier() 

1282 else: 

1283 id1 = self._compute_document_identifier() 

1284 id2 = id1 

1285 self._ID = ArrayObject((id1, id2)) 

1286 

1287 def encrypt( 

1288 self, 

1289 user_password: str, 

1290 owner_password: Optional[str] = None, 

1291 use_128bit: bool = True, 

1292 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS, 

1293 *, 

1294 algorithm: Optional[str] = None, 

1295 ) -> None: 

1296 """ 

1297 Encrypt this PDF file with the PDF Standard encryption handler. 

1298 

1299 Args: 

1300 user_password: The password which allows for opening 

1301 and reading the PDF file with the restrictions provided. 

1302 owner_password: The password which allows for 

1303 opening the PDF files without any restrictions. By default, 

1304 the owner password is the same as the user password. 

1305 use_128bit: flag as to whether to use 128bit 

1306 encryption. When false, 40bit encryption will be used. 

1307 By default, this flag is on. 

1308 permissions_flag: permissions as described in 

1309 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means 

1310 the permission is granted. 

1311 Hence an integer value of -1 will set all flags. 

1312 Bit position 3 is for printing, 4 is for modifying content, 

1313 5 and 6 control annotations, 9 for form fields, 

1314 10 for extraction of text and graphics. 

1315 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128", 

1316 "AES-128", "AES-256-R5", "AES-256". If it is valid, 

1317 `use_128bit` will be ignored. 

1318 

1319 """ 

1320 if self.incremental: 

1321 raise NotImplementedError("Encrypting incremental PDF files is currently not supported.") 

1322 

1323 if owner_password is None: 

1324 owner_password = user_password 

1325 

1326 if algorithm is not None: 

1327 try: 

1328 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_")) 

1329 except AttributeError: 

1330 raise ValueError(f"Algorithm '{algorithm}' NOT supported") 

1331 else: 

1332 alg = EncryptAlgorithm.RC4_128 

1333 if not use_128bit: 

1334 alg = EncryptAlgorithm.RC4_40 

1335 self.generate_file_identifiers() 

1336 assert self._ID 

1337 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0]) 

1338 # in case call `encrypt` again 

1339 entry = self._encryption.write_entry(user_password, owner_password, strict=self.strict) 

1340 if self._encrypt_entry: 

1341 # replace old encrypt_entry 

1342 assert self._encrypt_entry.indirect_reference is not None 

1343 entry.indirect_reference = self._encrypt_entry.indirect_reference 

1344 self._objects[entry.indirect_reference.idnum - 1] = entry 

1345 else: 

1346 self._add_object(entry) 

1347 self._encrypt_entry = entry 

1348 

1349 def _resolve_links(self) -> None: 

1350 """Patch up links that were added to the document earlier, to 

1351 make sure they still point to the same pages. 

1352 """ 

1353 for (new_link, old_link) in self._unresolved_links: 

1354 old_page = old_link.find_referenced_page() 

1355 if not old_page: 

1356 continue 

1357 new_page = self._merged_in_pages.get(old_page) 

1358 if new_page is None: 

1359 continue 

1360 new_link.patch_reference(self, new_page) 

1361 

1362 def write_stream(self, stream: StreamType) -> None: 

1363 if hasattr(stream, "mode") and "b" not in stream.mode: 

1364 logger_warning( 

1365 "File <%(stream_name)s> to write to is not in binary mode. " 

1366 "It may not be written to correctly.", 

1367 source=__name__, 

1368 stream_name=stream.name, 

1369 ) 

1370 self._resolve_links() 

1371 

1372 if self.incremental: 

1373 self._reader.stream.seek(0) 

1374 stream.write(self._reader.stream.read(-1)) 

1375 if len(self.list_objects_in_increment()) > 0: 

1376 self._write_increment(stream) # writes objs, xref stream and startxref 

1377 else: 

1378 object_positions, free_objects = self._write_pdf_structure(stream) 

1379 xref_location = self._write_xref_table( 

1380 stream, object_positions, free_objects 

1381 ) 

1382 self._write_trailer(stream, xref_location) 

1383 

1384 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]: 

1385 """ 

1386 Write the collection of pages added to this object out as a PDF file. 

1387 

1388 Args: 

1389 stream: An object to write the file to. The object can support 

1390 the write method and the tell method, similar to a file object, or 

1391 be a file path, just like the fileobj, just named it stream to keep 

1392 existing workflow. 

1393 

1394 Returns: 

1395 A tuple (bool, IO). 

1396 

1397 """ 

1398 my_file = False 

1399 

1400 if stream == "": 

1401 raise ValueError(f"Output({stream=}) is empty.") 

1402 

1403 if isinstance(stream, (str, Path)): 

1404 stream = FileIO(stream, "wb") 

1405 my_file = True 

1406 

1407 self.write_stream(stream) 

1408 

1409 if my_file: 

1410 stream.close() 

1411 else: 

1412 stream.flush() 

1413 

1414 return my_file, stream 

1415 

1416 def list_objects_in_increment(self) -> list[IndirectObject]: 

1417 """ 

1418 For analysis or debugging. 

1419 Provides the list of new or modified objects that will be written 

1420 in the increment. 

1421 Deleted objects will not be freed but will become orphans. 

1422 

1423 Returns: 

1424 List of new or modified IndirectObjects 

1425 

1426 """ 

1427 original_hash_count = len(self._original_hash) 

1428 return [ 

1429 cast(IndirectObject, obj).indirect_reference 

1430 for i, obj in enumerate(self._objects) 

1431 if ( 

1432 obj is not None 

1433 and ( 

1434 i >= original_hash_count 

1435 or obj.hash_bin() != self._original_hash[i] 

1436 ) 

1437 ) 

1438 ] 

1439 

1440 def _write_increment(self, stream: StreamType) -> None: 

1441 object_positions = {} 

1442 object_blocks = [] 

1443 current_start = -1 

1444 current_stop = -2 

1445 original_hash_count = len(self._original_hash) 

1446 for i, obj in enumerate(self._objects): 

1447 if obj is not None and ( 

1448 i >= original_hash_count 

1449 or obj.hash_bin() != self._original_hash[i] 

1450 ): 

1451 idnum = i + 1 

1452 assert isinstance(obj, PdfObject), "mypy" 

1453 # first write new/modified object 

1454 object_positions[idnum] = stream.tell() 

1455 stream.write(f"{idnum} 0 obj\n".encode()) 

1456 """ encryption is not operational 

1457 if self._encryption and obj != self._encrypt_entry: 

1458 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1459 """ 

1460 obj.write_to_stream(stream) 

1461 stream.write(b"\nendobj\n") 

1462 

1463 # prepare xref 

1464 if idnum != current_stop: 

1465 if current_start > 0: 

1466 object_blocks.append( 

1467 [current_start, current_stop - current_start] 

1468 ) 

1469 current_start = idnum 

1470 current_stop = idnum + 1 

1471 assert current_start > 0, "for pytest only" 

1472 object_blocks.append([current_start, current_stop - current_start]) 

1473 # write incremented xref 

1474 xref_location = stream.tell() 

1475 xr_id = len(self._objects) + 1 

1476 stream.write(f"{xr_id} 0 obj".encode()) 

1477 init_data = { 

1478 NameObject("/Type"): NameObject("/XRef"), 

1479 NameObject("/Size"): NumberObject(xr_id + 1), 

1480 NameObject("/Root"): self.root_object.indirect_reference, 

1481 NameObject("/Filter"): NameObject("/FlateDecode"), 

1482 NameObject("/Index"): ArrayObject( 

1483 [NumberObject(_it) for _su in object_blocks for _it in _su] 

1484 ), 

1485 NameObject("/W"): ArrayObject( 

1486 [NumberObject(1), NumberObject(4), NumberObject(1)] 

1487 ), 

1488 "__streamdata__": b"", 

1489 } 

1490 if self._info is not None and ( 

1491 self._info.indirect_reference.idnum - 1 # type: ignore[union-attr] 

1492 >= len(self._original_hash) 

1493 or cast(IndirectObject, self._info).hash_bin() # kept for future 

1494 != self._original_hash[ 

1495 self._info.indirect_reference.idnum - 1 # type: ignore[union-attr] 

1496 ] 

1497 ): 

1498 init_data[NameObject(TK.INFO)] = self._info.indirect_reference 

1499 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) 

1500 if self._ID: 

1501 init_data[NameObject(TK.ID)] = self._ID 

1502 xr = StreamObject.initialize_from_dictionary(init_data) 

1503 xr.set_data( 

1504 b"".join( 

1505 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] 

1506 ) 

1507 ) 

1508 xr.write_to_stream(stream) 

1509 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1510 

1511 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]: 

1512 object_positions = [] 

1513 free_objects = [] 

1514 stream.write(self.pdf_header.encode() + b"\n") 

1515 stream.write(b"%\xE2\xE3\xCF\xD3\n") 

1516 

1517 for idnum, obj in enumerate(self._objects, start=1): 

1518 if obj is not None: 

1519 object_positions.append(stream.tell()) 

1520 stream.write(f"{idnum} 0 obj\n".encode()) 

1521 if self._encryption and obj != self._encrypt_entry: 

1522 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1523 obj.write_to_stream(stream) 

1524 stream.write(b"\nendobj\n") 

1525 else: 

1526 object_positions.append(-1) 

1527 free_objects.append(idnum) 

1528 free_objects.append(0) # add 0 to loop in accordance with specification 

1529 return object_positions, free_objects 

1530 

1531 def _write_xref_table( 

1532 self, stream: StreamType, object_positions: list[int], free_objects: list[int] 

1533 ) -> int: 

1534 xref_location = stream.tell() 

1535 stream.write(b"xref\n") 

1536 stream.write(f"0 {len(self._objects) + 1}\n".encode()) 

1537 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) 

1538 free_idx = 1 

1539 for offset in object_positions: 

1540 if offset > 0: 

1541 stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) 

1542 else: 

1543 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) 

1544 free_idx += 1 

1545 return xref_location 

1546 

1547 def _write_trailer(self, stream: StreamType, xref_location: int) -> None: 

1548 """ 

1549 Write the PDF trailer to the stream. 

1550 

1551 To quote the PDF specification: 

1552 [The] trailer [gives] the location of the cross-reference table and 

1553 of certain special objects within the body of the file. 

1554 """ 

1555 stream.write(b"trailer\n") 

1556 trailer = DictionaryObject( 

1557 { 

1558 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), 

1559 NameObject(TK.ROOT): self.root_object.indirect_reference, 

1560 } 

1561 ) 

1562 if self._info is not None: 

1563 trailer[NameObject(TK.INFO)] = self._info.indirect_reference 

1564 if self._ID is not None: 

1565 trailer[NameObject(TK.ID)] = self._ID 

1566 if self._encrypt_entry: 

1567 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference 

1568 trailer.write_to_stream(stream) 

1569 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1570 

1571 @property 

1572 def metadata(self) -> Optional[DocumentInformation]: 

1573 """ 

1574 Retrieve/set the PDF file's document information dictionary, if it exists. 

1575 

1576 Args: 

1577 value: dict with the entries to be set. if None : remove the /Info entry from the pdf. 

1578 

1579 Note that some PDF files use (XMP) metadata streams instead of document 

1580 information dictionaries, and these metadata streams will not be 

1581 accessed by this function, but by :meth:`~xmp_metadata`. 

1582 

1583 """ 

1584 return super().metadata 

1585 

1586 @metadata.setter 

1587 def metadata( 

1588 self, 

1589 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]], 

1590 ) -> None: 

1591 if value is None: 

1592 self._info = None 

1593 else: 

1594 if self._info is not None: 

1595 self._info.clear() 

1596 

1597 self.add_metadata(value) 

1598 

1599 def add_metadata(self, infos: dict[str, Any]) -> None: 

1600 """ 

1601 Add custom metadata to the output. 

1602 

1603 Args: 

1604 infos: a Python dictionary where each key is a field 

1605 and each value is your new metadata. 

1606 

1607 """ 

1608 args = {} 

1609 if isinstance(infos, PdfObject): 

1610 infos = cast(DictionaryObject, infos.get_object()) 

1611 for key, value in list(infos.items()): 

1612 if isinstance(value, PdfObject): 

1613 value = value.get_object() 

1614 args[NameObject(key)] = create_string_object(str(value)) 

1615 if self._info is None: 

1616 self._info = DictionaryObject() 

1617 self._info.update(args) 

1618 

1619 _UNSET = object() 

1620 

1621 def compress_identical_objects( 

1622 self, 

1623 remove_identicals: Any = _UNSET, 

1624 remove_orphans: Any = _UNSET, 

1625 *, 

1626 remove_duplicates: bool = True, 

1627 remove_unreferenced: bool = True, 

1628 ) -> None: 

1629 """ 

1630 Parse the PDF file and merge objects that have the same hash. 

1631 This will make objects common to multiple pages. 

1632 Recommended to be used just before writing output. 

1633 

1634 Args: 

1635 remove_identicals: Deprecated. 

1636 remove_orphans: Deprecated. 

1637 remove_duplicates: Remove duplicate objects. 

1638 remove_unreferenced: Remove unreferenced objects. 

1639 

1640 """ 

1641 if remove_identicals != self._UNSET: 

1642 deprecate_with_replacement("remove_identicals", "remove_duplicates", "7.0.0") 

1643 assert isinstance(remove_identicals, bool) 

1644 remove_duplicates = remove_identicals 

1645 if remove_orphans != self._UNSET: 

1646 deprecate_with_replacement("remove_orphans", "remove_unreferenced", "7.0.0") 

1647 assert isinstance(remove_orphans, bool) 

1648 remove_unreferenced = remove_orphans 

1649 

1650 def replace_in_obj( 

1651 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject] 

1652 ) -> None: 

1653 if isinstance(obj, DictionaryObject): 

1654 key_val = obj.items() 

1655 elif isinstance(obj, ArrayObject): 

1656 key_val = enumerate(obj) # type: ignore[assignment] 

1657 else: 

1658 return 

1659 assert isinstance(obj, (DictionaryObject, ArrayObject)) 

1660 for k, v in key_val: 

1661 if isinstance(v, IndirectObject): 

1662 unreferenced[v.idnum - 1] = False 

1663 if v in crossref: 

1664 obj[k] = crossref[v] 

1665 else: 

1666 """The filtering on DictionaryObject and ArrayObject only 

1667 will be performed within replace_in_obj""" 

1668 replace_in_obj(v, crossref) 

1669 

1670 # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...]) 

1671 self._idnum_hash = {} 

1672 unreferenced = [True] * len(self._objects) 

1673 # look for similar objects 

1674 for idx, obj in enumerate(self._objects): 

1675 if is_null_or_none(obj): 

1676 continue 

1677 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. 

1678 assert isinstance(obj.indirect_reference, IndirectObject) 

1679 h = obj.hash_value() 

1680 if remove_duplicates and h in self._idnum_hash: 

1681 self._idnum_hash[h][1].append(obj.indirect_reference) 

1682 self._objects[idx] = None 

1683 else: 

1684 self._idnum_hash[h] = (obj.indirect_reference, []) 

1685 

1686 # generate the dict converting others to 1st 

1687 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} 

1688 cnv_rev: dict[IndirectObject, IndirectObject] = {} 

1689 for k, v in cnv.items(): 

1690 cnv_rev.update(zip(v, (k,) * len(v))) 

1691 

1692 # replace reference to merged objects 

1693 for obj in self._objects: 

1694 if isinstance(obj, (DictionaryObject, ArrayObject)): 

1695 replace_in_obj(obj, cnv_rev) 

1696 

1697 if remove_unreferenced: 

1698 unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore[union-attr] 

1699 

1700 if not is_null_or_none(self._info): 

1701 unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore[union-attr] 

1702 

1703 try: 

1704 unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore[union-attr] 

1705 except AttributeError: 

1706 pass 

1707 

1708 for i in compress(range(len(self._objects)), unreferenced): 

1709 self._objects[i] = None 

1710 

1711 def get_reference(self, obj: PdfObject) -> IndirectObject: 

1712 idnum = self._objects.index(obj) + 1 

1713 ref = IndirectObject(idnum, 0, self) 

1714 assert ref.get_object() == obj 

1715 return ref 

1716 

1717 def get_outline_root(self) -> TreeObject: 

1718 if CO.OUTLINES in self._root_object: 

1719 # Entries in the catalog dictionary 

1720 outline = cast(TreeObject, self._root_object[CO.OUTLINES]) 

1721 if not isinstance(outline, TreeObject): 

1722 t = TreeObject(outline) 

1723 self._replace_object(outline.indirect_reference.idnum, t) 

1724 outline = t 

1725 idnum = self._objects.index(outline) + 1 

1726 outline_ref = IndirectObject(idnum, 0, self) 

1727 assert outline_ref.get_object() == outline 

1728 else: 

1729 outline = TreeObject() 

1730 outline.update({}) 

1731 outline_ref = self._add_object(outline) 

1732 self._root_object[NameObject(CO.OUTLINES)] = outline_ref 

1733 

1734 return outline 

1735 

1736 def get_threads_root(self) -> ArrayObject: 

1737 """ 

1738 The list of threads. 

1739 

1740 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1741 

1742 Returns: 

1743 An array (possibly empty) of Dictionaries with an ``/F`` key, 

1744 and optionally information about the thread in ``/I`` or ``/Metadata`` keys. 

1745 

1746 """ 

1747 if CO.THREADS in self._root_object: 

1748 # Entries in the catalog dictionary 

1749 threads = cast(ArrayObject, self._root_object[CO.THREADS]) 

1750 else: 

1751 threads = ArrayObject() 

1752 self._root_object[NameObject(CO.THREADS)] = threads 

1753 return threads 

1754 

1755 @property 

1756 def threads(self) -> ArrayObject: 

1757 """ 

1758 Read-only property for the list of threads. 

1759 

1760 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1761 

1762 Each element is a dictionary with an ``/F`` key, and optionally 

1763 information about the thread in ``/I`` or ``/Metadata`` keys. 

1764 """ 

1765 return self.get_threads_root() 

1766 

1767 def add_outline_item_destination( 

1768 self, 

1769 page_destination: Union[IndirectObject, PageObject, TreeObject], 

1770 parent: Union[None, TreeObject, IndirectObject] = None, 

1771 before: Union[None, TreeObject, IndirectObject] = None, 

1772 is_open: bool = True, 

1773 ) -> IndirectObject: 

1774 page_destination = cast(PageObject, page_destination.get_object()) 

1775 if isinstance(page_destination, PageObject): 

1776 return self.add_outline_item_destination( 

1777 Destination( 

1778 f"page #{page_destination.page_number}", 

1779 cast(IndirectObject, page_destination.indirect_reference), 

1780 Fit.fit(), 

1781 ) 

1782 ) 

1783 

1784 if parent is None: 

1785 parent = self.get_outline_root() 

1786 

1787 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open) 

1788 parent = cast(TreeObject, parent.get_object()) 

1789 page_destination_ref = self._add_object(page_destination) 

1790 if before is not None: 

1791 before = before.indirect_reference 

1792 parent.insert_child( 

1793 page_destination_ref, 

1794 before, 

1795 self, 

1796 page_destination.inc_parent_counter_outline 

1797 if is_open 

1798 else (lambda x, y: 0), # noqa: ARG005 

1799 ) 

1800 if "/Count" not in page_destination: 

1801 page_destination[NameObject("/Count")] = NumberObject(0) 

1802 

1803 return page_destination_ref 

1804 

1805 def add_outline_item_dict( 

1806 self, 

1807 outline_item: OutlineItemType, 

1808 parent: Union[None, TreeObject, IndirectObject] = None, 

1809 before: Union[None, TreeObject, IndirectObject] = None, 

1810 is_open: bool = True, 

1811 ) -> IndirectObject: 

1812 outline_item_object = TreeObject() 

1813 outline_item_object.update(outline_item) 

1814 

1815 """code currently unreachable 

1816 if "/A" in outline_item: 

1817 action = DictionaryObject() 

1818 a_dict = cast(DictionaryObject, outline_item["/A"]) 

1819 for k, v in list(a_dict.items()): 

1820 action[NameObject(str(k))] = v 

1821 action_ref = self._add_object(action) 

1822 outline_item_object[NameObject("/A")] = action_ref 

1823 """ 

1824 return self.add_outline_item_destination( 

1825 outline_item_object, parent, before, is_open 

1826 ) 

1827 

1828 def add_outline_item( 

1829 self, 

1830 title: str, 

1831 page_number: Union[None, PageObject, IndirectObject, int], 

1832 parent: Union[None, TreeObject, IndirectObject] = None, 

1833 before: Union[None, TreeObject, IndirectObject] = None, 

1834 color: Optional[Union[tuple[float, float, float], str]] = None, 

1835 bold: bool = False, 

1836 italic: bool = False, 

1837 fit: Fit = PAGE_FIT, 

1838 is_open: bool = True, 

1839 ) -> IndirectObject: 

1840 """ 

1841 Add an outline item (commonly referred to as a "Bookmark") to the PDF file. 

1842 

1843 Args: 

1844 title: Title to use for this outline item. 

1845 page_number: Page number this outline item will point to. 

1846 parent: A reference to a parent outline item to create nested 

1847 outline items. 

1848 before: 

1849 color: Color of the outline item's font as a red, green, blue tuple 

1850 from 0.0 to 1.0 or as a Hex String (#RRGGBB) 

1851 bold: Outline item font is bold 

1852 italic: Outline item font is italic 

1853 fit: The fit of the destination page. 

1854 

1855 Returns: 

1856 The added outline item as an indirect object. 

1857 

1858 """ 

1859 page_ref: Union[None, NullObject, IndirectObject, NumberObject] 

1860 if isinstance(italic, Fit): # it means that we are on the old params 

1861 if fit is not None and page_number is None: 

1862 page_number = fit 

1863 return self.add_outline_item( 

1864 title, page_number, parent, None, before, color, bold, italic, is_open=is_open 

1865 ) 

1866 if page_number is None: 

1867 action_ref = None 

1868 else: 

1869 if isinstance(page_number, IndirectObject): 

1870 page_ref = page_number 

1871 elif isinstance(page_number, PageObject): 

1872 page_ref = page_number.indirect_reference 

1873 elif isinstance(page_number, int): 

1874 try: 

1875 page_ref = self.pages[page_number].indirect_reference 

1876 except IndexError: 

1877 page_ref = NumberObject(page_number) 

1878 if page_ref is None: 

1879 logger_warning( 

1880 "can not find reference of page %(page_number)s", 

1881 source=__name__, 

1882 page_number=page_number, 

1883 ) 

1884 page_ref = NullObject() 

1885 dest = Destination( 

1886 NameObject("/" + title + " outline item"), 

1887 page_ref, 

1888 fit, 

1889 ) 

1890 

1891 action_ref = self._add_object( 

1892 DictionaryObject( 

1893 { 

1894 NameObject(GoToActionArguments.D): dest.dest_array, 

1895 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

1896 } 

1897 ) 

1898 ) 

1899 outline_item = self._add_object( 

1900 _create_outline_item(action_ref, title, color, italic, bold) 

1901 ) 

1902 

1903 if parent is None: 

1904 parent = self.get_outline_root() 

1905 return self.add_outline_item_destination(outline_item, parent, before, is_open) 

1906 

1907 def add_outline(self) -> None: 

1908 raise NotImplementedError( 

1909 "This method is not yet implemented. Use :meth:`add_outline_item` instead." 

1910 ) 

1911 

1912 def add_named_destination_array( 

1913 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject] 

1914 ) -> None: 

1915 named_dest = self.get_named_dest_root() 

1916 i = 0 

1917 while i < len(named_dest): 

1918 if title < named_dest[i]: 

1919 named_dest.insert(i, destination) 

1920 named_dest.insert(i, TextStringObject(title)) 

1921 return 

1922 i += 2 

1923 named_dest.extend([TextStringObject(title), destination]) 

1924 return 

1925 

1926 def add_named_destination_object( 

1927 self, 

1928 page_destination: PdfObject, 

1929 ) -> IndirectObject: 

1930 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore[attr-defined] 

1931 self.add_named_destination_array( 

1932 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore[index] 

1933 ) 

1934 

1935 return page_destination_ref 

1936 

1937 def add_named_destination( 

1938 self, 

1939 title: str, 

1940 page_number: int, 

1941 ) -> IndirectObject: 

1942 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore[index] 

1943 dest = DictionaryObject() 

1944 dest.update( 

1945 { 

1946 NameObject(GoToActionArguments.D): ArrayObject( 

1947 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)] 

1948 ), 

1949 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

1950 } 

1951 ) 

1952 

1953 dest_ref = self._add_object(dest) 

1954 if not isinstance(title, TextStringObject): 

1955 title = TextStringObject(str(title)) 

1956 

1957 self.add_named_destination_array(title, dest_ref) 

1958 return dest_ref 

1959 

1960 def remove_links(self) -> None: 

1961 """Remove links and annotations from this output.""" 

1962 for page in self.pages: 

1963 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS) 

1964 

1965 def remove_annotations( 

1966 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]] 

1967 ) -> None: 

1968 """ 

1969 Remove annotations by annotation subtype. 

1970 

1971 Args: 

1972 subtypes: subtype or list of subtypes to be removed. 

1973 Examples are: "/Link", "/FileAttachment", "/Sound", 

1974 "/Movie", "/Screen", ... 

1975 If you want to remove all annotations, use subtypes=None. 

1976 

1977 """ 

1978 for page in self.pages: 

1979 self._remove_annots_from_page(page, subtypes) 

1980 

1981 def _remove_annots_from_page( 

1982 self, 

1983 page: Union[IndirectObject, PageObject, DictionaryObject], 

1984 subtypes: Optional[Iterable[str]], 

1985 ) -> None: 

1986 page = cast(DictionaryObject, page.get_object()) 

1987 if PG.ANNOTS in page: 

1988 i = 0 

1989 while i < len(cast(ArrayObject, page[PG.ANNOTS])): 

1990 an = cast(ArrayObject, page[PG.ANNOTS])[i] 

1991 obj = cast(DictionaryObject, an.get_object()) 

1992 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes: 

1993 if isinstance(an, IndirectObject): 

1994 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size 

1995 del page[PG.ANNOTS][i] # type:ignore 

1996 else: 

1997 i += 1 

1998 

1999 def remove_objects_from_page( 

2000 self, 

2001 page: Union[PageObject, DictionaryObject], 

2002 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]], 

2003 text_filters: Optional[dict[str, Any]] = None 

2004 ) -> None: 

2005 """ 

2006 Remove objects specified by ``to_delete`` from the given page. 

2007 

2008 Args: 

2009 page: Page object to clean up. 

2010 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag`` 

2011 or a list of ObjectDeletionFlag 

2012 text_filters: Properties of text to be deleted, if applicable. Optional. 

2013 This is a Python dictionary with the following properties: 

2014 

2015 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted. 

2016 

2017 """ 

2018 if isinstance(to_delete, (list, tuple)): 

2019 for to_d in to_delete: 

2020 self.remove_objects_from_page(page, to_d) 

2021 return None 

2022 assert isinstance(to_delete, ObjectDeletionFlag) 

2023 

2024 if to_delete & ObjectDeletionFlag.LINKS: 

2025 return self._remove_annots_from_page(page, ("/Link",)) 

2026 if to_delete & ObjectDeletionFlag.ATTACHMENTS: 

2027 return self._remove_annots_from_page( 

2028 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen") 

2029 ) 

2030 if to_delete & ObjectDeletionFlag.OBJECTS_3D: 

2031 return self._remove_annots_from_page(page, ("/3D",)) 

2032 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: 

2033 return self._remove_annots_from_page(page, None) 

2034 

2035 jump_operators = [] 

2036 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: 

2037 jump_operators = [ 

2038 b"w", b"J", b"j", b"M", b"d", b"i", 

2039 b"W", b"W*", 

2040 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n", 

2041 b"m", b"l", b"c", b"v", b"y", b"h", b"re", 

2042 b"sh" 

2043 ] 

2044 if to_delete & ObjectDeletionFlag.TEXT: 

2045 jump_operators = [b"Tj", b"TJ", b"'", b'"'] 

2046 

2047 if not isinstance(page, PageObject): 

2048 page = PageObject(self, page.indirect_reference) # pragma: no cover 

2049 if "/Contents" in page: 

2050 content = cast(ContentStream, page.get_contents()) 

2051 

2052 images, forms = self._remove_objects_from_page__clean_forms( 

2053 elt=page, stack=[], jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters, 

2054 ) 

2055 

2056 self._remove_objects_from_page__clean( 

2057 content=content, images=images, forms=forms, 

2058 jump_operators=jump_operators, to_delete=to_delete, 

2059 text_filters=text_filters 

2060 ) 

2061 page.replace_contents(content) 

2062 return [], [] # type: ignore[return-value] 

2063 

2064 def _remove_objects_from_page__clean( 

2065 self, 

2066 content: ContentStream, 

2067 images: list[str], 

2068 forms: list[str], 

2069 jump_operators: list[bytes], 

2070 to_delete: ObjectDeletionFlag, 

2071 text_filters: Optional[dict[str, Any]] = None, 

2072 ) -> None: 

2073 font_id = None 

2074 font_ids_to_delete = [] 

2075 if text_filters and to_delete & ObjectDeletionFlag.TEXT: 

2076 font_ids_to_delete = text_filters.get("font_ids", []) 

2077 

2078 i = 0 

2079 while i < len(content.operations): 

2080 operands, operator = content.operations[i] 

2081 if operator == b"Tf": 

2082 font_id = operands[0] 

2083 if ( 

2084 ( 

2085 operator == b"INLINE IMAGE" 

2086 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES) 

2087 ) 

2088 or (operator in jump_operators) 

2089 or ( 

2090 operator == b"Do" 

2091 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES) 

2092 and (operands[0] in images) 

2093 ) 

2094 ): 

2095 if ( 

2096 not to_delete & ObjectDeletionFlag.TEXT 

2097 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters) 

2098 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete) 

2099 ): 

2100 del content.operations[i] 

2101 else: 

2102 i += 1 

2103 else: 

2104 i += 1 

2105 content.get_data() # this ensures ._data is rebuilt from the .operations 

2106 

2107 def _remove_objects_from_page__clean_forms( 

2108 self, 

2109 elt: DictionaryObject, 

2110 stack: list[DictionaryObject], 

2111 jump_operators: list[bytes], 

2112 to_delete: ObjectDeletionFlag, 

2113 text_filters: Optional[dict[str, Any]] = None, 

2114 ) -> tuple[list[str], list[str]]: 

2115 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference 

2116 if (elt in stack) or ( 

2117 hasattr(elt, "indirect_reference") and any( 

2118 elt.indirect_reference == getattr(x, "indirect_reference", -1) 

2119 for x in stack 

2120 ) 

2121 ): 

2122 # to prevent infinite looping 

2123 return [], [] # pragma: no cover 

2124 try: 

2125 d = cast( 

2126 dict[Any, Any], 

2127 cast(DictionaryObject, elt["/Resources"])["/XObject"], 

2128 ) 

2129 except KeyError: 

2130 d = {} 

2131 images = [] 

2132 forms = [] 

2133 for k, v in d.items(): 

2134 o = v.get_object() 

2135 try: 

2136 content: Any = None 

2137 if ( 

2138 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES 

2139 and o["/Subtype"] == "/Image" 

2140 ): 

2141 content = NullObject() # to delete the image keeping the entry 

2142 images.append(k) 

2143 if o["/Subtype"] == "/Form": 

2144 forms.append(k) 

2145 if isinstance(o, ContentStream): 

2146 content = o 

2147 else: 

2148 content = ContentStream(o, self) 

2149 content.update( 

2150 { 

2151 k1: v1 

2152 for k1, v1 in o.items() 

2153 if k1 not in ["/Length", "/Filter", "/DecodeParms"] 

2154 } 

2155 ) 

2156 try: 

2157 content.indirect_reference = o.indirect_reference 

2158 except AttributeError: # pragma: no cover 

2159 pass 

2160 stack.append(elt) 

2161 

2162 # clean subforms 

2163 self._remove_objects_from_page__clean_forms( 

2164 elt=content, stack=stack, jump_operators=jump_operators, to_delete=to_delete, 

2165 text_filters=text_filters, 

2166 ) 

2167 if content is not None: 

2168 if isinstance(v, IndirectObject): 

2169 self._objects[v.idnum - 1] = content 

2170 else: 

2171 # should only occur in a PDF not respecting PDF spec 

2172 # where streams must be indirected. 

2173 d[k] = self._add_object(content) # pragma: no cover 

2174 except (TypeError, KeyError): 

2175 pass 

2176 for im in images: 

2177 del d[im] # for clean-up 

2178 if isinstance(elt, StreamObject): # for /Form 

2179 if not isinstance(elt, ContentStream): # pragma: no cover 

2180 e = ContentStream(elt, self) 

2181 e.update(elt.items()) 

2182 elt = e 

2183 # clean the content 

2184 self._remove_objects_from_page__clean( 

2185 content=elt, images=images, forms=forms, jump_operators=jump_operators, 

2186 to_delete=to_delete, text_filters=text_filters 

2187 ) 

2188 return images, forms 

2189 

2190 def remove_images( 

2191 self, 

2192 to_delete: ImageType = ImageType.ALL, 

2193 ) -> None: 

2194 """ 

2195 Remove images from this output. 

2196 

2197 Args: 

2198 to_delete: The type of images to be deleted 

2199 (default = all images types) 

2200 

2201 """ 

2202 if isinstance(to_delete, bool): 

2203 to_delete = ImageType.ALL 

2204 

2205 i = ObjectDeletionFlag.NONE 

2206 

2207 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"): 

2208 if to_delete & ImageType[image]: 

2209 i |= ObjectDeletionFlag[image] 

2210 

2211 for page in self.pages: 

2212 self.remove_objects_from_page(page, i) 

2213 

2214 def remove_text(self, font_names: Optional[list[str]] = None) -> None: 

2215 """ 

2216 Remove text from the PDF. 

2217 

2218 Args: 

2219 font_names: List of font names to remove, such as "Helvetica-Bold". 

2220 Optional. If not specified, all text will be removed. 

2221 """ 

2222 if not font_names: 

2223 font_names = [] 

2224 

2225 for page in self.pages: 

2226 resource_ids_to_remove = [] 

2227 

2228 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0" 

2229 # Font names need to be converted to resource names/IDs for easier removal 

2230 if font_names: 

2231 # Recursively loop through page objects to gather font info 

2232 def get_font_info( 

2233 obj: Any, 

2234 font_info: Optional[dict[str, Any]] = None, 

2235 key: Optional[str] = None 

2236 ) -> dict[str, Any]: 

2237 if font_info is None: 

2238 font_info = {} 

2239 if isinstance(obj, IndirectObject): 

2240 obj = obj.get_object() 

2241 if isinstance(obj, dict): 

2242 if obj.get("/Type") == "/Font": 

2243 font_name = obj.get("/BaseFont", "") 

2244 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold" 

2245 normalized_font_name = font_name.lstrip("/").split("+")[-1] 

2246 if normalized_font_name not in font_info: 

2247 font_info[normalized_font_name] = { 

2248 "normalized_font_name": normalized_font_name, 

2249 "resource_ids": [], 

2250 } 

2251 if key not in font_info[normalized_font_name]["resource_ids"]: 

2252 font_info[normalized_font_name]["resource_ids"].append(key) 

2253 for k in obj: 

2254 font_info = get_font_info(obj[k], font_info, k) 

2255 elif isinstance(obj, (list, ArrayObject)): 

2256 for child_obj in obj: 

2257 font_info = get_font_info(child_obj, font_info) 

2258 return font_info 

2259 

2260 # Add relevant resource names for removal 

2261 font_info = get_font_info(page.get("/Resources")) 

2262 for font_name in font_names: 

2263 if font_name in font_info: 

2264 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"]) 

2265 

2266 text_filters = {} 

2267 if font_names: 

2268 text_filters["font_ids"] = resource_ids_to_remove 

2269 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters) 

2270 

2271 def add_uri( 

2272 self, 

2273 page_number: int, 

2274 uri: str, 

2275 rect: RectangleObject, 

2276 border: Optional[ArrayObject] = None, 

2277 ) -> None: 

2278 """ 

2279 Add an URI from a rectangular area to the specified page. 

2280 

2281 Args: 

2282 page_number: index of the page on which to place the URI action. 

2283 uri: URI of resource to link to. 

2284 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or 

2285 array of four integers specifying the clickable rectangular area 

2286 ``[xLL, yLL, xUR, yUR]``, or string in the form 

2287 ``"[ xLL yLL xUR yUR ]"``. 

2288 border: if provided, an array describing border-drawing 

2289 properties. See the PDF spec for details. No border will be 

2290 drawn if this argument is omitted. 

2291 

2292 """ 

2293 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore[index] 

2294 page_ref = cast(dict[str, Any], self.get_object(page_link)) 

2295 

2296 border_arr: BorderArrayType 

2297 if border is not None: 

2298 border_arr = [NumberObject(n) for n in border[:3]] 

2299 if len(border) == 4: 

2300 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) 

2301 border_arr.append(dash_pattern) 

2302 else: 

2303 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)] 

2304 

2305 if isinstance(rect, str): 

2306 rect = NumberObject(rect) 

2307 elif isinstance(rect, RectangleObject): 

2308 pass 

2309 else: 

2310 rect = RectangleObject(rect) 

2311 

2312 lnk2 = DictionaryObject() 

2313 lnk2.update( 

2314 { 

2315 NameObject("/S"): NameObject("/URI"), 

2316 NameObject("/URI"): TextStringObject(uri), 

2317 } 

2318 ) 

2319 lnk = DictionaryObject() 

2320 lnk.update( 

2321 { 

2322 NameObject(AA.Type): NameObject("/Annot"), 

2323 NameObject(AA.Subtype): NameObject("/Link"), 

2324 NameObject(AA.P): page_link, 

2325 NameObject(AA.Rect): rect, 

2326 NameObject("/H"): NameObject("/I"), 

2327 NameObject(AA.Border): ArrayObject(border_arr), 

2328 NameObject("/A"): lnk2, 

2329 } 

2330 ) 

2331 lnk_ref = self._add_object(lnk) 

2332 

2333 if PG.ANNOTS in page_ref: 

2334 page_ref[PG.ANNOTS].append(lnk_ref) 

2335 else: 

2336 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) 

2337 

2338 _valid_layouts = ( 

2339 "/NoLayout", 

2340 "/SinglePage", 

2341 "/OneColumn", 

2342 "/TwoColumnLeft", 

2343 "/TwoColumnRight", 

2344 "/TwoPageLeft", 

2345 "/TwoPageRight", 

2346 ) 

2347 

2348 def _get_page_layout(self) -> Optional[LayoutType]: 

2349 try: 

2350 return cast(LayoutType, self._root_object["/PageLayout"]) 

2351 except KeyError: 

2352 return None 

2353 

2354 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: 

2355 """ 

2356 Set the page layout. 

2357 

2358 Args: 

2359 layout: The page layout to be used. 

2360 

2361 .. list-table:: Valid ``layout`` arguments 

2362 :widths: 50 200 

2363 

2364 * - /NoLayout 

2365 - Layout explicitly not specified 

2366 * - /SinglePage 

2367 - Show one page at a time 

2368 * - /OneColumn 

2369 - Show one column at a time 

2370 * - /TwoColumnLeft 

2371 - Show pages in two columns, odd-numbered pages on the left 

2372 * - /TwoColumnRight 

2373 - Show pages in two columns, odd-numbered pages on the right 

2374 * - /TwoPageLeft 

2375 - Show two pages at a time, odd-numbered pages on the left 

2376 * - /TwoPageRight 

2377 - Show two pages at a time, odd-numbered pages on the right 

2378 

2379 """ 

2380 if not isinstance(layout, NameObject): 

2381 if layout not in self._valid_layouts: 

2382 logger_warning( 

2383 "Layout should be one of: %(layouts)s", 

2384 source=__name__, 

2385 layouts={"", "".join(self._valid_layouts)}, 

2386 ) 

2387 layout = NameObject(layout) 

2388 self._root_object.update({NameObject("/PageLayout"): layout}) 

2389 

2390 def set_page_layout(self, layout: LayoutType) -> None: 

2391 """ 

2392 Set the page layout. 

2393 

2394 Args: 

2395 layout: The page layout to be used 

2396 

2397 .. list-table:: Valid ``layout`` arguments 

2398 :widths: 50 200 

2399 

2400 * - /NoLayout 

2401 - Layout explicitly not specified 

2402 * - /SinglePage 

2403 - Show one page at a time 

2404 * - /OneColumn 

2405 - Show one column at a time 

2406 * - /TwoColumnLeft 

2407 - Show pages in two columns, odd-numbered pages on the left 

2408 * - /TwoColumnRight 

2409 - Show pages in two columns, odd-numbered pages on the right 

2410 * - /TwoPageLeft 

2411 - Show two pages at a time, odd-numbered pages on the left 

2412 * - /TwoPageRight 

2413 - Show two pages at a time, odd-numbered pages on the right 

2414 

2415 """ 

2416 self._set_page_layout(layout) 

2417 

2418 @property 

2419 def page_layout(self) -> Optional[LayoutType]: 

2420 """ 

2421 Page layout property. 

2422 

2423 .. list-table:: Valid ``layout`` values 

2424 :widths: 50 200 

2425 

2426 * - /NoLayout 

2427 - Layout explicitly not specified 

2428 * - /SinglePage 

2429 - Show one page at a time 

2430 * - /OneColumn 

2431 - Show one column at a time 

2432 * - /TwoColumnLeft 

2433 - Show pages in two columns, odd-numbered pages on the left 

2434 * - /TwoColumnRight 

2435 - Show pages in two columns, odd-numbered pages on the right 

2436 * - /TwoPageLeft 

2437 - Show two pages at a time, odd-numbered pages on the left 

2438 * - /TwoPageRight 

2439 - Show two pages at a time, odd-numbered pages on the right 

2440 """ 

2441 return self._get_page_layout() 

2442 

2443 @page_layout.setter 

2444 def page_layout(self, layout: LayoutType) -> None: 

2445 self._set_page_layout(layout) 

2446 

2447 _valid_modes = ( 

2448 "/UseNone", 

2449 "/UseOutlines", 

2450 "/UseThumbs", 

2451 "/FullScreen", 

2452 "/UseOC", 

2453 "/UseAttachments", 

2454 ) 

2455 

2456 def _get_page_mode(self) -> Optional[PagemodeType]: 

2457 try: 

2458 return cast(PagemodeType, self._root_object["/PageMode"]) 

2459 except KeyError: 

2460 return None 

2461 

2462 @property 

2463 def page_mode(self) -> Optional[PagemodeType]: 

2464 """ 

2465 Page mode property. 

2466 

2467 .. list-table:: Valid ``mode`` values 

2468 :widths: 50 200 

2469 

2470 * - /UseNone 

2471 - Do not show outline or thumbnails panels 

2472 * - /UseOutlines 

2473 - Show outline (aka bookmarks) panel 

2474 * - /UseThumbs 

2475 - Show page thumbnails panel 

2476 * - /FullScreen 

2477 - Fullscreen view 

2478 * - /UseOC 

2479 - Show Optional Content Group (OCG) panel 

2480 * - /UseAttachments 

2481 - Show attachments panel 

2482 """ 

2483 return self._get_page_mode() 

2484 

2485 @page_mode.setter 

2486 def page_mode(self, mode: PagemodeType) -> None: 

2487 if isinstance(mode, NameObject): 

2488 mode_name: NameObject = mode 

2489 else: 

2490 if mode not in self._valid_modes: 

2491 logger_warning( 

2492 "Mode should be one of: %(modes)s", 

2493 source=__name__, 

2494 modes=", ".join(self._valid_modes), 

2495 ) 

2496 mode_name = NameObject(mode) 

2497 self._root_object.update({NameObject("/PageMode"): mode_name}) 

2498 

2499 def add_annotation( 

2500 self, 

2501 page_number: Union[int, PageObject], 

2502 annotation: dict[str, Any], 

2503 ) -> DictionaryObject: 

2504 """ 

2505 Add a single annotation to the page. 

2506 The added annotation must be a new annotation. 

2507 It cannot be recycled. 

2508 

2509 Args: 

2510 page_number: PageObject or page index. 

2511 annotation: Annotation to be added (created with annotation). 

2512 

2513 Returns: 

2514 The inserted object. 

2515 This can be used for popup creation, for example. 

2516 

2517 """ 

2518 page = page_number 

2519 if isinstance(page, int): 

2520 page = self.pages[page] 

2521 elif not isinstance(page, PageObject): 

2522 raise TypeError("page: invalid type") 

2523 

2524 to_add = cast(DictionaryObject, _pdf_objectify(annotation)) 

2525 to_add[NameObject("/P")] = page.indirect_reference 

2526 

2527 if page.annotations is None: 

2528 page[NameObject("/Annots")] = ArrayObject() 

2529 assert page.annotations is not None 

2530 

2531 # Internal link annotations need the correct object type for the 

2532 # destination 

2533 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add: 

2534 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")]) 

2535 dest = Destination( 

2536 NameObject("/LinkName"), 

2537 tmp["target_page_index"], 

2538 Fit( 

2539 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"] 

2540 ), # I have no clue why this dict-hack is necessary 

2541 ) 

2542 to_add[NameObject("/Dest")] = dest.dest_array 

2543 

2544 page.annotations.append(self._add_object(to_add)) 

2545 

2546 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add: 

2547 cast(DictionaryObject, to_add["/Parent"].get_object())[ 

2548 NameObject("/Popup") 

2549 ] = to_add.indirect_reference 

2550 

2551 return to_add 

2552 

2553 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: 

2554 """ 

2555 Perform some clean up in the page. 

2556 Currently: convert NameObject named destination to TextStringObject 

2557 (required for names/dests list) 

2558 

2559 Args: 

2560 page: 

2561 

2562 Returns: 

2563 The cleaned PageObject 

2564 

2565 """ 

2566 page = cast("PageObject", page.get_object()) 

2567 for a in page.get("/Annots", []): 

2568 a_obj = a.get_object() 

2569 d = a_obj.get("/Dest", None) 

2570 act = a_obj.get("/A", None) 

2571 if isinstance(d, NameObject): 

2572 a_obj[NameObject("/Dest")] = TextStringObject(d) 

2573 elif act is not None: 

2574 act = act.get_object() 

2575 d = act.get("/D", None) 

2576 if isinstance(d, NameObject): 

2577 act[NameObject("/D")] = TextStringObject(d) 

2578 return page 

2579 

2580 def _create_stream( 

2581 self, fileobj: Union[Path, StrByteType, PdfReader] 

2582 ) -> tuple[IOBase, Optional[Encryption]]: 

2583 # If the fileobj parameter is a string, assume it is a path 

2584 # and create a file object at that location. If it is a file, 

2585 # copy the file's contents into a BytesIO stream object; if 

2586 # it is a PdfReader, copy that reader's stream into a 

2587 # BytesIO stream. 

2588 # If fileobj is none of the above types, it is not modified 

2589 encryption_obj = None 

2590 stream: IOBase 

2591 if isinstance(fileobj, (str, Path)): 

2592 with FileIO(fileobj, "rb") as f: 

2593 stream = BytesIO(f.read()) 

2594 elif isinstance(fileobj, PdfReader): 

2595 if fileobj._encryption: 

2596 encryption_obj = fileobj._encryption 

2597 orig_tell = fileobj.stream.tell() 

2598 fileobj.stream.seek(0) 

2599 stream = BytesIO(fileobj.stream.read()) 

2600 

2601 # reset the stream to its original location 

2602 fileobj.stream.seek(orig_tell) 

2603 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): 

2604 fileobj.seek(0) 

2605 filecontent = fileobj.read() 

2606 stream = BytesIO(filecontent) 

2607 else: 

2608 raise NotImplementedError( 

2609 "Merging requires an object that PdfReader can parse. " 

2610 "Typically, that is a Path or a string representing a Path, " 

2611 "a file object, or an object implementing .seek and .read. " 

2612 "Passing a PdfReader directly works as well." 

2613 ) 

2614 return stream, encryption_obj 

2615 

2616 def append( 

2617 self, 

2618 fileobj: Union[StrByteType, PdfReader, Path], 

2619 outline_item: Union[ 

2620 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int] 

2621 ] = None, 

2622 pages: Union[ 

2623 None, 

2624 PageRange, 

2625 tuple[int, int], 

2626 tuple[int, int, int], 

2627 list[int], 

2628 list[PageObject], 

2629 ] = None, 

2630 import_outline: bool = True, 

2631 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None, 

2632 ) -> None: 

2633 """ 

2634 Identical to the :meth:`merge()<merge>` method, but assumes you want to 

2635 concatenate all pages onto the end of the file instead of specifying a 

2636 position. 

2637 

2638 Args: 

2639 fileobj: A File Object or an object that supports the standard 

2640 read and seek methods similar to a File Object. Could also be a 

2641 string representing a path to a PDF file. 

2642 outline_item: Optionally, you may specify a string to build an 

2643 outline (aka 'bookmark') to identify the beginning of the 

2644 included file. 

2645 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2646 or a ``(start, stop[, step])`` tuple 

2647 or a list of pages to be processed 

2648 to merge only the specified range of pages from the source 

2649 document into the output document. 

2650 import_outline: You may prevent the source document's 

2651 outline (collection of outline items, previously referred to as 

2652 'bookmarks') from being imported by specifying this as ``False``. 

2653 excluded_fields: Provide the list of fields/keys to be ignored 

2654 if ``/Annots`` is part of the list, the annotation will be ignored 

2655 if ``/B`` is part of the list, the articles will be ignored 

2656 

2657 """ 

2658 if excluded_fields is None: 

2659 excluded_fields = () 

2660 if isinstance(outline_item, (tuple, list, PageRange)): 

2661 if isinstance(pages, bool): 

2662 if not isinstance(import_outline, bool): 

2663 excluded_fields = import_outline 

2664 import_outline = pages 

2665 pages = outline_item 

2666 self.merge( 

2667 None, 

2668 fileobj, 

2669 None, 

2670 pages, 

2671 import_outline, 

2672 excluded_fields, 

2673 ) 

2674 else: # if isinstance(outline_item, str): 

2675 self.merge( 

2676 None, 

2677 fileobj, 

2678 outline_item, 

2679 pages, 

2680 import_outline, 

2681 excluded_fields, 

2682 ) 

2683 

2684 def merge( 

2685 self, 

2686 position: Optional[int], 

2687 fileobj: Union[Path, StrByteType, PdfReader], 

2688 outline_item: Optional[str] = None, 

2689 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None, 

2690 import_outline: bool = True, 

2691 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (), 

2692 ) -> None: 

2693 """ 

2694 Merge the pages from the given file into the output file at the 

2695 specified page number. 

2696 

2697 Args: 

2698 position: The *page number* to insert this file. File will 

2699 be inserted after the given number. 

2700 fileobj: A File Object or an object that supports the standard 

2701 read and seek methods similar to a File Object. Could also be a 

2702 string representing a path to a PDF file. 

2703 outline_item: Optionally, you may specify a string to build an outline 

2704 (aka 'bookmark') to identify the 

2705 beginning of the included file. 

2706 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2707 or a ``(start, stop[, step])`` tuple 

2708 or a list of pages to be processed 

2709 to merge only the specified range of pages from the source 

2710 document into the output document. 

2711 import_outline: You may prevent the source document's 

2712 outline (collection of outline items, previously referred to as 

2713 'bookmarks') from being imported by specifying this as ``False``. 

2714 excluded_fields: provide the list of fields/keys to be ignored 

2715 if ``/Annots`` is part of the list, the annotation will be ignored 

2716 if ``/B`` is part of the list, the articles will be ignored 

2717 

2718 Raises: 

2719 TypeError: The pages attribute is not configured properly 

2720 

2721 """ 

2722 if isinstance(fileobj, PdfDocCommon): 

2723 reader = fileobj 

2724 else: 

2725 stream, _encryption_obj = self._create_stream(fileobj) 

2726 # Create a new PdfReader instance using the stream 

2727 # (either file or BytesIO or StringIO) created above 

2728 reader = PdfReader(stream, strict=False) # type: ignore[arg-type] 

2729 

2730 if excluded_fields is None: 

2731 excluded_fields = () 

2732 # Find the range of pages to merge. 

2733 if pages is None: 

2734 pages = list(range(len(reader.pages))) 

2735 elif isinstance(pages, PageRange): 

2736 pages = list(range(*pages.indices(len(reader.pages)))) 

2737 elif isinstance(pages, list): 

2738 pass # keep unchanged 

2739 elif isinstance(pages, tuple) and len(pages) <= 3: 

2740 pages = list(range(*pages)) 

2741 elif not isinstance(pages, tuple): 

2742 raise TypeError( 

2743 '"pages" must be a tuple of (start, stop[, step]) or a list' 

2744 ) 

2745 

2746 srcpages = {} 

2747 for page in pages: 

2748 if isinstance(page, PageObject): 

2749 pg = page 

2750 else: 

2751 pg = reader.pages[page] 

2752 assert pg.indirect_reference is not None 

2753 if position is None: 

2754 # numbers in the exclude list identifies that the exclusion is 

2755 # only applicable to 1st level of cloning 

2756 srcpages[pg.indirect_reference.idnum] = self.add_page( 

2757 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore[list-item] 

2758 ) 

2759 else: 

2760 srcpages[pg.indirect_reference.idnum] = self.insert_page( 

2761 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore[list-item] 

2762 ) 

2763 position += 1 

2764 srcpages[pg.indirect_reference.idnum].original_page = pg 

2765 

2766 reader._named_destinations = ( 

2767 reader.named_destinations 

2768 ) # need for the outline processing below 

2769 

2770 arr: Any 

2771 

2772 for dest in reader._named_destinations.values(): 

2773 self._merge__process_named_dests(dest=dest, reader=reader, srcpages=srcpages) 

2774 

2775 outline_item_typ: TreeObject 

2776 if outline_item is not None: 

2777 outline_item_typ = cast( 

2778 "TreeObject", 

2779 self.add_outline_item( 

2780 TextStringObject(outline_item), 

2781 next(iter(srcpages.values())).indirect_reference, 

2782 fit=PAGE_FIT, 

2783 ).get_object(), 

2784 ) 

2785 else: 

2786 outline_item_typ = self.get_outline_root() 

2787 

2788 _ro = reader.root_object 

2789 if import_outline and CO.OUTLINES in _ro: 

2790 outline = self._get_filtered_outline( 

2791 _ro.get(CO.OUTLINES, None), srcpages, reader 

2792 ) 

2793 self._insert_filtered_outline( 

2794 outline, outline_item_typ, None 

2795 ) # TODO: use before parameter 

2796 

2797 if "/Annots" not in excluded_fields: 

2798 for pag in srcpages.values(): 

2799 lst = self._insert_filtered_annotations( 

2800 pag.original_page.get("/Annots", []), pag, srcpages, reader 

2801 ) 

2802 if len(lst) > 0: 

2803 pag[NameObject("/Annots")] = lst 

2804 self.clean_page(pag) 

2805 

2806 if "/AcroForm" in _ro and not is_null_or_none(_ro["/AcroForm"]): 

2807 if "/AcroForm" not in self._root_object: 

2808 self._root_object[NameObject("/AcroForm")] = self._add_object( 

2809 cast( 

2810 DictionaryObject, 

2811 reader.root_object["/AcroForm"], 

2812 ).clone(self, False, ("/Fields",)) 

2813 ) 

2814 arr = ArrayObject() 

2815 else: 

2816 arr = cast( 

2817 ArrayObject, 

2818 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], 

2819 ) 

2820 trslat = self._id_translated[id(reader)] 

2821 try: 

2822 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore[index] 

2823 try: 

2824 ind = IndirectObject(trslat[f.idnum], 0, self) 

2825 if ind not in arr: 

2826 arr.append(ind) 

2827 except KeyError: 

2828 # for trslat[] which mean the field has not be copied 

2829 # through the page 

2830 pass 

2831 except KeyError: # for /Acroform or /Fields are not existing 

2832 arr = self._add_object(ArrayObject()) 

2833 cast(DictionaryObject, self._root_object["/AcroForm"])[ 

2834 NameObject("/Fields") 

2835 ] = arr 

2836 

2837 if "/B" not in excluded_fields: 

2838 self.add_filtered_articles("", srcpages, reader) 

2839 

2840 def _merge__process_named_dests(self, dest: Any, reader: PdfDocCommon, srcpages: dict[int, PageObject]) -> None: 

2841 arr: Any = dest.dest_array 

2842 if "/Names" in self._root_object and dest["/Title"] in cast( 

2843 list[Any], 

2844 cast( 

2845 DictionaryObject, 

2846 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()), 

2847 ).get("/Names", DictionaryObject()), 

2848 ): 

2849 # already exists: should not duplicate it 

2850 pass 

2851 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject): 

2852 pass 

2853 elif isinstance(dest["/Page"], int): 

2854 # the page reference is a page number normally not a PDF Reference 

2855 # page numbers as int are normally accepted only in external goto 

2856 try: 

2857 p = reader.pages[dest["/Page"]] 

2858 except IndexError: 

2859 return 

2860 assert p.indirect_reference is not None 

2861 try: 

2862 arr[NumberObject(0)] = NumberObject( 

2863 srcpages[p.indirect_reference.idnum].page_number 

2864 ) 

2865 self.add_named_destination_array(dest["/Title"], arr) 

2866 except KeyError: 

2867 pass 

2868 elif dest["/Page"].indirect_reference.idnum in srcpages: 

2869 arr[NumberObject(0)] = srcpages[ 

2870 dest["/Page"].indirect_reference.idnum 

2871 ].indirect_reference 

2872 self.add_named_destination_array(dest["/Title"], arr) 

2873 

2874 def _add_articles_thread( 

2875 self, 

2876 thread: DictionaryObject, # thread entry from the reader's array of threads 

2877 pages: dict[int, PageObject], 

2878 reader: PdfReader, 

2879 ) -> IndirectObject: 

2880 """ 

2881 Clone the thread with only the applicable articles. 

2882 

2883 Args: 

2884 thread: 

2885 pages: 

2886 reader: 

2887 

2888 Returns: 

2889 The added thread as an indirect reference 

2890 

2891 """ 

2892 nthread = thread.clone( 

2893 self, force_duplicate=True, ignore_fields=("/F",) 

2894 ) # use of clone to keep link between reader and writer 

2895 self.threads.append(nthread.indirect_reference) 

2896 first_article = cast("DictionaryObject", thread["/F"]) 

2897 current_article: Optional[DictionaryObject] = first_article 

2898 new_article: Optional[DictionaryObject] = None 

2899 while current_article is not None: 

2900 pag = self._get_cloned_page( 

2901 cast("PageObject", current_article["/P"]), pages, reader 

2902 ) 

2903 if pag is not None: 

2904 if new_article is None: 

2905 new_article = cast( 

2906 "DictionaryObject", 

2907 self._add_object(DictionaryObject()).get_object(), 

2908 ) 

2909 new_first = new_article 

2910 nthread[NameObject("/F")] = new_article.indirect_reference 

2911 else: 

2912 new_article2 = cast( 

2913 "DictionaryObject", 

2914 self._add_object( 

2915 DictionaryObject( 

2916 {NameObject("/V"): new_article.indirect_reference} 

2917 ) 

2918 ).get_object(), 

2919 ) 

2920 new_article[NameObject("/N")] = new_article2.indirect_reference 

2921 new_article = new_article2 

2922 new_article[NameObject("/P")] = pag 

2923 new_article[NameObject("/T")] = nthread.indirect_reference 

2924 new_article[NameObject("/R")] = current_article["/R"] 

2925 pag_obj = cast("PageObject", pag.get_object()) 

2926 if "/B" not in pag_obj: 

2927 pag_obj[NameObject("/B")] = ArrayObject() 

2928 cast("ArrayObject", pag_obj["/B"]).append( 

2929 new_article.indirect_reference 

2930 ) 

2931 current_article = cast("DictionaryObject", current_article["/N"]) 

2932 if current_article == first_article: 

2933 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore[index] 

2934 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore[union-attr] 

2935 current_article = None 

2936 assert nthread.indirect_reference is not None 

2937 return nthread.indirect_reference 

2938 

2939 def add_filtered_articles( 

2940 self, 

2941 fltr: Union[ 

2942 Pattern[Any], str 

2943 ], # thread entry from the reader's array of threads 

2944 pages: dict[int, PageObject], 

2945 reader: PdfReader, 

2946 ) -> None: 

2947 """ 

2948 Add articles matching the defined criteria. 

2949 

2950 Args: 

2951 fltr: 

2952 pages: 

2953 reader: 

2954 

2955 """ 

2956 if isinstance(fltr, str): 

2957 fltr = re.compile(fltr) 

2958 elif not isinstance(fltr, Pattern): 

2959 fltr = re.compile("") 

2960 for p in pages.values(): 

2961 pp = p.original_page 

2962 for a in pp.get("/B", ()): 

2963 a_obj = a.get_object() 

2964 if is_null_or_none(a_obj): 

2965 continue 

2966 thr = a_obj.get("/T") 

2967 if thr is None: 

2968 continue 

2969 thr = thr.get_object() 

2970 if thr.indirect_reference.idnum not in self._id_translated[ 

2971 id(reader) 

2972 ] and fltr.search((thr.get("/I", {})).get("/Title", "")): 

2973 self._add_articles_thread(thr, pages, reader) 

2974 

2975 def _get_cloned_page( 

2976 self, 

2977 page: Union[None, IndirectObject, PageObject, NullObject], 

2978 pages: dict[int, PageObject], 

2979 reader: PdfReader, 

2980 ) -> Optional[IndirectObject]: 

2981 if isinstance(page, NullObject): 

2982 return None 

2983 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": 

2984 _i = page.indirect_reference 

2985 elif isinstance(page, IndirectObject): 

2986 _i = page 

2987 try: 

2988 return pages[_i.idnum].indirect_reference # type: ignore[union-attr] 

2989 except Exception: 

2990 return None 

2991 

2992 def _insert_filtered_annotations( 

2993 self, 

2994 annots: Union[IndirectObject, list[DictionaryObject], None], 

2995 page: PageObject, 

2996 pages: dict[int, PageObject], 

2997 reader: PdfReader, 

2998 ) -> list[Destination]: 

2999 outlist = ArrayObject() 

3000 if isinstance(annots, IndirectObject): 

3001 annots = cast("list[Any]", annots.get_object()) 

3002 if annots is None: 

3003 return outlist 

3004 if not isinstance(annots, list): 

3005 logger_warning( 

3006 "Expected list of annotations, got %(annots)s of type %(annots_type)s.", 

3007 source=__name__, 

3008 annots=annots, 

3009 annots_type=annots.__class__.__name__, 

3010 ) 

3011 return outlist 

3012 for an in annots: 

3013 ano = cast("DictionaryObject", an.get_object()) 

3014 if ( 

3015 ano["/Subtype"] != "/Link" # type: ignore[comparison-overlap] 

3016 or "/A" not in ano 

3017 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" # type: ignore[comparison-overlap] 

3018 or "/Dest" in ano 

3019 ): 

3020 if "/Dest" not in ano: 

3021 outlist.append(self._add_object(ano.clone(self))) 

3022 else: 

3023 d = ano["/Dest"] 

3024 if isinstance(d, str): 

3025 # it is a named dest 

3026 if str(d) in self.get_named_dest_root(): 

3027 outlist.append(ano.clone(self).indirect_reference) 

3028 else: 

3029 d = cast("ArrayObject", d) 

3030 p = self._get_cloned_page(d[0], pages, reader) 

3031 if p is not None: 

3032 anc = ano.clone(self, ignore_fields=("/Dest",)) 

3033 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]]) 

3034 outlist.append(self._add_object(anc)) 

3035 else: 

3036 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject()) 

3037 if is_null_or_none(d): 

3038 continue 

3039 if isinstance(d, str): 

3040 # it is a named dest 

3041 if str(d) in self.get_named_dest_root(): 

3042 outlist.append(ano.clone(self).indirect_reference) 

3043 else: 

3044 d = cast("ArrayObject", d) 

3045 p = self._get_cloned_page(d[0], pages, reader) 

3046 if p is not None: 

3047 anc = ano.clone(self, ignore_fields=("/D",)) 

3048 cast("DictionaryObject", anc["/A"])[ 

3049 NameObject("/D") 

3050 ] = ArrayObject([p, *d[1:]]) 

3051 outlist.append(self._add_object(anc)) 

3052 return outlist 

3053 

3054 def _get_filtered_outline( 

3055 self, 

3056 node: Any, 

3057 pages: dict[int, PageObject], 

3058 reader: PdfReader, 

3059 ) -> list[Destination]: 

3060 """ 

3061 Extract outline item entries that are part of the specified page set. 

3062 

3063 Args: 

3064 node: 

3065 pages: 

3066 reader: 

3067 

3068 Returns: 

3069 A list of destination objects. 

3070 

3071 """ 

3072 new_outline = [] 

3073 if node is None: 

3074 node = NullObject() 

3075 node = node.get_object() 

3076 if is_null_or_none(node): 

3077 node = DictionaryObject() 

3078 if node.get("/Type", "") == "/Outlines" or "/Title" not in node: 

3079 node = node.get("/First", None) 

3080 if node is not None: 

3081 node = node.get_object() 

3082 new_outline += self._get_filtered_outline(node, pages, reader) 

3083 else: 

3084 v: Union[None, IndirectObject, NullObject] 

3085 while node is not None: 

3086 node = node.get_object() 

3087 o = cast("Destination", reader._build_outline_item(node)) 

3088 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader) 

3089 if v is None: 

3090 v = NullObject() 

3091 o[NameObject("/Page")] = v 

3092 if "/First" in node: 

3093 o._filtered_children = self._get_filtered_outline( 

3094 node["/First"], pages, reader 

3095 ) 

3096 else: 

3097 o._filtered_children = [] 

3098 if ( 

3099 not isinstance(o["/Page"], NullObject) 

3100 or len(o._filtered_children) > 0 

3101 ): 

3102 new_outline.append(o) 

3103 node = node.get("/Next", None) 

3104 return new_outline 

3105 

3106 def _clone_outline(self, dest: Destination) -> TreeObject: 

3107 n_ol = TreeObject() 

3108 self._add_object(n_ol) 

3109 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"]) 

3110 if not isinstance(dest["/Page"], NullObject): 

3111 if dest.node is not None and "/A" in dest.node: 

3112 n_ol[NameObject("/A")] = dest.node["/A"].clone(self) 

3113 else: 

3114 n_ol[NameObject("/Dest")] = dest.dest_array 

3115 # TODO: /SE 

3116 if dest.node is not None: 

3117 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0)) 

3118 n_ol[NameObject("/C")] = ArrayObject( 

3119 dest.node.get( 

3120 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)] 

3121 ) 

3122 ) 

3123 return n_ol 

3124 

3125 def _insert_filtered_outline( 

3126 self, 

3127 outlines: list[Destination], 

3128 parent: Union[TreeObject, IndirectObject], 

3129 before: Union[None, TreeObject, IndirectObject] = None, 

3130 ) -> None: 

3131 for dest in outlines: 

3132 # TODO: can be improved to keep A and SE entries (ignored for the moment) 

3133 # with np=self.add_outline_item_destination(dest,parent,before) 

3134 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: 

3135 np = parent 

3136 else: 

3137 np = self._clone_outline(dest) 

3138 cast(TreeObject, parent.get_object()).insert_child(np, before, self) 

3139 self._insert_filtered_outline(dest._filtered_children, np, None) 

3140 

3141 def close(self) -> None: 

3142 """Implemented for API harmonization.""" 

3143 return 

3144 

3145 def find_outline_item( 

3146 self, 

3147 outline_item: dict[str, Any], 

3148 root: Optional[OutlineType] = None, 

3149 ) -> Optional[list[int]]: 

3150 if root is None: 

3151 o = self.get_outline_root() 

3152 else: 

3153 o = cast("TreeObject", root) 

3154 

3155 i = 0 

3156 while o is not None: 

3157 if ( 

3158 o.indirect_reference == outline_item 

3159 or o.get("/Title", None) == outline_item 

3160 ): 

3161 return [i] 

3162 if "/First" in o: 

3163 res = self.find_outline_item( 

3164 outline_item, cast(OutlineType, o["/First"]) 

3165 ) 

3166 if res: 

3167 return ([i] if "/Title" in o else []) + res 

3168 if "/Next" in o: 

3169 i += 1 

3170 o = cast(TreeObject, o["/Next"]) 

3171 else: 

3172 return None 

3173 raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover 

3174 

3175 def reset_translation( 

3176 self, reader: Union[None, PdfReader, IndirectObject] = None 

3177 ) -> None: 

3178 """ 

3179 Reset the translation table between reader and the writer object. 

3180 

3181 Late cloning will create new independent objects. 

3182 

3183 Args: 

3184 reader: PdfReader or IndirectObject referencing a PdfReader object. 

3185 if set to None or omitted, all tables will be reset. 

3186 

3187 """ 

3188 if reader is None: 

3189 self._id_translated = {} 

3190 elif isinstance(reader, PdfReader): 

3191 try: 

3192 del self._id_translated[id(reader)] 

3193 except Exception: 

3194 pass 

3195 elif isinstance(reader, IndirectObject): 

3196 try: 

3197 del self._id_translated[id(reader.pdf)] 

3198 except Exception: 

3199 pass 

3200 else: 

3201 raise Exception("invalid parameter {reader}") 

3202 

3203 def set_page_label( 

3204 self, 

3205 page_index_from: int, 

3206 page_index_to: int, 

3207 style: Optional[PageLabelStyle] = None, 

3208 prefix: Optional[str] = None, 

3209 start: Optional[int] = 0, 

3210 ) -> None: 

3211 """ 

3212 Set a page label to a range of pages. 

3213 

3214 Page indexes must be given starting from 0. 

3215 Labels must have a style, a prefix or both. 

3216 If a range is not assigned any page label, a decimal label starting from 1 is applied. 

3217 

3218 Args: 

3219 page_index_from: page index of the beginning of the range starting from 0 

3220 page_index_to: page index of the beginning of the range starting from 0 

3221 style: The numbering style to be used for the numeric portion of each page label: 

3222 

3223 * ``/D`` Decimal Arabic numerals 

3224 * ``/R`` Uppercase Roman numerals 

3225 * ``/r`` Lowercase Roman numerals 

3226 * ``/A`` Uppercase letters (A to Z for the first 26 pages, 

3227 AA to ZZ for the next 26, and so on) 

3228 * ``/a`` Lowercase letters (a to z for the first 26 pages, 

3229 aa to zz for the next 26, and so on) 

3230 

3231 prefix: The label prefix for page labels in this range. 

3232 start: The value of the numeric portion for the first page label 

3233 in the range. 

3234 Subsequent pages are numbered sequentially from this value, 

3235 which must be greater than or equal to 1. 

3236 Default value: 1. 

3237 

3238 """ 

3239 if style is None and prefix is None: 

3240 raise ValueError("At least one of style and prefix must be given") 

3241 if page_index_from < 0: 

3242 raise ValueError("page_index_from must be greater or equal than 0") 

3243 if page_index_to < page_index_from: 

3244 raise ValueError( 

3245 "page_index_to must be greater or equal than page_index_from" 

3246 ) 

3247 if page_index_to >= len(self.pages): 

3248 raise ValueError("page_index_to exceeds number of pages") 

3249 if start is not None and start != 0 and start < 1: 

3250 raise ValueError("If given, start must be greater or equal than one") 

3251 

3252 self._set_page_label(page_index_from, page_index_to, style, prefix, start) 

3253 

3254 def _set_page_label( 

3255 self, 

3256 page_index_from: int, 

3257 page_index_to: int, 

3258 style: Optional[PageLabelStyle] = None, 

3259 prefix: Optional[str] = None, 

3260 start: Optional[int] = 0, 

3261 ) -> None: 

3262 """ 

3263 Set a page label to a range of pages. 

3264 

3265 Page indexes must be given starting from 0. 

3266 Labels must have a style, a prefix or both. 

3267 If a range is not assigned any page label a decimal label starting from 1 is applied. 

3268 

3269 Args: 

3270 page_index_from: page index of the beginning of the range starting from 0 

3271 page_index_to: page index of the beginning of the range starting from 0 

3272 style: The numbering style to be used for the numeric portion of each page label: 

3273 /D Decimal Arabic numerals 

3274 /R Uppercase Roman numerals 

3275 /r Lowercase Roman numerals 

3276 /A Uppercase letters (A to Z for the first 26 pages, 

3277 AA to ZZ for the next 26, and so on) 

3278 /a Lowercase letters (a to z for the first 26 pages, 

3279 aa to zz for the next 26, and so on) 

3280 prefix: The label prefix for page labels in this range. 

3281 start: The value of the numeric portion for the first page label 

3282 in the range. 

3283 Subsequent pages are numbered sequentially from this value, 

3284 which must be greater than or equal to 1. Default value: 1. 

3285 

3286 """ 

3287 default_page_label = DictionaryObject() 

3288 default_page_label[NameObject("/S")] = NameObject("/D") 

3289 

3290 new_page_label = DictionaryObject() 

3291 if style is not None: 

3292 new_page_label[NameObject("/S")] = NameObject(style) 

3293 if prefix is not None: 

3294 new_page_label[NameObject("/P")] = TextStringObject(prefix) 

3295 if start != 0: 

3296 new_page_label[NameObject("/St")] = NumberObject(start) 

3297 

3298 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: 

3299 nums = ArrayObject() 

3300 nums_insert(NumberObject(0), default_page_label, nums) 

3301 page_labels = TreeObject() 

3302 page_labels[NameObject("/Nums")] = nums 

3303 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3304 

3305 page_labels = cast( 

3306 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] 

3307 ) 

3308 nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) 

3309 

3310 nums_insert(NumberObject(page_index_from), new_page_label, nums) 

3311 nums_clear_range(NumberObject(page_index_from), page_index_to, nums) 

3312 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums) 

3313 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages): 

3314 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) 

3315 

3316 page_labels[NameObject("/Nums")] = nums 

3317 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3318 

3319 def _repr_mimebundle_( 

3320 self, 

3321 include: Union[None, Iterable[str]] = None, 

3322 exclude: Union[None, Iterable[str]] = None, 

3323 ) -> dict[str, Any]: 

3324 """ 

3325 Integration into Jupyter Notebooks. 

3326 

3327 This method returns a dictionary that maps a mime-type to its 

3328 representation. 

3329 

3330 .. seealso:: 

3331 

3332 https://ipython.readthedocs.io/en/stable/config/integrating.html 

3333 """ 

3334 pdf_data = BytesIO() 

3335 self.write(pdf_data) 

3336 data = { 

3337 "application/pdf": pdf_data, 

3338 } 

3339 

3340 if include is not None: 

3341 # Filter representations based on include list 

3342 data = {k: v for k, v in data.items() if k in include} 

3343 

3344 if exclude is not None: 

3345 # Remove representations based on exclude list 

3346 data = {k: v for k, v in data.items() if k not in exclude} 

3347 

3348 return data 

3349 

3350 

3351def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject: 

3352 if isinstance(obj, PdfObject): 

3353 return obj 

3354 if isinstance(obj, dict): 

3355 to_add = DictionaryObject() 

3356 for key, value in obj.items(): 

3357 to_add[NameObject(key)] = _pdf_objectify(value) 

3358 return to_add 

3359 if isinstance(obj, str): 

3360 if obj.startswith("/"): 

3361 return NameObject(obj) 

3362 return TextStringObject(obj) 

3363 if isinstance(obj, (float, int)): 

3364 return FloatObject(obj) 

3365 if isinstance(obj, list): 

3366 return ArrayObject(_pdf_objectify(i) for i in obj) 

3367 raise NotImplementedError( 

3368 f"{type(obj)=} could not be cast to a PdfObject" 

3369 ) 

3370 

3371 

3372def _create_outline_item( 

3373 action_ref: Union[None, IndirectObject], 

3374 title: str, 

3375 color: Union[tuple[float, float, float], str, None], 

3376 italic: bool, 

3377 bold: bool, 

3378) -> TreeObject: 

3379 outline_item = TreeObject() 

3380 if action_ref is not None: 

3381 outline_item[NameObject("/A")] = action_ref 

3382 outline_item.update( 

3383 { 

3384 NameObject("/Title"): create_string_object(title), 

3385 } 

3386 ) 

3387 if color: 

3388 if isinstance(color, str): 

3389 color = hex_to_rgb(color) 

3390 outline_item.update( 

3391 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} 

3392 ) 

3393 if italic or bold: 

3394 format_flag = 0 

3395 if italic: 

3396 format_flag += OutlineFontFlag.italic 

3397 if bold: 

3398 format_flag += OutlineFontFlag.bold 

3399 outline_item.update({NameObject("/F"): NumberObject(format_flag)}) 

3400 return outline_item