Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 20%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1481 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import decimal 

31import enum 

32import hashlib 

33import re 

34import struct 

35import uuid 

36from collections.abc import Iterable, Mapping 

37from io import BytesIO, FileIO, IOBase 

38from itertools import compress 

39from pathlib import Path 

40from re import Pattern 

41from types import TracebackType 

42from typing import ( 

43 IO, 

44 Any, 

45 Callable, 

46 Optional, 

47 Union, 

48 cast, 

49) 

50 

51from ._cmap import _default_fonts_space_width, build_char_map_from_dict 

52from ._doc_common import DocumentInformation, PdfDocCommon 

53from ._encryption import EncryptAlgorithm, Encryption 

54from ._page import PageObject, Transformation 

55from ._page_labels import nums_clear_range, nums_insert, nums_next 

56from ._reader import PdfReader 

57from ._utils import ( 

58 StrByteType, 

59 StreamType, 

60 _get_max_pdf_version_header, 

61 deprecation_no_replacement, 

62 logger_warning, 

63) 

64from .constants import AnnotationDictionaryAttributes as AA 

65from .constants import CatalogAttributes as CA 

66from .constants import ( 

67 CatalogDictionary, 

68 GoToActionArguments, 

69 ImageType, 

70 InteractiveFormDictEntries, 

71 OutlineFontFlag, 

72 PageLabelStyle, 

73 PagesAttributes, 

74 TypFitArguments, 

75 UserAccessPermissions, 

76) 

77from .constants import Core as CO 

78from .constants import FieldDictionaryAttributes as FA 

79from .constants import PageAttributes as PG 

80from .constants import TrailerKeys as TK 

81from .errors import PdfReadError, PyPdfError 

82from .generic import ( 

83 PAGE_FIT, 

84 ArrayObject, 

85 BooleanObject, 

86 ByteStringObject, 

87 ContentStream, 

88 DecodedStreamObject, 

89 Destination, 

90 DictionaryObject, 

91 EmbeddedFile, 

92 Fit, 

93 FloatObject, 

94 IndirectObject, 

95 NameObject, 

96 NullObject, 

97 NumberObject, 

98 PdfObject, 

99 RectangleObject, 

100 ReferenceLink, 

101 StreamObject, 

102 TextStringObject, 

103 TreeObject, 

104 ViewerPreferences, 

105 create_string_object, 

106 extract_links, 

107 hex_to_rgb, 

108 is_null_or_none, 

109) 

110from .pagerange import PageRange, PageRangeSpec 

111from .types import ( 

112 AnnotationSubtype, 

113 BorderArrayType, 

114 LayoutType, 

115 OutlineItemType, 

116 OutlineType, 

117 PagemodeType, 

118) 

119from .xmp import XmpInformation 

120 

121ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() 

122DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 

123 

124 

125class ObjectDeletionFlag(enum.IntFlag): 

126 NONE = 0 

127 TEXT = enum.auto() 

128 LINKS = enum.auto() 

129 ATTACHMENTS = enum.auto() 

130 OBJECTS_3D = enum.auto() 

131 ALL_ANNOTATIONS = enum.auto() 

132 XOBJECT_IMAGES = enum.auto() 

133 INLINE_IMAGES = enum.auto() 

134 DRAWING_IMAGES = enum.auto() 

135 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES 

136 

137 

138def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: 

139 hash = hashlib.md5(usedforsecurity=False) 

140 for block in iter(lambda: stream.read(blocksize), b""): 

141 hash.update(block) 

142 return hash.hexdigest() 

143 

144 

145class PdfWriter(PdfDocCommon): 

146 """ 

147 Write a PDF file out, given pages produced by another class or through 

148 cloning a PDF file during initialization. 

149 

150 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`. 

151 

152 Args: 

153 clone_from: identical to fileobj (for compatibility) 

154 

155 incremental: If true, loads the document and set the PdfWriter in incremental mode. 

156 

157 When writing incrementally, the original document is written first and new/modified 

158 content is appended. To be used for signed document/forms to keep signature valid. 

159 

160 full: If true, loads all the objects (always full if incremental = True). 

161 This parameter may allow loading large PDFs. 

162 

163 strict: If true, pypdf will raise an exception if a PDF does not follow the specification. 

164 If false, pypdf will try to be forgiving and do something reasonable, but it will log 

165 a warning message. It is a best-effort approach. 

166 

167 """ 

168 

169 def __init__( 

170 self, 

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "", 

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None, 

173 incremental: bool = False, 

174 full: bool = False, 

175 strict: bool = False, 

176 ) -> None: 

177 self.strict = strict 

178 """ 

179 If true, pypdf will raise an exception if a PDF does not follow the specification. 

180 If false, pypdf will try to be forgiving and do something reasonable, but it will log 

181 a warning message. It is a best-effort approach. 

182 """ 

183 

184 self.incremental = incremental or full 

185 """ 

186 Returns if the PdfWriter object has been started in incremental mode. 

187 """ 

188 

189 self._objects: list[Optional[PdfObject]] = [] 

190 """ 

191 The indirect objects in the PDF. 

192 For the incremental case, it will be filled with None 

193 in clone_reader_document_root. 

194 """ 

195 

196 self._original_hash: list[int] = [] 

197 """ 

198 List of hashes after import; used to identify changes. 

199 """ 

200 

201 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {} 

202 """ 

203 Maps hash values of indirect objects to the list of IndirectObjects. 

204 This is used for compression. 

205 """ 

206 

207 self._id_translated: dict[int, dict[int, int]] = {} 

208 """List of already translated IDs. 

209 dict[id(pdf)][(idnum, generation)] 

210 """ 

211 

212 self._info_obj: Optional[PdfObject] 

213 """The PDF files's document information dictionary, 

214 the Info entry in the PDF file's trailer dictionary.""" 

215 

216 self._ID: Union[ArrayObject, None] = None 

217 """The PDF file identifier, 

218 defined by the ID in the PDF file's trailer dictionary.""" 

219 

220 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = [] 

221 "Tracks links in pages added to the writer for resolving later." 

222 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {} 

223 "Tracks pages added to the writer and what page they turned into." 

224 

225 if self.incremental: 

226 if isinstance(fileobj, (str, Path)): 

227 with open(fileobj, "rb") as f: 

228 fileobj = BytesIO(f.read(-1)) 

229 if isinstance(fileobj, BytesIO): 

230 fileobj = PdfReader(fileobj) 

231 if not isinstance(fileobj, PdfReader): 

232 raise PyPdfError("Invalid type for incremental mode") 

233 self._reader = fileobj # prev content is in _reader.stream 

234 self._header = fileobj.pdf_header.encode() 

235 self._readonly = True # TODO: to be analysed 

236 else: 

237 self._header = b"%PDF-1.3" 

238 self._info_obj = self._add_object( 

239 DictionaryObject( 

240 {NameObject("/Producer"): create_string_object("pypdf")} 

241 ) 

242 ) 

243 

244 def _get_clone_from( 

245 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

246 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

247 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]: 

248 if isinstance(fileobj, (str, Path, IO, BytesIO)) and ( 

249 fileobj == "" or clone_from is not None 

250 ): 

251 return clone_from 

252 cloning = True 

253 if isinstance(fileobj, (str, Path)) and ( 

254 not Path(str(fileobj)).exists() 

255 or Path(str(fileobj)).stat().st_size == 0 

256 ): 

257 cloning = False 

258 if isinstance(fileobj, (IOBase, BytesIO)): 

259 t = fileobj.tell() 

260 if fileobj.seek(0, 2) == 0: 

261 cloning = False 

262 fileobj.seek(t, 0) 

263 if cloning: 

264 clone_from = fileobj 

265 return clone_from 

266 

267 clone_from = _get_clone_from(fileobj, clone_from) 

268 # To prevent overwriting 

269 self.temp_fileobj = fileobj 

270 self.fileobj = "" 

271 self._with_as_usage = False 

272 self._cloned = False 

273 # The root of our page tree node 

274 pages = DictionaryObject( 

275 { 

276 NameObject(PagesAttributes.TYPE): NameObject("/Pages"), 

277 NameObject(PagesAttributes.COUNT): NumberObject(0), 

278 NameObject(PagesAttributes.KIDS): ArrayObject(), 

279 } 

280 ) 

281 self.flattened_pages = [] 

282 self._encryption: Optional[Encryption] = None 

283 self._encrypt_entry: Optional[DictionaryObject] = None 

284 

285 if clone_from is not None: 

286 if not isinstance(clone_from, PdfReader): 

287 clone_from = PdfReader(clone_from) 

288 self.clone_document_from_reader(clone_from) 

289 self._cloned = True 

290 else: 

291 self._pages = self._add_object(pages) 

292 self._root_object = DictionaryObject( 

293 { 

294 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG), 

295 NameObject(CO.PAGES): self._pages, 

296 } 

297 ) 

298 self._add_object(self._root_object) 

299 if full and not incremental: 

300 self.incremental = False 

301 if isinstance(self._ID, list): 

302 if isinstance(self._ID[0], TextStringObject): 

303 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) 

304 if isinstance(self._ID[1], TextStringObject): 

305 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) 

306 

307 # for commonality 

308 @property 

309 def is_encrypted(self) -> bool: 

310 """ 

311 Read-only boolean property showing whether this PDF file is encrypted. 

312 

313 Note that this property, if true, will remain true even after the 

314 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

315 """ 

316 return False 

317 

318 @property 

319 def root_object(self) -> DictionaryObject: 

320 """ 

321 Provide direct access to PDF Structure. 

322 

323 Note: 

324 Recommended only for read access. 

325 

326 """ 

327 return self._root_object 

328 

329 @property 

330 def _info(self) -> Optional[DictionaryObject]: 

331 """ 

332 Provide access to "/Info". Standardized with PdfReader. 

333 

334 Returns: 

335 /Info Dictionary; None if the entry does not exist 

336 

337 """ 

338 return ( 

339 None 

340 if self._info_obj is None 

341 else cast(DictionaryObject, self._info_obj.get_object()) 

342 ) 

343 

344 @_info.setter 

345 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: 

346 if value is None: 

347 try: 

348 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore 

349 except (KeyError, AttributeError): 

350 pass 

351 self._info_obj = None 

352 else: 

353 if self._info_obj is None: 

354 self._info_obj = self._add_object(DictionaryObject()) 

355 obj = cast(DictionaryObject, self._info_obj.get_object()) 

356 obj.clear() 

357 obj.update(cast(DictionaryObject, value.get_object())) 

358 

359 @property 

360 def xmp_metadata(self) -> Optional[XmpInformation]: 

361 """XMP (Extensible Metadata Platform) data.""" 

362 return cast(XmpInformation, self.root_object.xmp_metadata) 

363 

364 @xmp_metadata.setter 

365 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None: 

366 """XMP (Extensible Metadata Platform) data.""" 

367 if value is None: 

368 if "/Metadata" in self.root_object: 

369 del self.root_object["/Metadata"] 

370 return 

371 

372 metadata = self.root_object.get("/Metadata", None) 

373 if not isinstance(metadata, IndirectObject): 

374 if metadata is not None: 

375 del self.root_object["/Metadata"] 

376 metadata_stream = StreamObject() 

377 stream_reference = self._add_object(metadata_stream) 

378 self.root_object[NameObject("/Metadata")] = stream_reference 

379 else: 

380 metadata_stream = cast(StreamObject, metadata.get_object()) 

381 

382 if isinstance(value, XmpInformation): 

383 bytes_data = value.stream.get_data() 

384 else: 

385 bytes_data = value 

386 metadata_stream.set_data(bytes_data) 

387 

388 @property 

389 def with_as_usage(self) -> bool: 

390 deprecation_no_replacement("with_as_usage", "5.0") 

391 return self._with_as_usage 

392 

393 @with_as_usage.setter 

394 def with_as_usage(self, value: bool) -> None: 

395 deprecation_no_replacement("with_as_usage", "5.0") 

396 self._with_as_usage = value 

397 

398 def __enter__(self) -> "PdfWriter": 

399 """Store how writer is initialized by 'with'.""" 

400 c: bool = self._cloned 

401 t = self.temp_fileobj 

402 self.__init__() # type: ignore 

403 self._cloned = c 

404 self._with_as_usage = True 

405 self.fileobj = t # type: ignore 

406 return self 

407 

408 def __exit__( 

409 self, 

410 exc_type: Optional[type[BaseException]], 

411 exc: Optional[BaseException], 

412 traceback: Optional[TracebackType], 

413 ) -> None: 

414 """Write data to the fileobj.""" 

415 if self.fileobj and not self._cloned: 

416 self.write(self.fileobj) 

417 

418 @property 

419 def pdf_header(self) -> str: 

420 """ 

421 Read/Write property of the PDF header that is written. 

422 

423 This should be something like ``'%PDF-1.5'``. It is recommended to set 

424 the lowest version that supports all features which are used within the 

425 PDF file. 

426 

427 Note: `pdf_header` returns a string but accepts bytes or str for writing 

428 """ 

429 return self._header.decode() 

430 

431 @pdf_header.setter 

432 def pdf_header(self, new_header: Union[str, bytes]) -> None: 

433 if isinstance(new_header, str): 

434 new_header = new_header.encode() 

435 self._header = new_header 

436 

437 def _add_object(self, obj: PdfObject) -> IndirectObject: 

438 if ( 

439 getattr(obj, "indirect_reference", None) is not None 

440 and obj.indirect_reference.pdf == self # type: ignore 

441 ): 

442 return obj.indirect_reference # type: ignore 

443 # check for /Contents in Pages (/Contents in annotations are strings) 

444 if isinstance(obj, DictionaryObject) and isinstance( 

445 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject) 

446 ): 

447 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS]) 

448 self._objects.append(obj) 

449 obj.indirect_reference = IndirectObject(len(self._objects), 0, self) 

450 return obj.indirect_reference 

451 

452 def get_object( 

453 self, 

454 indirect_reference: Union[int, IndirectObject], 

455 ) -> PdfObject: 

456 if isinstance(indirect_reference, int): 

457 obj = self._objects[indirect_reference - 1] 

458 elif indirect_reference.pdf != self: 

459 raise ValueError("PDF must be self") 

460 else: 

461 obj = self._objects[indirect_reference.idnum - 1] 

462 assert obj is not None, "mypy" 

463 return obj 

464 

465 def _replace_object( 

466 self, 

467 indirect_reference: Union[int, IndirectObject], 

468 obj: PdfObject, 

469 ) -> PdfObject: 

470 if isinstance(indirect_reference, IndirectObject): 

471 if indirect_reference.pdf != self: 

472 raise ValueError("PDF must be self") 

473 indirect_reference = indirect_reference.idnum 

474 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore 

475 if ( 

476 getattr(obj, "indirect_reference", None) is not None 

477 and obj.indirect_reference.pdf != self # type: ignore 

478 ): 

479 obj = obj.clone(self) 

480 self._objects[indirect_reference - 1] = obj 

481 obj.indirect_reference = IndirectObject(indirect_reference, gen, self) 

482 

483 assert isinstance(obj, PdfObject), "mypy" 

484 return obj 

485 

486 def _add_page( 

487 self, 

488 page: PageObject, 

489 index: int, 

490 excluded_keys: Iterable[str] = (), 

491 ) -> PageObject: 

492 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE: 

493 raise ValueError("Invalid page object") 

494 assert self.flattened_pages is not None, "for mypy" 

495 page_org = page 

496 excluded_keys = list(excluded_keys) 

497 excluded_keys += [PagesAttributes.PARENT, "/StructParents"] 

498 # Acrobat does not accept two indirect references pointing on the same 

499 # page; therefore in order to add multiple copies of the same 

500 # page, we need to create a new dictionary for the page, however the 

501 # objects below (including content) are not duplicated: 

502 try: # delete an already existing page 

503 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore 

504 page_org.indirect_reference.idnum # type: ignore 

505 ] 

506 except Exception: 

507 pass 

508 

509 page = cast( 

510 "PageObject", page_org.clone(self, False, excluded_keys).get_object() 

511 ) 

512 if page_org.pdf is not None: 

513 other = page_org.pdf.pdf_header 

514 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) 

515 

516 node, idx = self._get_page_in_node(index) 

517 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference 

518 

519 if idx >= 0: 

520 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference) 

521 self.flattened_pages.insert(index, page) 

522 else: 

523 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference) 

524 self.flattened_pages.append(page) 

525 recurse = 0 

526 while not is_null_or_none(node): 

527 node = cast(DictionaryObject, node.get_object()) 

528 node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1) 

529 node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix. 

530 recurse += 1 

531 if recurse > 1000: 

532 raise PyPdfError("Too many recursive calls!") 

533 

534 if page_org.pdf is not None: 

535 # the page may contain links to other pages, and those other 

536 # pages may or may not already be added. we store the 

537 # information we need, so that we can resolve the references 

538 # later. 

539 self._unresolved_links.extend(extract_links(page, page_org)) 

540 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference 

541 

542 return page 

543 

544 def set_need_appearances_writer(self, state: bool = True) -> None: 

545 """ 

546 Sets the "NeedAppearances" flag in the PDF writer. 

547 

548 The "NeedAppearances" flag indicates whether the appearance dictionary 

549 for form fields should be automatically generated by the PDF viewer or 

550 if the embedded appearance should be used. 

551 

552 Args: 

553 state: The actual value of the NeedAppearances flag. 

554 

555 Returns: 

556 None 

557 

558 """ 

559 # See §12.7.2 and §7.7.2 for more information: 

560 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

561 try: 

562 # get the AcroForm tree 

563 if CatalogDictionary.ACRO_FORM not in self._root_object: 

564 self._root_object[ 

565 NameObject(CatalogDictionary.ACRO_FORM) 

566 ] = self._add_object(DictionaryObject()) 

567 

568 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) 

569 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[ 

570 need_appearances 

571 ] = BooleanObject(state) 

572 except Exception as exc: # pragma: no cover 

573 logger_warning( 

574 f"set_need_appearances_writer({state}) catch : {exc}", __name__ 

575 ) 

576 

577 def create_viewer_preferences(self) -> ViewerPreferences: 

578 o = ViewerPreferences() 

579 self._root_object[ 

580 NameObject(CatalogDictionary.VIEWER_PREFERENCES) 

581 ] = self._add_object(o) 

582 return o 

583 

584 def add_page( 

585 self, 

586 page: PageObject, 

587 excluded_keys: Iterable[str] = (), 

588 ) -> PageObject: 

589 """ 

590 Add a page to this PDF file. 

591 

592 Recommended for advanced usage including the adequate excluded_keys. 

593 

594 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>` 

595 instance. 

596 

597 Args: 

598 page: The page to add to the document. Should be 

599 an instance of :class:`PageObject<pypdf._page.PageObject>` 

600 excluded_keys: 

601 

602 Returns: 

603 The added PageObject. 

604 

605 """ 

606 assert self.flattened_pages is not None, "mypy" 

607 return self._add_page(page, len(self.flattened_pages), excluded_keys) 

608 

609 def insert_page( 

610 self, 

611 page: PageObject, 

612 index: int = 0, 

613 excluded_keys: Iterable[str] = (), 

614 ) -> PageObject: 

615 """ 

616 Insert a page in this PDF file. The page is usually acquired from a 

617 :class:`PdfReader<pypdf.PdfReader>` instance. 

618 

619 Args: 

620 page: The page to add to the document. 

621 index: Position at which the page will be inserted. 

622 excluded_keys: 

623 

624 Returns: 

625 The added PageObject. 

626 

627 """ 

628 assert self.flattened_pages is not None, "mypy" 

629 if index < 0: 

630 index = len(self.flattened_pages) + index 

631 if index < 0: 

632 raise ValueError("Invalid index value") 

633 if index >= len(self.flattened_pages): 

634 return self.add_page(page, excluded_keys) 

635 return self._add_page(page, index, excluded_keys) 

636 

637 def _get_page_number_by_indirect( 

638 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

639 ) -> Optional[int]: 

640 """ 

641 Generate _page_id2num. 

642 

643 Args: 

644 indirect_reference: 

645 

646 Returns: 

647 The page number or None 

648 

649 """ 

650 # To provide same function as in PdfReader 

651 if is_null_or_none(indirect_reference): 

652 return None 

653 assert indirect_reference is not None, "mypy" 

654 if isinstance(indirect_reference, int): 

655 indirect_reference = IndirectObject(indirect_reference, 0, self) 

656 obj = indirect_reference.get_object() 

657 if isinstance(obj, PageObject): 

658 return obj.page_number 

659 return None 

660 

661 def add_blank_page( 

662 self, width: Optional[float] = None, height: Optional[float] = None 

663 ) -> PageObject: 

664 """ 

665 Append a blank page to this PDF file and return it. 

666 

667 If no page size is specified, use the size of the last page. 

668 

669 Args: 

670 width: The width of the new page expressed in default user 

671 space units. 

672 height: The height of the new page expressed in default 

673 user space units. 

674 

675 Returns: 

676 The newly appended page. 

677 

678 Raises: 

679 PageSizeNotDefinedError: if width and height are not defined 

680 and previous page does not exist. 

681 

682 """ 

683 page = PageObject.create_blank_page(self, width, height) 

684 return self.add_page(page) 

685 

686 def insert_blank_page( 

687 self, 

688 width: Optional[Union[float, decimal.Decimal]] = None, 

689 height: Optional[Union[float, decimal.Decimal]] = None, 

690 index: int = 0, 

691 ) -> PageObject: 

692 """ 

693 Insert a blank page to this PDF file and return it. 

694 

695 If no page size is specified, use the size of the last page. 

696 

697 Args: 

698 width: The width of the new page expressed in default user 

699 space units. 

700 height: The height of the new page expressed in default 

701 user space units. 

702 index: Position to add the page. 

703 

704 Returns: 

705 The newly inserted page. 

706 

707 Raises: 

708 PageSizeNotDefinedError: if width and height are not defined 

709 and previous page does not exist. 

710 

711 """ 

712 if width is None or (height is None and index < self.get_num_pages()): 

713 oldpage = self.pages[index] 

714 width = oldpage.mediabox.width 

715 height = oldpage.mediabox.height 

716 page = PageObject.create_blank_page(self, width, height) 

717 self.insert_page(page, index) 

718 return page 

719 

720 @property 

721 def open_destination( 

722 self, 

723 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

724 return super().open_destination 

725 

726 @open_destination.setter 

727 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

728 if dest is None: 

729 try: 

730 del self._root_object["/OpenAction"] 

731 except KeyError: 

732 pass 

733 elif isinstance(dest, str): 

734 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) 

735 elif isinstance(dest, Destination): 

736 self._root_object[NameObject("/OpenAction")] = dest.dest_array 

737 elif isinstance(dest, PageObject): 

738 self._root_object[NameObject("/OpenAction")] = Destination( 

739 "Opening", 

740 dest.indirect_reference 

741 if dest.indirect_reference is not None 

742 else NullObject(), 

743 PAGE_FIT, 

744 ).dest_array 

745 

746 def add_js(self, javascript: str) -> None: 

747 """ 

748 Add JavaScript which will launch upon opening this PDF. 

749 

750 Args: 

751 javascript: Your JavaScript. 

752 

753 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 

754 # Example: This will launch the print window when the PDF is opened. 

755 

756 """ 

757 # Names / JavaScript preferred to be able to add multiple scripts 

758 if "/Names" not in self._root_object: 

759 self._root_object[NameObject(CA.NAMES)] = DictionaryObject() 

760 names = cast(DictionaryObject, self._root_object[CA.NAMES]) 

761 if "/JavaScript" not in names: 

762 names[NameObject("/JavaScript")] = DictionaryObject( 

763 {NameObject("/Names"): ArrayObject()} 

764 ) 

765 js_list = cast( 

766 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] 

767 ) 

768 # We need a name for parameterized JavaScript in the PDF file, 

769 # but it can be anything. 

770 js_list.append(create_string_object(str(uuid.uuid4()))) 

771 

772 js = DictionaryObject( 

773 { 

774 NameObject(PagesAttributes.TYPE): NameObject("/Action"), 

775 NameObject("/S"): NameObject("/JavaScript"), 

776 NameObject("/JS"): TextStringObject(f"{javascript}"), 

777 } 

778 ) 

779 js_list.append(self._add_object(js)) 

780 

781 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile": 

782 """ 

783 Embed a file inside the PDF. 

784 

785 Reference: 

786 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

787 Section 7.11.3 

788 

789 Args: 

790 filename: The filename to display. 

791 data: The data in the file. 

792 

793 Returns: 

794 EmbeddedFile instance for the newly created embedded file. 

795 

796 """ 

797 return EmbeddedFile._create_new(self, filename, data) 

798 

799 def append_pages_from_reader( 

800 self, 

801 reader: PdfReader, 

802 after_page_append: Optional[Callable[[PageObject], None]] = None, 

803 ) -> None: 

804 """ 

805 Copy pages from reader to writer. Includes an optional callback 

806 parameter which is invoked after pages are appended to the writer. 

807 

808 ``append`` should be preferred. 

809 

810 Args: 

811 reader: a PdfReader object from which to copy page 

812 annotations to this writer object. The writer's annots 

813 will then be updated. 

814 after_page_append: 

815 Callback function that is invoked after each page is appended to 

816 the writer. Signature includes a reference to the appended page 

817 (delegates to append_pages_from_reader). The single parameter of 

818 the callback is a reference to the page just appended to the 

819 document. 

820 

821 """ 

822 reader_num_pages = len(reader.pages) 

823 # Copy pages from reader to writer 

824 for reader_page_number in range(reader_num_pages): 

825 reader_page = reader.pages[reader_page_number] 

826 writer_page = self.add_page(reader_page) 

827 # Trigger callback, pass writer page as parameter 

828 if callable(after_page_append): 

829 after_page_append(writer_page) 

830 

831 def _merge_content_stream_to_page( 

832 self, 

833 page: PageObject, 

834 new_content_data: bytes, 

835 ) -> None: 

836 """ 

837 Combines existing content stream(s) with new content (as bytes). 

838 

839 Args: 

840 page: The page to which the new content data will be added. 

841 new_content_data: A binary-encoded new content stream, for 

842 instance the commands to draw an XObject. 

843 """ 

844 # First resolve the existing page content. This always is an IndirectObject: 

845 # PDF Explained by John Whitington 

846 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html 

847 if NameObject("/Contents") in page: 

848 existing_content_ref = page[NameObject("/Contents")] 

849 existing_content = existing_content_ref.get_object() 

850 

851 if isinstance(existing_content, ArrayObject): 

852 # Create a new StreamObject for the new_content_data 

853 new_stream_obj = StreamObject() 

854 new_stream_obj.set_data(new_content_data) 

855 existing_content.append(self._add_object(new_stream_obj)) 

856 page[NameObject("/Contents")] = self._add_object(existing_content) 

857 if isinstance(existing_content, StreamObject): 

858 # Merge new content to existing StreamObject 

859 merged_data = existing_content.get_data() + b"\n" + new_content_data 

860 new_stream = StreamObject() 

861 new_stream.set_data(merged_data) 

862 page[NameObject("/Contents")] = self._add_object(new_stream) 

863 else: 

864 # If no existing content, then we have an empty page. 

865 # Create a new StreamObject in a new /Contents entry. 

866 new_stream = StreamObject() 

867 new_stream.set_data(new_content_data) 

868 page[NameObject("/Contents")] = self._add_object(new_stream) 

869 

870 def _add_apstream_object( 

871 self, 

872 page: PageObject, 

873 appearance_stream_obj: StreamObject, 

874 object_name: str, 

875 x_offset: float, 

876 y_offset: float, 

877 font_res: Optional[DictionaryObject] = None 

878 ) -> None: 

879 """ 

880 Adds an appearance stream to the page content in the form of 

881 an XObject. 

882 

883 Args: 

884 page: The page to which to add the appearance stream. 

885 appearance_stream_obj: The appearance stream. 

886 object_name: The name of the appearance stream. 

887 x_offset: The horizontal offset for the appearance stream. 

888 y_offset: The vertical offset for the appearance stream. 

889 font_res: The appearance stream's font resource (if given). 

890 """ 

891 # Prepare XObject resource dictionary on the page 

892 pg_res = cast(DictionaryObject, page[PG.RESOURCES]) 

893 if font_res is not None: 

894 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated 

895 if "/Font" not in pg_res: 

896 pg_res[NameObject("/Font")] = DictionaryObject() 

897 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")]) 

898 if font_name not in pg_ft_res: 

899 pg_ft_res[NameObject(font_name)] = font_res 

900 # Always add the resolved stream object to the writer to get a new IndirectObject. 

901 # This ensures we have a valid IndirectObject managed by *this* writer. 

902 xobject_ref = self._add_object(appearance_stream_obj) 

903 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize() 

904 if "/XObject" not in pg_res: 

905 pg_res[NameObject("/XObject")] = DictionaryObject() 

906 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"]) 

907 if xobject_name not in pg_xo_res: 

908 pg_xo_res[xobject_name] = xobject_ref 

909 else: 

910 logger_warning( 

911 f"XObject {xobject_name!r} already added to page resources. This might be an issue.", 

912 __name__ 

913 ) 

914 xobject_cm = Transformation().translate(x_offset, y_offset) 

915 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() 

916 self._merge_content_stream_to_page(page, xobject_drawing_commands) 

917 

918 def _update_field_annotation( 

919 self, 

920 page: PageObject, 

921 field: DictionaryObject, 

922 annotation: DictionaryObject, 

923 font_name: str = "", 

924 font_size: float = -1, 

925 flatten: bool = False, 

926 ) -> None: 

927 # Calculate rectangle dimensions 

928 _rct = cast(RectangleObject, annotation[AA.Rect]) 

929 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) 

930 

931 # Extract font information 

932 da = annotation.get_inherited( 

933 AA.DA, 

934 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( 

935 AA.DA, None 

936 ), 

937 ) 

938 if da is None: 

939 da = TextStringObject("/Helv 0 Tf 0 g") 

940 else: 

941 da = da.get_object() 

942 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") 

943 font_properties = [x for x in font_properties if x != ""] 

944 if font_name: 

945 font_properties[font_properties.index("Tf") - 2] = font_name 

946 else: 

947 font_name = font_properties[font_properties.index("Tf") - 2] 

948 font_height = ( 

949 font_size 

950 if font_size >= 0 

951 else float(font_properties[font_properties.index("Tf") - 1]) 

952 ) 

953 if font_height == 0: 

954 if field.get(FA.Ff, 0) & FA.FfBits.Multiline: 

955 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE 

956 else: 

957 font_height = rct.height - 2 

958 font_properties[font_properties.index("Tf") - 1] = str(font_height) 

959 da = " ".join(font_properties) 

960 y_offset = rct.height - 1 - font_height 

961 

962 # Retrieve font information from local DR ... 

963 dr: Any = cast( 

964 DictionaryObject, 

965 cast( 

966 DictionaryObject, 

967 annotation.get_inherited( 

968 "/DR", 

969 cast( 

970 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] 

971 ).get("/DR", DictionaryObject()), 

972 ), 

973 ).get_object(), 

974 ) 

975 dr = dr.get("/Font", DictionaryObject()).get_object() 

976 # _default_fonts_space_width keys is the list of Standard fonts 

977 if font_name not in dr and font_name not in _default_fonts_space_width: 

978 # ...or AcroForm dictionary 

979 dr = cast( 

980 dict[Any, Any], 

981 cast( 

982 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] 

983 ).get("/DR", {}), 

984 ) 

985 dr = dr.get_object().get("/Font", DictionaryObject()).get_object() 

986 font_res = dr.get(font_name, None) 

987 if not is_null_or_none(font_res): 

988 font_res = cast(DictionaryObject, font_res.get_object()) 

989 _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 

990 200, font_res 

991 ) 

992 try: # remove width stored in -1 key 

993 del font_map[-1] 

994 except KeyError: 

995 pass 

996 font_full_rev: dict[str, bytes] 

997 if isinstance(font_encoding, str): 

998 font_full_rev = { 

999 v: k.encode(font_encoding) for k, v in font_map.items() 

1000 } 

1001 else: 

1002 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

1003 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

1004 for key, value in font_map.items(): 

1005 font_full_rev[value] = font_encoding_rev.get(key, key) 

1006 else: 

1007 logger_warning(f"Font dictionary for {font_name} not found.", __name__) 

1008 font_full_rev = {} 

1009 

1010 # Retrieve field text and selected values 

1011 field_flags = field.get(FA.Ff, 0) 

1012 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: 

1013 txt = "\n".join(annotation.get_inherited(FA.Opt, [])) 

1014 sel = field.get("/V", []) 

1015 if not isinstance(sel, list): 

1016 sel = [sel] 

1017 else: # /Tx 

1018 txt = field.get("/V", "") 

1019 sel = [] 

1020 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) 

1021 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") 

1022 # Generate appearance stream 

1023 ap_stream = generate_appearance_stream( 

1024 txt, sel, da, font_full_rev, rct, font_height, y_offset 

1025 ) 

1026 

1027 # Create appearance dictionary 

1028 dct = DecodedStreamObject.initialize_from_dictionary( 

1029 { 

1030 NameObject("/Type"): NameObject("/XObject"), 

1031 NameObject("/Subtype"): NameObject("/Form"), 

1032 NameObject("/BBox"): rct, 

1033 "__streamdata__": ByteStringObject(ap_stream), 

1034 "/Length": 0, 

1035 } 

1036 ) 

1037 if AA.AP in annotation: 

1038 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): 

1039 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: 

1040 dct[k] = v 

1041 

1042 # Update Resources with font information if necessary 

1043 if font_res is not None: 

1044 dct[NameObject("/Resources")] = DictionaryObject( 

1045 { 

1046 NameObject("/Font"): DictionaryObject( 

1047 { 

1048 NameObject(font_name): getattr( 

1049 font_res, "indirect_reference", font_res 

1050 ) 

1051 } 

1052 ) 

1053 } 

1054 ) 

1055 if AA.AP not in annotation: 

1056 annotation[NameObject(AA.AP)] = DictionaryObject( 

1057 {NameObject("/N"): self._add_object(dct)} 

1058 ) 

1059 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]): 

1060 cast(DictionaryObject, annotation[NameObject(AA.AP)])[ 

1061 NameObject("/N") 

1062 ] = self._add_object(dct) 

1063 else: # [/AP][/N] exists 

1064 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore 

1065 self._objects[n - 1] = dct 

1066 dct.indirect_reference = IndirectObject(n, 0, self) 

1067 

1068 if flatten: 

1069 field_name = self._get_qualified_field_name(annotation) 

1070 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res) 

1071 

1072 FFBITS_NUL = FA.FfBits(0) 

1073 

1074 def update_page_form_field_values( 

1075 self, 

1076 page: Union[PageObject, list[PageObject], None], 

1077 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]], 

1078 flags: FA.FfBits = FFBITS_NUL, 

1079 auto_regenerate: Optional[bool] = True, 

1080 flatten: bool = False, 

1081 ) -> None: 

1082 """ 

1083 Update the form field values for a given page from a fields dictionary. 

1084 

1085 Copy field texts and values from fields to page. 

1086 If the field links to a parent object, add the information to the parent. 

1087 

1088 Args: 

1089 page: `PageObject` - references **PDF writer's page** where the 

1090 annotations and field data will be updated. 

1091 `List[Pageobject]` - provides list of pages to be processed. 

1092 `None` - all pages. 

1093 fields: a Python dictionary of: 

1094 

1095 * field names (/T) as keys and text values (/V) as value 

1096 * field names (/T) as keys and list of text values (/V) for multiple choice list 

1097 * field names (/T) as keys and tuple of: 

1098 * text values (/V) 

1099 * font id (e.g. /F1, the font id must exist) 

1100 * font size (0 for autosize) 

1101 

1102 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`. 

1103 

1104 auto_regenerate: Set/unset the need_appearances flag; 

1105 the flag is unchanged if auto_regenerate is None. 

1106 

1107 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's 

1108 appearance stream to the page contents. Note that this option does not remove the 

1109 annotation itself. 

1110 

1111 """ 

1112 if CatalogDictionary.ACRO_FORM not in self._root_object: 

1113 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") 

1114 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1115 if InteractiveFormDictEntries.Fields not in af: 

1116 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") 

1117 if isinstance(auto_regenerate, bool): 

1118 self.set_need_appearances_writer(auto_regenerate) 

1119 # Iterate through pages, update field values 

1120 if page is None: 

1121 page = list(self.pages) 

1122 if isinstance(page, list): 

1123 for p in page: 

1124 if PG.ANNOTS in p: # just to prevent warnings 

1125 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten) 

1126 return 

1127 if PG.ANNOTS not in page: 

1128 logger_warning("No fields to update on this page", __name__) 

1129 return 

1130 for annotation in page[PG.ANNOTS]: # type: ignore 

1131 annotation = cast(DictionaryObject, annotation.get_object()) 

1132 if annotation.get("/Subtype", "") != "/Widget": 

1133 continue 

1134 if "/FT" in annotation and "/T" in annotation: 

1135 parent_annotation = annotation 

1136 else: 

1137 parent_annotation = annotation.get( 

1138 PG.PARENT, DictionaryObject() 

1139 ).get_object() 

1140 

1141 for field, value in fields.items(): 

1142 if not ( 

1143 self._get_qualified_field_name(parent_annotation) == field 

1144 or parent_annotation.get("/T", None) == field 

1145 ): 

1146 continue 

1147 if ( 

1148 parent_annotation.get("/FT", None) == "/Ch" 

1149 and "/I" in parent_annotation 

1150 ): 

1151 del parent_annotation["/I"] 

1152 if flags: 

1153 annotation[NameObject(FA.Ff)] = NumberObject(flags) 

1154 if not (value is None and flatten): # Only change values if given by user and not flattening. 

1155 if isinstance(value, list): 

1156 lst = ArrayObject(TextStringObject(v) for v in value) 

1157 parent_annotation[NameObject(FA.V)] = lst 

1158 elif isinstance(value, tuple): 

1159 annotation[NameObject(FA.V)] = TextStringObject( 

1160 value[0], 

1161 ) 

1162 else: 

1163 parent_annotation[NameObject(FA.V)] = TextStringObject(value) 

1164 if parent_annotation.get(FA.FT) == "/Btn": 

1165 # Checkbox button (no /FT found in Radio widgets) 

1166 v = NameObject(value) 

1167 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) 

1168 normal_ap = cast(DictionaryObject, ap["/N"]) 

1169 if v not in normal_ap: 

1170 v = NameObject("/Off") 

1171 appearance_stream_obj = normal_ap.get(v) 

1172 # other cases will be updated through the for loop 

1173 annotation[NameObject(AA.AS)] = v 

1174 annotation[NameObject(FA.V)] = v 

1175 if flatten and appearance_stream_obj is not None: 

1176 # We basically copy the entire appearance stream, which should be an XObject that 

1177 # is already registered. No need to add font resources. 

1178 rct = cast(RectangleObject, annotation[AA.Rect]) 

1179 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) 

1180 elif ( 

1181 parent_annotation.get(FA.FT) == "/Tx" 

1182 or parent_annotation.get(FA.FT) == "/Ch" 

1183 ): 

1184 # textbox 

1185 if isinstance(value, tuple): 

1186 self._update_field_annotation( 

1187 page, parent_annotation, annotation, value[1], value[2], flatten=flatten 

1188 ) 

1189 else: 

1190 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten) 

1191 elif ( 

1192 annotation.get(FA.FT) == "/Sig" 

1193 ): # deprecated # not implemented yet 

1194 logger_warning("Signature forms not implemented yet", __name__) 

1195 

1196 def reattach_fields( 

1197 self, page: Optional[PageObject] = None 

1198 ) -> list[DictionaryObject]: 

1199 """ 

1200 Parse annotations within the page looking for orphan fields and 

1201 reattach then into the Fields Structure. 

1202 

1203 Args: 

1204 page: page to analyze. 

1205 If none is provided, all pages will be analyzed. 

1206 

1207 Returns: 

1208 list of reattached fields. 

1209 

1210 """ 

1211 lst = [] 

1212 if page is None: 

1213 for p in self.pages: 

1214 lst += self.reattach_fields(p) 

1215 return lst 

1216 

1217 try: 

1218 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1219 except KeyError: 

1220 af = DictionaryObject() 

1221 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af 

1222 try: 

1223 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields]) 

1224 except KeyError: 

1225 fields = ArrayObject() 

1226 af[NameObject(InteractiveFormDictEntries.Fields)] = fields 

1227 

1228 if "/Annots" not in page: 

1229 return lst 

1230 annotations = cast(ArrayObject, page["/Annots"]) 

1231 for idx, annotation in enumerate(annotations): 

1232 is_indirect = isinstance(annotation, IndirectObject) 

1233 annotation = cast(DictionaryObject, annotation.get_object()) 

1234 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation: 

1235 if ( 

1236 "indirect_reference" in annotation.__dict__ 

1237 and annotation.indirect_reference in fields 

1238 ): 

1239 continue 

1240 if not is_indirect: 

1241 annotations[idx] = self._add_object(annotation) 

1242 fields.append(annotation.indirect_reference) 

1243 lst.append(annotation) 

1244 return lst 

1245 

1246 def clone_reader_document_root(self, reader: PdfReader) -> None: 

1247 """ 

1248 Copy the reader document root to the writer and all sub-elements, 

1249 including pages, threads, outlines,... For partial insertion, ``append`` 

1250 should be considered. 

1251 

1252 Args: 

1253 reader: PdfReader from which the document root should be copied. 

1254 

1255 """ 

1256 self._info_obj = None 

1257 if self.incremental: 

1258 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1) 

1259 for i in range(len(self._objects)): 

1260 o = reader.get_object(i + 1) 

1261 if o is not None: 

1262 self._objects[i] = o.replicate(self) 

1263 else: 

1264 self._objects.clear() 

1265 self._root_object = reader.root_object.clone(self) 

1266 self._pages = self._root_object.raw_get("/Pages") 

1267 

1268 if len(self._objects) > cast(int, reader.trailer["/Size"]): 

1269 if self.strict: 

1270 raise PdfReadError( 

1271 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}" 

1272 ) 

1273 logger_warning( 

1274 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}", 

1275 __name__ 

1276 ) 

1277 

1278 # must be done here before rewriting 

1279 if self.incremental: 

1280 self._original_hash = [ 

1281 (obj.hash_bin() if obj is not None else 0) for obj in self._objects 

1282 ] 

1283 

1284 try: 

1285 self._flatten() 

1286 except IndexError: 

1287 raise PdfReadError("Got index error while flattening.") 

1288 

1289 assert self.flattened_pages is not None 

1290 for p in self.flattened_pages: 

1291 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) 

1292 if not self.incremental: 

1293 p[NameObject("/Parent")] = self._pages 

1294 if not self.incremental: 

1295 cast(DictionaryObject, self._pages.get_object())[ 

1296 NameObject("/Kids") 

1297 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) 

1298 

1299 def clone_document_from_reader( 

1300 self, 

1301 reader: PdfReader, 

1302 after_page_append: Optional[Callable[[PageObject], None]] = None, 

1303 ) -> None: 

1304 """ 

1305 Create a copy (clone) of a document from a PDF file reader cloning 

1306 section '/Root' and '/Info' and '/ID' of the pdf. 

1307 

1308 Args: 

1309 reader: PDF file reader instance from which the clone 

1310 should be created. 

1311 after_page_append: 

1312 Callback function that is invoked after each page is appended to 

1313 the writer. Signature includes a reference to the appended page 

1314 (delegates to append_pages_from_reader). The single parameter of 

1315 the callback is a reference to the page just appended to the 

1316 document. 

1317 

1318 """ 

1319 self.clone_reader_document_root(reader) 

1320 inf = reader._info 

1321 if self.incremental: 

1322 if inf is not None: 

1323 self._info_obj = cast( 

1324 IndirectObject, inf.clone(self).indirect_reference 

1325 ) 

1326 assert isinstance(self._info, DictionaryObject), "for mypy" 

1327 self._original_hash[ 

1328 self._info_obj.indirect_reference.idnum - 1 

1329 ] = self._info.hash_bin() 

1330 elif inf is not None: 

1331 self._info_obj = self._add_object( 

1332 DictionaryObject(cast(DictionaryObject, inf.get_object())) 

1333 ) 

1334 # else: _info_obj = None done in clone_reader_document_root() 

1335 

1336 try: 

1337 self._ID = cast(ArrayObject, reader._ID).clone(self) 

1338 except AttributeError: 

1339 pass 

1340 

1341 if callable(after_page_append): 

1342 for page in cast( 

1343 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] 

1344 ): 

1345 after_page_append(page.get_object()) 

1346 

1347 def _compute_document_identifier(self) -> ByteStringObject: 

1348 stream = BytesIO() 

1349 self._write_pdf_structure(stream) 

1350 stream.seek(0) 

1351 return ByteStringObject(_rolling_checksum(stream).encode("utf8")) 

1352 

1353 def generate_file_identifiers(self) -> None: 

1354 """ 

1355 Generate an identifier for the PDF that will be written. 

1356 

1357 The only point of this is ensuring uniqueness. Reproducibility is not 

1358 required. 

1359 When a file is first written, both identifiers shall be set to the same value. 

1360 If both identifiers match when a file reference is resolved, it is very 

1361 likely that the correct and unchanged file has been found. If only the first 

1362 identifier matches, a different version of the correct file has been found. 

1363 see §14.4 "File Identifiers". 

1364 """ 

1365 if self._ID: 

1366 id1 = self._ID[0] 

1367 id2 = self._compute_document_identifier() 

1368 else: 

1369 id1 = self._compute_document_identifier() 

1370 id2 = id1 

1371 self._ID = ArrayObject((id1, id2)) 

1372 

1373 def encrypt( 

1374 self, 

1375 user_password: str, 

1376 owner_password: Optional[str] = None, 

1377 use_128bit: bool = True, 

1378 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS, 

1379 *, 

1380 algorithm: Optional[str] = None, 

1381 ) -> None: 

1382 """ 

1383 Encrypt this PDF file with the PDF Standard encryption handler. 

1384 

1385 Args: 

1386 user_password: The password which allows for opening 

1387 and reading the PDF file with the restrictions provided. 

1388 owner_password: The password which allows for 

1389 opening the PDF files without any restrictions. By default, 

1390 the owner password is the same as the user password. 

1391 use_128bit: flag as to whether to use 128bit 

1392 encryption. When false, 40bit encryption will be used. 

1393 By default, this flag is on. 

1394 permissions_flag: permissions as described in 

1395 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means 

1396 the permission is granted. 

1397 Hence an integer value of -1 will set all flags. 

1398 Bit position 3 is for printing, 4 is for modifying content, 

1399 5 and 6 control annotations, 9 for form fields, 

1400 10 for extraction of text and graphics. 

1401 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128", 

1402 "AES-128", "AES-256-R5", "AES-256". If it is valid, 

1403 `use_128bit` will be ignored. 

1404 

1405 """ 

1406 if owner_password is None: 

1407 owner_password = user_password 

1408 

1409 if algorithm is not None: 

1410 try: 

1411 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_")) 

1412 except AttributeError: 

1413 raise ValueError(f"Algorithm '{algorithm}' NOT supported") 

1414 else: 

1415 alg = EncryptAlgorithm.RC4_128 

1416 if not use_128bit: 

1417 alg = EncryptAlgorithm.RC4_40 

1418 self.generate_file_identifiers() 

1419 assert self._ID 

1420 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0]) 

1421 # in case call `encrypt` again 

1422 entry = self._encryption.write_entry(user_password, owner_password) 

1423 if self._encrypt_entry: 

1424 # replace old encrypt_entry 

1425 assert self._encrypt_entry.indirect_reference is not None 

1426 entry.indirect_reference = self._encrypt_entry.indirect_reference 

1427 self._objects[entry.indirect_reference.idnum - 1] = entry 

1428 else: 

1429 self._add_object(entry) 

1430 self._encrypt_entry = entry 

1431 

1432 def _resolve_links(self) -> None: 

1433 """Patch up links that were added to the document earlier, to 

1434 make sure they still point to the same pages. 

1435 """ 

1436 for (new_link, old_link) in self._unresolved_links: 

1437 old_page = old_link.find_referenced_page() 

1438 if not old_page: 

1439 continue 

1440 new_page = self._merged_in_pages.get(old_page) 

1441 if new_page is None: 

1442 continue 

1443 new_link.patch_reference(self, new_page) 

1444 

1445 def write_stream(self, stream: StreamType) -> None: 

1446 if hasattr(stream, "mode") and "b" not in stream.mode: 

1447 logger_warning( 

1448 f"File <{stream.name}> to write to is not in binary mode. " 

1449 "It may not be written to correctly.", 

1450 __name__, 

1451 ) 

1452 self._resolve_links() 

1453 

1454 if self.incremental: 

1455 self._reader.stream.seek(0) 

1456 stream.write(self._reader.stream.read(-1)) 

1457 if len(self.list_objects_in_increment()) > 0: 

1458 self._write_increment(stream) # writes objs, xref stream and startxref 

1459 else: 

1460 object_positions, free_objects = self._write_pdf_structure(stream) 

1461 xref_location = self._write_xref_table( 

1462 stream, object_positions, free_objects 

1463 ) 

1464 self._write_trailer(stream, xref_location) 

1465 

1466 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]: 

1467 """ 

1468 Write the collection of pages added to this object out as a PDF file. 

1469 

1470 Args: 

1471 stream: An object to write the file to. The object can support 

1472 the write method and the tell method, similar to a file object, or 

1473 be a file path, just like the fileobj, just named it stream to keep 

1474 existing workflow. 

1475 

1476 Returns: 

1477 A tuple (bool, IO). 

1478 

1479 """ 

1480 my_file = False 

1481 

1482 if stream == "": 

1483 raise ValueError(f"Output({stream=}) is empty.") 

1484 

1485 if isinstance(stream, (str, Path)): 

1486 stream = FileIO(stream, "wb") 

1487 my_file = True 

1488 

1489 self.write_stream(stream) 

1490 

1491 if my_file: 

1492 stream.close() 

1493 else: 

1494 stream.flush() 

1495 

1496 return my_file, stream 

1497 

1498 def list_objects_in_increment(self) -> list[IndirectObject]: 

1499 """ 

1500 For analysis or debugging. 

1501 Provides the list of new or modified objects that will be written 

1502 in the increment. 

1503 Deleted objects will not be freed but will become orphans. 

1504 

1505 Returns: 

1506 List of new or modified IndirectObjects 

1507 

1508 """ 

1509 original_hash_count = len(self._original_hash) 

1510 return [ 

1511 cast(IndirectObject, obj).indirect_reference 

1512 for i, obj in enumerate(self._objects) 

1513 if ( 

1514 obj is not None 

1515 and ( 

1516 i >= original_hash_count 

1517 or obj.hash_bin() != self._original_hash[i] 

1518 ) 

1519 ) 

1520 ] 

1521 

1522 def _write_increment(self, stream: StreamType) -> None: 

1523 object_positions = {} 

1524 object_blocks = [] 

1525 current_start = -1 

1526 current_stop = -2 

1527 original_hash_count = len(self._original_hash) 

1528 for i, obj in enumerate(self._objects): 

1529 if obj is not None and ( 

1530 i >= original_hash_count 

1531 or obj.hash_bin() != self._original_hash[i] 

1532 ): 

1533 idnum = i + 1 

1534 assert isinstance(obj, PdfObject), "mypy" 

1535 # first write new/modified object 

1536 object_positions[idnum] = stream.tell() 

1537 stream.write(f"{idnum} 0 obj\n".encode()) 

1538 """ encryption is not operational 

1539 if self._encryption and obj != self._encrypt_entry: 

1540 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1541 """ 

1542 obj.write_to_stream(stream) 

1543 stream.write(b"\nendobj\n") 

1544 

1545 # prepare xref 

1546 if idnum != current_stop: 

1547 if current_start > 0: 

1548 object_blocks.append( 

1549 [current_start, current_stop - current_start] 

1550 ) 

1551 current_start = idnum 

1552 current_stop = idnum + 1 

1553 assert current_start > 0, "for pytest only" 

1554 object_blocks.append([current_start, current_stop - current_start]) 

1555 # write incremented xref 

1556 xref_location = stream.tell() 

1557 xr_id = len(self._objects) + 1 

1558 stream.write(f"{xr_id} 0 obj".encode()) 

1559 init_data = { 

1560 NameObject("/Type"): NameObject("/XRef"), 

1561 NameObject("/Size"): NumberObject(xr_id + 1), 

1562 NameObject("/Root"): self.root_object.indirect_reference, 

1563 NameObject("/Filter"): NameObject("/FlateDecode"), 

1564 NameObject("/Index"): ArrayObject( 

1565 [NumberObject(_it) for _su in object_blocks for _it in _su] 

1566 ), 

1567 NameObject("/W"): ArrayObject( 

1568 [NumberObject(1), NumberObject(4), NumberObject(1)] 

1569 ), 

1570 "__streamdata__": b"", 

1571 } 

1572 if self._info is not None and ( 

1573 self._info.indirect_reference.idnum - 1 # type: ignore 

1574 >= len(self._original_hash) 

1575 or cast(IndirectObject, self._info).hash_bin() # kept for future 

1576 != self._original_hash[ 

1577 self._info.indirect_reference.idnum - 1 # type: ignore 

1578 ] 

1579 ): 

1580 init_data[NameObject(TK.INFO)] = self._info.indirect_reference 

1581 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) 

1582 if self._ID: 

1583 init_data[NameObject(TK.ID)] = self._ID 

1584 xr = StreamObject.initialize_from_dictionary(init_data) 

1585 xr.set_data( 

1586 b"".join( 

1587 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] 

1588 ) 

1589 ) 

1590 xr.write_to_stream(stream) 

1591 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1592 

1593 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]: 

1594 object_positions = [] 

1595 free_objects = [] 

1596 stream.write(self.pdf_header.encode() + b"\n") 

1597 stream.write(b"%\xE2\xE3\xCF\xD3\n") 

1598 

1599 for idnum, obj in enumerate(self._objects, start=1): 

1600 if obj is not None: 

1601 object_positions.append(stream.tell()) 

1602 stream.write(f"{idnum} 0 obj\n".encode()) 

1603 if self._encryption and obj != self._encrypt_entry: 

1604 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1605 obj.write_to_stream(stream) 

1606 stream.write(b"\nendobj\n") 

1607 else: 

1608 object_positions.append(-1) 

1609 free_objects.append(idnum) 

1610 free_objects.append(0) # add 0 to loop in accordance with specification 

1611 return object_positions, free_objects 

1612 

1613 def _write_xref_table( 

1614 self, stream: StreamType, object_positions: list[int], free_objects: list[int] 

1615 ) -> int: 

1616 xref_location = stream.tell() 

1617 stream.write(b"xref\n") 

1618 stream.write(f"0 {len(self._objects) + 1}\n".encode()) 

1619 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) 

1620 free_idx = 1 

1621 for offset in object_positions: 

1622 if offset > 0: 

1623 stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) 

1624 else: 

1625 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) 

1626 free_idx += 1 

1627 return xref_location 

1628 

1629 def _write_trailer(self, stream: StreamType, xref_location: int) -> None: 

1630 """ 

1631 Write the PDF trailer to the stream. 

1632 

1633 To quote the PDF specification: 

1634 [The] trailer [gives] the location of the cross-reference table and 

1635 of certain special objects within the body of the file. 

1636 """ 

1637 stream.write(b"trailer\n") 

1638 trailer = DictionaryObject( 

1639 { 

1640 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), 

1641 NameObject(TK.ROOT): self.root_object.indirect_reference, 

1642 } 

1643 ) 

1644 if self._info is not None: 

1645 trailer[NameObject(TK.INFO)] = self._info.indirect_reference 

1646 if self._ID is not None: 

1647 trailer[NameObject(TK.ID)] = self._ID 

1648 if self._encrypt_entry: 

1649 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference 

1650 trailer.write_to_stream(stream) 

1651 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1652 

1653 @property 

1654 def metadata(self) -> Optional[DocumentInformation]: 

1655 """ 

1656 Retrieve/set the PDF file's document information dictionary, if it exists. 

1657 

1658 Args: 

1659 value: dict with the entries to be set. if None : remove the /Info entry from the pdf. 

1660 

1661 Note that some PDF files use (XMP) metadata streams instead of document 

1662 information dictionaries, and these metadata streams will not be 

1663 accessed by this function, but by :meth:`~xmp_metadata`. 

1664 

1665 """ 

1666 return super().metadata 

1667 

1668 @metadata.setter 

1669 def metadata( 

1670 self, 

1671 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]], 

1672 ) -> None: 

1673 if value is None: 

1674 self._info = None 

1675 else: 

1676 if self._info is not None: 

1677 self._info.clear() 

1678 

1679 self.add_metadata(value) 

1680 

1681 def add_metadata(self, infos: dict[str, Any]) -> None: 

1682 """ 

1683 Add custom metadata to the output. 

1684 

1685 Args: 

1686 infos: a Python dictionary where each key is a field 

1687 and each value is your new metadata. 

1688 

1689 """ 

1690 args = {} 

1691 if isinstance(infos, PdfObject): 

1692 infos = cast(DictionaryObject, infos.get_object()) 

1693 for key, value in list(infos.items()): 

1694 if isinstance(value, PdfObject): 

1695 value = value.get_object() 

1696 args[NameObject(key)] = create_string_object(str(value)) 

1697 if self._info is None: 

1698 self._info = DictionaryObject() 

1699 self._info.update(args) 

1700 

1701 def compress_identical_objects( 

1702 self, 

1703 remove_identicals: bool = True, 

1704 remove_orphans: bool = True, 

1705 ) -> None: 

1706 """ 

1707 Parse the PDF file and merge objects that have the same hash. 

1708 This will make objects common to multiple pages. 

1709 Recommended to be used just before writing output. 

1710 

1711 Args: 

1712 remove_identicals: Remove identical objects. 

1713 remove_orphans: Remove unreferenced objects. 

1714 

1715 """ 

1716 

1717 def replace_in_obj( 

1718 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject] 

1719 ) -> None: 

1720 if isinstance(obj, DictionaryObject): 

1721 key_val = obj.items() 

1722 elif isinstance(obj, ArrayObject): 

1723 key_val = enumerate(obj) # type: ignore 

1724 else: 

1725 return 

1726 assert isinstance(obj, (DictionaryObject, ArrayObject)) 

1727 for k, v in key_val: 

1728 if isinstance(v, IndirectObject): 

1729 orphans[v.idnum - 1] = False 

1730 if v in crossref: 

1731 obj[k] = crossref[v] 

1732 else: 

1733 """the filtering on DictionaryObject and ArrayObject only 

1734 will be performed within replace_in_obj""" 

1735 replace_in_obj(v, crossref) 

1736 

1737 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) 

1738 self._idnum_hash = {} 

1739 orphans = [True] * len(self._objects) 

1740 # look for similar objects 

1741 for idx, obj in enumerate(self._objects): 

1742 if is_null_or_none(obj): 

1743 continue 

1744 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. 

1745 assert isinstance(obj.indirect_reference, IndirectObject) 

1746 h = obj.hash_value() 

1747 if remove_identicals and h in self._idnum_hash: 

1748 self._idnum_hash[h][1].append(obj.indirect_reference) 

1749 self._objects[idx] = None 

1750 else: 

1751 self._idnum_hash[h] = (obj.indirect_reference, []) 

1752 

1753 # generate the dict converting others to 1st 

1754 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} 

1755 cnv_rev: dict[IndirectObject, IndirectObject] = {} 

1756 for k, v in cnv.items(): 

1757 cnv_rev.update(zip(v, (k,) * len(v))) 

1758 

1759 # replace reference to merged objects 

1760 for obj in self._objects: 

1761 if isinstance(obj, (DictionaryObject, ArrayObject)): 

1762 replace_in_obj(obj, cnv_rev) 

1763 

1764 # remove orphans (if applicable) 

1765 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore 

1766 

1767 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore 

1768 

1769 try: 

1770 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore 

1771 except AttributeError: 

1772 pass 

1773 for i in compress(range(len(self._objects)), orphans): 

1774 self._objects[i] = None 

1775 

1776 def get_reference(self, obj: PdfObject) -> IndirectObject: 

1777 idnum = self._objects.index(obj) + 1 

1778 ref = IndirectObject(idnum, 0, self) 

1779 assert ref.get_object() == obj 

1780 return ref 

1781 

1782 def get_outline_root(self) -> TreeObject: 

1783 if CO.OUTLINES in self._root_object: 

1784 # Entries in the catalog dictionary 

1785 outline = cast(TreeObject, self._root_object[CO.OUTLINES]) 

1786 if not isinstance(outline, TreeObject): 

1787 t = TreeObject(outline) 

1788 self._replace_object(outline.indirect_reference.idnum, t) 

1789 outline = t 

1790 idnum = self._objects.index(outline) + 1 

1791 outline_ref = IndirectObject(idnum, 0, self) 

1792 assert outline_ref.get_object() == outline 

1793 else: 

1794 outline = TreeObject() 

1795 outline.update({}) 

1796 outline_ref = self._add_object(outline) 

1797 self._root_object[NameObject(CO.OUTLINES)] = outline_ref 

1798 

1799 return outline 

1800 

1801 def get_threads_root(self) -> ArrayObject: 

1802 """ 

1803 The list of threads. 

1804 

1805 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1806 

1807 Returns: 

1808 An array (possibly empty) of Dictionaries with an ``/F`` key, 

1809 and optionally information about the thread in ``/I`` or ``/Metadata`` keys. 

1810 

1811 """ 

1812 if CO.THREADS in self._root_object: 

1813 # Entries in the catalog dictionary 

1814 threads = cast(ArrayObject, self._root_object[CO.THREADS]) 

1815 else: 

1816 threads = ArrayObject() 

1817 self._root_object[NameObject(CO.THREADS)] = threads 

1818 return threads 

1819 

1820 @property 

1821 def threads(self) -> ArrayObject: 

1822 """ 

1823 Read-only property for the list of threads. 

1824 

1825 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1826 

1827 Each element is a dictionary with an ``/F`` key, and optionally 

1828 information about the thread in ``/I`` or ``/Metadata`` keys. 

1829 """ 

1830 return self.get_threads_root() 

1831 

1832 def add_outline_item_destination( 

1833 self, 

1834 page_destination: Union[IndirectObject, PageObject, TreeObject], 

1835 parent: Union[None, TreeObject, IndirectObject] = None, 

1836 before: Union[None, TreeObject, IndirectObject] = None, 

1837 is_open: bool = True, 

1838 ) -> IndirectObject: 

1839 page_destination = cast(PageObject, page_destination.get_object()) 

1840 if isinstance(page_destination, PageObject): 

1841 return self.add_outline_item_destination( 

1842 Destination( 

1843 f"page #{page_destination.page_number}", 

1844 cast(IndirectObject, page_destination.indirect_reference), 

1845 Fit.fit(), 

1846 ) 

1847 ) 

1848 

1849 if parent is None: 

1850 parent = self.get_outline_root() 

1851 

1852 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open) 

1853 parent = cast(TreeObject, parent.get_object()) 

1854 page_destination_ref = self._add_object(page_destination) 

1855 if before is not None: 

1856 before = before.indirect_reference 

1857 parent.insert_child( 

1858 page_destination_ref, 

1859 before, 

1860 self, 

1861 page_destination.inc_parent_counter_outline 

1862 if is_open 

1863 else (lambda x, y: 0), # noqa: ARG005 

1864 ) 

1865 if "/Count" not in page_destination: 

1866 page_destination[NameObject("/Count")] = NumberObject(0) 

1867 

1868 return page_destination_ref 

1869 

1870 def add_outline_item_dict( 

1871 self, 

1872 outline_item: OutlineItemType, 

1873 parent: Union[None, TreeObject, IndirectObject] = None, 

1874 before: Union[None, TreeObject, IndirectObject] = None, 

1875 is_open: bool = True, 

1876 ) -> IndirectObject: 

1877 outline_item_object = TreeObject() 

1878 outline_item_object.update(outline_item) 

1879 

1880 """code currently unreachable 

1881 if "/A" in outline_item: 

1882 action = DictionaryObject() 

1883 a_dict = cast(DictionaryObject, outline_item["/A"]) 

1884 for k, v in list(a_dict.items()): 

1885 action[NameObject(str(k))] = v 

1886 action_ref = self._add_object(action) 

1887 outline_item_object[NameObject("/A")] = action_ref 

1888 """ 

1889 return self.add_outline_item_destination( 

1890 outline_item_object, parent, before, is_open 

1891 ) 

1892 

1893 def add_outline_item( 

1894 self, 

1895 title: str, 

1896 page_number: Union[None, PageObject, IndirectObject, int], 

1897 parent: Union[None, TreeObject, IndirectObject] = None, 

1898 before: Union[None, TreeObject, IndirectObject] = None, 

1899 color: Optional[Union[tuple[float, float, float], str]] = None, 

1900 bold: bool = False, 

1901 italic: bool = False, 

1902 fit: Fit = PAGE_FIT, 

1903 is_open: bool = True, 

1904 ) -> IndirectObject: 

1905 """ 

1906 Add an outline item (commonly referred to as a "Bookmark") to the PDF file. 

1907 

1908 Args: 

1909 title: Title to use for this outline item. 

1910 page_number: Page number this outline item will point to. 

1911 parent: A reference to a parent outline item to create nested 

1912 outline items. 

1913 before: 

1914 color: Color of the outline item's font as a red, green, blue tuple 

1915 from 0.0 to 1.0 or as a Hex String (#RRGGBB) 

1916 bold: Outline item font is bold 

1917 italic: Outline item font is italic 

1918 fit: The fit of the destination page. 

1919 

1920 Returns: 

1921 The added outline item as an indirect object. 

1922 

1923 """ 

1924 page_ref: Union[None, NullObject, IndirectObject, NumberObject] 

1925 if isinstance(italic, Fit): # it means that we are on the old params 

1926 if fit is not None and page_number is None: 

1927 page_number = fit 

1928 return self.add_outline_item( 

1929 title, page_number, parent, None, before, color, bold, italic, is_open=is_open 

1930 ) 

1931 if page_number is None: 

1932 action_ref = None 

1933 else: 

1934 if isinstance(page_number, IndirectObject): 

1935 page_ref = page_number 

1936 elif isinstance(page_number, PageObject): 

1937 page_ref = page_number.indirect_reference 

1938 elif isinstance(page_number, int): 

1939 try: 

1940 page_ref = self.pages[page_number].indirect_reference 

1941 except IndexError: 

1942 page_ref = NumberObject(page_number) 

1943 if page_ref is None: 

1944 logger_warning( 

1945 f"can not find reference of page {page_number}", 

1946 __name__, 

1947 ) 

1948 page_ref = NullObject() 

1949 dest = Destination( 

1950 NameObject("/" + title + " outline item"), 

1951 page_ref, 

1952 fit, 

1953 ) 

1954 

1955 action_ref = self._add_object( 

1956 DictionaryObject( 

1957 { 

1958 NameObject(GoToActionArguments.D): dest.dest_array, 

1959 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

1960 } 

1961 ) 

1962 ) 

1963 outline_item = self._add_object( 

1964 _create_outline_item(action_ref, title, color, italic, bold) 

1965 ) 

1966 

1967 if parent is None: 

1968 parent = self.get_outline_root() 

1969 return self.add_outline_item_destination(outline_item, parent, before, is_open) 

1970 

1971 def add_outline(self) -> None: 

1972 raise NotImplementedError( 

1973 "This method is not yet implemented. Use :meth:`add_outline_item` instead." 

1974 ) 

1975 

1976 def add_named_destination_array( 

1977 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject] 

1978 ) -> None: 

1979 named_dest = self.get_named_dest_root() 

1980 i = 0 

1981 while i < len(named_dest): 

1982 if title < named_dest[i]: 

1983 named_dest.insert(i, destination) 

1984 named_dest.insert(i, TextStringObject(title)) 

1985 return 

1986 i += 2 

1987 named_dest.extend([TextStringObject(title), destination]) 

1988 return 

1989 

1990 def add_named_destination_object( 

1991 self, 

1992 page_destination: PdfObject, 

1993 ) -> IndirectObject: 

1994 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore 

1995 self.add_named_destination_array( 

1996 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore 

1997 ) 

1998 

1999 return page_destination_ref 

2000 

2001 def add_named_destination( 

2002 self, 

2003 title: str, 

2004 page_number: int, 

2005 ) -> IndirectObject: 

2006 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore 

2007 dest = DictionaryObject() 

2008 dest.update( 

2009 { 

2010 NameObject(GoToActionArguments.D): ArrayObject( 

2011 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)] 

2012 ), 

2013 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

2014 } 

2015 ) 

2016 

2017 dest_ref = self._add_object(dest) 

2018 if not isinstance(title, TextStringObject): 

2019 title = TextStringObject(str(title)) 

2020 

2021 self.add_named_destination_array(title, dest_ref) 

2022 return dest_ref 

2023 

2024 def remove_links(self) -> None: 

2025 """Remove links and annotations from this output.""" 

2026 for page in self.pages: 

2027 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS) 

2028 

2029 def remove_annotations( 

2030 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]] 

2031 ) -> None: 

2032 """ 

2033 Remove annotations by annotation subtype. 

2034 

2035 Args: 

2036 subtypes: subtype or list of subtypes to be removed. 

2037 Examples are: "/Link", "/FileAttachment", "/Sound", 

2038 "/Movie", "/Screen", ... 

2039 If you want to remove all annotations, use subtypes=None. 

2040 

2041 """ 

2042 for page in self.pages: 

2043 self._remove_annots_from_page(page, subtypes) 

2044 

2045 def _remove_annots_from_page( 

2046 self, 

2047 page: Union[IndirectObject, PageObject, DictionaryObject], 

2048 subtypes: Optional[Iterable[str]], 

2049 ) -> None: 

2050 page = cast(DictionaryObject, page.get_object()) 

2051 if PG.ANNOTS in page: 

2052 i = 0 

2053 while i < len(cast(ArrayObject, page[PG.ANNOTS])): 

2054 an = cast(ArrayObject, page[PG.ANNOTS])[i] 

2055 obj = cast(DictionaryObject, an.get_object()) 

2056 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes: 

2057 if isinstance(an, IndirectObject): 

2058 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size 

2059 del page[PG.ANNOTS][i] # type:ignore 

2060 else: 

2061 i += 1 

2062 

2063 def remove_objects_from_page( 

2064 self, 

2065 page: Union[PageObject, DictionaryObject], 

2066 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]], 

2067 text_filters: Optional[dict[str, Any]] = None 

2068 ) -> None: 

2069 """ 

2070 Remove objects specified by ``to_delete`` from the given page. 

2071 

2072 Args: 

2073 page: Page object to clean up. 

2074 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag`` 

2075 or a list of ObjectDeletionFlag 

2076 text_filters: Properties of text to be deleted, if applicable. Optional. 

2077 This is a Python dictionary with the following properties: 

2078 

2079 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted. 

2080 

2081 """ 

2082 if isinstance(to_delete, (list, tuple)): 

2083 for to_d in to_delete: 

2084 self.remove_objects_from_page(page, to_d) 

2085 return None 

2086 assert isinstance(to_delete, ObjectDeletionFlag) 

2087 

2088 if to_delete & ObjectDeletionFlag.LINKS: 

2089 return self._remove_annots_from_page(page, ("/Link",)) 

2090 if to_delete & ObjectDeletionFlag.ATTACHMENTS: 

2091 return self._remove_annots_from_page( 

2092 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen") 

2093 ) 

2094 if to_delete & ObjectDeletionFlag.OBJECTS_3D: 

2095 return self._remove_annots_from_page(page, ("/3D",)) 

2096 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: 

2097 return self._remove_annots_from_page(page, None) 

2098 

2099 jump_operators = [] 

2100 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: 

2101 jump_operators = ( 

2102 [ 

2103 b"w", b"J", b"j", b"M", b"d", b"i", 

2104 b"W", b"W*", 

2105 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n", 

2106 b"m", b"l", b"c", b"v", b"y", b"h", b"re", 

2107 b"sh" 

2108 ] 

2109 ) 

2110 if to_delete & ObjectDeletionFlag.TEXT: 

2111 jump_operators = [b"Tj", b"TJ", b"'", b'"'] 

2112 

2113 def clean( 

2114 content: ContentStream, 

2115 images: list[str], 

2116 forms: list[str], 

2117 text_filters: Optional[dict[str, Any]] = None 

2118 ) -> None: 

2119 nonlocal jump_operators, to_delete 

2120 

2121 font_id = None 

2122 font_ids_to_delete = [] 

2123 if text_filters and to_delete & ObjectDeletionFlag.TEXT: 

2124 font_ids_to_delete = text_filters.get("font_ids", []) 

2125 

2126 i = 0 

2127 while i < len(content.operations): 

2128 operands, operator = content.operations[i] 

2129 if operator == b"Tf": 

2130 font_id = operands[0] 

2131 if ( 

2132 ( 

2133 operator == b"INLINE IMAGE" 

2134 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES) 

2135 ) 

2136 or (operator in jump_operators) 

2137 or ( 

2138 operator == b"Do" 

2139 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES) 

2140 and (operands[0] in images) 

2141 ) 

2142 ): 

2143 if ( 

2144 not to_delete & ObjectDeletionFlag.TEXT 

2145 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters) 

2146 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete) 

2147 ): 

2148 del content.operations[i] 

2149 else: 

2150 i += 1 

2151 else: 

2152 i += 1 

2153 content.get_data() # this ensures ._data is rebuilt from the .operations 

2154 

2155 def clean_forms( 

2156 elt: DictionaryObject, stack: list[DictionaryObject] 

2157 ) -> tuple[list[str], list[str]]: 

2158 nonlocal to_delete 

2159 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference 

2160 if (elt in stack) or ( 

2161 hasattr(elt, "indirect_reference") 

2162 and any( 

2163 elt.indirect_reference == getattr(x, "indirect_reference", -1) 

2164 for x in stack 

2165 ) 

2166 ): 

2167 # to prevent infinite looping 

2168 return [], [] # pragma: no cover 

2169 try: 

2170 d = cast( 

2171 dict[Any, Any], 

2172 cast(DictionaryObject, elt["/Resources"])["/XObject"], 

2173 ) 

2174 except KeyError: 

2175 d = {} 

2176 images = [] 

2177 forms = [] 

2178 for k, v in d.items(): 

2179 o = v.get_object() 

2180 try: 

2181 content: Any = None 

2182 if ( 

2183 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES 

2184 and o["/Subtype"] == "/Image" 

2185 ): 

2186 content = NullObject() # to delete the image keeping the entry 

2187 images.append(k) 

2188 if o["/Subtype"] == "/Form": 

2189 forms.append(k) 

2190 if isinstance(o, ContentStream): 

2191 content = o 

2192 else: 

2193 content = ContentStream(o, self) 

2194 content.update( 

2195 { 

2196 k1: v1 

2197 for k1, v1 in o.items() 

2198 if k1 not in ["/Length", "/Filter", "/DecodeParms"] 

2199 } 

2200 ) 

2201 try: 

2202 content.indirect_reference = o.indirect_reference 

2203 except AttributeError: # pragma: no cover 

2204 pass 

2205 stack.append(elt) 

2206 clean_forms(content, stack) # clean subforms 

2207 if content is not None: 

2208 if isinstance(v, IndirectObject): 

2209 self._objects[v.idnum - 1] = content 

2210 else: 

2211 # should only occur in a PDF not respecting PDF spec 

2212 # where streams must be indirected. 

2213 d[k] = self._add_object(content) # pragma: no cover 

2214 except (TypeError, KeyError): 

2215 pass 

2216 for im in images: 

2217 del d[im] # for clean-up 

2218 if isinstance(elt, StreamObject): # for /Form 

2219 if not isinstance(elt, ContentStream): # pragma: no cover 

2220 e = ContentStream(elt, self) 

2221 e.update(elt.items()) 

2222 elt = e 

2223 clean(elt, images, forms, text_filters) # clean the content 

2224 return images, forms 

2225 

2226 if not isinstance(page, PageObject): 

2227 page = PageObject(self, page.indirect_reference) # pragma: no cover 

2228 if "/Contents" in page: 

2229 content = cast(ContentStream, page.get_contents()) 

2230 

2231 images, forms = clean_forms(page, []) 

2232 

2233 clean(content, images, forms, text_filters) 

2234 page.replace_contents(content) 

2235 

2236 def remove_images( 

2237 self, 

2238 to_delete: ImageType = ImageType.ALL, 

2239 ) -> None: 

2240 """ 

2241 Remove images from this output. 

2242 

2243 Args: 

2244 to_delete: The type of images to be deleted 

2245 (default = all images types) 

2246 

2247 """ 

2248 if isinstance(to_delete, bool): 

2249 to_delete = ImageType.ALL 

2250 

2251 i = ObjectDeletionFlag.NONE 

2252 

2253 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"): 

2254 if to_delete & ImageType[image]: 

2255 i |= ObjectDeletionFlag[image] 

2256 

2257 for page in self.pages: 

2258 self.remove_objects_from_page(page, i) 

2259 

2260 def remove_text(self, font_names: Optional[list[str]] = None) -> None: 

2261 """ 

2262 Remove text from the PDF. 

2263 

2264 Args: 

2265 font_names: List of font names to remove, such as "Helvetica-Bold". 

2266 Optional. If not specified, all text will be removed. 

2267 """ 

2268 if not font_names: 

2269 font_names = [] 

2270 

2271 for page in self.pages: 

2272 resource_ids_to_remove = [] 

2273 

2274 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0" 

2275 # Font names need to be converted to resource names/IDs for easier removal 

2276 if font_names: 

2277 # Recursively loop through page objects to gather font info 

2278 def get_font_info( 

2279 obj: Any, 

2280 font_info: Optional[dict[str, Any]] = None, 

2281 key: Optional[str] = None 

2282 ) -> dict[str, Any]: 

2283 if font_info is None: 

2284 font_info = {} 

2285 if isinstance(obj, IndirectObject): 

2286 obj = obj.get_object() 

2287 if isinstance(obj, dict): 

2288 if obj.get("/Type") == "/Font": 

2289 font_name = obj.get("/BaseFont", "") 

2290 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold" 

2291 normalized_font_name = font_name.lstrip("/").split("+")[-1] 

2292 if normalized_font_name not in font_info: 

2293 font_info[normalized_font_name] = { 

2294 "normalized_font_name": normalized_font_name, 

2295 "resource_ids": [], 

2296 } 

2297 if key not in font_info[normalized_font_name]["resource_ids"]: 

2298 font_info[normalized_font_name]["resource_ids"].append(key) 

2299 for k in obj: 

2300 font_info = get_font_info(obj[k], font_info, k) 

2301 elif isinstance(obj, (list, ArrayObject)): 

2302 for child_obj in obj: 

2303 font_info = get_font_info(child_obj, font_info) 

2304 return font_info 

2305 

2306 # Add relevant resource names for removal 

2307 font_info = get_font_info(page.get("/Resources")) 

2308 for font_name in font_names: 

2309 if font_name in font_info: 

2310 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"]) 

2311 

2312 text_filters = {} 

2313 if font_names: 

2314 text_filters["font_ids"] = resource_ids_to_remove 

2315 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters) 

2316 

2317 def add_uri( 

2318 self, 

2319 page_number: int, 

2320 uri: str, 

2321 rect: RectangleObject, 

2322 border: Optional[ArrayObject] = None, 

2323 ) -> None: 

2324 """ 

2325 Add an URI from a rectangular area to the specified page. 

2326 

2327 Args: 

2328 page_number: index of the page on which to place the URI action. 

2329 uri: URI of resource to link to. 

2330 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or 

2331 array of four integers specifying the clickable rectangular area 

2332 ``[xLL, yLL, xUR, yUR]``, or string in the form 

2333 ``"[ xLL yLL xUR yUR ]"``. 

2334 border: if provided, an array describing border-drawing 

2335 properties. See the PDF spec for details. No border will be 

2336 drawn if this argument is omitted. 

2337 

2338 """ 

2339 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore 

2340 page_ref = cast(dict[str, Any], self.get_object(page_link)) 

2341 

2342 border_arr: BorderArrayType 

2343 if border is not None: 

2344 border_arr = [NumberObject(n) for n in border[:3]] 

2345 if len(border) == 4: 

2346 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) 

2347 border_arr.append(dash_pattern) 

2348 else: 

2349 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)] 

2350 

2351 if isinstance(rect, str): 

2352 rect = NumberObject(rect) 

2353 elif isinstance(rect, RectangleObject): 

2354 pass 

2355 else: 

2356 rect = RectangleObject(rect) 

2357 

2358 lnk2 = DictionaryObject() 

2359 lnk2.update( 

2360 { 

2361 NameObject("/S"): NameObject("/URI"), 

2362 NameObject("/URI"): TextStringObject(uri), 

2363 } 

2364 ) 

2365 lnk = DictionaryObject() 

2366 lnk.update( 

2367 { 

2368 NameObject(AA.Type): NameObject("/Annot"), 

2369 NameObject(AA.Subtype): NameObject("/Link"), 

2370 NameObject(AA.P): page_link, 

2371 NameObject(AA.Rect): rect, 

2372 NameObject("/H"): NameObject("/I"), 

2373 NameObject(AA.Border): ArrayObject(border_arr), 

2374 NameObject("/A"): lnk2, 

2375 } 

2376 ) 

2377 lnk_ref = self._add_object(lnk) 

2378 

2379 if PG.ANNOTS in page_ref: 

2380 page_ref[PG.ANNOTS].append(lnk_ref) 

2381 else: 

2382 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) 

2383 

2384 _valid_layouts = ( 

2385 "/NoLayout", 

2386 "/SinglePage", 

2387 "/OneColumn", 

2388 "/TwoColumnLeft", 

2389 "/TwoColumnRight", 

2390 "/TwoPageLeft", 

2391 "/TwoPageRight", 

2392 ) 

2393 

2394 def _get_page_layout(self) -> Optional[LayoutType]: 

2395 try: 

2396 return cast(LayoutType, self._root_object["/PageLayout"]) 

2397 except KeyError: 

2398 return None 

2399 

2400 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: 

2401 """ 

2402 Set the page layout. 

2403 

2404 Args: 

2405 layout: The page layout to be used. 

2406 

2407 .. list-table:: Valid ``layout`` arguments 

2408 :widths: 50 200 

2409 

2410 * - /NoLayout 

2411 - Layout explicitly not specified 

2412 * - /SinglePage 

2413 - Show one page at a time 

2414 * - /OneColumn 

2415 - Show one column at a time 

2416 * - /TwoColumnLeft 

2417 - Show pages in two columns, odd-numbered pages on the left 

2418 * - /TwoColumnRight 

2419 - Show pages in two columns, odd-numbered pages on the right 

2420 * - /TwoPageLeft 

2421 - Show two pages at a time, odd-numbered pages on the left 

2422 * - /TwoPageRight 

2423 - Show two pages at a time, odd-numbered pages on the right 

2424 

2425 """ 

2426 if not isinstance(layout, NameObject): 

2427 if layout not in self._valid_layouts: 

2428 logger_warning( 

2429 f"Layout should be one of: {'', ''.join(self._valid_layouts)}", 

2430 __name__, 

2431 ) 

2432 layout = NameObject(layout) 

2433 self._root_object.update({NameObject("/PageLayout"): layout}) 

2434 

2435 def set_page_layout(self, layout: LayoutType) -> None: 

2436 """ 

2437 Set the page layout. 

2438 

2439 Args: 

2440 layout: The page layout to be used 

2441 

2442 .. list-table:: Valid ``layout`` arguments 

2443 :widths: 50 200 

2444 

2445 * - /NoLayout 

2446 - Layout explicitly not specified 

2447 * - /SinglePage 

2448 - Show one page at a time 

2449 * - /OneColumn 

2450 - Show one column at a time 

2451 * - /TwoColumnLeft 

2452 - Show pages in two columns, odd-numbered pages on the left 

2453 * - /TwoColumnRight 

2454 - Show pages in two columns, odd-numbered pages on the right 

2455 * - /TwoPageLeft 

2456 - Show two pages at a time, odd-numbered pages on the left 

2457 * - /TwoPageRight 

2458 - Show two pages at a time, odd-numbered pages on the right 

2459 

2460 """ 

2461 self._set_page_layout(layout) 

2462 

2463 @property 

2464 def page_layout(self) -> Optional[LayoutType]: 

2465 """ 

2466 Page layout property. 

2467 

2468 .. list-table:: Valid ``layout`` values 

2469 :widths: 50 200 

2470 

2471 * - /NoLayout 

2472 - Layout explicitly not specified 

2473 * - /SinglePage 

2474 - Show one page at a time 

2475 * - /OneColumn 

2476 - Show one column at a time 

2477 * - /TwoColumnLeft 

2478 - Show pages in two columns, odd-numbered pages on the left 

2479 * - /TwoColumnRight 

2480 - Show pages in two columns, odd-numbered pages on the right 

2481 * - /TwoPageLeft 

2482 - Show two pages at a time, odd-numbered pages on the left 

2483 * - /TwoPageRight 

2484 - Show two pages at a time, odd-numbered pages on the right 

2485 """ 

2486 return self._get_page_layout() 

2487 

2488 @page_layout.setter 

2489 def page_layout(self, layout: LayoutType) -> None: 

2490 self._set_page_layout(layout) 

2491 

2492 _valid_modes = ( 

2493 "/UseNone", 

2494 "/UseOutlines", 

2495 "/UseThumbs", 

2496 "/FullScreen", 

2497 "/UseOC", 

2498 "/UseAttachments", 

2499 ) 

2500 

2501 def _get_page_mode(self) -> Optional[PagemodeType]: 

2502 try: 

2503 return cast(PagemodeType, self._root_object["/PageMode"]) 

2504 except KeyError: 

2505 return None 

2506 

2507 @property 

2508 def page_mode(self) -> Optional[PagemodeType]: 

2509 """ 

2510 Page mode property. 

2511 

2512 .. list-table:: Valid ``mode`` values 

2513 :widths: 50 200 

2514 

2515 * - /UseNone 

2516 - Do not show outline or thumbnails panels 

2517 * - /UseOutlines 

2518 - Show outline (aka bookmarks) panel 

2519 * - /UseThumbs 

2520 - Show page thumbnails panel 

2521 * - /FullScreen 

2522 - Fullscreen view 

2523 * - /UseOC 

2524 - Show Optional Content Group (OCG) panel 

2525 * - /UseAttachments 

2526 - Show attachments panel 

2527 """ 

2528 return self._get_page_mode() 

2529 

2530 @page_mode.setter 

2531 def page_mode(self, mode: PagemodeType) -> None: 

2532 if isinstance(mode, NameObject): 

2533 mode_name: NameObject = mode 

2534 else: 

2535 if mode not in self._valid_modes: 

2536 logger_warning( 

2537 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__ 

2538 ) 

2539 mode_name = NameObject(mode) 

2540 self._root_object.update({NameObject("/PageMode"): mode_name}) 

2541 

2542 def add_annotation( 

2543 self, 

2544 page_number: Union[int, PageObject], 

2545 annotation: dict[str, Any], 

2546 ) -> DictionaryObject: 

2547 """ 

2548 Add a single annotation to the page. 

2549 The added annotation must be a new annotation. 

2550 It cannot be recycled. 

2551 

2552 Args: 

2553 page_number: PageObject or page index. 

2554 annotation: Annotation to be added (created with annotation). 

2555 

2556 Returns: 

2557 The inserted object. 

2558 This can be used for popup creation, for example. 

2559 

2560 """ 

2561 page = page_number 

2562 if isinstance(page, int): 

2563 page = self.pages[page] 

2564 elif not isinstance(page, PageObject): 

2565 raise TypeError("page: invalid type") 

2566 

2567 to_add = cast(DictionaryObject, _pdf_objectify(annotation)) 

2568 to_add[NameObject("/P")] = page.indirect_reference 

2569 

2570 if page.annotations is None: 

2571 page[NameObject("/Annots")] = ArrayObject() 

2572 assert page.annotations is not None 

2573 

2574 # Internal link annotations need the correct object type for the 

2575 # destination 

2576 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add: 

2577 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")]) 

2578 dest = Destination( 

2579 NameObject("/LinkName"), 

2580 tmp["target_page_index"], 

2581 Fit( 

2582 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"] 

2583 ), # I have no clue why this dict-hack is necessary 

2584 ) 

2585 to_add[NameObject("/Dest")] = dest.dest_array 

2586 

2587 page.annotations.append(self._add_object(to_add)) 

2588 

2589 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add: 

2590 cast(DictionaryObject, to_add["/Parent"].get_object())[ 

2591 NameObject("/Popup") 

2592 ] = to_add.indirect_reference 

2593 

2594 return to_add 

2595 

2596 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: 

2597 """ 

2598 Perform some clean up in the page. 

2599 Currently: convert NameObject named destination to TextStringObject 

2600 (required for names/dests list) 

2601 

2602 Args: 

2603 page: 

2604 

2605 Returns: 

2606 The cleaned PageObject 

2607 

2608 """ 

2609 page = cast("PageObject", page.get_object()) 

2610 for a in page.get("/Annots", []): 

2611 a_obj = a.get_object() 

2612 d = a_obj.get("/Dest", None) 

2613 act = a_obj.get("/A", None) 

2614 if isinstance(d, NameObject): 

2615 a_obj[NameObject("/Dest")] = TextStringObject(d) 

2616 elif act is not None: 

2617 act = act.get_object() 

2618 d = act.get("/D", None) 

2619 if isinstance(d, NameObject): 

2620 act[NameObject("/D")] = TextStringObject(d) 

2621 return page 

2622 

2623 def _create_stream( 

2624 self, fileobj: Union[Path, StrByteType, PdfReader] 

2625 ) -> tuple[IOBase, Optional[Encryption]]: 

2626 # If the fileobj parameter is a string, assume it is a path 

2627 # and create a file object at that location. If it is a file, 

2628 # copy the file's contents into a BytesIO stream object; if 

2629 # it is a PdfReader, copy that reader's stream into a 

2630 # BytesIO stream. 

2631 # If fileobj is none of the above types, it is not modified 

2632 encryption_obj = None 

2633 stream: IOBase 

2634 if isinstance(fileobj, (str, Path)): 

2635 with FileIO(fileobj, "rb") as f: 

2636 stream = BytesIO(f.read()) 

2637 elif isinstance(fileobj, PdfReader): 

2638 if fileobj._encryption: 

2639 encryption_obj = fileobj._encryption 

2640 orig_tell = fileobj.stream.tell() 

2641 fileobj.stream.seek(0) 

2642 stream = BytesIO(fileobj.stream.read()) 

2643 

2644 # reset the stream to its original location 

2645 fileobj.stream.seek(orig_tell) 

2646 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): 

2647 fileobj.seek(0) 

2648 filecontent = fileobj.read() 

2649 stream = BytesIO(filecontent) 

2650 else: 

2651 raise NotImplementedError( 

2652 "Merging requires an object that PdfReader can parse. " 

2653 "Typically, that is a Path or a string representing a Path, " 

2654 "a file object, or an object implementing .seek and .read. " 

2655 "Passing a PdfReader directly works as well." 

2656 ) 

2657 return stream, encryption_obj 

2658 

2659 def append( 

2660 self, 

2661 fileobj: Union[StrByteType, PdfReader, Path], 

2662 outline_item: Union[ 

2663 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int] 

2664 ] = None, 

2665 pages: Union[ 

2666 None, 

2667 PageRange, 

2668 tuple[int, int], 

2669 tuple[int, int, int], 

2670 list[int], 

2671 list[PageObject], 

2672 ] = None, 

2673 import_outline: bool = True, 

2674 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None, 

2675 ) -> None: 

2676 """ 

2677 Identical to the :meth:`merge()<merge>` method, but assumes you want to 

2678 concatenate all pages onto the end of the file instead of specifying a 

2679 position. 

2680 

2681 Args: 

2682 fileobj: A File Object or an object that supports the standard 

2683 read and seek methods similar to a File Object. Could also be a 

2684 string representing a path to a PDF file. 

2685 outline_item: Optionally, you may specify a string to build an 

2686 outline (aka 'bookmark') to identify the beginning of the 

2687 included file. 

2688 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2689 or a ``(start, stop[, step])`` tuple 

2690 or a list of pages to be processed 

2691 to merge only the specified range of pages from the source 

2692 document into the output document. 

2693 import_outline: You may prevent the source document's 

2694 outline (collection of outline items, previously referred to as 

2695 'bookmarks') from being imported by specifying this as ``False``. 

2696 excluded_fields: Provide the list of fields/keys to be ignored 

2697 if ``/Annots`` is part of the list, the annotation will be ignored 

2698 if ``/B`` is part of the list, the articles will be ignored 

2699 

2700 """ 

2701 if excluded_fields is None: 

2702 excluded_fields = () 

2703 if isinstance(outline_item, (tuple, list, PageRange)): 

2704 if isinstance(pages, bool): 

2705 if not isinstance(import_outline, bool): 

2706 excluded_fields = import_outline 

2707 import_outline = pages 

2708 pages = outline_item 

2709 self.merge( 

2710 None, 

2711 fileobj, 

2712 None, 

2713 pages, 

2714 import_outline, 

2715 excluded_fields, 

2716 ) 

2717 else: # if isinstance(outline_item, str): 

2718 self.merge( 

2719 None, 

2720 fileobj, 

2721 outline_item, 

2722 pages, 

2723 import_outline, 

2724 excluded_fields, 

2725 ) 

2726 

2727 def merge( 

2728 self, 

2729 position: Optional[int], 

2730 fileobj: Union[Path, StrByteType, PdfReader], 

2731 outline_item: Optional[str] = None, 

2732 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None, 

2733 import_outline: bool = True, 

2734 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (), 

2735 ) -> None: 

2736 """ 

2737 Merge the pages from the given file into the output file at the 

2738 specified page number. 

2739 

2740 Args: 

2741 position: The *page number* to insert this file. File will 

2742 be inserted after the given number. 

2743 fileobj: A File Object or an object that supports the standard 

2744 read and seek methods similar to a File Object. Could also be a 

2745 string representing a path to a PDF file. 

2746 outline_item: Optionally, you may specify a string to build an outline 

2747 (aka 'bookmark') to identify the 

2748 beginning of the included file. 

2749 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2750 or a ``(start, stop[, step])`` tuple 

2751 or a list of pages to be processed 

2752 to merge only the specified range of pages from the source 

2753 document into the output document. 

2754 import_outline: You may prevent the source document's 

2755 outline (collection of outline items, previously referred to as 

2756 'bookmarks') from being imported by specifying this as ``False``. 

2757 excluded_fields: provide the list of fields/keys to be ignored 

2758 if ``/Annots`` is part of the list, the annotation will be ignored 

2759 if ``/B`` is part of the list, the articles will be ignored 

2760 

2761 Raises: 

2762 TypeError: The pages attribute is not configured properly 

2763 

2764 """ 

2765 if isinstance(fileobj, PdfDocCommon): 

2766 reader = fileobj 

2767 else: 

2768 stream, _encryption_obj = self._create_stream(fileobj) 

2769 # Create a new PdfReader instance using the stream 

2770 # (either file or BytesIO or StringIO) created above 

2771 reader = PdfReader(stream, strict=False) # type: ignore[arg-type] 

2772 

2773 if excluded_fields is None: 

2774 excluded_fields = () 

2775 # Find the range of pages to merge. 

2776 if pages is None: 

2777 pages = list(range(len(reader.pages))) 

2778 elif isinstance(pages, PageRange): 

2779 pages = list(range(*pages.indices(len(reader.pages)))) 

2780 elif isinstance(pages, list): 

2781 pass # keep unchanged 

2782 elif isinstance(pages, tuple) and len(pages) <= 3: 

2783 pages = list(range(*pages)) 

2784 elif not isinstance(pages, tuple): 

2785 raise TypeError( 

2786 '"pages" must be a tuple of (start, stop[, step]) or a list' 

2787 ) 

2788 

2789 srcpages = {} 

2790 for page in pages: 

2791 if isinstance(page, PageObject): 

2792 pg = page 

2793 else: 

2794 pg = reader.pages[page] 

2795 assert pg.indirect_reference is not None 

2796 if position is None: 

2797 # numbers in the exclude list identifies that the exclusion is 

2798 # only applicable to 1st level of cloning 

2799 srcpages[pg.indirect_reference.idnum] = self.add_page( 

2800 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore 

2801 ) 

2802 else: 

2803 srcpages[pg.indirect_reference.idnum] = self.insert_page( 

2804 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore 

2805 ) 

2806 position += 1 

2807 srcpages[pg.indirect_reference.idnum].original_page = pg 

2808 

2809 reader._named_destinations = ( 

2810 reader.named_destinations 

2811 ) # need for the outline processing below 

2812 

2813 arr: Any 

2814 

2815 def _process_named_dests(dest: Any) -> None: 

2816 arr = dest.dest_array 

2817 if "/Names" in self._root_object and dest["/Title"] in cast( 

2818 list[Any], 

2819 cast( 

2820 DictionaryObject, 

2821 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()), 

2822 ).get("/Names", DictionaryObject()), 

2823 ): 

2824 # already exists: should not duplicate it 

2825 pass 

2826 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject): 

2827 pass 

2828 elif isinstance(dest["/Page"], int): 

2829 # the page reference is a page number normally not a PDF Reference 

2830 # page numbers as int are normally accepted only in external goto 

2831 try: 

2832 p = reader.pages[dest["/Page"]] 

2833 except IndexError: 

2834 return 

2835 assert p.indirect_reference is not None 

2836 try: 

2837 arr[NumberObject(0)] = NumberObject( 

2838 srcpages[p.indirect_reference.idnum].page_number 

2839 ) 

2840 self.add_named_destination_array(dest["/Title"], arr) 

2841 except KeyError: 

2842 pass 

2843 elif dest["/Page"].indirect_reference.idnum in srcpages: 

2844 arr[NumberObject(0)] = srcpages[ 

2845 dest["/Page"].indirect_reference.idnum 

2846 ].indirect_reference 

2847 self.add_named_destination_array(dest["/Title"], arr) 

2848 

2849 for dest in reader._named_destinations.values(): 

2850 _process_named_dests(dest) 

2851 

2852 outline_item_typ: TreeObject 

2853 if outline_item is not None: 

2854 outline_item_typ = cast( 

2855 "TreeObject", 

2856 self.add_outline_item( 

2857 TextStringObject(outline_item), 

2858 next(iter(srcpages.values())).indirect_reference, 

2859 fit=PAGE_FIT, 

2860 ).get_object(), 

2861 ) 

2862 else: 

2863 outline_item_typ = self.get_outline_root() 

2864 

2865 _ro = reader.root_object 

2866 if import_outline and CO.OUTLINES in _ro: 

2867 outline = self._get_filtered_outline( 

2868 _ro.get(CO.OUTLINES, None), srcpages, reader 

2869 ) 

2870 self._insert_filtered_outline( 

2871 outline, outline_item_typ, None 

2872 ) # TODO: use before parameter 

2873 

2874 if "/Annots" not in excluded_fields: 

2875 for pag in srcpages.values(): 

2876 lst = self._insert_filtered_annotations( 

2877 pag.original_page.get("/Annots", []), pag, srcpages, reader 

2878 ) 

2879 if len(lst) > 0: 

2880 pag[NameObject("/Annots")] = lst 

2881 self.clean_page(pag) 

2882 

2883 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None: 

2884 if "/AcroForm" not in self._root_object: 

2885 self._root_object[NameObject("/AcroForm")] = self._add_object( 

2886 cast( 

2887 DictionaryObject, 

2888 reader.root_object["/AcroForm"], 

2889 ).clone(self, False, ("/Fields",)) 

2890 ) 

2891 arr = ArrayObject() 

2892 else: 

2893 arr = cast( 

2894 ArrayObject, 

2895 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], 

2896 ) 

2897 trslat = self._id_translated[id(reader)] 

2898 try: 

2899 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore 

2900 try: 

2901 ind = IndirectObject(trslat[f.idnum], 0, self) 

2902 if ind not in arr: 

2903 arr.append(ind) 

2904 except KeyError: 

2905 # for trslat[] which mean the field has not be copied 

2906 # through the page 

2907 pass 

2908 except KeyError: # for /Acroform or /Fields are not existing 

2909 arr = self._add_object(ArrayObject()) 

2910 cast(DictionaryObject, self._root_object["/AcroForm"])[ 

2911 NameObject("/Fields") 

2912 ] = arr 

2913 

2914 if "/B" not in excluded_fields: 

2915 self.add_filtered_articles("", srcpages, reader) 

2916 

2917 def _add_articles_thread( 

2918 self, 

2919 thread: DictionaryObject, # thread entry from the reader's array of threads 

2920 pages: dict[int, PageObject], 

2921 reader: PdfReader, 

2922 ) -> IndirectObject: 

2923 """ 

2924 Clone the thread with only the applicable articles. 

2925 

2926 Args: 

2927 thread: 

2928 pages: 

2929 reader: 

2930 

2931 Returns: 

2932 The added thread as an indirect reference 

2933 

2934 """ 

2935 nthread = thread.clone( 

2936 self, force_duplicate=True, ignore_fields=("/F",) 

2937 ) # use of clone to keep link between reader and writer 

2938 self.threads.append(nthread.indirect_reference) 

2939 first_article = cast("DictionaryObject", thread["/F"]) 

2940 current_article: Optional[DictionaryObject] = first_article 

2941 new_article: Optional[DictionaryObject] = None 

2942 while current_article is not None: 

2943 pag = self._get_cloned_page( 

2944 cast("PageObject", current_article["/P"]), pages, reader 

2945 ) 

2946 if pag is not None: 

2947 if new_article is None: 

2948 new_article = cast( 

2949 "DictionaryObject", 

2950 self._add_object(DictionaryObject()).get_object(), 

2951 ) 

2952 new_first = new_article 

2953 nthread[NameObject("/F")] = new_article.indirect_reference 

2954 else: 

2955 new_article2 = cast( 

2956 "DictionaryObject", 

2957 self._add_object( 

2958 DictionaryObject( 

2959 {NameObject("/V"): new_article.indirect_reference} 

2960 ) 

2961 ).get_object(), 

2962 ) 

2963 new_article[NameObject("/N")] = new_article2.indirect_reference 

2964 new_article = new_article2 

2965 new_article[NameObject("/P")] = pag 

2966 new_article[NameObject("/T")] = nthread.indirect_reference 

2967 new_article[NameObject("/R")] = current_article["/R"] 

2968 pag_obj = cast("PageObject", pag.get_object()) 

2969 if "/B" not in pag_obj: 

2970 pag_obj[NameObject("/B")] = ArrayObject() 

2971 cast("ArrayObject", pag_obj["/B"]).append( 

2972 new_article.indirect_reference 

2973 ) 

2974 current_article = cast("DictionaryObject", current_article["/N"]) 

2975 if current_article == first_article: 

2976 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore 

2977 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore 

2978 current_article = None 

2979 assert nthread.indirect_reference is not None 

2980 return nthread.indirect_reference 

2981 

2982 def add_filtered_articles( 

2983 self, 

2984 fltr: Union[ 

2985 Pattern[Any], str 

2986 ], # thread entry from the reader's array of threads 

2987 pages: dict[int, PageObject], 

2988 reader: PdfReader, 

2989 ) -> None: 

2990 """ 

2991 Add articles matching the defined criteria. 

2992 

2993 Args: 

2994 fltr: 

2995 pages: 

2996 reader: 

2997 

2998 """ 

2999 if isinstance(fltr, str): 

3000 fltr = re.compile(fltr) 

3001 elif not isinstance(fltr, Pattern): 

3002 fltr = re.compile("") 

3003 for p in pages.values(): 

3004 pp = p.original_page 

3005 for a in pp.get("/B", ()): 

3006 thr = a.get_object().get("/T") 

3007 if thr is None: 

3008 continue 

3009 thr = thr.get_object() 

3010 if thr.indirect_reference.idnum not in self._id_translated[ 

3011 id(reader) 

3012 ] and fltr.search((thr.get("/I", {})).get("/Title", "")): 

3013 self._add_articles_thread(thr, pages, reader) 

3014 

3015 def _get_cloned_page( 

3016 self, 

3017 page: Union[None, IndirectObject, PageObject, NullObject], 

3018 pages: dict[int, PageObject], 

3019 reader: PdfReader, 

3020 ) -> Optional[IndirectObject]: 

3021 if isinstance(page, NullObject): 

3022 return None 

3023 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": 

3024 _i = page.indirect_reference 

3025 elif isinstance(page, IndirectObject): 

3026 _i = page 

3027 try: 

3028 return pages[_i.idnum].indirect_reference # type: ignore 

3029 except Exception: 

3030 return None 

3031 

3032 def _insert_filtered_annotations( 

3033 self, 

3034 annots: Union[IndirectObject, list[DictionaryObject], None], 

3035 page: PageObject, 

3036 pages: dict[int, PageObject], 

3037 reader: PdfReader, 

3038 ) -> list[Destination]: 

3039 outlist = ArrayObject() 

3040 if isinstance(annots, IndirectObject): 

3041 annots = cast("list[Any]", annots.get_object()) 

3042 if annots is None: 

3043 return outlist 

3044 if not isinstance(annots, list): 

3045 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__) 

3046 return outlist 

3047 for an in annots: 

3048 ano = cast("DictionaryObject", an.get_object()) 

3049 if ( 

3050 ano["/Subtype"] != "/Link" 

3051 or "/A" not in ano 

3052 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" 

3053 or "/Dest" in ano 

3054 ): 

3055 if "/Dest" not in ano: 

3056 outlist.append(self._add_object(ano.clone(self))) 

3057 else: 

3058 d = ano["/Dest"] 

3059 if isinstance(d, str): 

3060 # it is a named dest 

3061 if str(d) in self.get_named_dest_root(): 

3062 outlist.append(ano.clone(self).indirect_reference) 

3063 else: 

3064 d = cast("ArrayObject", d) 

3065 p = self._get_cloned_page(d[0], pages, reader) 

3066 if p is not None: 

3067 anc = ano.clone(self, ignore_fields=("/Dest",)) 

3068 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]]) 

3069 outlist.append(self._add_object(anc)) 

3070 else: 

3071 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject()) 

3072 if d is None or isinstance(d, NullObject): 

3073 continue 

3074 if isinstance(d, str): 

3075 # it is a named dest 

3076 if str(d) in self.get_named_dest_root(): 

3077 outlist.append(ano.clone(self).indirect_reference) 

3078 else: 

3079 d = cast("ArrayObject", d) 

3080 p = self._get_cloned_page(d[0], pages, reader) 

3081 if p is not None: 

3082 anc = ano.clone(self, ignore_fields=("/D",)) 

3083 cast("DictionaryObject", anc["/A"])[ 

3084 NameObject("/D") 

3085 ] = ArrayObject([p, *d[1:]]) 

3086 outlist.append(self._add_object(anc)) 

3087 return outlist 

3088 

3089 def _get_filtered_outline( 

3090 self, 

3091 node: Any, 

3092 pages: dict[int, PageObject], 

3093 reader: PdfReader, 

3094 ) -> list[Destination]: 

3095 """ 

3096 Extract outline item entries that are part of the specified page set. 

3097 

3098 Args: 

3099 node: 

3100 pages: 

3101 reader: 

3102 

3103 Returns: 

3104 A list of destination objects. 

3105 

3106 """ 

3107 new_outline = [] 

3108 if node is None: 

3109 node = NullObject() 

3110 node = node.get_object() 

3111 if is_null_or_none(node): 

3112 node = DictionaryObject() 

3113 if node.get("/Type", "") == "/Outlines" or "/Title" not in node: 

3114 node = node.get("/First", None) 

3115 if node is not None: 

3116 node = node.get_object() 

3117 new_outline += self._get_filtered_outline(node, pages, reader) 

3118 else: 

3119 v: Union[None, IndirectObject, NullObject] 

3120 while node is not None: 

3121 node = node.get_object() 

3122 o = cast("Destination", reader._build_outline_item(node)) 

3123 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader) 

3124 if v is None: 

3125 v = NullObject() 

3126 o[NameObject("/Page")] = v 

3127 if "/First" in node: 

3128 o._filtered_children = self._get_filtered_outline( 

3129 node["/First"], pages, reader 

3130 ) 

3131 else: 

3132 o._filtered_children = [] 

3133 if ( 

3134 not isinstance(o["/Page"], NullObject) 

3135 or len(o._filtered_children) > 0 

3136 ): 

3137 new_outline.append(o) 

3138 node = node.get("/Next", None) 

3139 return new_outline 

3140 

3141 def _clone_outline(self, dest: Destination) -> TreeObject: 

3142 n_ol = TreeObject() 

3143 self._add_object(n_ol) 

3144 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"]) 

3145 if not isinstance(dest["/Page"], NullObject): 

3146 if dest.node is not None and "/A" in dest.node: 

3147 n_ol[NameObject("/A")] = dest.node["/A"].clone(self) 

3148 else: 

3149 n_ol[NameObject("/Dest")] = dest.dest_array 

3150 # TODO: /SE 

3151 if dest.node is not None: 

3152 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0)) 

3153 n_ol[NameObject("/C")] = ArrayObject( 

3154 dest.node.get( 

3155 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)] 

3156 ) 

3157 ) 

3158 return n_ol 

3159 

3160 def _insert_filtered_outline( 

3161 self, 

3162 outlines: list[Destination], 

3163 parent: Union[TreeObject, IndirectObject], 

3164 before: Union[None, TreeObject, IndirectObject] = None, 

3165 ) -> None: 

3166 for dest in outlines: 

3167 # TODO: can be improved to keep A and SE entries (ignored for the moment) 

3168 # with np=self.add_outline_item_destination(dest,parent,before) 

3169 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: 

3170 np = parent 

3171 else: 

3172 np = self._clone_outline(dest) 

3173 cast(TreeObject, parent.get_object()).insert_child(np, before, self) 

3174 self._insert_filtered_outline(dest._filtered_children, np, None) 

3175 

3176 def close(self) -> None: 

3177 """Implemented for API harmonization.""" 

3178 return 

3179 

3180 def find_outline_item( 

3181 self, 

3182 outline_item: dict[str, Any], 

3183 root: Optional[OutlineType] = None, 

3184 ) -> Optional[list[int]]: 

3185 if root is None: 

3186 o = self.get_outline_root() 

3187 else: 

3188 o = cast("TreeObject", root) 

3189 

3190 i = 0 

3191 while o is not None: 

3192 if ( 

3193 o.indirect_reference == outline_item 

3194 or o.get("/Title", None) == outline_item 

3195 ): 

3196 return [i] 

3197 if "/First" in o: 

3198 res = self.find_outline_item( 

3199 outline_item, cast(OutlineType, o["/First"]) 

3200 ) 

3201 if res: 

3202 return ([i] if "/Title" in o else []) + res 

3203 if "/Next" in o: 

3204 i += 1 

3205 o = cast(TreeObject, o["/Next"]) 

3206 else: 

3207 return None 

3208 

3209 def reset_translation( 

3210 self, reader: Union[None, PdfReader, IndirectObject] = None 

3211 ) -> None: 

3212 """ 

3213 Reset the translation table between reader and the writer object. 

3214 

3215 Late cloning will create new independent objects. 

3216 

3217 Args: 

3218 reader: PdfReader or IndirectObject referencing a PdfReader object. 

3219 if set to None or omitted, all tables will be reset. 

3220 

3221 """ 

3222 if reader is None: 

3223 self._id_translated = {} 

3224 elif isinstance(reader, PdfReader): 

3225 try: 

3226 del self._id_translated[id(reader)] 

3227 except Exception: 

3228 pass 

3229 elif isinstance(reader, IndirectObject): 

3230 try: 

3231 del self._id_translated[id(reader.pdf)] 

3232 except Exception: 

3233 pass 

3234 else: 

3235 raise Exception("invalid parameter {reader}") 

3236 

3237 def set_page_label( 

3238 self, 

3239 page_index_from: int, 

3240 page_index_to: int, 

3241 style: Optional[PageLabelStyle] = None, 

3242 prefix: Optional[str] = None, 

3243 start: Optional[int] = 0, 

3244 ) -> None: 

3245 """ 

3246 Set a page label to a range of pages. 

3247 

3248 Page indexes must be given starting from 0. 

3249 Labels must have a style, a prefix or both. 

3250 If a range is not assigned any page label, a decimal label starting from 1 is applied. 

3251 

3252 Args: 

3253 page_index_from: page index of the beginning of the range starting from 0 

3254 page_index_to: page index of the beginning of the range starting from 0 

3255 style: The numbering style to be used for the numeric portion of each page label: 

3256 

3257 * ``/D`` Decimal Arabic numerals 

3258 * ``/R`` Uppercase Roman numerals 

3259 * ``/r`` Lowercase Roman numerals 

3260 * ``/A`` Uppercase letters (A to Z for the first 26 pages, 

3261 AA to ZZ for the next 26, and so on) 

3262 * ``/a`` Lowercase letters (a to z for the first 26 pages, 

3263 aa to zz for the next 26, and so on) 

3264 

3265 prefix: The label prefix for page labels in this range. 

3266 start: The value of the numeric portion for the first page label 

3267 in the range. 

3268 Subsequent pages are numbered sequentially from this value, 

3269 which must be greater than or equal to 1. 

3270 Default value: 1. 

3271 

3272 """ 

3273 if style is None and prefix is None: 

3274 raise ValueError("At least one of style and prefix must be given") 

3275 if page_index_from < 0: 

3276 raise ValueError("page_index_from must be greater or equal than 0") 

3277 if page_index_to < page_index_from: 

3278 raise ValueError( 

3279 "page_index_to must be greater or equal than page_index_from" 

3280 ) 

3281 if page_index_to >= len(self.pages): 

3282 raise ValueError("page_index_to exceeds number of pages") 

3283 if start is not None and start != 0 and start < 1: 

3284 raise ValueError("If given, start must be greater or equal than one") 

3285 

3286 self._set_page_label(page_index_from, page_index_to, style, prefix, start) 

3287 

3288 def _set_page_label( 

3289 self, 

3290 page_index_from: int, 

3291 page_index_to: int, 

3292 style: Optional[PageLabelStyle] = None, 

3293 prefix: Optional[str] = None, 

3294 start: Optional[int] = 0, 

3295 ) -> None: 

3296 """ 

3297 Set a page label to a range of pages. 

3298 

3299 Page indexes must be given starting from 0. 

3300 Labels must have a style, a prefix or both. 

3301 If a range is not assigned any page label a decimal label starting from 1 is applied. 

3302 

3303 Args: 

3304 page_index_from: page index of the beginning of the range starting from 0 

3305 page_index_to: page index of the beginning of the range starting from 0 

3306 style: The numbering style to be used for the numeric portion of each page label: 

3307 /D Decimal Arabic numerals 

3308 /R Uppercase Roman numerals 

3309 /r Lowercase Roman numerals 

3310 /A Uppercase letters (A to Z for the first 26 pages, 

3311 AA to ZZ for the next 26, and so on) 

3312 /a Lowercase letters (a to z for the first 26 pages, 

3313 aa to zz for the next 26, and so on) 

3314 prefix: The label prefix for page labels in this range. 

3315 start: The value of the numeric portion for the first page label 

3316 in the range. 

3317 Subsequent pages are numbered sequentially from this value, 

3318 which must be greater than or equal to 1. Default value: 1. 

3319 

3320 """ 

3321 default_page_label = DictionaryObject() 

3322 default_page_label[NameObject("/S")] = NameObject("/D") 

3323 

3324 new_page_label = DictionaryObject() 

3325 if style is not None: 

3326 new_page_label[NameObject("/S")] = NameObject(style) 

3327 if prefix is not None: 

3328 new_page_label[NameObject("/P")] = TextStringObject(prefix) 

3329 if start != 0: 

3330 new_page_label[NameObject("/St")] = NumberObject(start) 

3331 

3332 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: 

3333 nums = ArrayObject() 

3334 nums_insert(NumberObject(0), default_page_label, nums) 

3335 page_labels = TreeObject() 

3336 page_labels[NameObject("/Nums")] = nums 

3337 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3338 

3339 page_labels = cast( 

3340 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] 

3341 ) 

3342 nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) 

3343 

3344 nums_insert(NumberObject(page_index_from), new_page_label, nums) 

3345 nums_clear_range(NumberObject(page_index_from), page_index_to, nums) 

3346 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums) 

3347 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages): 

3348 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) 

3349 

3350 page_labels[NameObject("/Nums")] = nums 

3351 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3352 

3353 def _repr_mimebundle_( 

3354 self, 

3355 include: Union[None, Iterable[str]] = None, 

3356 exclude: Union[None, Iterable[str]] = None, 

3357 ) -> dict[str, Any]: 

3358 """ 

3359 Integration into Jupyter Notebooks. 

3360 

3361 This method returns a dictionary that maps a mime-type to its 

3362 representation. 

3363 

3364 .. seealso:: 

3365 

3366 https://ipython.readthedocs.io/en/stable/config/integrating.html 

3367 """ 

3368 pdf_data = BytesIO() 

3369 self.write(pdf_data) 

3370 data = { 

3371 "application/pdf": pdf_data, 

3372 } 

3373 

3374 if include is not None: 

3375 # Filter representations based on include list 

3376 data = {k: v for k, v in data.items() if k in include} 

3377 

3378 if exclude is not None: 

3379 # Remove representations based on exclude list 

3380 data = {k: v for k, v in data.items() if k not in exclude} 

3381 

3382 return data 

3383 

3384 

3385def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject: 

3386 if isinstance(obj, PdfObject): 

3387 return obj 

3388 if isinstance(obj, dict): 

3389 to_add = DictionaryObject() 

3390 for key, value in obj.items(): 

3391 to_add[NameObject(key)] = _pdf_objectify(value) 

3392 return to_add 

3393 if isinstance(obj, str): 

3394 if obj.startswith("/"): 

3395 return NameObject(obj) 

3396 return TextStringObject(obj) 

3397 if isinstance(obj, (float, int)): 

3398 return FloatObject(obj) 

3399 if isinstance(obj, list): 

3400 return ArrayObject(_pdf_objectify(i) for i in obj) 

3401 raise NotImplementedError( 

3402 f"{type(obj)=} could not be cast to a PdfObject" 

3403 ) 

3404 

3405 

3406def _create_outline_item( 

3407 action_ref: Union[None, IndirectObject], 

3408 title: str, 

3409 color: Union[tuple[float, float, float], str, None], 

3410 italic: bool, 

3411 bold: bool, 

3412) -> TreeObject: 

3413 outline_item = TreeObject() 

3414 if action_ref is not None: 

3415 outline_item[NameObject("/A")] = action_ref 

3416 outline_item.update( 

3417 { 

3418 NameObject("/Title"): create_string_object(title), 

3419 } 

3420 ) 

3421 if color: 

3422 if isinstance(color, str): 

3423 color = hex_to_rgb(color) 

3424 outline_item.update( 

3425 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} 

3426 ) 

3427 if italic or bold: 

3428 format_flag = 0 

3429 if italic: 

3430 format_flag += OutlineFontFlag.italic 

3431 if bold: 

3432 format_flag += OutlineFontFlag.bold 

3433 outline_item.update({NameObject("/F"): NumberObject(format_flag)}) 

3434 return outline_item 

3435 

3436 

3437def generate_appearance_stream( 

3438 txt: str, 

3439 sel: list[str], 

3440 da: str, 

3441 font_full_rev: dict[str, bytes], 

3442 rct: RectangleObject, 

3443 font_height: float, 

3444 y_offset: float, 

3445) -> bytes: 

3446 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() 

3447 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): 

3448 if line in sel: 

3449 # may be improved but cannot find how to get fill working => replaced with lined box 

3450 ap_stream += ( 

3451 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" 

3452 f"0.5 0.5 0.5 rg s\n{da}\n" 

3453 ).encode() 

3454 if line_number == 0: 

3455 ap_stream += f"2 {y_offset} Td\n".encode() 

3456 else: 

3457 # Td is a relative translation 

3458 ap_stream += f"0 {- font_height * 1.4} Td\n".encode() 

3459 enc_line: list[bytes] = [ 

3460 font_full_rev.get(c, c.encode("utf-16-be")) for c in line 

3461 ] 

3462 if any(len(c) >= 2 for c in enc_line): 

3463 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" 

3464 else: 

3465 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" 

3466 ap_stream += b"ET\nQ\nEMC\nQ\n" 

3467 return ap_stream