Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1466 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import decimal 

31import enum 

32import hashlib 

33import re 

34import struct 

35import uuid 

36from io import BytesIO, FileIO, IOBase 

37from itertools import compress 

38from pathlib import Path 

39from types import TracebackType 

40from typing import ( 

41 IO, 

42 Any, 

43 Callable, 

44 Dict, 

45 Iterable, 

46 List, 

47 Optional, 

48 Pattern, 

49 Tuple, 

50 Type, 

51 Union, 

52 cast, 

53) 

54 

55from ._cmap import _default_fonts_space_width, build_char_map_from_dict 

56from ._doc_common import DocumentInformation, PdfDocCommon 

57from ._encryption import EncryptAlgorithm, Encryption 

58from ._page import PageObject, Transformation 

59from ._page_labels import nums_clear_range, nums_insert, nums_next 

60from ._reader import PdfReader 

61from ._utils import ( 

62 StrByteType, 

63 StreamType, 

64 _get_max_pdf_version_header, 

65 deprecate, 

66 deprecate_no_replacement, 

67 deprecation_with_replacement, 

68 logger_warning, 

69) 

70from .constants import AnnotationDictionaryAttributes as AA 

71from .constants import CatalogAttributes as CA 

72from .constants import ( 

73 CatalogDictionary, 

74 FileSpecificationDictionaryEntries, 

75 GoToActionArguments, 

76 ImageType, 

77 InteractiveFormDictEntries, 

78 OutlineFontFlag, 

79 PageLabelStyle, 

80 TypFitArguments, 

81 UserAccessPermissions, 

82) 

83from .constants import Core as CO 

84from .constants import FieldDictionaryAttributes as FA 

85from .constants import PageAttributes as PG 

86from .constants import PagesAttributes as PA 

87from .constants import TrailerKeys as TK 

88from .errors import PyPdfError 

89from .generic import ( 

90 PAGE_FIT, 

91 ArrayObject, 

92 BooleanObject, 

93 ByteStringObject, 

94 ContentStream, 

95 DecodedStreamObject, 

96 Destination, 

97 DictionaryObject, 

98 Fit, 

99 FloatObject, 

100 IndirectObject, 

101 NameObject, 

102 NullObject, 

103 NumberObject, 

104 PdfObject, 

105 RectangleObject, 

106 StreamObject, 

107 TextStringObject, 

108 TreeObject, 

109 ViewerPreferences, 

110 create_string_object, 

111 hex_to_rgb, 

112 is_null_or_none, 

113) 

114from .pagerange import PageRange, PageRangeSpec 

115from .types import ( 

116 AnnotationSubtype, 

117 BorderArrayType, 

118 LayoutType, 

119 OutlineItemType, 

120 OutlineType, 

121 PagemodeType, 

122) 

123from .xmp import XmpInformation 

124 

125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() 

126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 

127 

128 

129class ObjectDeletionFlag(enum.IntFlag): 

130 NONE = 0 

131 TEXT = enum.auto() 

132 LINKS = enum.auto() 

133 ATTACHMENTS = enum.auto() 

134 OBJECTS_3D = enum.auto() 

135 ALL_ANNOTATIONS = enum.auto() 

136 XOBJECT_IMAGES = enum.auto() 

137 INLINE_IMAGES = enum.auto() 

138 DRAWING_IMAGES = enum.auto() 

139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES 

140 

141 

142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: 

143 hash = hashlib.md5() 

144 for block in iter(lambda: stream.read(blocksize), b""): 

145 hash.update(block) 

146 return hash.hexdigest() 

147 

148 

149class PdfWriter(PdfDocCommon): 

150 """ 

151 Write a PDF file out, given pages produced by another class or through 

152 cloning a PDF file during initialization. 

153 

154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`. 

155 

156 Args: 

157 clone_from: identical to fileobj (for compatibility) 

158 

159 incremental: If true, loads the document and set the PdfWriter in incremental mode. 

160 

161 When writing incrementally, the original document is written first and new/modified 

162 content is appended. To be used for signed document/forms to keep signature valid. 

163 

164 full: If true, loads all the objects (always full if incremental = True). 

165 This parameter may allow loading large PDFs. 

166 

167 """ 

168 

169 def __init__( 

170 self, 

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "", 

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None, 

173 incremental: bool = False, 

174 full: bool = False, 

175 ) -> None: 

176 self.incremental = incremental or full 

177 """ 

178 Returns if the PdfWriter object has been started in incremental mode. 

179 """ 

180 

181 self._objects: List[Optional[PdfObject]] = [] 

182 """ 

183 The indirect objects in the PDF. 

184 For the incremental case, it will be filled with None 

185 in clone_reader_document_root. 

186 """ 

187 

188 self._original_hash: List[int] = [] 

189 """ 

190 List of hashes after import; used to identify changes. 

191 """ 

192 

193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} 

194 """ 

195 Maps hash values of indirect objects to the list of IndirectObjects. 

196 This is used for compression. 

197 """ 

198 

199 self._id_translated: Dict[int, Dict[int, int]] = {} 

200 """List of already translated IDs. 

201 dict[id(pdf)][(idnum, generation)] 

202 """ 

203 

204 self._info_obj: Optional[PdfObject] 

205 """The PDF files's document information dictionary, 

206 the Info entry in the PDF file's trailer dictionary.""" 

207 

208 self._ID: Union[ArrayObject, None] = None 

209 """The PDF file identifier, 

210 defined by the ID in the PDF file's trailer dictionary.""" 

211 

212 if self.incremental: 

213 if isinstance(fileobj, (str, Path)): 

214 with open(fileobj, "rb") as f: 

215 fileobj = BytesIO(f.read(-1)) 

216 if isinstance(fileobj, BytesIO): 

217 fileobj = PdfReader(fileobj) 

218 if not isinstance(fileobj, PdfReader): 

219 raise PyPdfError("Invalid type for incremental mode") 

220 self._reader = fileobj # prev content is in _reader.stream 

221 self._header = fileobj.pdf_header.encode() 

222 self._readonly = True # TODO: to be analysed 

223 else: 

224 self._header = b"%PDF-1.3" 

225 self._info_obj = self._add_object( 

226 DictionaryObject( 

227 {NameObject("/Producer"): create_string_object("pypdf")} 

228 ) 

229 ) 

230 

231 def _get_clone_from( 

232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO], 

234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]: 

235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and ( 

236 fileobj == "" or clone_from is not None 

237 ): 

238 return clone_from 

239 cloning = True 

240 if isinstance(fileobj, (str, Path)) and ( 

241 not Path(str(fileobj)).exists() 

242 or Path(str(fileobj)).stat().st_size == 0 

243 ): 

244 cloning = False 

245 if isinstance(fileobj, (IOBase, BytesIO)): 

246 t = fileobj.tell() 

247 if fileobj.seek(0, 2) == 0: 

248 cloning = False 

249 fileobj.seek(t, 0) 

250 if cloning: 

251 clone_from = fileobj 

252 return clone_from 

253 

254 clone_from = _get_clone_from(fileobj, clone_from) 

255 # To prevent overwriting 

256 self.temp_fileobj = fileobj 

257 self.fileobj = "" 

258 self._with_as_usage = False 

259 self._cloned = False 

260 # The root of our page tree node 

261 pages = DictionaryObject( 

262 { 

263 NameObject(PA.TYPE): NameObject("/Pages"), 

264 NameObject(PA.COUNT): NumberObject(0), 

265 NameObject(PA.KIDS): ArrayObject(), 

266 } 

267 ) 

268 self.flattened_pages = [] 

269 self._encryption: Optional[Encryption] = None 

270 self._encrypt_entry: Optional[DictionaryObject] = None 

271 

272 if clone_from is not None: 

273 if not isinstance(clone_from, PdfReader): 

274 clone_from = PdfReader(clone_from) 

275 self.clone_document_from_reader(clone_from) 

276 self._cloned = True 

277 else: 

278 self._pages = self._add_object(pages) 

279 self._root_object = DictionaryObject( 

280 { 

281 NameObject(PA.TYPE): NameObject(CO.CATALOG), 

282 NameObject(CO.PAGES): self._pages, 

283 } 

284 ) 

285 self._add_object(self._root_object) 

286 if full and not incremental: 

287 self.incremental = False 

288 if isinstance(self._ID, list): 

289 if isinstance(self._ID[0], TextStringObject): 

290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) 

291 if isinstance(self._ID[1], TextStringObject): 

292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) 

293 

294 # for commonality 

295 @property 

296 def is_encrypted(self) -> bool: 

297 """ 

298 Read-only boolean property showing whether this PDF file is encrypted. 

299 

300 Note that this property, if true, will remain true even after the 

301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

302 """ 

303 return False 

304 

305 @property 

306 def root_object(self) -> DictionaryObject: 

307 """ 

308 Provide direct access to PDF Structure. 

309 

310 Note: 

311 Recommended only for read access. 

312 

313 """ 

314 return self._root_object 

315 

316 @property 

317 def _info(self) -> Optional[DictionaryObject]: 

318 """ 

319 Provide access to "/Info". Standardized with PdfReader. 

320 

321 Returns: 

322 /Info Dictionary; None if the entry does not exist 

323 

324 """ 

325 return ( 

326 None 

327 if self._info_obj is None 

328 else cast(DictionaryObject, self._info_obj.get_object()) 

329 ) 

330 

331 @_info.setter 

332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: 

333 if value is None: 

334 try: 

335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore 

336 except (KeyError, AttributeError): 

337 pass 

338 self._info_obj = None 

339 else: 

340 if self._info_obj is None: 

341 self._info_obj = self._add_object(DictionaryObject()) 

342 obj = cast(DictionaryObject, self._info_obj.get_object()) 

343 obj.clear() 

344 obj.update(cast(DictionaryObject, value.get_object())) 

345 

346 @property 

347 def xmp_metadata(self) -> Optional[XmpInformation]: 

348 """XMP (Extensible Metadata Platform) data.""" 

349 return cast(XmpInformation, self.root_object.xmp_metadata) 

350 

351 @xmp_metadata.setter 

352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None: 

353 """XMP (Extensible Metadata Platform) data.""" 

354 if value is None: 

355 if "/Metadata" in self.root_object: 

356 del self.root_object["/Metadata"] 

357 else: 

358 self.root_object[NameObject("/Metadata")] = value 

359 

360 return self.root_object.xmp_metadata # type: ignore 

361 

362 @property 

363 def with_as_usage(self) -> bool: 

364 deprecate_no_replacement("with_as_usage", "6.0") 

365 return self._with_as_usage 

366 

367 @with_as_usage.setter 

368 def with_as_usage(self, value: bool) -> None: 

369 deprecate_no_replacement("with_as_usage", "6.0") 

370 self._with_as_usage = value 

371 

372 def __enter__(self) -> "PdfWriter": 

373 """Store how writer is initialized by 'with'.""" 

374 c: bool = self._cloned 

375 t = self.temp_fileobj 

376 self.__init__() # type: ignore 

377 self._cloned = c 

378 self._with_as_usage = True 

379 self.fileobj = t # type: ignore 

380 return self 

381 

382 def __exit__( 

383 self, 

384 exc_type: Optional[Type[BaseException]], 

385 exc: Optional[BaseException], 

386 traceback: Optional[TracebackType], 

387 ) -> None: 

388 """Write data to the fileobj.""" 

389 if self.fileobj and not self._cloned: 

390 self.write(self.fileobj) 

391 

392 @property 

393 def pdf_header(self) -> str: 

394 """ 

395 Read/Write property of the PDF header that is written. 

396 

397 This should be something like ``'%PDF-1.5'``. It is recommended to set 

398 the lowest version that supports all features which are used within the 

399 PDF file. 

400 

401 Note: `pdf_header` returns a string but accepts bytes or str for writing 

402 """ 

403 return self._header.decode() 

404 

405 @pdf_header.setter 

406 def pdf_header(self, new_header: Union[str, bytes]) -> None: 

407 if isinstance(new_header, str): 

408 new_header = new_header.encode() 

409 self._header = new_header 

410 

411 def _add_object(self, obj: PdfObject) -> IndirectObject: 

412 if ( 

413 getattr(obj, "indirect_reference", None) is not None 

414 and obj.indirect_reference.pdf == self # type: ignore 

415 ): 

416 return obj.indirect_reference # type: ignore 

417 # check for /Contents in Pages (/Contents in annotations are strings) 

418 if isinstance(obj, DictionaryObject) and isinstance( 

419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject) 

420 ): 

421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS]) 

422 self._objects.append(obj) 

423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self) 

424 return obj.indirect_reference 

425 

426 def get_object( 

427 self, 

428 indirect_reference: Union[int, IndirectObject], 

429 ) -> PdfObject: 

430 if isinstance(indirect_reference, int): 

431 obj = self._objects[indirect_reference - 1] 

432 elif indirect_reference.pdf != self: 

433 raise ValueError("PDF must be self") 

434 else: 

435 obj = self._objects[indirect_reference.idnum - 1] 

436 assert obj is not None, "mypy" 

437 return obj 

438 

439 def _replace_object( 

440 self, 

441 indirect_reference: Union[int, IndirectObject], 

442 obj: PdfObject, 

443 ) -> PdfObject: 

444 if isinstance(indirect_reference, IndirectObject): 

445 if indirect_reference.pdf != self: 

446 raise ValueError("PDF must be self") 

447 indirect_reference = indirect_reference.idnum 

448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore 

449 if ( 

450 getattr(obj, "indirect_reference", None) is not None 

451 and obj.indirect_reference.pdf != self # type: ignore 

452 ): 

453 obj = obj.clone(self) 

454 self._objects[indirect_reference - 1] = obj 

455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self) 

456 

457 assert isinstance(obj, PdfObject), "mypy" 

458 return obj 

459 

460 def _add_page( 

461 self, 

462 page: PageObject, 

463 index: int, 

464 excluded_keys: Iterable[str] = (), 

465 ) -> PageObject: 

466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE: 

467 raise ValueError("Invalid page object") 

468 assert self.flattened_pages is not None, "for mypy" 

469 page_org = page 

470 excluded_keys = list(excluded_keys) 

471 excluded_keys += [PA.PARENT, "/StructParents"] 

472 # Acrobat does not accept two indirect references pointing on the same 

473 # page; therefore in order to add multiple copies of the same 

474 # page, we need to create a new dictionary for the page, however the 

475 # objects below (including content) are not duplicated: 

476 try: # delete an already existing page 

477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore 

478 page_org.indirect_reference.idnum # type: ignore 

479 ] 

480 except Exception: 

481 pass 

482 page = cast( 

483 "PageObject", page_org.clone(self, False, excluded_keys).get_object() 

484 ) 

485 if page_org.pdf is not None: 

486 other = page_org.pdf.pdf_header 

487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) 

488 node, idx = self._get_page_in_node(index) 

489 page[NameObject(PA.PARENT)] = node.indirect_reference 

490 

491 if idx >= 0: 

492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference) 

493 self.flattened_pages.insert(index, page) 

494 else: 

495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) 

496 self.flattened_pages.append(page) 

497 recurse = 0 

498 while not is_null_or_none(node): 

499 node = cast(DictionaryObject, node.get_object()) 

500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) 

501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix. 

502 recurse += 1 

503 if recurse > 1000: 

504 raise PyPdfError("Too many recursive calls!") 

505 return page 

506 

507 def set_need_appearances_writer(self, state: bool = True) -> None: 

508 """ 

509 Sets the "NeedAppearances" flag in the PDF writer. 

510 

511 The "NeedAppearances" flag indicates whether the appearance dictionary 

512 for form fields should be automatically generated by the PDF viewer or 

513 if the embedded appearance should be used. 

514 

515 Args: 

516 state: The actual value of the NeedAppearances flag. 

517 

518 Returns: 

519 None 

520 

521 """ 

522 # See §12.7.2 and §7.7.2 for more information: 

523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

524 try: 

525 # get the AcroForm tree 

526 if CatalogDictionary.ACRO_FORM not in self._root_object: 

527 self._root_object[ 

528 NameObject(CatalogDictionary.ACRO_FORM) 

529 ] = self._add_object(DictionaryObject()) 

530 

531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) 

532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[ 

533 need_appearances 

534 ] = BooleanObject(state) 

535 except Exception as exc: # pragma: no cover 

536 logger_warning( 

537 f"set_need_appearances_writer({state}) catch : {exc}", __name__ 

538 ) 

539 

540 def create_viewer_preferences(self) -> ViewerPreferences: 

541 o = ViewerPreferences() 

542 self._root_object[ 

543 NameObject(CatalogDictionary.VIEWER_PREFERENCES) 

544 ] = self._add_object(o) 

545 return o 

546 

547 def add_page( 

548 self, 

549 page: PageObject, 

550 excluded_keys: Iterable[str] = (), 

551 ) -> PageObject: 

552 """ 

553 Add a page to this PDF file. 

554 

555 Recommended for advanced usage including the adequate excluded_keys. 

556 

557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>` 

558 instance. 

559 

560 Args: 

561 page: The page to add to the document. Should be 

562 an instance of :class:`PageObject<pypdf._page.PageObject>` 

563 excluded_keys: 

564 

565 Returns: 

566 The added PageObject. 

567 

568 """ 

569 assert self.flattened_pages is not None, "mypy" 

570 return self._add_page(page, len(self.flattened_pages), excluded_keys) 

571 

572 def insert_page( 

573 self, 

574 page: PageObject, 

575 index: int = 0, 

576 excluded_keys: Iterable[str] = (), 

577 ) -> PageObject: 

578 """ 

579 Insert a page in this PDF file. The page is usually acquired from a 

580 :class:`PdfReader<pypdf.PdfReader>` instance. 

581 

582 Args: 

583 page: The page to add to the document. 

584 index: Position at which the page will be inserted. 

585 excluded_keys: 

586 

587 Returns: 

588 The added PageObject. 

589 

590 """ 

591 assert self.flattened_pages is not None, "mypy" 

592 if index < 0: 

593 index = len(self.flattened_pages) + index 

594 if index < 0: 

595 raise ValueError("Invalid index value") 

596 if index >= len(self.flattened_pages): 

597 return self.add_page(page, excluded_keys) 

598 return self._add_page(page, index, excluded_keys) 

599 

600 def _get_page_number_by_indirect( 

601 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

602 ) -> Optional[int]: 

603 """ 

604 Generate _page_id2num. 

605 

606 Args: 

607 indirect_reference: 

608 

609 Returns: 

610 The page number or None 

611 

612 """ 

613 # To provide same function as in PdfReader 

614 if is_null_or_none(indirect_reference): 

615 return None 

616 assert indirect_reference is not None, "mypy" 

617 if isinstance(indirect_reference, int): 

618 indirect_reference = IndirectObject(indirect_reference, 0, self) 

619 obj = indirect_reference.get_object() 

620 if isinstance(obj, PageObject): 

621 return obj.page_number 

622 return None 

623 

624 def add_blank_page( 

625 self, width: Optional[float] = None, height: Optional[float] = None 

626 ) -> PageObject: 

627 """ 

628 Append a blank page to this PDF file and return it. 

629 

630 If no page size is specified, use the size of the last page. 

631 

632 Args: 

633 width: The width of the new page expressed in default user 

634 space units. 

635 height: The height of the new page expressed in default 

636 user space units. 

637 

638 Returns: 

639 The newly appended page. 

640 

641 Raises: 

642 PageSizeNotDefinedError: if width and height are not defined 

643 and previous page does not exist. 

644 

645 """ 

646 page = PageObject.create_blank_page(self, width, height) 

647 return self.add_page(page) 

648 

649 def insert_blank_page( 

650 self, 

651 width: Optional[Union[float, decimal.Decimal]] = None, 

652 height: Optional[Union[float, decimal.Decimal]] = None, 

653 index: int = 0, 

654 ) -> PageObject: 

655 """ 

656 Insert a blank page to this PDF file and return it. 

657 

658 If no page size is specified, use the size of the last page. 

659 

660 Args: 

661 width: The width of the new page expressed in default user 

662 space units. 

663 height: The height of the new page expressed in default 

664 user space units. 

665 index: Position to add the page. 

666 

667 Returns: 

668 The newly inserted page. 

669 

670 Raises: 

671 PageSizeNotDefinedError: if width and height are not defined 

672 and previous page does not exist. 

673 

674 """ 

675 if width is None or (height is None and index < self.get_num_pages()): 

676 oldpage = self.pages[index] 

677 width = oldpage.mediabox.width 

678 height = oldpage.mediabox.height 

679 page = PageObject.create_blank_page(self, width, height) 

680 self.insert_page(page, index) 

681 return page 

682 

683 @property 

684 def open_destination( 

685 self, 

686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

687 return super().open_destination 

688 

689 @open_destination.setter 

690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

691 if dest is None: 

692 try: 

693 del self._root_object["/OpenAction"] 

694 except KeyError: 

695 pass 

696 elif isinstance(dest, str): 

697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) 

698 elif isinstance(dest, Destination): 

699 self._root_object[NameObject("/OpenAction")] = dest.dest_array 

700 elif isinstance(dest, PageObject): 

701 self._root_object[NameObject("/OpenAction")] = Destination( 

702 "Opening", 

703 dest.indirect_reference 

704 if dest.indirect_reference is not None 

705 else NullObject(), 

706 PAGE_FIT, 

707 ).dest_array 

708 

709 def add_js(self, javascript: str) -> None: 

710 """ 

711 Add JavaScript which will launch upon opening this PDF. 

712 

713 Args: 

714 javascript: Your JavaScript. 

715 

716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 

717 # Example: This will launch the print window when the PDF is opened. 

718 

719 """ 

720 # Names / JavaScript preferred to be able to add multiple scripts 

721 if "/Names" not in self._root_object: 

722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject() 

723 names = cast(DictionaryObject, self._root_object[CA.NAMES]) 

724 if "/JavaScript" not in names: 

725 names[NameObject("/JavaScript")] = DictionaryObject( 

726 {NameObject("/Names"): ArrayObject()} 

727 ) 

728 js_list = cast( 

729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] 

730 ) 

731 # We need a name for parameterized JavaScript in the PDF file, 

732 # but it can be anything. 

733 js_list.append(create_string_object(str(uuid.uuid4()))) 

734 

735 js = DictionaryObject( 

736 { 

737 NameObject(PA.TYPE): NameObject("/Action"), 

738 NameObject("/S"): NameObject("/JavaScript"), 

739 NameObject("/JS"): TextStringObject(f"{javascript}"), 

740 } 

741 ) 

742 js_list.append(self._add_object(js)) 

743 

744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: 

745 """ 

746 Embed a file inside the PDF. 

747 

748 Reference: 

749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf 

750 Section 7.11.3 

751 

752 Args: 

753 filename: The filename to display. 

754 data: The data in the file. 

755 

756 """ 

757 # We need three entries: 

758 # * The file's data 

759 # * The /Filespec entry 

760 # * The file's name, which goes in the Catalog 

761 

762 # The entry for the file 

763 # Sample: 

764 # 8 0 obj 

765 # << 

766 # /Length 12 

767 # /Type /EmbeddedFile 

768 # >> 

769 # stream 

770 # Hello world! 

771 # endstream 

772 # endobj 

773 

774 if isinstance(data, str): 

775 data = data.encode("latin-1") 

776 file_entry = DecodedStreamObject() 

777 file_entry.set_data(data) 

778 file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) 

779 

780 # The Filespec entry 

781 # Sample: 

782 # 7 0 obj 

783 # << 

784 # /Type /Filespec 

785 # /F (hello.txt) 

786 # /EF << /F 8 0 R >> 

787 # >> 

788 # endobj 

789 

790 ef_entry = DictionaryObject() 

791 ef_entry.update({NameObject("/F"): self._add_object(file_entry)}) 

792 

793 filespec = DictionaryObject() 

794 filespec.update( 

795 { 

796 NameObject(PA.TYPE): NameObject("/Filespec"), 

797 NameObject(FileSpecificationDictionaryEntries.F): create_string_object( 

798 filename 

799 ), # Perhaps also try TextStringObject 

800 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, 

801 } 

802 ) 

803 

804 # Then create the entry for the root, as it needs 

805 # a reference to the Filespec 

806 # Sample: 

807 # 1 0 obj 

808 # << 

809 # /Type /Catalog 

810 # /Outlines 2 0 R 

811 # /Pages 3 0 R 

812 # /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> 

813 # >> 

814 # endobj 

815 

816 if CA.NAMES not in self._root_object: 

817 self._root_object[NameObject(CA.NAMES)] = self._add_object( 

818 DictionaryObject() 

819 ) 

820 if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]): 

821 embedded_files_names_dictionary = DictionaryObject( 

822 {NameObject(CA.NAMES): ArrayObject()} 

823 ) 

824 cast(DictionaryObject, self._root_object[CA.NAMES])[ 

825 NameObject("/EmbeddedFiles") 

826 ] = self._add_object(embedded_files_names_dictionary) 

827 else: 

828 embedded_files_names_dictionary = cast( 

829 DictionaryObject, 

830 cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"], 

831 ) 

832 cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend( 

833 [create_string_object(filename), filespec] 

834 ) 

835 

836 def append_pages_from_reader( 

837 self, 

838 reader: PdfReader, 

839 after_page_append: Optional[Callable[[PageObject], None]] = None, 

840 ) -> None: 

841 """ 

842 Copy pages from reader to writer. Includes an optional callback 

843 parameter which is invoked after pages are appended to the writer. 

844 

845 ``append`` should be preferred. 

846 

847 Args: 

848 reader: a PdfReader object from which to copy page 

849 annotations to this writer object. The writer's annots 

850 will then be updated. 

851 after_page_append: 

852 Callback function that is invoked after each page is appended to 

853 the writer. Signature includes a reference to the appended page 

854 (delegates to append_pages_from_reader). The single parameter of 

855 the callback is a reference to the page just appended to the 

856 document. 

857 

858 """ 

859 reader_num_pages = len(reader.pages) 

860 # Copy pages from reader to writer 

861 for reader_page_number in range(reader_num_pages): 

862 reader_page = reader.pages[reader_page_number] 

863 writer_page = self.add_page(reader_page) 

864 # Trigger callback, pass writer page as parameter 

865 if callable(after_page_append): 

866 after_page_append(writer_page) 

867 

868 def _merge_content_stream_to_page( 

869 self, 

870 page: PageObject, 

871 new_content_data: bytes, 

872 ) -> None: 

873 """ 

874 Combines existing content stream(s) with new content (as bytes), 

875 and returns a new single StreamObject. 

876 

877 Args: 

878 page: The page to which the new content data will be added. 

879 new_content_data: A binary-encoded new content stream, for 

880 instance the commands to draw an XObject. 

881 """ 

882 # First resolve the existing page content. This always is an IndirectObject: 

883 # PDF Explained by John Whitington 

884 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html 

885 if NameObject("/Contents") in page: 

886 existing_content_ref = page[NameObject("/Contents")] 

887 existing_content = existing_content_ref.get_object() 

888 

889 if isinstance(existing_content, ArrayObject): 

890 # Create a new StreamObject for the new_content_data 

891 new_stream_obj = StreamObject() 

892 new_stream_obj.set_data(new_content_data) 

893 existing_content.append(self._add_object(new_stream_obj)) 

894 page[NameObject("/Contents")] = self._add_object(existing_content) 

895 if isinstance(existing_content, StreamObject): 

896 # Merge new content to existing StreamObject 

897 merged_data = existing_content.get_data() + b"\n" + new_content_data 

898 new_stream = StreamObject() 

899 new_stream.set_data(merged_data) 

900 page[NameObject("/Contents")] = self._add_object(new_stream) 

901 else: 

902 # If no existing content, then we have an empty page. 

903 # Create a new StreamObject in a new /Contents entry. 

904 new_stream = StreamObject() 

905 new_stream.set_data(new_content_data) 

906 page[NameObject("/Contents")] = self._add_object(new_stream) 

907 

908 def _add_apstream_object( 

909 self, 

910 page: PageObject, 

911 appearance_stream_obj: StreamObject, 

912 object_name: str, 

913 x_offset: float, 

914 y_offset: float, 

915 font_res: Optional[DictionaryObject] = None 

916 ) -> None: 

917 """ 

918 Adds an appearance stream to the page content in the form of 

919 an XObject. 

920 

921 Args: 

922 page: The page to which to add the appearance stream. 

923 appearance_stream_obj: The appearance stream. 

924 object_name: The name of the appearance stream. 

925 x_offset: The horizontal offset for the appearance stream. 

926 y_offset: The vertical offset for the appearance stream. 

927 font_res: The appearance stream's font resource (if given). 

928 """ 

929 # Prepare XObject resource dictionary on the page 

930 pg_res = cast(DictionaryObject, page[PG.RESOURCES]) 

931 if font_res is not None: 

932 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated 

933 if "/Font" not in pg_res: 

934 pg_res[NameObject("/Font")] = DictionaryObject() 

935 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")]) 

936 if font_name not in pg_ft_res: 

937 pg_ft_res[NameObject(font_name)] = font_res 

938 # Always add the resolved stream object to the writer to get a new IndirectObject. 

939 # This ensures we have a valid IndirectObject managed by *this* writer. 

940 xobject_ref = self._add_object(appearance_stream_obj) 

941 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize() 

942 if "/XObject" not in pg_res: 

943 pg_res[NameObject("/XObject")] = DictionaryObject() 

944 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"]) 

945 if xobject_name not in pg_xo_res: 

946 pg_xo_res[xobject_name] = xobject_ref 

947 else: 

948 logger_warning( 

949 f"XObject {xobject_name!r} already added to page resources. This might be an issue.", 

950 __name__ 

951 ) 

952 xobject_cm = Transformation().translate(x_offset, y_offset) 

953 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() 

954 self._merge_content_stream_to_page(page, xobject_drawing_commands) 

955 

956 def _update_field_annotation( 

957 self, 

958 page: PageObject, 

959 field: DictionaryObject, 

960 annotation: DictionaryObject, 

961 font_name: str = "", 

962 font_size: float = -1, 

963 flatten: bool = False, 

964 ) -> None: 

965 # Calculate rectangle dimensions 

966 _rct = cast(RectangleObject, annotation[AA.Rect]) 

967 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) 

968 

969 # Extract font information 

970 da = annotation.get_inherited( 

971 AA.DA, 

972 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( 

973 AA.DA, None 

974 ), 

975 ) 

976 if da is None: 

977 da = TextStringObject("/Helv 0 Tf 0 g") 

978 else: 

979 da = da.get_object() 

980 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") 

981 font_properties = [x for x in font_properties if x != ""] 

982 if font_name: 

983 font_properties[font_properties.index("Tf") - 2] = font_name 

984 else: 

985 font_name = font_properties[font_properties.index("Tf") - 2] 

986 font_height = ( 

987 font_size 

988 if font_size >= 0 

989 else float(font_properties[font_properties.index("Tf") - 1]) 

990 ) 

991 if font_height == 0: 

992 if field.get(FA.Ff, 0) & FA.FfBits.Multiline: 

993 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE 

994 else: 

995 font_height = rct.height - 2 

996 font_properties[font_properties.index("Tf") - 1] = str(font_height) 

997 da = " ".join(font_properties) 

998 y_offset = rct.height - 1 - font_height 

999 

1000 # Retrieve font information from local DR ... 

1001 dr: Any = cast( 

1002 DictionaryObject, 

1003 cast( 

1004 DictionaryObject, 

1005 annotation.get_inherited( 

1006 "/DR", 

1007 cast( 

1008 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] 

1009 ).get("/DR", DictionaryObject()), 

1010 ), 

1011 ).get_object(), 

1012 ) 

1013 dr = dr.get("/Font", DictionaryObject()).get_object() 

1014 # _default_fonts_space_width keys is the list of Standard fonts 

1015 if font_name not in dr and font_name not in _default_fonts_space_width: 

1016 # ...or AcroForm dictionary 

1017 dr = cast( 

1018 Dict[Any, Any], 

1019 cast( 

1020 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] 

1021 ).get("/DR", {}), 

1022 ) 

1023 dr = dr.get_object().get("/Font", DictionaryObject()).get_object() 

1024 font_res = dr.get(font_name, None) 

1025 if not is_null_or_none(font_res): 

1026 font_res = cast(DictionaryObject, font_res.get_object()) 

1027 font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 

1028 200, font_res 

1029 ) 

1030 try: # remove width stored in -1 key 

1031 del font_map[-1] 

1032 except KeyError: 

1033 pass 

1034 font_full_rev: Dict[str, bytes] 

1035 if isinstance(font_encoding, str): 

1036 font_full_rev = { 

1037 v: k.encode(font_encoding) for k, v in font_map.items() 

1038 } 

1039 else: 

1040 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

1041 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

1042 for key, value in font_map.items(): 

1043 font_full_rev[value] = font_encoding_rev.get(key, key) 

1044 else: 

1045 logger_warning(f"Font dictionary for {font_name} not found.", __name__) 

1046 font_full_rev = {} 

1047 

1048 # Retrieve field text and selected values 

1049 field_flags = field.get(FA.Ff, 0) 

1050 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: 

1051 txt = "\n".join(annotation.get_inherited(FA.Opt, [])) 

1052 sel = field.get("/V", []) 

1053 if not isinstance(sel, list): 

1054 sel = [sel] 

1055 else: # /Tx 

1056 txt = field.get("/V", "") 

1057 sel = [] 

1058 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) 

1059 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") 

1060 # Generate appearance stream 

1061 ap_stream = generate_appearance_stream( 

1062 txt, sel, da, font_full_rev, rct, font_height, y_offset 

1063 ) 

1064 

1065 # Create appearance dictionary 

1066 dct = DecodedStreamObject.initialize_from_dictionary( 

1067 { 

1068 NameObject("/Type"): NameObject("/XObject"), 

1069 NameObject("/Subtype"): NameObject("/Form"), 

1070 NameObject("/BBox"): rct, 

1071 "__streamdata__": ByteStringObject(ap_stream), 

1072 "/Length": 0, 

1073 } 

1074 ) 

1075 if AA.AP in annotation: 

1076 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): 

1077 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: 

1078 dct[k] = v 

1079 

1080 # Update Resources with font information if necessary 

1081 if font_res is not None: 

1082 dct[NameObject("/Resources")] = DictionaryObject( 

1083 { 

1084 NameObject("/Font"): DictionaryObject( 

1085 { 

1086 NameObject(font_name): getattr( 

1087 font_res, "indirect_reference", font_res 

1088 ) 

1089 } 

1090 ) 

1091 } 

1092 ) 

1093 if AA.AP not in annotation: 

1094 annotation[NameObject(AA.AP)] = DictionaryObject( 

1095 {NameObject("/N"): self._add_object(dct)} 

1096 ) 

1097 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]): 

1098 cast(DictionaryObject, annotation[NameObject(AA.AP)])[ 

1099 NameObject("/N") 

1100 ] = self._add_object(dct) 

1101 else: # [/AP][/N] exists 

1102 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore 

1103 self._objects[n - 1] = dct 

1104 dct.indirect_reference = IndirectObject(n, 0, self) 

1105 

1106 if flatten: 

1107 field_name = self._get_qualified_field_name(annotation) 

1108 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res) 

1109 

1110 FFBITS_NUL = FA.FfBits(0) 

1111 

1112 def update_page_form_field_values( 

1113 self, 

1114 page: Union[PageObject, List[PageObject], None], 

1115 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]], 

1116 flags: FA.FfBits = FFBITS_NUL, 

1117 auto_regenerate: Optional[bool] = True, 

1118 flatten: bool = False, 

1119 ) -> None: 

1120 """ 

1121 Update the form field values for a given page from a fields dictionary. 

1122 

1123 Copy field texts and values from fields to page. 

1124 If the field links to a parent object, add the information to the parent. 

1125 

1126 Args: 

1127 page: `PageObject` - references **PDF writer's page** where the 

1128 annotations and field data will be updated. 

1129 `List[Pageobject]` - provides list of pages to be processed. 

1130 `None` - all pages. 

1131 fields: a Python dictionary of: 

1132 

1133 * field names (/T) as keys and text values (/V) as value 

1134 * field names (/T) as keys and list of text values (/V) for multiple choice list 

1135 * field names (/T) as keys and tuple of: 

1136 * text values (/V) 

1137 * font id (e.g. /F1, the font id must exist) 

1138 * font size (0 for autosize) 

1139 

1140 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`. 

1141 

1142 auto_regenerate: Set/unset the need_appearances flag; 

1143 the flag is unchanged if auto_regenerate is None. 

1144 

1145 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's 

1146 appearance stream to the page contents. Note that this option does not remove the 

1147 annotation itself. 

1148 

1149 """ 

1150 if CatalogDictionary.ACRO_FORM not in self._root_object: 

1151 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") 

1152 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1153 if InteractiveFormDictEntries.Fields not in af: 

1154 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") 

1155 if isinstance(auto_regenerate, bool): 

1156 self.set_need_appearances_writer(auto_regenerate) 

1157 # Iterate through pages, update field values 

1158 if page is None: 

1159 page = list(self.pages) 

1160 if isinstance(page, list): 

1161 for p in page: 

1162 if PG.ANNOTS in p: # just to prevent warnings 

1163 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten) 

1164 return 

1165 if PG.ANNOTS not in page: 

1166 logger_warning("No fields to update on this page", __name__) 

1167 return 

1168 for annotation in page[PG.ANNOTS]: # type: ignore 

1169 annotation = cast(DictionaryObject, annotation.get_object()) 

1170 if annotation.get("/Subtype", "") != "/Widget": 

1171 continue 

1172 if "/FT" in annotation and "/T" in annotation: 

1173 parent_annotation = annotation 

1174 else: 

1175 parent_annotation = annotation.get( 

1176 PG.PARENT, DictionaryObject() 

1177 ).get_object() 

1178 

1179 for field, value in fields.items(): 

1180 if not ( 

1181 self._get_qualified_field_name(parent_annotation) == field 

1182 or parent_annotation.get("/T", None) == field 

1183 ): 

1184 continue 

1185 if ( 

1186 parent_annotation.get("/FT", None) == "/Ch" 

1187 and "/I" in parent_annotation 

1188 ): 

1189 del parent_annotation["/I"] 

1190 if flags: 

1191 annotation[NameObject(FA.Ff)] = NumberObject(flags) 

1192 if not (value is None and flatten): # Only change values if given by user and not flattening. 

1193 if isinstance(value, list): 

1194 lst = ArrayObject(TextStringObject(v) for v in value) 

1195 parent_annotation[NameObject(FA.V)] = lst 

1196 elif isinstance(value, tuple): 

1197 annotation[NameObject(FA.V)] = TextStringObject( 

1198 value[0], 

1199 ) 

1200 else: 

1201 parent_annotation[NameObject(FA.V)] = TextStringObject(value) 

1202 if parent_annotation.get(FA.FT) == "/Btn": 

1203 # Checkbox button (no /FT found in Radio widgets) 

1204 v = NameObject(value) 

1205 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) 

1206 normal_ap = cast(DictionaryObject, ap["/N"]) 

1207 if v not in normal_ap: 

1208 v = NameObject("/Off") 

1209 appearance_stream_obj = normal_ap.get(v) 

1210 # other cases will be updated through the for loop 

1211 annotation[NameObject(AA.AS)] = v 

1212 annotation[NameObject(FA.V)] = v 

1213 if flatten and appearance_stream_obj is not None: 

1214 # We basically copy the entire appearance stream, which should be an XObject that 

1215 # is already registered. No need to add font resources. 

1216 rct = cast(RectangleObject, annotation[AA.Rect]) 

1217 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) 

1218 elif ( 

1219 parent_annotation.get(FA.FT) == "/Tx" 

1220 or parent_annotation.get(FA.FT) == "/Ch" 

1221 ): 

1222 # textbox 

1223 if isinstance(value, tuple): 

1224 self._update_field_annotation( 

1225 page, parent_annotation, annotation, value[1], value[2], flatten=flatten 

1226 ) 

1227 else: 

1228 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten) 

1229 elif ( 

1230 annotation.get(FA.FT) == "/Sig" 

1231 ): # deprecated # not implemented yet 

1232 logger_warning("Signature forms not implemented yet", __name__) 

1233 

1234 def reattach_fields( 

1235 self, page: Optional[PageObject] = None 

1236 ) -> List[DictionaryObject]: 

1237 """ 

1238 Parse annotations within the page looking for orphan fields and 

1239 reattach then into the Fields Structure. 

1240 

1241 Args: 

1242 page: page to analyze. 

1243 If none is provided, all pages will be analyzed. 

1244 

1245 Returns: 

1246 list of reattached fields. 

1247 

1248 """ 

1249 lst = [] 

1250 if page is None: 

1251 for p in self.pages: 

1252 lst += self.reattach_fields(p) 

1253 return lst 

1254 

1255 try: 

1256 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) 

1257 except KeyError: 

1258 af = DictionaryObject() 

1259 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af 

1260 try: 

1261 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields]) 

1262 except KeyError: 

1263 fields = ArrayObject() 

1264 af[NameObject(InteractiveFormDictEntries.Fields)] = fields 

1265 

1266 if "/Annots" not in page: 

1267 return lst 

1268 annotations = cast(ArrayObject, page["/Annots"]) 

1269 for idx, annotation in enumerate(annotations): 

1270 is_indirect = isinstance(annotation, IndirectObject) 

1271 annotation = cast(DictionaryObject, annotation.get_object()) 

1272 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation: 

1273 if ( 

1274 "indirect_reference" in annotation.__dict__ 

1275 and annotation.indirect_reference in fields 

1276 ): 

1277 continue 

1278 if not is_indirect: 

1279 annotations[idx] = self._add_object(annotation) 

1280 fields.append(annotation.indirect_reference) 

1281 lst.append(annotation) 

1282 return lst 

1283 

1284 def clone_reader_document_root(self, reader: PdfReader) -> None: 

1285 """ 

1286 Copy the reader document root to the writer and all sub-elements, 

1287 including pages, threads, outlines,... For partial insertion, ``append`` 

1288 should be considered. 

1289 

1290 Args: 

1291 reader: PdfReader from which the document root should be copied. 

1292 

1293 """ 

1294 self._info_obj = None 

1295 if self.incremental: 

1296 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1) 

1297 for i in range(len(self._objects)): 

1298 o = reader.get_object(i + 1) 

1299 if o is not None: 

1300 self._objects[i] = o.replicate(self) 

1301 else: 

1302 self._objects.clear() 

1303 self._root_object = reader.root_object.clone(self) 

1304 self._pages = self._root_object.raw_get("/Pages") 

1305 

1306 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest 

1307 # must be done here before rewriting 

1308 if self.incremental: 

1309 self._original_hash = [ 

1310 (obj.hash_bin() if obj is not None else 0) for obj in self._objects 

1311 ] 

1312 self._flatten() 

1313 assert self.flattened_pages is not None 

1314 for p in self.flattened_pages: 

1315 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) 

1316 if not self.incremental: 

1317 p[NameObject("/Parent")] = self._pages 

1318 if not self.incremental: 

1319 cast(DictionaryObject, self._pages.get_object())[ 

1320 NameObject("/Kids") 

1321 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) 

1322 

1323 def clone_document_from_reader( 

1324 self, 

1325 reader: PdfReader, 

1326 after_page_append: Optional[Callable[[PageObject], None]] = None, 

1327 ) -> None: 

1328 """ 

1329 Create a copy (clone) of a document from a PDF file reader cloning 

1330 section '/Root' and '/Info' and '/ID' of the pdf. 

1331 

1332 Args: 

1333 reader: PDF file reader instance from which the clone 

1334 should be created. 

1335 after_page_append: 

1336 Callback function that is invoked after each page is appended to 

1337 the writer. Signature includes a reference to the appended page 

1338 (delegates to append_pages_from_reader). The single parameter of 

1339 the callback is a reference to the page just appended to the 

1340 document. 

1341 

1342 """ 

1343 self.clone_reader_document_root(reader) 

1344 inf = reader._info 

1345 if self.incremental: 

1346 if inf is not None: 

1347 self._info_obj = cast( 

1348 IndirectObject, inf.clone(self).indirect_reference 

1349 ) 

1350 assert isinstance(self._info, DictionaryObject), "for mypy" 

1351 self._original_hash[ 

1352 self._info_obj.indirect_reference.idnum - 1 

1353 ] = self._info.hash_bin() 

1354 elif inf is not None: 

1355 self._info_obj = self._add_object( 

1356 DictionaryObject(cast(DictionaryObject, inf.get_object())) 

1357 ) 

1358 # else: _info_obj = None done in clone_reader_document_root() 

1359 

1360 try: 

1361 self._ID = cast(ArrayObject, reader._ID).clone(self) 

1362 except AttributeError: 

1363 pass 

1364 

1365 if callable(after_page_append): 

1366 for page in cast( 

1367 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] 

1368 ): 

1369 after_page_append(page.get_object()) 

1370 

1371 def _compute_document_identifier(self) -> ByteStringObject: 

1372 stream = BytesIO() 

1373 self._write_pdf_structure(stream) 

1374 stream.seek(0) 

1375 return ByteStringObject(_rolling_checksum(stream).encode("utf8")) 

1376 

1377 def generate_file_identifiers(self) -> None: 

1378 """ 

1379 Generate an identifier for the PDF that will be written. 

1380 

1381 The only point of this is ensuring uniqueness. Reproducibility is not 

1382 required. 

1383 When a file is first written, both identifiers shall be set to the same value. 

1384 If both identifiers match when a file reference is resolved, it is very 

1385 likely that the correct and unchanged file has been found. If only the first 

1386 identifier matches, a different version of the correct file has been found. 

1387 see §14.4 "File Identifiers". 

1388 """ 

1389 if self._ID: 

1390 id1 = self._ID[0] 

1391 id2 = self._compute_document_identifier() 

1392 else: 

1393 id1 = self._compute_document_identifier() 

1394 id2 = id1 

1395 self._ID = ArrayObject((id1, id2)) 

1396 

1397 def encrypt( 

1398 self, 

1399 user_password: str, 

1400 owner_password: Optional[str] = None, 

1401 use_128bit: bool = True, 

1402 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS, 

1403 *, 

1404 algorithm: Optional[str] = None, 

1405 ) -> None: 

1406 """ 

1407 Encrypt this PDF file with the PDF Standard encryption handler. 

1408 

1409 Args: 

1410 user_password: The password which allows for opening 

1411 and reading the PDF file with the restrictions provided. 

1412 owner_password: The password which allows for 

1413 opening the PDF files without any restrictions. By default, 

1414 the owner password is the same as the user password. 

1415 use_128bit: flag as to whether to use 128bit 

1416 encryption. When false, 40bit encryption will be used. 

1417 By default, this flag is on. 

1418 permissions_flag: permissions as described in 

1419 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means 

1420 the permission is granted. 

1421 Hence an integer value of -1 will set all flags. 

1422 Bit position 3 is for printing, 4 is for modifying content, 

1423 5 and 6 control annotations, 9 for form fields, 

1424 10 for extraction of text and graphics. 

1425 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128", 

1426 "AES-128", "AES-256-R5", "AES-256". If it is valid, 

1427 `use_128bit` will be ignored. 

1428 

1429 """ 

1430 if owner_password is None: 

1431 owner_password = user_password 

1432 

1433 if algorithm is not None: 

1434 try: 

1435 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_")) 

1436 except AttributeError: 

1437 raise ValueError(f"Algorithm '{algorithm}' NOT supported") 

1438 else: 

1439 alg = EncryptAlgorithm.RC4_128 

1440 if not use_128bit: 

1441 alg = EncryptAlgorithm.RC4_40 

1442 self.generate_file_identifiers() 

1443 assert self._ID 

1444 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0]) 

1445 # in case call `encrypt` again 

1446 entry = self._encryption.write_entry(user_password, owner_password) 

1447 if self._encrypt_entry: 

1448 # replace old encrypt_entry 

1449 assert self._encrypt_entry.indirect_reference is not None 

1450 entry.indirect_reference = self._encrypt_entry.indirect_reference 

1451 self._objects[entry.indirect_reference.idnum - 1] = entry 

1452 else: 

1453 self._add_object(entry) 

1454 self._encrypt_entry = entry 

1455 

1456 def write_stream(self, stream: StreamType) -> None: 

1457 if hasattr(stream, "mode") and "b" not in stream.mode: 

1458 logger_warning( 

1459 f"File <{stream.name}> to write to is not in binary mode. " 

1460 "It may not be written to correctly.", 

1461 __name__, 

1462 ) 

1463 # deprecated to be removed in pypdf 6.0.0 : 

1464 # if not self._root: 

1465 # self._root = self._add_object(self._root_object) 

1466 # self._sweep_indirect_references(self._root) 

1467 

1468 if self.incremental: 

1469 self._reader.stream.seek(0) 

1470 stream.write(self._reader.stream.read(-1)) 

1471 if len(self.list_objects_in_increment()) > 0: 

1472 self._write_increment(stream) # writes objs, xref stream and startxref 

1473 else: 

1474 object_positions, free_objects = self._write_pdf_structure(stream) 

1475 xref_location = self._write_xref_table( 

1476 stream, object_positions, free_objects 

1477 ) 

1478 self._write_trailer(stream, xref_location) 

1479 

1480 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: 

1481 """ 

1482 Write the collection of pages added to this object out as a PDF file. 

1483 

1484 Args: 

1485 stream: An object to write the file to. The object can support 

1486 the write method and the tell method, similar to a file object, or 

1487 be a file path, just like the fileobj, just named it stream to keep 

1488 existing workflow. 

1489 

1490 Returns: 

1491 A tuple (bool, IO). 

1492 

1493 """ 

1494 my_file = False 

1495 

1496 if stream == "": 

1497 raise ValueError(f"Output({stream=}) is empty.") 

1498 

1499 if isinstance(stream, (str, Path)): 

1500 stream = FileIO(stream, "wb") 

1501 my_file = True 

1502 

1503 self.write_stream(stream) 

1504 

1505 if my_file: 

1506 stream.close() 

1507 else: 

1508 stream.flush() 

1509 

1510 return my_file, stream 

1511 

1512 def list_objects_in_increment(self) -> List[IndirectObject]: 

1513 """ 

1514 For analysis or debugging. 

1515 Provides the list of new or modified objects that will be written 

1516 in the increment. 

1517 Deleted objects will not be freed but will become orphans. 

1518 

1519 Returns: 

1520 List of new or modified IndirectObjects 

1521 

1522 """ 

1523 original_hash_count = len(self._original_hash) 

1524 return [ 

1525 cast(IndirectObject, obj).indirect_reference 

1526 for i, obj in enumerate(self._objects) 

1527 if ( 

1528 obj is not None 

1529 and ( 

1530 i >= original_hash_count 

1531 or obj.hash_bin() != self._original_hash[i] 

1532 ) 

1533 ) 

1534 ] 

1535 

1536 def _write_increment(self, stream: StreamType) -> None: 

1537 object_positions = {} 

1538 object_blocks = [] 

1539 current_start = -1 

1540 current_stop = -2 

1541 original_hash_count = len(self._original_hash) 

1542 for i, obj in enumerate(self._objects): 

1543 if obj is not None and ( 

1544 i >= original_hash_count 

1545 or obj.hash_bin() != self._original_hash[i] 

1546 ): 

1547 idnum = i + 1 

1548 assert isinstance(obj, PdfObject), "mypy" 

1549 # first write new/modified object 

1550 object_positions[idnum] = stream.tell() 

1551 stream.write(f"{idnum} 0 obj\n".encode()) 

1552 """ encryption is not operational 

1553 if self._encryption and obj != self._encrypt_entry: 

1554 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1555 """ 

1556 obj.write_to_stream(stream) 

1557 stream.write(b"\nendobj\n") 

1558 

1559 # prepare xref 

1560 if idnum != current_stop: 

1561 if current_start > 0: 

1562 object_blocks.append( 

1563 [current_start, current_stop - current_start] 

1564 ) 

1565 current_start = idnum 

1566 current_stop = idnum + 1 

1567 assert current_start > 0, "for pytest only" 

1568 object_blocks.append([current_start, current_stop - current_start]) 

1569 # write incremented xref 

1570 xref_location = stream.tell() 

1571 xr_id = len(self._objects) + 1 

1572 stream.write(f"{xr_id} 0 obj".encode()) 

1573 init_data = { 

1574 NameObject("/Type"): NameObject("/XRef"), 

1575 NameObject("/Size"): NumberObject(xr_id + 1), 

1576 NameObject("/Root"): self.root_object.indirect_reference, 

1577 NameObject("/Filter"): NameObject("/FlateDecode"), 

1578 NameObject("/Index"): ArrayObject( 

1579 [NumberObject(_it) for _su in object_blocks for _it in _su] 

1580 ), 

1581 NameObject("/W"): ArrayObject( 

1582 [NumberObject(1), NumberObject(4), NumberObject(1)] 

1583 ), 

1584 "__streamdata__": b"", 

1585 } 

1586 if self._info is not None and ( 

1587 self._info.indirect_reference.idnum - 1 # type: ignore 

1588 >= len(self._original_hash) 

1589 or cast(IndirectObject, self._info).hash_bin() # kept for future 

1590 != self._original_hash[ 

1591 self._info.indirect_reference.idnum - 1 # type: ignore 

1592 ] 

1593 ): 

1594 init_data[NameObject(TK.INFO)] = self._info.indirect_reference 

1595 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) 

1596 if self._ID: 

1597 init_data[NameObject(TK.ID)] = self._ID 

1598 xr = StreamObject.initialize_from_dictionary(init_data) 

1599 xr.set_data( 

1600 b"".join( 

1601 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] 

1602 ) 

1603 ) 

1604 xr.write_to_stream(stream) 

1605 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1606 

1607 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: 

1608 object_positions = [] 

1609 free_objects = [] 

1610 stream.write(self.pdf_header.encode() + b"\n") 

1611 stream.write(b"%\xE2\xE3\xCF\xD3\n") 

1612 

1613 for idnum, obj in enumerate(self._objects, start=1): 

1614 if obj is not None: 

1615 object_positions.append(stream.tell()) 

1616 stream.write(f"{idnum} 0 obj\n".encode()) 

1617 if self._encryption and obj != self._encrypt_entry: 

1618 obj = self._encryption.encrypt_object(obj, idnum, 0) 

1619 obj.write_to_stream(stream) 

1620 stream.write(b"\nendobj\n") 

1621 else: 

1622 object_positions.append(-1) 

1623 free_objects.append(idnum) 

1624 free_objects.append(0) # add 0 to loop in accordance with specification 

1625 return object_positions, free_objects 

1626 

1627 def _write_xref_table( 

1628 self, stream: StreamType, object_positions: List[int], free_objects: List[int] 

1629 ) -> int: 

1630 xref_location = stream.tell() 

1631 stream.write(b"xref\n") 

1632 stream.write(f"0 {len(self._objects) + 1}\n".encode()) 

1633 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) 

1634 free_idx = 1 

1635 for offset in object_positions: 

1636 if offset > 0: 

1637 stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) 

1638 else: 

1639 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) 

1640 free_idx += 1 

1641 return xref_location 

1642 

1643 def _write_trailer(self, stream: StreamType, xref_location: int) -> None: 

1644 """ 

1645 Write the PDF trailer to the stream. 

1646 

1647 To quote the PDF specification: 

1648 [The] trailer [gives] the location of the cross-reference table and 

1649 of certain special objects within the body of the file. 

1650 """ 

1651 stream.write(b"trailer\n") 

1652 trailer = DictionaryObject( 

1653 { 

1654 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), 

1655 NameObject(TK.ROOT): self.root_object.indirect_reference, 

1656 } 

1657 ) 

1658 if self._info is not None: 

1659 trailer[NameObject(TK.INFO)] = self._info.indirect_reference 

1660 if self._ID is not None: 

1661 trailer[NameObject(TK.ID)] = self._ID 

1662 if self._encrypt_entry: 

1663 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference 

1664 trailer.write_to_stream(stream) 

1665 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof 

1666 

1667 @property 

1668 def metadata(self) -> Optional[DocumentInformation]: 

1669 """ 

1670 Retrieve/set the PDF file's document information dictionary, if it exists. 

1671 

1672 Args: 

1673 value: dict with the entries to be set. if None : remove the /Info entry from the pdf. 

1674 

1675 Note that some PDF files use (XMP) metadata streams instead of document 

1676 information dictionaries, and these metadata streams will not be 

1677 accessed by this function, but by :meth:`~xmp_metadata`. 

1678 

1679 """ 

1680 return super().metadata 

1681 

1682 @metadata.setter 

1683 def metadata( 

1684 self, 

1685 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]], 

1686 ) -> None: 

1687 if value is None: 

1688 self._info = None 

1689 else: 

1690 if self._info is not None: 

1691 self._info.clear() 

1692 

1693 self.add_metadata(value) 

1694 

1695 def add_metadata(self, infos: Dict[str, Any]) -> None: 

1696 """ 

1697 Add custom metadata to the output. 

1698 

1699 Args: 

1700 infos: a Python dictionary where each key is a field 

1701 and each value is your new metadata. 

1702 

1703 """ 

1704 args = {} 

1705 if isinstance(infos, PdfObject): 

1706 infos = cast(DictionaryObject, infos.get_object()) 

1707 for key, value in list(infos.items()): 

1708 if isinstance(value, PdfObject): 

1709 value = value.get_object() 

1710 args[NameObject(key)] = create_string_object(str(value)) 

1711 if self._info is None: 

1712 self._info = DictionaryObject() 

1713 self._info.update(args) 

1714 

1715 def compress_identical_objects( 

1716 self, 

1717 remove_identicals: bool = True, 

1718 remove_orphans: bool = True, 

1719 ) -> None: 

1720 """ 

1721 Parse the PDF file and merge objects that have the same hash. 

1722 This will make objects common to multiple pages. 

1723 Recommended to be used just before writing output. 

1724 

1725 Args: 

1726 remove_identicals: Remove identical objects. 

1727 remove_orphans: Remove unreferenced objects. 

1728 

1729 """ 

1730 

1731 def replace_in_obj( 

1732 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] 

1733 ) -> None: 

1734 if isinstance(obj, DictionaryObject): 

1735 key_val = obj.items() 

1736 elif isinstance(obj, ArrayObject): 

1737 key_val = enumerate(obj) # type: ignore 

1738 else: 

1739 return 

1740 assert isinstance(obj, (DictionaryObject, ArrayObject)) 

1741 for k, v in key_val: 

1742 if isinstance(v, IndirectObject): 

1743 orphans[v.idnum - 1] = False 

1744 if v in crossref: 

1745 obj[k] = crossref[v] 

1746 else: 

1747 """the filtering on DictionaryObject and ArrayObject only 

1748 will be performed within replace_in_obj""" 

1749 replace_in_obj(v, crossref) 

1750 

1751 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) 

1752 self._idnum_hash = {} 

1753 orphans = [True] * len(self._objects) 

1754 # look for similar objects 

1755 for idx, obj in enumerate(self._objects): 

1756 if is_null_or_none(obj): 

1757 continue 

1758 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. 

1759 assert isinstance(obj.indirect_reference, IndirectObject) 

1760 h = obj.hash_value() 

1761 if remove_identicals and h in self._idnum_hash: 

1762 self._idnum_hash[h][1].append(obj.indirect_reference) 

1763 self._objects[idx] = None 

1764 else: 

1765 self._idnum_hash[h] = (obj.indirect_reference, []) 

1766 

1767 # generate the dict converting others to 1st 

1768 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} 

1769 cnv_rev: Dict[IndirectObject, IndirectObject] = {} 

1770 for k, v in cnv.items(): 

1771 cnv_rev.update(zip(v, (k,) * len(v))) 

1772 

1773 # replace reference to merged objects 

1774 for obj in self._objects: 

1775 if isinstance(obj, (DictionaryObject, ArrayObject)): 

1776 replace_in_obj(obj, cnv_rev) 

1777 

1778 # remove orphans (if applicable) 

1779 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore 

1780 

1781 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore 

1782 

1783 try: 

1784 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore 

1785 except AttributeError: 

1786 pass 

1787 for i in compress(range(len(self._objects)), orphans): 

1788 self._objects[i] = None 

1789 

1790 def _sweep_indirect_references( 

1791 self, 

1792 root: Union[ 

1793 ArrayObject, 

1794 BooleanObject, 

1795 DictionaryObject, 

1796 FloatObject, 

1797 IndirectObject, 

1798 NameObject, 

1799 PdfObject, 

1800 NumberObject, 

1801 TextStringObject, 

1802 NullObject, 

1803 ], 

1804 ) -> None: # deprecated 

1805 """ 

1806 Resolving any circular references to Page objects. 

1807 

1808 Circular references to Page objects can arise when objects such as 

1809 annotations refer to their associated page. If these references are not 

1810 properly handled, the PDF file will contain multiple copies of the same 

1811 Page object. To address this problem, Page objects store their original 

1812 object reference number. This method adds the reference number of any 

1813 circularly referenced Page objects to an external reference map. This 

1814 ensures that self-referencing trees reference the correct new object 

1815 location, rather than copying in a new copy of the Page object. 

1816 

1817 Args: 

1818 root: The root of the PDF object tree to sweep. 

1819 

1820 """ 

1821 deprecate( 

1822 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed", 

1823 ) 

1824 

1825 def _resolve_indirect_object( 

1826 self, data: IndirectObject 

1827 ) -> IndirectObject: # deprecated 

1828 """ 

1829 Resolves an indirect object to an indirect object in this PDF file. 

1830 

1831 If the input indirect object already belongs to this PDF file, it is 

1832 returned directly. Otherwise, the object is retrieved from the input 

1833 object's PDF file using the object's ID number and generation number. If 

1834 the object cannot be found, a warning is logged and a `NullObject` is 

1835 returned. 

1836 

1837 If the object is not already in this PDF file, it is added to the file's 

1838 list of objects and assigned a new ID number and generation number of 0. 

1839 The hash value of the object is then added to the `_idnum_hash` 

1840 dictionary, with the corresponding `IndirectObject` reference as the 

1841 value. 

1842 

1843 Args: 

1844 data: The `IndirectObject` to resolve. 

1845 

1846 Returns: 

1847 The resolved `IndirectObject` in this PDF file. 

1848 

1849 Raises: 

1850 ValueError: If the input stream is closed. 

1851 

1852 """ 

1853 deprecate( 

1854 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed", 

1855 ) 

1856 return IndirectObject(0, 0, self) 

1857 

1858 def get_reference(self, obj: PdfObject) -> IndirectObject: 

1859 idnum = self._objects.index(obj) + 1 

1860 ref = IndirectObject(idnum, 0, self) 

1861 assert ref.get_object() == obj 

1862 return ref 

1863 

1864 def get_outline_root(self) -> TreeObject: 

1865 if CO.OUTLINES in self._root_object: 

1866 # Entries in the catalog dictionary 

1867 outline = cast(TreeObject, self._root_object[CO.OUTLINES]) 

1868 if not isinstance(outline, TreeObject): 

1869 t = TreeObject(outline) 

1870 self._replace_object(outline.indirect_reference.idnum, t) 

1871 outline = t 

1872 idnum = self._objects.index(outline) + 1 

1873 outline_ref = IndirectObject(idnum, 0, self) 

1874 assert outline_ref.get_object() == outline 

1875 else: 

1876 outline = TreeObject() 

1877 outline.update({}) 

1878 outline_ref = self._add_object(outline) 

1879 self._root_object[NameObject(CO.OUTLINES)] = outline_ref 

1880 

1881 return outline 

1882 

1883 def get_threads_root(self) -> ArrayObject: 

1884 """ 

1885 The list of threads. 

1886 

1887 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1888 

1889 Returns: 

1890 An array (possibly empty) of Dictionaries with an ``/F`` key, 

1891 and optionally information about the thread in ``/I`` or ``/Metadata`` keys. 

1892 

1893 """ 

1894 if CO.THREADS in self._root_object: 

1895 # Entries in the catalog dictionary 

1896 threads = cast(ArrayObject, self._root_object[CO.THREADS]) 

1897 else: 

1898 threads = ArrayObject() 

1899 self._root_object[NameObject(CO.THREADS)] = threads 

1900 return threads 

1901 

1902 @property 

1903 def threads(self) -> ArrayObject: 

1904 """ 

1905 Read-only property for the list of threads. 

1906 

1907 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification. 

1908 

1909 Each element is a dictionary with an ``/F`` key, and optionally 

1910 information about the thread in ``/I`` or ``/Metadata`` keys. 

1911 """ 

1912 return self.get_threads_root() 

1913 

1914 def add_outline_item_destination( 

1915 self, 

1916 page_destination: Union[IndirectObject, PageObject, TreeObject], 

1917 parent: Union[None, TreeObject, IndirectObject] = None, 

1918 before: Union[None, TreeObject, IndirectObject] = None, 

1919 is_open: bool = True, 

1920 ) -> IndirectObject: 

1921 page_destination = cast(PageObject, page_destination.get_object()) 

1922 if isinstance(page_destination, PageObject): 

1923 return self.add_outline_item_destination( 

1924 Destination( 

1925 f"page #{page_destination.page_number}", 

1926 cast(IndirectObject, page_destination.indirect_reference), 

1927 Fit.fit(), 

1928 ) 

1929 ) 

1930 

1931 if parent is None: 

1932 parent = self.get_outline_root() 

1933 

1934 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open) 

1935 parent = cast(TreeObject, parent.get_object()) 

1936 page_destination_ref = self._add_object(page_destination) 

1937 if before is not None: 

1938 before = before.indirect_reference 

1939 parent.insert_child( 

1940 page_destination_ref, 

1941 before, 

1942 self, 

1943 page_destination.inc_parent_counter_outline 

1944 if is_open 

1945 else (lambda x, y: 0), # noqa: ARG005 

1946 ) 

1947 if "/Count" not in page_destination: 

1948 page_destination[NameObject("/Count")] = NumberObject(0) 

1949 

1950 return page_destination_ref 

1951 

1952 def add_outline_item_dict( 

1953 self, 

1954 outline_item: OutlineItemType, 

1955 parent: Union[None, TreeObject, IndirectObject] = None, 

1956 before: Union[None, TreeObject, IndirectObject] = None, 

1957 is_open: bool = True, 

1958 ) -> IndirectObject: 

1959 outline_item_object = TreeObject() 

1960 outline_item_object.update(outline_item) 

1961 

1962 """code currently unreachable 

1963 if "/A" in outline_item: 

1964 action = DictionaryObject() 

1965 a_dict = cast(DictionaryObject, outline_item["/A"]) 

1966 for k, v in list(a_dict.items()): 

1967 action[NameObject(str(k))] = v 

1968 action_ref = self._add_object(action) 

1969 outline_item_object[NameObject("/A")] = action_ref 

1970 """ 

1971 return self.add_outline_item_destination( 

1972 outline_item_object, parent, before, is_open 

1973 ) 

1974 

1975 def add_outline_item( 

1976 self, 

1977 title: str, 

1978 page_number: Union[None, PageObject, IndirectObject, int], 

1979 parent: Union[None, TreeObject, IndirectObject] = None, 

1980 before: Union[None, TreeObject, IndirectObject] = None, 

1981 color: Optional[Union[Tuple[float, float, float], str]] = None, 

1982 bold: bool = False, 

1983 italic: bool = False, 

1984 fit: Fit = PAGE_FIT, 

1985 is_open: bool = True, 

1986 ) -> IndirectObject: 

1987 """ 

1988 Add an outline item (commonly referred to as a "Bookmark") to the PDF file. 

1989 

1990 Args: 

1991 title: Title to use for this outline item. 

1992 page_number: Page number this outline item will point to. 

1993 parent: A reference to a parent outline item to create nested 

1994 outline items. 

1995 before: 

1996 color: Color of the outline item's font as a red, green, blue tuple 

1997 from 0.0 to 1.0 or as a Hex String (#RRGGBB) 

1998 bold: Outline item font is bold 

1999 italic: Outline item font is italic 

2000 fit: The fit of the destination page. 

2001 

2002 Returns: 

2003 The added outline item as an indirect object. 

2004 

2005 """ 

2006 page_ref: Union[None, NullObject, IndirectObject, NumberObject] 

2007 if isinstance(italic, Fit): # it means that we are on the old params 

2008 if fit is not None and page_number is None: 

2009 page_number = fit 

2010 return self.add_outline_item( 

2011 title, page_number, parent, None, before, color, bold, italic, is_open=is_open 

2012 ) 

2013 if page_number is None: 

2014 action_ref = None 

2015 else: 

2016 if isinstance(page_number, IndirectObject): 

2017 page_ref = page_number 

2018 elif isinstance(page_number, PageObject): 

2019 page_ref = page_number.indirect_reference 

2020 elif isinstance(page_number, int): 

2021 try: 

2022 page_ref = self.pages[page_number].indirect_reference 

2023 except IndexError: 

2024 page_ref = NumberObject(page_number) 

2025 if page_ref is None: 

2026 logger_warning( 

2027 f"can not find reference of page {page_number}", 

2028 __name__, 

2029 ) 

2030 page_ref = NullObject() 

2031 dest = Destination( 

2032 NameObject("/" + title + " outline item"), 

2033 page_ref, 

2034 fit, 

2035 ) 

2036 

2037 action_ref = self._add_object( 

2038 DictionaryObject( 

2039 { 

2040 NameObject(GoToActionArguments.D): dest.dest_array, 

2041 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

2042 } 

2043 ) 

2044 ) 

2045 outline_item = self._add_object( 

2046 _create_outline_item(action_ref, title, color, italic, bold) 

2047 ) 

2048 

2049 if parent is None: 

2050 parent = self.get_outline_root() 

2051 return self.add_outline_item_destination(outline_item, parent, before, is_open) 

2052 

2053 def add_outline(self) -> None: 

2054 raise NotImplementedError( 

2055 "This method is not yet implemented. Use :meth:`add_outline_item` instead." 

2056 ) 

2057 

2058 def add_named_destination_array( 

2059 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject] 

2060 ) -> None: 

2061 named_dest = self.get_named_dest_root() 

2062 i = 0 

2063 while i < len(named_dest): 

2064 if title < named_dest[i]: 

2065 named_dest.insert(i, destination) 

2066 named_dest.insert(i, TextStringObject(title)) 

2067 return 

2068 i += 2 

2069 named_dest.extend([TextStringObject(title), destination]) 

2070 return 

2071 

2072 def add_named_destination_object( 

2073 self, 

2074 page_destination: PdfObject, 

2075 ) -> IndirectObject: 

2076 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore 

2077 self.add_named_destination_array( 

2078 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore 

2079 ) 

2080 

2081 return page_destination_ref 

2082 

2083 def add_named_destination( 

2084 self, 

2085 title: str, 

2086 page_number: int, 

2087 ) -> IndirectObject: 

2088 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore 

2089 dest = DictionaryObject() 

2090 dest.update( 

2091 { 

2092 NameObject(GoToActionArguments.D): ArrayObject( 

2093 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)] 

2094 ), 

2095 NameObject(GoToActionArguments.S): NameObject("/GoTo"), 

2096 } 

2097 ) 

2098 

2099 dest_ref = self._add_object(dest) 

2100 if not isinstance(title, TextStringObject): 

2101 title = TextStringObject(str(title)) 

2102 

2103 self.add_named_destination_array(title, dest_ref) 

2104 return dest_ref 

2105 

2106 def remove_links(self) -> None: 

2107 """Remove links and annotations from this output.""" 

2108 for page in self.pages: 

2109 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS) 

2110 

2111 def remove_annotations( 

2112 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]] 

2113 ) -> None: 

2114 """ 

2115 Remove annotations by annotation subtype. 

2116 

2117 Args: 

2118 subtypes: subtype or list of subtypes to be removed. 

2119 Examples are: "/Link", "/FileAttachment", "/Sound", 

2120 "/Movie", "/Screen", ... 

2121 If you want to remove all annotations, use subtypes=None. 

2122 

2123 """ 

2124 for page in self.pages: 

2125 self._remove_annots_from_page(page, subtypes) 

2126 

2127 def _remove_annots_from_page( 

2128 self, 

2129 page: Union[IndirectObject, PageObject, DictionaryObject], 

2130 subtypes: Optional[Iterable[str]], 

2131 ) -> None: 

2132 page = cast(DictionaryObject, page.get_object()) 

2133 if PG.ANNOTS in page: 

2134 i = 0 

2135 while i < len(cast(ArrayObject, page[PG.ANNOTS])): 

2136 an = cast(ArrayObject, page[PG.ANNOTS])[i] 

2137 obj = cast(DictionaryObject, an.get_object()) 

2138 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes: 

2139 if isinstance(an, IndirectObject): 

2140 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size 

2141 del page[PG.ANNOTS][i] # type:ignore 

2142 else: 

2143 i += 1 

2144 

2145 def remove_objects_from_page( 

2146 self, 

2147 page: Union[PageObject, DictionaryObject], 

2148 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]], 

2149 text_filters: Optional[Dict[str, Any]] = None 

2150 ) -> None: 

2151 """ 

2152 Remove objects specified by ``to_delete`` from the given page. 

2153 

2154 Args: 

2155 page: Page object to clean up. 

2156 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag`` 

2157 or a list of ObjectDeletionFlag 

2158 text_filters: Properties of text to be deleted, if applicable. Optional. 

2159 This is a Python dictionary with the following properties: 

2160 

2161 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted. 

2162 

2163 """ 

2164 if isinstance(to_delete, (list, tuple)): 

2165 for to_d in to_delete: 

2166 self.remove_objects_from_page(page, to_d) 

2167 return None 

2168 assert isinstance(to_delete, ObjectDeletionFlag) 

2169 

2170 if to_delete & ObjectDeletionFlag.LINKS: 

2171 return self._remove_annots_from_page(page, ("/Link",)) 

2172 if to_delete & ObjectDeletionFlag.ATTACHMENTS: 

2173 return self._remove_annots_from_page( 

2174 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen") 

2175 ) 

2176 if to_delete & ObjectDeletionFlag.OBJECTS_3D: 

2177 return self._remove_annots_from_page(page, ("/3D",)) 

2178 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: 

2179 return self._remove_annots_from_page(page, None) 

2180 

2181 jump_operators = [] 

2182 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: 

2183 jump_operators = ( 

2184 [ 

2185 b"w", b"J", b"j", b"M", b"d", b"i", 

2186 b"W", b"W*", 

2187 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n", 

2188 b"m", b"l", b"c", b"v", b"y", b"h", b"re", 

2189 b"sh" 

2190 ] 

2191 ) 

2192 if to_delete & ObjectDeletionFlag.TEXT: 

2193 jump_operators = [b"Tj", b"TJ", b"'", b'"'] 

2194 

2195 def clean( 

2196 content: ContentStream, 

2197 images: List[str], 

2198 forms: List[str], 

2199 text_filters: Optional[Dict[str, Any]] = None 

2200 ) -> None: 

2201 nonlocal jump_operators, to_delete 

2202 

2203 font_id = None 

2204 font_ids_to_delete = [] 

2205 if text_filters and to_delete & ObjectDeletionFlag.TEXT: 

2206 font_ids_to_delete = text_filters.get("font_ids", []) 

2207 

2208 i = 0 

2209 while i < len(content.operations): 

2210 operands, operator = content.operations[i] 

2211 if operator == b"Tf": 

2212 font_id = operands[0] 

2213 if ( 

2214 ( 

2215 operator == b"INLINE IMAGE" 

2216 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES) 

2217 ) 

2218 or (operator in jump_operators) 

2219 or ( 

2220 operator == b"Do" 

2221 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES) 

2222 and (operands[0] in images) 

2223 ) 

2224 ): 

2225 if ( 

2226 not to_delete & ObjectDeletionFlag.TEXT 

2227 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters) 

2228 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete) 

2229 ): 

2230 del content.operations[i] 

2231 else: 

2232 i += 1 

2233 else: 

2234 i += 1 

2235 content.get_data() # this ensures ._data is rebuilt from the .operations 

2236 

2237 def clean_forms( 

2238 elt: DictionaryObject, stack: List[DictionaryObject] 

2239 ) -> Tuple[List[str], List[str]]: 

2240 nonlocal to_delete 

2241 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference 

2242 if (elt in stack) or ( 

2243 hasattr(elt, "indirect_reference") 

2244 and any( 

2245 elt.indirect_reference == getattr(x, "indirect_reference", -1) 

2246 for x in stack 

2247 ) 

2248 ): 

2249 # to prevent infinite looping 

2250 return [], [] # pragma: no cover 

2251 try: 

2252 d = cast( 

2253 Dict[Any, Any], 

2254 cast(DictionaryObject, elt["/Resources"])["/XObject"], 

2255 ) 

2256 except KeyError: 

2257 d = {} 

2258 images = [] 

2259 forms = [] 

2260 for k, v in d.items(): 

2261 o = v.get_object() 

2262 try: 

2263 content: Any = None 

2264 if ( 

2265 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES 

2266 and o["/Subtype"] == "/Image" 

2267 ): 

2268 content = NullObject() # to delete the image keeping the entry 

2269 images.append(k) 

2270 if o["/Subtype"] == "/Form": 

2271 forms.append(k) 

2272 if isinstance(o, ContentStream): 

2273 content = o 

2274 else: 

2275 content = ContentStream(o, self) 

2276 content.update( 

2277 { 

2278 k1: v1 

2279 for k1, v1 in o.items() 

2280 if k1 not in ["/Length", "/Filter", "/DecodeParms"] 

2281 } 

2282 ) 

2283 try: 

2284 content.indirect_reference = o.indirect_reference 

2285 except AttributeError: # pragma: no cover 

2286 pass 

2287 stack.append(elt) 

2288 clean_forms(content, stack) # clean subforms 

2289 if content is not None: 

2290 if isinstance(v, IndirectObject): 

2291 self._objects[v.idnum - 1] = content 

2292 else: 

2293 # should only occur in a PDF not respecting PDF spec 

2294 # where streams must be indirected. 

2295 d[k] = self._add_object(content) # pragma: no cover 

2296 except (TypeError, KeyError): 

2297 pass 

2298 for im in images: 

2299 del d[im] # for clean-up 

2300 if isinstance(elt, StreamObject): # for /Form 

2301 if not isinstance(elt, ContentStream): # pragma: no cover 

2302 e = ContentStream(elt, self) 

2303 e.update(elt.items()) 

2304 elt = e 

2305 clean(elt, images, forms, text_filters) # clean the content 

2306 return images, forms 

2307 

2308 if not isinstance(page, PageObject): 

2309 page = PageObject(self, page.indirect_reference) # pragma: no cover 

2310 if "/Contents" in page: 

2311 content = cast(ContentStream, page.get_contents()) 

2312 

2313 images, forms = clean_forms(page, []) 

2314 

2315 clean(content, images, forms, text_filters) 

2316 page.replace_contents(content) 

2317 

2318 def remove_images( 

2319 self, 

2320 to_delete: ImageType = ImageType.ALL, 

2321 ) -> None: 

2322 """ 

2323 Remove images from this output. 

2324 

2325 Args: 

2326 to_delete: The type of images to be deleted 

2327 (default = all images types) 

2328 

2329 """ 

2330 if isinstance(to_delete, bool): 

2331 to_delete = ImageType.ALL 

2332 

2333 i = ObjectDeletionFlag.NONE 

2334 

2335 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"): 

2336 if to_delete & ImageType[image]: 

2337 i |= ObjectDeletionFlag[image] 

2338 

2339 for page in self.pages: 

2340 self.remove_objects_from_page(page, i) 

2341 

2342 def remove_text(self, font_names: Optional[List[str]] = None) -> None: 

2343 """ 

2344 Remove text from the PDF. 

2345 

2346 Args: 

2347 font_names: List of font names to remove, such as "Helvetica-Bold". 

2348 Optional. If not specified, all text will be removed. 

2349 """ 

2350 if not font_names: 

2351 font_names = [] 

2352 

2353 for page in self.pages: 

2354 resource_ids_to_remove = [] 

2355 

2356 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0" 

2357 # Font names need to be converted to resource names/IDs for easier removal 

2358 if font_names: 

2359 # Recursively loop through page objects to gather font info 

2360 def get_font_info( 

2361 obj: Any, 

2362 font_info: Optional[Dict[str, Any]] = None, 

2363 key: Optional[str] = None 

2364 ) -> Dict[str, Any]: 

2365 if font_info is None: 

2366 font_info = {} 

2367 if isinstance(obj, IndirectObject): 

2368 obj = obj.get_object() 

2369 if isinstance(obj, dict): 

2370 if obj.get("/Type") == "/Font": 

2371 font_name = obj.get("/BaseFont", "") 

2372 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold" 

2373 normalized_font_name = font_name.lstrip("/").split("+")[-1] 

2374 if normalized_font_name not in font_info: 

2375 font_info[normalized_font_name] = { 

2376 "normalized_font_name": normalized_font_name, 

2377 "resource_ids": [], 

2378 } 

2379 if key not in font_info[normalized_font_name]["resource_ids"]: 

2380 font_info[normalized_font_name]["resource_ids"].append(key) 

2381 for k in obj: 

2382 font_info = get_font_info(obj[k], font_info, k) 

2383 elif isinstance(obj, (list, ArrayObject)): 

2384 for child_obj in obj: 

2385 font_info = get_font_info(child_obj, font_info) 

2386 return font_info 

2387 

2388 # Add relevant resource names for removal 

2389 font_info = get_font_info(page.get("/Resources")) 

2390 for font_name in font_names: 

2391 if font_name in font_info: 

2392 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"]) 

2393 

2394 text_filters = {} 

2395 if font_names: 

2396 text_filters["font_ids"] = resource_ids_to_remove 

2397 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters) 

2398 

2399 def add_uri( 

2400 self, 

2401 page_number: int, 

2402 uri: str, 

2403 rect: RectangleObject, 

2404 border: Optional[ArrayObject] = None, 

2405 ) -> None: 

2406 """ 

2407 Add an URI from a rectangular area to the specified page. 

2408 

2409 Args: 

2410 page_number: index of the page on which to place the URI action. 

2411 uri: URI of resource to link to. 

2412 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or 

2413 array of four integers specifying the clickable rectangular area 

2414 ``[xLL, yLL, xUR, yUR]``, or string in the form 

2415 ``"[ xLL yLL xUR yUR ]"``. 

2416 border: if provided, an array describing border-drawing 

2417 properties. See the PDF spec for details. No border will be 

2418 drawn if this argument is omitted. 

2419 

2420 """ 

2421 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore 

2422 page_ref = cast(Dict[str, Any], self.get_object(page_link)) 

2423 

2424 border_arr: BorderArrayType 

2425 if border is not None: 

2426 border_arr = [NumberObject(n) for n in border[:3]] 

2427 if len(border) == 4: 

2428 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) 

2429 border_arr.append(dash_pattern) 

2430 else: 

2431 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)] 

2432 

2433 if isinstance(rect, str): 

2434 rect = NumberObject(rect) 

2435 elif isinstance(rect, RectangleObject): 

2436 pass 

2437 else: 

2438 rect = RectangleObject(rect) 

2439 

2440 lnk2 = DictionaryObject() 

2441 lnk2.update( 

2442 { 

2443 NameObject("/S"): NameObject("/URI"), 

2444 NameObject("/URI"): TextStringObject(uri), 

2445 } 

2446 ) 

2447 lnk = DictionaryObject() 

2448 lnk.update( 

2449 { 

2450 NameObject(AA.Type): NameObject("/Annot"), 

2451 NameObject(AA.Subtype): NameObject("/Link"), 

2452 NameObject(AA.P): page_link, 

2453 NameObject(AA.Rect): rect, 

2454 NameObject("/H"): NameObject("/I"), 

2455 NameObject(AA.Border): ArrayObject(border_arr), 

2456 NameObject("/A"): lnk2, 

2457 } 

2458 ) 

2459 lnk_ref = self._add_object(lnk) 

2460 

2461 if PG.ANNOTS in page_ref: 

2462 page_ref[PG.ANNOTS].append(lnk_ref) 

2463 else: 

2464 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) 

2465 

2466 _valid_layouts = ( 

2467 "/NoLayout", 

2468 "/SinglePage", 

2469 "/OneColumn", 

2470 "/TwoColumnLeft", 

2471 "/TwoColumnRight", 

2472 "/TwoPageLeft", 

2473 "/TwoPageRight", 

2474 ) 

2475 

2476 def _get_page_layout(self) -> Optional[LayoutType]: 

2477 try: 

2478 return cast(LayoutType, self._root_object["/PageLayout"]) 

2479 except KeyError: 

2480 return None 

2481 

2482 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: 

2483 """ 

2484 Set the page layout. 

2485 

2486 Args: 

2487 layout: The page layout to be used. 

2488 

2489 .. list-table:: Valid ``layout`` arguments 

2490 :widths: 50 200 

2491 

2492 * - /NoLayout 

2493 - Layout explicitly not specified 

2494 * - /SinglePage 

2495 - Show one page at a time 

2496 * - /OneColumn 

2497 - Show one column at a time 

2498 * - /TwoColumnLeft 

2499 - Show pages in two columns, odd-numbered pages on the left 

2500 * - /TwoColumnRight 

2501 - Show pages in two columns, odd-numbered pages on the right 

2502 * - /TwoPageLeft 

2503 - Show two pages at a time, odd-numbered pages on the left 

2504 * - /TwoPageRight 

2505 - Show two pages at a time, odd-numbered pages on the right 

2506 

2507 """ 

2508 if not isinstance(layout, NameObject): 

2509 if layout not in self._valid_layouts: 

2510 logger_warning( 

2511 f"Layout should be one of: {'', ''.join(self._valid_layouts)}", 

2512 __name__, 

2513 ) 

2514 layout = NameObject(layout) 

2515 self._root_object.update({NameObject("/PageLayout"): layout}) 

2516 

2517 def set_page_layout(self, layout: LayoutType) -> None: 

2518 """ 

2519 Set the page layout. 

2520 

2521 Args: 

2522 layout: The page layout to be used 

2523 

2524 .. list-table:: Valid ``layout`` arguments 

2525 :widths: 50 200 

2526 

2527 * - /NoLayout 

2528 - Layout explicitly not specified 

2529 * - /SinglePage 

2530 - Show one page at a time 

2531 * - /OneColumn 

2532 - Show one column at a time 

2533 * - /TwoColumnLeft 

2534 - Show pages in two columns, odd-numbered pages on the left 

2535 * - /TwoColumnRight 

2536 - Show pages in two columns, odd-numbered pages on the right 

2537 * - /TwoPageLeft 

2538 - Show two pages at a time, odd-numbered pages on the left 

2539 * - /TwoPageRight 

2540 - Show two pages at a time, odd-numbered pages on the right 

2541 

2542 """ 

2543 self._set_page_layout(layout) 

2544 

2545 @property 

2546 def page_layout(self) -> Optional[LayoutType]: 

2547 """ 

2548 Page layout property. 

2549 

2550 .. list-table:: Valid ``layout`` values 

2551 :widths: 50 200 

2552 

2553 * - /NoLayout 

2554 - Layout explicitly not specified 

2555 * - /SinglePage 

2556 - Show one page at a time 

2557 * - /OneColumn 

2558 - Show one column at a time 

2559 * - /TwoColumnLeft 

2560 - Show pages in two columns, odd-numbered pages on the left 

2561 * - /TwoColumnRight 

2562 - Show pages in two columns, odd-numbered pages on the right 

2563 * - /TwoPageLeft 

2564 - Show two pages at a time, odd-numbered pages on the left 

2565 * - /TwoPageRight 

2566 - Show two pages at a time, odd-numbered pages on the right 

2567 """ 

2568 return self._get_page_layout() 

2569 

2570 @page_layout.setter 

2571 def page_layout(self, layout: LayoutType) -> None: 

2572 self._set_page_layout(layout) 

2573 

2574 _valid_modes = ( 

2575 "/UseNone", 

2576 "/UseOutlines", 

2577 "/UseThumbs", 

2578 "/FullScreen", 

2579 "/UseOC", 

2580 "/UseAttachments", 

2581 ) 

2582 

2583 def _get_page_mode(self) -> Optional[PagemodeType]: 

2584 try: 

2585 return cast(PagemodeType, self._root_object["/PageMode"]) 

2586 except KeyError: 

2587 return None 

2588 

2589 @property 

2590 def page_mode(self) -> Optional[PagemodeType]: 

2591 """ 

2592 Page mode property. 

2593 

2594 .. list-table:: Valid ``mode`` values 

2595 :widths: 50 200 

2596 

2597 * - /UseNone 

2598 - Do not show outline or thumbnails panels 

2599 * - /UseOutlines 

2600 - Show outline (aka bookmarks) panel 

2601 * - /UseThumbs 

2602 - Show page thumbnails panel 

2603 * - /FullScreen 

2604 - Fullscreen view 

2605 * - /UseOC 

2606 - Show Optional Content Group (OCG) panel 

2607 * - /UseAttachments 

2608 - Show attachments panel 

2609 """ 

2610 return self._get_page_mode() 

2611 

2612 @page_mode.setter 

2613 def page_mode(self, mode: PagemodeType) -> None: 

2614 if isinstance(mode, NameObject): 

2615 mode_name: NameObject = mode 

2616 else: 

2617 if mode not in self._valid_modes: 

2618 logger_warning( 

2619 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__ 

2620 ) 

2621 mode_name = NameObject(mode) 

2622 self._root_object.update({NameObject("/PageMode"): mode_name}) 

2623 

2624 def add_annotation( 

2625 self, 

2626 page_number: Union[int, PageObject], 

2627 annotation: Dict[str, Any], 

2628 ) -> DictionaryObject: 

2629 """ 

2630 Add a single annotation to the page. 

2631 The added annotation must be a new annotation. 

2632 It cannot be recycled. 

2633 

2634 Args: 

2635 page_number: PageObject or page index. 

2636 annotation: Annotation to be added (created with annotation). 

2637 

2638 Returns: 

2639 The inserted object. 

2640 This can be used for popup creation, for example. 

2641 

2642 """ 

2643 page = page_number 

2644 if isinstance(page, int): 

2645 page = self.pages[page] 

2646 elif not isinstance(page, PageObject): 

2647 raise TypeError("page: invalid type") 

2648 

2649 to_add = cast(DictionaryObject, _pdf_objectify(annotation)) 

2650 to_add[NameObject("/P")] = page.indirect_reference 

2651 

2652 if page.annotations is None: 

2653 page[NameObject("/Annots")] = ArrayObject() 

2654 assert page.annotations is not None 

2655 

2656 # Internal link annotations need the correct object type for the 

2657 # destination 

2658 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add: 

2659 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")]) 

2660 dest = Destination( 

2661 NameObject("/LinkName"), 

2662 tmp["target_page_index"], 

2663 Fit( 

2664 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"] 

2665 ), # I have no clue why this dict-hack is necessary 

2666 ) 

2667 to_add[NameObject("/Dest")] = dest.dest_array 

2668 

2669 page.annotations.append(self._add_object(to_add)) 

2670 

2671 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add: 

2672 cast(DictionaryObject, to_add["/Parent"].get_object())[ 

2673 NameObject("/Popup") 

2674 ] = to_add.indirect_reference 

2675 

2676 return to_add 

2677 

2678 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: 

2679 """ 

2680 Perform some clean up in the page. 

2681 Currently: convert NameObject named destination to TextStringObject 

2682 (required for names/dests list) 

2683 

2684 Args: 

2685 page: 

2686 

2687 Returns: 

2688 The cleaned PageObject 

2689 

2690 """ 

2691 page = cast("PageObject", page.get_object()) 

2692 for a in page.get("/Annots", []): 

2693 a_obj = a.get_object() 

2694 d = a_obj.get("/Dest", None) 

2695 act = a_obj.get("/A", None) 

2696 if isinstance(d, NameObject): 

2697 a_obj[NameObject("/Dest")] = TextStringObject(d) 

2698 elif act is not None: 

2699 act = act.get_object() 

2700 d = act.get("/D", None) 

2701 if isinstance(d, NameObject): 

2702 act[NameObject("/D")] = TextStringObject(d) 

2703 return page 

2704 

2705 def _create_stream( 

2706 self, fileobj: Union[Path, StrByteType, PdfReader] 

2707 ) -> Tuple[IOBase, Optional[Encryption]]: 

2708 # If the fileobj parameter is a string, assume it is a path 

2709 # and create a file object at that location. If it is a file, 

2710 # copy the file's contents into a BytesIO stream object; if 

2711 # it is a PdfReader, copy that reader's stream into a 

2712 # BytesIO stream. 

2713 # If fileobj is none of the above types, it is not modified 

2714 encryption_obj = None 

2715 stream: IOBase 

2716 if isinstance(fileobj, (str, Path)): 

2717 with FileIO(fileobj, "rb") as f: 

2718 stream = BytesIO(f.read()) 

2719 elif isinstance(fileobj, PdfReader): 

2720 if fileobj._encryption: 

2721 encryption_obj = fileobj._encryption 

2722 orig_tell = fileobj.stream.tell() 

2723 fileobj.stream.seek(0) 

2724 stream = BytesIO(fileobj.stream.read()) 

2725 

2726 # reset the stream to its original location 

2727 fileobj.stream.seek(orig_tell) 

2728 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): 

2729 fileobj.seek(0) 

2730 filecontent = fileobj.read() 

2731 stream = BytesIO(filecontent) 

2732 else: 

2733 raise NotImplementedError( 

2734 "Merging requires an object that PdfReader can parse. " 

2735 "Typically, that is a Path or a string representing a Path, " 

2736 "a file object, or an object implementing .seek and .read. " 

2737 "Passing a PdfReader directly works as well." 

2738 ) 

2739 return stream, encryption_obj 

2740 

2741 def append( 

2742 self, 

2743 fileobj: Union[StrByteType, PdfReader, Path], 

2744 outline_item: Union[ 

2745 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] 

2746 ] = None, 

2747 pages: Union[ 

2748 None, 

2749 PageRange, 

2750 Tuple[int, int], 

2751 Tuple[int, int, int], 

2752 List[int], 

2753 List[PageObject], 

2754 ] = None, 

2755 import_outline: bool = True, 

2756 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None, 

2757 ) -> None: 

2758 """ 

2759 Identical to the :meth:`merge()<merge>` method, but assumes you want to 

2760 concatenate all pages onto the end of the file instead of specifying a 

2761 position. 

2762 

2763 Args: 

2764 fileobj: A File Object or an object that supports the standard 

2765 read and seek methods similar to a File Object. Could also be a 

2766 string representing a path to a PDF file. 

2767 outline_item: Optionally, you may specify a string to build an 

2768 outline (aka 'bookmark') to identify the beginning of the 

2769 included file. 

2770 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2771 or a ``(start, stop[, step])`` tuple 

2772 or a list of pages to be processed 

2773 to merge only the specified range of pages from the source 

2774 document into the output document. 

2775 import_outline: You may prevent the source document's 

2776 outline (collection of outline items, previously referred to as 

2777 'bookmarks') from being imported by specifying this as ``False``. 

2778 excluded_fields: Provide the list of fields/keys to be ignored 

2779 if ``/Annots`` is part of the list, the annotation will be ignored 

2780 if ``/B`` is part of the list, the articles will be ignored 

2781 

2782 """ 

2783 if excluded_fields is None: 

2784 excluded_fields = () 

2785 if isinstance(outline_item, (tuple, list, PageRange)): 

2786 if isinstance(pages, bool): 

2787 if not isinstance(import_outline, bool): 

2788 excluded_fields = import_outline 

2789 import_outline = pages 

2790 pages = outline_item 

2791 self.merge( 

2792 None, 

2793 fileobj, 

2794 None, 

2795 pages, 

2796 import_outline, 

2797 excluded_fields, 

2798 ) 

2799 else: # if isinstance(outline_item, str): 

2800 self.merge( 

2801 None, 

2802 fileobj, 

2803 outline_item, 

2804 pages, 

2805 import_outline, 

2806 excluded_fields, 

2807 ) 

2808 

2809 def merge( 

2810 self, 

2811 position: Optional[int], 

2812 fileobj: Union[Path, StrByteType, PdfReader], 

2813 outline_item: Optional[str] = None, 

2814 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None, 

2815 import_outline: bool = True, 

2816 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (), 

2817 ) -> None: 

2818 """ 

2819 Merge the pages from the given file into the output file at the 

2820 specified page number. 

2821 

2822 Args: 

2823 position: The *page number* to insert this file. File will 

2824 be inserted after the given number. 

2825 fileobj: A File Object or an object that supports the standard 

2826 read and seek methods similar to a File Object. Could also be a 

2827 string representing a path to a PDF file. 

2828 outline_item: Optionally, you may specify a string to build an outline 

2829 (aka 'bookmark') to identify the 

2830 beginning of the included file. 

2831 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>` 

2832 or a ``(start, stop[, step])`` tuple 

2833 or a list of pages to be processed 

2834 to merge only the specified range of pages from the source 

2835 document into the output document. 

2836 import_outline: You may prevent the source document's 

2837 outline (collection of outline items, previously referred to as 

2838 'bookmarks') from being imported by specifying this as ``False``. 

2839 excluded_fields: provide the list of fields/keys to be ignored 

2840 if ``/Annots`` is part of the list, the annotation will be ignored 

2841 if ``/B`` is part of the list, the articles will be ignored 

2842 

2843 Raises: 

2844 TypeError: The pages attribute is not configured properly 

2845 

2846 """ 

2847 if isinstance(fileobj, PdfDocCommon): 

2848 reader = fileobj 

2849 else: 

2850 stream, encryption_obj = self._create_stream(fileobj) 

2851 # Create a new PdfReader instance using the stream 

2852 # (either file or BytesIO or StringIO) created above 

2853 reader = PdfReader(stream, strict=False) # type: ignore[arg-type] 

2854 

2855 if excluded_fields is None: 

2856 excluded_fields = () 

2857 # Find the range of pages to merge. 

2858 if pages is None: 

2859 pages = list(range(len(reader.pages))) 

2860 elif isinstance(pages, PageRange): 

2861 pages = list(range(*pages.indices(len(reader.pages)))) 

2862 elif isinstance(pages, list): 

2863 pass # keep unchanged 

2864 elif isinstance(pages, tuple) and len(pages) <= 3: 

2865 pages = list(range(*pages)) 

2866 elif not isinstance(pages, tuple): 

2867 raise TypeError( 

2868 '"pages" must be a tuple of (start, stop[, step]) or a list' 

2869 ) 

2870 

2871 srcpages = {} 

2872 for page in pages: 

2873 if isinstance(page, PageObject): 

2874 pg = page 

2875 else: 

2876 pg = reader.pages[page] 

2877 assert pg.indirect_reference is not None 

2878 if position is None: 

2879 # numbers in the exclude list identifies that the exclusion is 

2880 # only applicable to 1st level of cloning 

2881 srcpages[pg.indirect_reference.idnum] = self.add_page( 

2882 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore 

2883 ) 

2884 else: 

2885 srcpages[pg.indirect_reference.idnum] = self.insert_page( 

2886 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore 

2887 ) 

2888 position += 1 

2889 srcpages[pg.indirect_reference.idnum].original_page = pg 

2890 

2891 reader._named_destinations = ( 

2892 reader.named_destinations 

2893 ) # need for the outline processing below 

2894 

2895 arr: Any 

2896 

2897 def _process_named_dests(dest: Any) -> None: 

2898 arr = dest.dest_array 

2899 if "/Names" in self._root_object and dest["/Title"] in cast( 

2900 List[Any], 

2901 cast( 

2902 DictionaryObject, 

2903 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()), 

2904 ).get("/Names", DictionaryObject()), 

2905 ): 

2906 # already exists: should not duplicate it 

2907 pass 

2908 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject): 

2909 pass 

2910 elif isinstance(dest["/Page"], int): 

2911 # the page reference is a page number normally not a PDF Reference 

2912 # page numbers as int are normally accepted only in external goto 

2913 try: 

2914 p = reader.pages[dest["/Page"]] 

2915 except IndexError: 

2916 return 

2917 assert p.indirect_reference is not None 

2918 try: 

2919 arr[NumberObject(0)] = NumberObject( 

2920 srcpages[p.indirect_reference.idnum].page_number 

2921 ) 

2922 self.add_named_destination_array(dest["/Title"], arr) 

2923 except KeyError: 

2924 pass 

2925 elif dest["/Page"].indirect_reference.idnum in srcpages: 

2926 arr[NumberObject(0)] = srcpages[ 

2927 dest["/Page"].indirect_reference.idnum 

2928 ].indirect_reference 

2929 self.add_named_destination_array(dest["/Title"], arr) 

2930 

2931 for dest in reader._named_destinations.values(): 

2932 _process_named_dests(dest) 

2933 

2934 outline_item_typ: TreeObject 

2935 if outline_item is not None: 

2936 outline_item_typ = cast( 

2937 "TreeObject", 

2938 self.add_outline_item( 

2939 TextStringObject(outline_item), 

2940 next(iter(srcpages.values())).indirect_reference, 

2941 fit=PAGE_FIT, 

2942 ).get_object(), 

2943 ) 

2944 else: 

2945 outline_item_typ = self.get_outline_root() 

2946 

2947 _ro = reader.root_object 

2948 if import_outline and CO.OUTLINES in _ro: 

2949 outline = self._get_filtered_outline( 

2950 _ro.get(CO.OUTLINES, None), srcpages, reader 

2951 ) 

2952 self._insert_filtered_outline( 

2953 outline, outline_item_typ, None 

2954 ) # TODO: use before parameter 

2955 

2956 if "/Annots" not in excluded_fields: 

2957 for pag in srcpages.values(): 

2958 lst = self._insert_filtered_annotations( 

2959 pag.original_page.get("/Annots", []), pag, srcpages, reader 

2960 ) 

2961 if len(lst) > 0: 

2962 pag[NameObject("/Annots")] = lst 

2963 self.clean_page(pag) 

2964 

2965 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None: 

2966 if "/AcroForm" not in self._root_object: 

2967 self._root_object[NameObject("/AcroForm")] = self._add_object( 

2968 cast( 

2969 DictionaryObject, 

2970 reader.root_object["/AcroForm"], 

2971 ).clone(self, False, ("/Fields",)) 

2972 ) 

2973 arr = ArrayObject() 

2974 else: 

2975 arr = cast( 

2976 ArrayObject, 

2977 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], 

2978 ) 

2979 trslat = self._id_translated[id(reader)] 

2980 try: 

2981 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore 

2982 try: 

2983 ind = IndirectObject(trslat[f.idnum], 0, self) 

2984 if ind not in arr: 

2985 arr.append(ind) 

2986 except KeyError: 

2987 # for trslat[] which mean the field has not be copied 

2988 # through the page 

2989 pass 

2990 except KeyError: # for /Acroform or /Fields are not existing 

2991 arr = self._add_object(ArrayObject()) 

2992 cast(DictionaryObject, self._root_object["/AcroForm"])[ 

2993 NameObject("/Fields") 

2994 ] = arr 

2995 

2996 if "/B" not in excluded_fields: 

2997 self.add_filtered_articles("", srcpages, reader) 

2998 

2999 def _add_articles_thread( 

3000 self, 

3001 thread: DictionaryObject, # thread entry from the reader's array of threads 

3002 pages: Dict[int, PageObject], 

3003 reader: PdfReader, 

3004 ) -> IndirectObject: 

3005 """ 

3006 Clone the thread with only the applicable articles. 

3007 

3008 Args: 

3009 thread: 

3010 pages: 

3011 reader: 

3012 

3013 Returns: 

3014 The added thread as an indirect reference 

3015 

3016 """ 

3017 nthread = thread.clone( 

3018 self, force_duplicate=True, ignore_fields=("/F",) 

3019 ) # use of clone to keep link between reader and writer 

3020 self.threads.append(nthread.indirect_reference) 

3021 first_article = cast("DictionaryObject", thread["/F"]) 

3022 current_article: Optional[DictionaryObject] = first_article 

3023 new_article: Optional[DictionaryObject] = None 

3024 while current_article is not None: 

3025 pag = self._get_cloned_page( 

3026 cast("PageObject", current_article["/P"]), pages, reader 

3027 ) 

3028 if pag is not None: 

3029 if new_article is None: 

3030 new_article = cast( 

3031 "DictionaryObject", 

3032 self._add_object(DictionaryObject()).get_object(), 

3033 ) 

3034 new_first = new_article 

3035 nthread[NameObject("/F")] = new_article.indirect_reference 

3036 else: 

3037 new_article2 = cast( 

3038 "DictionaryObject", 

3039 self._add_object( 

3040 DictionaryObject( 

3041 {NameObject("/V"): new_article.indirect_reference} 

3042 ) 

3043 ).get_object(), 

3044 ) 

3045 new_article[NameObject("/N")] = new_article2.indirect_reference 

3046 new_article = new_article2 

3047 new_article[NameObject("/P")] = pag 

3048 new_article[NameObject("/T")] = nthread.indirect_reference 

3049 new_article[NameObject("/R")] = current_article["/R"] 

3050 pag_obj = cast("PageObject", pag.get_object()) 

3051 if "/B" not in pag_obj: 

3052 pag_obj[NameObject("/B")] = ArrayObject() 

3053 cast("ArrayObject", pag_obj["/B"]).append( 

3054 new_article.indirect_reference 

3055 ) 

3056 current_article = cast("DictionaryObject", current_article["/N"]) 

3057 if current_article == first_article: 

3058 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore 

3059 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore 

3060 current_article = None 

3061 assert nthread.indirect_reference is not None 

3062 return nthread.indirect_reference 

3063 

3064 def add_filtered_articles( 

3065 self, 

3066 fltr: Union[ 

3067 Pattern[Any], str 

3068 ], # thread entry from the reader's array of threads 

3069 pages: Dict[int, PageObject], 

3070 reader: PdfReader, 

3071 ) -> None: 

3072 """ 

3073 Add articles matching the defined criteria. 

3074 

3075 Args: 

3076 fltr: 

3077 pages: 

3078 reader: 

3079 

3080 """ 

3081 if isinstance(fltr, str): 

3082 fltr = re.compile(fltr) 

3083 elif not isinstance(fltr, Pattern): 

3084 fltr = re.compile("") 

3085 for p in pages.values(): 

3086 pp = p.original_page 

3087 for a in pp.get("/B", ()): 

3088 thr = a.get_object().get("/T") 

3089 if thr is None: 

3090 continue 

3091 thr = thr.get_object() 

3092 if thr.indirect_reference.idnum not in self._id_translated[ 

3093 id(reader) 

3094 ] and fltr.search((thr.get("/I", {})).get("/Title", "")): 

3095 self._add_articles_thread(thr, pages, reader) 

3096 

3097 def _get_cloned_page( 

3098 self, 

3099 page: Union[None, IndirectObject, PageObject, NullObject], 

3100 pages: Dict[int, PageObject], 

3101 reader: PdfReader, 

3102 ) -> Optional[IndirectObject]: 

3103 if isinstance(page, NullObject): 

3104 return None 

3105 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": 

3106 _i = page.indirect_reference 

3107 elif isinstance(page, IndirectObject): 

3108 _i = page 

3109 try: 

3110 return pages[_i.idnum].indirect_reference # type: ignore 

3111 except Exception: 

3112 return None 

3113 

3114 def _insert_filtered_annotations( 

3115 self, 

3116 annots: Union[IndirectObject, List[DictionaryObject], None], 

3117 page: PageObject, 

3118 pages: Dict[int, PageObject], 

3119 reader: PdfReader, 

3120 ) -> List[Destination]: 

3121 outlist = ArrayObject() 

3122 if isinstance(annots, IndirectObject): 

3123 annots = cast("List[Any]", annots.get_object()) 

3124 if annots is None: 

3125 return outlist 

3126 if not isinstance(annots, list): 

3127 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__) 

3128 return outlist 

3129 for an in annots: 

3130 ano = cast("DictionaryObject", an.get_object()) 

3131 if ( 

3132 ano["/Subtype"] != "/Link" 

3133 or "/A" not in ano 

3134 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" 

3135 or "/Dest" in ano 

3136 ): 

3137 if "/Dest" not in ano: 

3138 outlist.append(self._add_object(ano.clone(self))) 

3139 else: 

3140 d = ano["/Dest"] 

3141 if isinstance(d, str): 

3142 # it is a named dest 

3143 if str(d) in self.get_named_dest_root(): 

3144 outlist.append(ano.clone(self).indirect_reference) 

3145 else: 

3146 d = cast("ArrayObject", d) 

3147 p = self._get_cloned_page(d[0], pages, reader) 

3148 if p is not None: 

3149 anc = ano.clone(self, ignore_fields=("/Dest",)) 

3150 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]]) 

3151 outlist.append(self._add_object(anc)) 

3152 else: 

3153 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject()) 

3154 if d is None or isinstance(d, NullObject): 

3155 continue 

3156 if isinstance(d, str): 

3157 # it is a named dest 

3158 if str(d) in self.get_named_dest_root(): 

3159 outlist.append(ano.clone(self).indirect_reference) 

3160 else: 

3161 d = cast("ArrayObject", d) 

3162 p = self._get_cloned_page(d[0], pages, reader) 

3163 if p is not None: 

3164 anc = ano.clone(self, ignore_fields=("/D",)) 

3165 cast("DictionaryObject", anc["/A"])[ 

3166 NameObject("/D") 

3167 ] = ArrayObject([p, *d[1:]]) 

3168 outlist.append(self._add_object(anc)) 

3169 return outlist 

3170 

3171 def _get_filtered_outline( 

3172 self, 

3173 node: Any, 

3174 pages: Dict[int, PageObject], 

3175 reader: PdfReader, 

3176 ) -> List[Destination]: 

3177 """ 

3178 Extract outline item entries that are part of the specified page set. 

3179 

3180 Args: 

3181 node: 

3182 pages: 

3183 reader: 

3184 

3185 Returns: 

3186 A list of destination objects. 

3187 

3188 """ 

3189 new_outline = [] 

3190 if node is None: 

3191 node = NullObject() 

3192 node = node.get_object() 

3193 if is_null_or_none(node): 

3194 node = DictionaryObject() 

3195 if node.get("/Type", "") == "/Outlines" or "/Title" not in node: 

3196 node = node.get("/First", None) 

3197 if node is not None: 

3198 node = node.get_object() 

3199 new_outline += self._get_filtered_outline(node, pages, reader) 

3200 else: 

3201 v: Union[None, IndirectObject, NullObject] 

3202 while node is not None: 

3203 node = node.get_object() 

3204 o = cast("Destination", reader._build_outline_item(node)) 

3205 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader) 

3206 if v is None: 

3207 v = NullObject() 

3208 o[NameObject("/Page")] = v 

3209 if "/First" in node: 

3210 o._filtered_children = self._get_filtered_outline( 

3211 node["/First"], pages, reader 

3212 ) 

3213 else: 

3214 o._filtered_children = [] 

3215 if ( 

3216 not isinstance(o["/Page"], NullObject) 

3217 or len(o._filtered_children) > 0 

3218 ): 

3219 new_outline.append(o) 

3220 node = node.get("/Next", None) 

3221 return new_outline 

3222 

3223 def _clone_outline(self, dest: Destination) -> TreeObject: 

3224 n_ol = TreeObject() 

3225 self._add_object(n_ol) 

3226 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"]) 

3227 if not isinstance(dest["/Page"], NullObject): 

3228 if dest.node is not None and "/A" in dest.node: 

3229 n_ol[NameObject("/A")] = dest.node["/A"].clone(self) 

3230 else: 

3231 n_ol[NameObject("/Dest")] = dest.dest_array 

3232 # TODO: /SE 

3233 if dest.node is not None: 

3234 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0)) 

3235 n_ol[NameObject("/C")] = ArrayObject( 

3236 dest.node.get( 

3237 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)] 

3238 ) 

3239 ) 

3240 return n_ol 

3241 

3242 def _insert_filtered_outline( 

3243 self, 

3244 outlines: List[Destination], 

3245 parent: Union[TreeObject, IndirectObject], 

3246 before: Union[None, TreeObject, IndirectObject] = None, 

3247 ) -> None: 

3248 for dest in outlines: 

3249 # TODO: can be improved to keep A and SE entries (ignored for the moment) 

3250 # with np=self.add_outline_item_destination(dest,parent,before) 

3251 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: 

3252 np = parent 

3253 else: 

3254 np = self._clone_outline(dest) 

3255 cast(TreeObject, parent.get_object()).insert_child(np, before, self) 

3256 self._insert_filtered_outline(dest._filtered_children, np, None) 

3257 

3258 def close(self) -> None: 

3259 """Implemented for API harmonization.""" 

3260 return 

3261 

3262 def find_outline_item( 

3263 self, 

3264 outline_item: Dict[str, Any], 

3265 root: Optional[OutlineType] = None, 

3266 ) -> Optional[List[int]]: 

3267 if root is None: 

3268 o = self.get_outline_root() 

3269 else: 

3270 o = cast("TreeObject", root) 

3271 

3272 i = 0 

3273 while o is not None: 

3274 if ( 

3275 o.indirect_reference == outline_item 

3276 or o.get("/Title", None) == outline_item 

3277 ): 

3278 return [i] 

3279 if "/First" in o: 

3280 res = self.find_outline_item( 

3281 outline_item, cast(OutlineType, o["/First"]) 

3282 ) 

3283 if res: 

3284 return ([i] if "/Title" in o else []) + res 

3285 if "/Next" in o: 

3286 i += 1 

3287 o = cast(TreeObject, o["/Next"]) 

3288 else: 

3289 return None 

3290 

3291 def find_bookmark( 

3292 self, 

3293 outline_item: Dict[str, Any], 

3294 root: Optional[OutlineType] = None, 

3295 ) -> None: # deprecated 

3296 """ 

3297 .. deprecated:: 2.9.0 

3298 Use :meth:`find_outline_item` instead. 

3299 """ 

3300 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0") 

3301 

3302 def reset_translation( 

3303 self, reader: Union[None, PdfReader, IndirectObject] = None 

3304 ) -> None: 

3305 """ 

3306 Reset the translation table between reader and the writer object. 

3307 

3308 Late cloning will create new independent objects. 

3309 

3310 Args: 

3311 reader: PdfReader or IndirectObject referencing a PdfReader object. 

3312 if set to None or omitted, all tables will be reset. 

3313 

3314 """ 

3315 if reader is None: 

3316 self._id_translated = {} 

3317 elif isinstance(reader, PdfReader): 

3318 try: 

3319 del self._id_translated[id(reader)] 

3320 except Exception: 

3321 pass 

3322 elif isinstance(reader, IndirectObject): 

3323 try: 

3324 del self._id_translated[id(reader.pdf)] 

3325 except Exception: 

3326 pass 

3327 else: 

3328 raise Exception("invalid parameter {reader}") 

3329 

3330 def set_page_label( 

3331 self, 

3332 page_index_from: int, 

3333 page_index_to: int, 

3334 style: Optional[PageLabelStyle] = None, 

3335 prefix: Optional[str] = None, 

3336 start: Optional[int] = 0, 

3337 ) -> None: 

3338 """ 

3339 Set a page label to a range of pages. 

3340 

3341 Page indexes must be given starting from 0. 

3342 Labels must have a style, a prefix or both. 

3343 If a range is not assigned any page label, a decimal label starting from 1 is applied. 

3344 

3345 Args: 

3346 page_index_from: page index of the beginning of the range starting from 0 

3347 page_index_to: page index of the beginning of the range starting from 0 

3348 style: The numbering style to be used for the numeric portion of each page label: 

3349 

3350 * ``/D`` Decimal Arabic numerals 

3351 * ``/R`` Uppercase Roman numerals 

3352 * ``/r`` Lowercase Roman numerals 

3353 * ``/A`` Uppercase letters (A to Z for the first 26 pages, 

3354 AA to ZZ for the next 26, and so on) 

3355 * ``/a`` Lowercase letters (a to z for the first 26 pages, 

3356 aa to zz for the next 26, and so on) 

3357 

3358 prefix: The label prefix for page labels in this range. 

3359 start: The value of the numeric portion for the first page label 

3360 in the range. 

3361 Subsequent pages are numbered sequentially from this value, 

3362 which must be greater than or equal to 1. 

3363 Default value: 1. 

3364 

3365 """ 

3366 if style is None and prefix is None: 

3367 raise ValueError("At least one of style and prefix must be given") 

3368 if page_index_from < 0: 

3369 raise ValueError("page_index_from must be greater or equal than 0") 

3370 if page_index_to < page_index_from: 

3371 raise ValueError( 

3372 "page_index_to must be greater or equal than page_index_from" 

3373 ) 

3374 if page_index_to >= len(self.pages): 

3375 raise ValueError("page_index_to exceeds number of pages") 

3376 if start is not None and start != 0 and start < 1: 

3377 raise ValueError("If given, start must be greater or equal than one") 

3378 

3379 self._set_page_label(page_index_from, page_index_to, style, prefix, start) 

3380 

3381 def _set_page_label( 

3382 self, 

3383 page_index_from: int, 

3384 page_index_to: int, 

3385 style: Optional[PageLabelStyle] = None, 

3386 prefix: Optional[str] = None, 

3387 start: Optional[int] = 0, 

3388 ) -> None: 

3389 """ 

3390 Set a page label to a range of pages. 

3391 

3392 Page indexes must be given starting from 0. 

3393 Labels must have a style, a prefix or both. 

3394 If a range is not assigned any page label a decimal label starting from 1 is applied. 

3395 

3396 Args: 

3397 page_index_from: page index of the beginning of the range starting from 0 

3398 page_index_to: page index of the beginning of the range starting from 0 

3399 style: The numbering style to be used for the numeric portion of each page label: 

3400 /D Decimal Arabic numerals 

3401 /R Uppercase Roman numerals 

3402 /r Lowercase Roman numerals 

3403 /A Uppercase letters (A to Z for the first 26 pages, 

3404 AA to ZZ for the next 26, and so on) 

3405 /a Lowercase letters (a to z for the first 26 pages, 

3406 aa to zz for the next 26, and so on) 

3407 prefix: The label prefix for page labels in this range. 

3408 start: The value of the numeric portion for the first page label 

3409 in the range. 

3410 Subsequent pages are numbered sequentially from this value, 

3411 which must be greater than or equal to 1. Default value: 1. 

3412 

3413 """ 

3414 default_page_label = DictionaryObject() 

3415 default_page_label[NameObject("/S")] = NameObject("/D") 

3416 

3417 new_page_label = DictionaryObject() 

3418 if style is not None: 

3419 new_page_label[NameObject("/S")] = NameObject(style) 

3420 if prefix is not None: 

3421 new_page_label[NameObject("/P")] = TextStringObject(prefix) 

3422 if start != 0: 

3423 new_page_label[NameObject("/St")] = NumberObject(start) 

3424 

3425 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: 

3426 nums = ArrayObject() 

3427 nums_insert(NumberObject(0), default_page_label, nums) 

3428 page_labels = TreeObject() 

3429 page_labels[NameObject("/Nums")] = nums 

3430 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3431 

3432 page_labels = cast( 

3433 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] 

3434 ) 

3435 nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) 

3436 

3437 nums_insert(NumberObject(page_index_from), new_page_label, nums) 

3438 nums_clear_range(NumberObject(page_index_from), page_index_to, nums) 

3439 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums) 

3440 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages): 

3441 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) 

3442 

3443 page_labels[NameObject("/Nums")] = nums 

3444 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels 

3445 

3446 def _repr_mimebundle_( 

3447 self, 

3448 include: Union[None, Iterable[str]] = None, 

3449 exclude: Union[None, Iterable[str]] = None, 

3450 ) -> Dict[str, Any]: 

3451 """ 

3452 Integration into Jupyter Notebooks. 

3453 

3454 This method returns a dictionary that maps a mime-type to its 

3455 representation. 

3456 

3457 .. seealso:: 

3458 

3459 https://ipython.readthedocs.io/en/stable/config/integrating.html 

3460 """ 

3461 pdf_data = BytesIO() 

3462 self.write(pdf_data) 

3463 data = { 

3464 "application/pdf": pdf_data, 

3465 } 

3466 

3467 if include is not None: 

3468 # Filter representations based on include list 

3469 data = {k: v for k, v in data.items() if k in include} 

3470 

3471 if exclude is not None: 

3472 # Remove representations based on exclude list 

3473 data = {k: v for k, v in data.items() if k not in exclude} 

3474 

3475 return data 

3476 

3477 

3478def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject: 

3479 if isinstance(obj, PdfObject): 

3480 return obj 

3481 if isinstance(obj, dict): 

3482 to_add = DictionaryObject() 

3483 for key, value in obj.items(): 

3484 to_add[NameObject(key)] = _pdf_objectify(value) 

3485 return to_add 

3486 if isinstance(obj, str): 

3487 if obj.startswith("/"): 

3488 return NameObject(obj) 

3489 return TextStringObject(obj) 

3490 if isinstance(obj, (float, int)): 

3491 return FloatObject(obj) 

3492 if isinstance(obj, list): 

3493 return ArrayObject(_pdf_objectify(i) for i in obj) 

3494 raise NotImplementedError( 

3495 f"{type(obj)=} could not be cast to a PdfObject" 

3496 ) 

3497 

3498 

3499def _create_outline_item( 

3500 action_ref: Union[None, IndirectObject], 

3501 title: str, 

3502 color: Union[Tuple[float, float, float], str, None], 

3503 italic: bool, 

3504 bold: bool, 

3505) -> TreeObject: 

3506 outline_item = TreeObject() 

3507 if action_ref is not None: 

3508 outline_item[NameObject("/A")] = action_ref 

3509 outline_item.update( 

3510 { 

3511 NameObject("/Title"): create_string_object(title), 

3512 } 

3513 ) 

3514 if color: 

3515 if isinstance(color, str): 

3516 color = hex_to_rgb(color) 

3517 outline_item.update( 

3518 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} 

3519 ) 

3520 if italic or bold: 

3521 format_flag = 0 

3522 if italic: 

3523 format_flag += OutlineFontFlag.italic 

3524 if bold: 

3525 format_flag += OutlineFontFlag.bold 

3526 outline_item.update({NameObject("/F"): NumberObject(format_flag)}) 

3527 return outline_item 

3528 

3529 

3530def generate_appearance_stream( 

3531 txt: str, 

3532 sel: List[str], 

3533 da: str, 

3534 font_full_rev: Dict[str, bytes], 

3535 rct: RectangleObject, 

3536 font_height: float, 

3537 y_offset: float, 

3538) -> bytes: 

3539 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() 

3540 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): 

3541 if line in sel: 

3542 # may be improved but cannot find how to get fill working => replaced with lined box 

3543 ap_stream += ( 

3544 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" 

3545 f"0.5 0.5 0.5 rg s\n{da}\n" 

3546 ).encode() 

3547 if line_number == 0: 

3548 ap_stream += f"2 {y_offset} Td\n".encode() 

3549 else: 

3550 # Td is a relative translation 

3551 ap_stream += f"0 {- font_height * 1.4} Td\n".encode() 

3552 enc_line: List[bytes] = [ 

3553 font_full_rev.get(c, c.encode("utf-16-be")) for c in line 

3554 ] 

3555 if any(len(c) >= 2 for c in enc_line): 

3556 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" 

3557 else: 

3558 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" 

3559 ap_stream += b"ET\nQ\nEMC\nQ\n" 

3560 return ap_stream