Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import sys

36import uuid

37from collections.abc import Iterable, Mapping

38from io import BytesIO, FileIO, IOBase

39from itertools import compress

40from pathlib import Path

41from re import Pattern

42from types import TracebackType

43from typing import (

44 IO,

45 Any,

46 Callable,

47 Optional,

48 Union,

49 cast,

50)

52if sys.version_info >= (3, 11):

53 from typing import Self

54else:

55 from typing_extensions import Self

57from ._doc_common import DocumentInformation, PdfDocCommon

58from ._encryption import EncryptAlgorithm, Encryption

59from ._page import PageObject, Transformation

60from ._page_labels import nums_clear_range, nums_insert, nums_next

61from ._reader import PdfReader

62from ._utils import (

63 StrByteType,

64 StreamType,

65 _get_max_pdf_version_header,

66 deprecate_with_replacement,

67 deprecation_no_replacement,

68 logger_warning,

69)

70from .constants import AnnotationDictionaryAttributes as AA

71from .constants import CatalogAttributes as CA

72from .constants import (

73 CatalogDictionary,

74 GoToActionArguments,

75 ImageType,

76 InteractiveFormDictEntries,

77 OutlineFontFlag,

78 PageLabelStyle,

79 PagesAttributes,

80 TypFitArguments,

81 UserAccessPermissions,

82)

83from .constants import Core as CO

84from .constants import FieldDictionaryAttributes as FA

85from .constants import PageAttributes as PG

86from .constants import TrailerKeys as TK

87from .errors import LimitReachedError, PdfReadError, PyPdfError

88from .generic import (

89 PAGE_FIT,

90 ArrayObject,

91 BooleanObject,

92 ByteStringObject,

93 ContentStream,

94 Destination,

95 DictionaryObject,

96 EmbeddedFile,

97 Fit,

98 FloatObject,

99 IndirectObject,

100 NameObject,

101 NullObject,

102 NumberObject,

103 PdfObject,

104 RectangleObject,

105 ReferenceLink,

106 StreamObject,

107 TextStringObject,

108 TreeObject,

109 ViewerPreferences,

110 create_string_object,

111 extract_links,

112 hex_to_rgb,

113 is_null_or_none,

114)

115from .generic._appearance_stream import TextStreamAppearance

116from .pagerange import PageRange, PageRangeSpec

117from .types import (

118 AnnotationSubtype,

119 BorderArrayType,

120 LayoutType,

121 OutlineItemType,

122 OutlineType,

123 PagemodeType,

124)

125from .xmp import XmpInformation

126

127ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

128

129

130class ObjectDeletionFlag(enum.IntFlag):

131 NONE = 0

132 TEXT = enum.auto()

133 LINKS = enum.auto()

134 ATTACHMENTS = enum.auto()

135 OBJECTS_3D = enum.auto()

136 ALL_ANNOTATIONS = enum.auto()

137 XOBJECT_IMAGES = enum.auto()

138 INLINE_IMAGES = enum.auto()

139 DRAWING_IMAGES = enum.auto()

140 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

141

142

143def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

144 hash = hashlib.md5(usedforsecurity=False)

145 for block in iter(lambda: stream.read(blocksize), b""):

146 hash.update(block)

147 return hash.hexdigest()

148

149

150class PdfWriter(PdfDocCommon):

151 """

152 Write a PDF file out, given pages produced by another class or through

153 cloning a PDF file during initialization.

154

155 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

156

157 Args:

158 clone_from: identical to fileobj (for compatibility)

159

160 incremental: If true, loads the document and set the PdfWriter in incremental mode.

161

162 When writing incrementally, the original document is written first and new/modified

163 content is appended. To be used for signed document/forms to keep signature valid.

164

165 full: If true, loads all the objects (always full if incremental = True).

166 This parameter may allow loading large PDFs.

167

168 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.

169 If false, pypdf will try to be forgiving and do something reasonable, but it will log

170 a warning message. It is a best-effort approach.

171

172 """

173

174 def __init__(

175 self,

176 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

177 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

178 incremental: bool = False,

179 full: bool = False,

180 strict: bool = False,

181 *,

182 incremental_clone_object_count_limit: Optional[int] = 500_000,

183 incremental_clone_object_id_limit: Optional[int] = 1_000_000,

184 ) -> None:

185 self.strict = strict

186 """

187 If true, pypdf will raise an exception if a PDF does not follow the specification.

188 If false, pypdf will try to be forgiving and do something reasonable, but it will log

189 a warning message. It is a best-effort approach.

190 """

191

192 self.incremental = incremental or full

193 """

194 Returns if the PdfWriter object has been started in incremental mode.

195 """

196

197 self._objects: list[Optional[PdfObject]] = []

198 """

199 The indirect objects in the PDF.

200 For the incremental case, it will be filled with None

201 in clone_reader_document_root.

202 """

203

204 self._original_hash: list[int] = []

205 """

206 List of hashes after import; used to identify changes.

207 """

208

209 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}

210 """

211 Maps hash values of indirect objects to the list of IndirectObjects.

212 This is used for compression.

213 """

214

215 self._id_translated: dict[int, dict[int, int]] = {}

216 """List of already translated IDs.

217 dict[id(pdf)][(idnum, generation)]

218 """

219

220 self._info_obj: Optional[PdfObject]

221 """The PDF files's document information dictionary,

222 defined by Info in the PDF file's trailer dictionary."""

223

224 self._ID: Union[ArrayObject, None] = None

225 """The PDF file identifier,

226 defined by the ID in the PDF file's trailer dictionary."""

227

228 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []

229 "Tracks links in pages added to the writer for resolving later."

230 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}

231 "Tracks pages added to the writer and what page they turned into."

232

233 # Security parameters.

234 self._incremental_clone_object_count_limit = (

235 incremental_clone_object_count_limit

236 if isinstance(incremental_clone_object_count_limit, int)

237 else sys.maxsize

238 )

239 self._incremental_clone_object_id_limit = (

240 incremental_clone_object_id_limit if isinstance(incremental_clone_object_id_limit, int) else sys.maxsize

241 )

242

243 if self.incremental:

244 if isinstance(fileobj, (str, Path)):

245 with open(fileobj, "rb") as f:

246 fileobj = BytesIO(f.read(-1))

247 if isinstance(fileobj, BytesIO):

248 fileobj = PdfReader(fileobj)

249 if not isinstance(fileobj, PdfReader):

250 raise PyPdfError("Invalid type for incremental mode")

251 self._reader = fileobj # prev content is in _reader.stream

252 self._header = fileobj.pdf_header.encode()

253 self._readonly = True # TODO: to be analysed

254 else:

255 self._header = b"%PDF-1.3"

256 self._info_obj = self._add_object(

257 DictionaryObject(

258 {NameObject("/Producer"): create_string_object("pypdf")}

259 )

260 )

261

262 def _get_clone_from(

263 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

264 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

265 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

266 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

267 fileobj == "" or clone_from is not None

268 ):

269 return clone_from

270 cloning = True

271 if isinstance(fileobj, (str, Path)) and (

272 not Path(str(fileobj)).exists()

273 or Path(str(fileobj)).stat().st_size == 0

274 ):

275 cloning = False

276 if isinstance(fileobj, (IOBase, BytesIO)):

277 t = fileobj.tell()

278 if fileobj.seek(0, 2) == 0:

279 cloning = False

280 fileobj.seek(t, 0)

281 if cloning:

282 clone_from = fileobj

283 return clone_from

284

285 clone_from = _get_clone_from(fileobj, clone_from)

286 # To prevent overwriting

287 self.temp_fileobj = fileobj

288 self.fileobj = ""

289 self._with_as_usage = False

290 self._cloned = False

291 # The root of our page tree node

292 pages = DictionaryObject(

293 {

294 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),

295 NameObject(PagesAttributes.COUNT): NumberObject(0),

296 NameObject(PagesAttributes.KIDS): ArrayObject(),

297 }

298 )

299 self.flattened_pages = []

300 self._encryption: Optional[Encryption] = None

301 self._encrypt_entry: Optional[DictionaryObject] = None

302

303 if clone_from is not None:

304 if not isinstance(clone_from, PdfReader):

305 clone_from = PdfReader(clone_from)

306 self.clone_document_from_reader(clone_from)

307 self._cloned = True

308 else:

309 self._pages = self._add_object(pages)

310 self._root_object = DictionaryObject(

311 {

312 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),

313 NameObject(CO.PAGES): self._pages,

314 }

315 )

316 self._add_object(self._root_object)

317 if full and not incremental:

318 self.incremental = False

319 if isinstance(self._ID, list):

320 if isinstance(self._ID[0], TextStringObject):

321 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

322 if isinstance(self._ID[1], TextStringObject):

323 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

324

325 # for commonality

326 @property

327 def is_encrypted(self) -> bool:

328 """

329 Read-only boolean property showing whether this PDF file is encrypted.

330

331 Note that this property, if true, will remain true even after the

332 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

333 """

334 return False

335

336 @property

337 def root_object(self) -> DictionaryObject:

338 """

339 Provide direct access to PDF Structure.

340

341 Note:

342 Recommended only for read access.

343

344 """

345 return self._root_object

346

347 @property

348 def _info(self) -> Optional[DictionaryObject]:

349 """

350 Provide access to "/Info". Standardized with PdfReader.

351

352 Returns:

353 /Info Dictionary; None if the entry does not exist

354

355 """

356 return (

357 None

358 if self._info_obj is None

359 else cast(DictionaryObject, self._info_obj.get_object())

360 )

361

362 @_info.setter

363 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

364 if value is None:

365 try:

366 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore

367 except (KeyError, AttributeError):

368 pass

369 self._info_obj = None

370 else:

371 if self._info_obj is None:

372 self._info_obj = self._add_object(DictionaryObject())

373 obj = cast(DictionaryObject, self._info_obj.get_object())

374 obj.clear()

375 obj.update(cast(DictionaryObject, value.get_object()))

376

377 @property

378 def xmp_metadata(self) -> Optional[XmpInformation]:

379 """XMP (Extensible Metadata Platform) data."""

380 return cast(XmpInformation, self.root_object.xmp_metadata)

381

382 @xmp_metadata.setter

383 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:

384 """XMP (Extensible Metadata Platform) data."""

385 if value is None:

386 if "/Metadata" in self.root_object:

387 del self.root_object["/Metadata"]

388 return

389

390 metadata = self.root_object.get("/Metadata", None)

391 if not isinstance(metadata, IndirectObject):

392 if metadata is not None:

393 del self.root_object["/Metadata"]

394 metadata_stream = StreamObject()

395 stream_reference = self._add_object(metadata_stream)

396 self.root_object[NameObject("/Metadata")] = stream_reference

397 else:

398 metadata_stream = cast(StreamObject, metadata.get_object())

399

400 if isinstance(value, XmpInformation):

401 bytes_data = value.stream.get_data()

402 else:

403 bytes_data = value

404 metadata_stream.set_data(bytes_data)

405

406 @property

407 def with_as_usage(self) -> bool:

408 deprecation_no_replacement("with_as_usage", "5.0")

409 return self._with_as_usage

410

411 @with_as_usage.setter

412 def with_as_usage(self, value: bool) -> None:

413 deprecation_no_replacement("with_as_usage", "5.0")

414 self._with_as_usage = value

415

416 def __enter__(self) -> Self:

417 """Store how writer is initialized by 'with'."""

418 c: bool = self._cloned

419 t = self.temp_fileobj

420 self.__init__() # type: ignore

421 self._cloned = c

422 self._with_as_usage = True

423 self.fileobj = t # type: ignore

424 return self

425

426 def __exit__(

427 self,

428 exc_type: Optional[type[BaseException]],

429 exc: Optional[BaseException],

430 traceback: Optional[TracebackType],

431 ) -> None:

432 """Write data to the fileobj."""

433 if self.fileobj and not self._cloned:

434 self.write(self.fileobj)

435

436 @property

437 def pdf_header(self) -> str:

438 """

439 Read/Write property of the PDF header that is written.

440

441 This should be something like ``'%PDF-1.5'``. It is recommended to set

442 the lowest version that supports all features which are used within the

443 PDF file.

444

445 Note: `pdf_header` returns a string but accepts bytes or str for writing

446 """

447 return self._header.decode()

448

449 @pdf_header.setter

450 def pdf_header(self, new_header: Union[str, bytes]) -> None:

451 if isinstance(new_header, str):

452 new_header = new_header.encode()

453 self._header = new_header

454

455 def _add_object(self, obj: PdfObject) -> IndirectObject:

456 if (

457 getattr(obj, "indirect_reference", None) is not None

458 and obj.indirect_reference.pdf == self # type: ignore

459 ):

460 return obj.indirect_reference # type: ignore

461 # check for /Contents in Pages (/Contents in annotations are strings)

462 if isinstance(obj, DictionaryObject) and isinstance(

463 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

464 ):

465 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

466 self._objects.append(obj)

467 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

468 return obj.indirect_reference

469

470 def get_object(

471 self,

472 indirect_reference: Union[int, IndirectObject],

473 ) -> PdfObject:

474 if isinstance(indirect_reference, int):

475 obj = self._objects[indirect_reference - 1]

476 elif indirect_reference.pdf != self:

477 raise ValueError("PDF must be self")

478 else:

479 obj = self._objects[indirect_reference.idnum - 1]

480 if obj is None:

481 raise PdfReadError(f"Object {indirect_reference!r} not found!")

482 return obj

483

484 def _replace_object(

485 self,

486 indirect_reference: Union[int, IndirectObject],

487 obj: PdfObject,

488 ) -> PdfObject:

489 if isinstance(indirect_reference, IndirectObject):

490 if indirect_reference.pdf != self:

491 raise ValueError("PDF must be self")

492 indirect_reference = indirect_reference.idnum

493 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore

494 if (

495 getattr(obj, "indirect_reference", None) is not None

496 and obj.indirect_reference.pdf != self # type: ignore

497 ):

498 obj = obj.clone(self)

499 self._objects[indirect_reference - 1] = obj

500 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

501

502 assert isinstance(obj, PdfObject), "mypy"

503 return obj

504

505 def _add_page(

506 self,

507 page: PageObject,

508 index: int,

509 excluded_keys: Iterable[str] = (),

510 ) -> PageObject:

511 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:

512 raise ValueError("Invalid page object")

513 assert self.flattened_pages is not None, "for mypy"

514 page_org = page

515 excluded_keys = list(excluded_keys)

516 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]

517 # Acrobat does not accept two indirect references pointing on the same

518 # page; therefore in order to add multiple copies of the same

519 # page, we need to create a new dictionary for the page, however the

520 # objects below (including content) are not duplicated:

521 try: # delete an already existing page

522 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore

523 page_org.indirect_reference.idnum # type: ignore

524 ]

525 except Exception:

526 pass

527

528 page = cast(

529 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

530 )

531 if page_org.pdf is not None:

532 other = page_org.pdf.pdf_header

533 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

534

535 node, idx = self._get_page_in_node(index)

536 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference

537

538 if idx >= 0:

539 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)

540 self.flattened_pages.insert(index, page)

541 else:

542 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)

543 self.flattened_pages.append(page)

544 recurse = 0

545 while not is_null_or_none(node):

546 node = cast(DictionaryObject, node.get_object())

547 node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1)

548 node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix.

549 recurse += 1

550 if recurse > 1000:

551 raise PyPdfError("Too many recursive calls!")

552

553 if page_org.pdf is not None:

554 # the page may contain links to other pages, and those other

555 # pages may or may not already be added. we store the

556 # information we need, so that we can resolve the references

557 # later.

558 self._unresolved_links.extend(extract_links(page, page_org))

559 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference

560

561 return page

562

563 def set_need_appearances_writer(self, state: bool = True) -> None:

564 """

565 Sets the "NeedAppearances" flag in the PDF writer.

566

567 The "NeedAppearances" flag indicates whether the appearance dictionary

568 for form fields should be automatically generated by the PDF viewer or

569 if the embedded appearance should be used.

570

571 Args:

572 state: The actual value of the NeedAppearances flag.

573

574 Returns:

575 None

576

577 """

578 # See §12.7.2 and §7.7.2 for more information:

579 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

580 try:

581 # get the AcroForm tree

582 if CatalogDictionary.ACRO_FORM not in self._root_object:

583 self._root_object[

584 NameObject(CatalogDictionary.ACRO_FORM)

585 ] = self._add_object(DictionaryObject())

586

587 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

588 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

589 need_appearances

590 ] = BooleanObject(state)

591 except Exception as exc: # pragma: no cover

592 logger_warning(

593 f"set_need_appearances_writer({state}) catch : {exc}", __name__

594 )

595

596 def create_viewer_preferences(self) -> ViewerPreferences:

597 o = ViewerPreferences()

598 self._root_object[

599 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

600 ] = self._add_object(o)

601 return o

602

603 def add_page(

604 self,

605 page: PageObject,

606 excluded_keys: Iterable[str] = (),

607 ) -> PageObject:

608 """

609 Add a page to this PDF file.

610

611 Recommended for advanced usage including the adequate excluded_keys.

612

613 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

614 instance.

615

616 Args:

617 page: The page to add to the document. Should be

618 an instance of :class:`PageObject<pypdf._page.PageObject>`

619 excluded_keys:

620

621 Returns:

622 The added PageObject.

623

624 """

625 assert self.flattened_pages is not None, "mypy"

626 return self._add_page(page, len(self.flattened_pages), excluded_keys)

627

628 def insert_page(

629 self,

630 page: PageObject,

631 index: int = 0,

632 excluded_keys: Iterable[str] = (),

633 ) -> PageObject:

634 """

635 Insert a page in this PDF file. The page is usually acquired from a

636 :class:`PdfReader<pypdf.PdfReader>` instance.

637

638 Args:

639 page: The page to add to the document.

640 index: Position at which the page will be inserted.

641 excluded_keys:

642

643 Returns:

644 The added PageObject.

645

646 """

647 assert self.flattened_pages is not None, "mypy"

648 if index < 0:

649 index += len(self.flattened_pages)

650 if index < 0:

651 raise ValueError("Invalid index value")

652 if index >= len(self.flattened_pages):

653 return self.add_page(page, excluded_keys)

654 return self._add_page(page, index, excluded_keys)

655

656 def _get_page_number_by_indirect(

657 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

658 ) -> Optional[int]:

659 """

660 Generate _page_id2num.

661

662 Args:

663 indirect_reference:

664

665 Returns:

666 The page number or None

667

668 """

669 # To provide same function as in PdfReader

670 if is_null_or_none(indirect_reference):

671 return None

672 assert indirect_reference is not None, "mypy"

673 if isinstance(indirect_reference, int):

674 indirect_reference = IndirectObject(indirect_reference, 0, self)

675 obj = indirect_reference.get_object()

676 if isinstance(obj, PageObject):

677 return obj.page_number

678 return None

679

680 def add_blank_page(

681 self, width: Optional[float] = None, height: Optional[float] = None

682 ) -> PageObject:

683 """

684 Append a blank page to this PDF file and return it.

685

686 If no page size is specified, use the size of the last page.

687

688 Args:

689 width: The width of the new page expressed in default user

690 space units.

691 height: The height of the new page expressed in default

692 user space units.

693

694 Returns:

695 The newly appended page.

696

697 Raises:

698 PageSizeNotDefinedError: if width and height are not defined

699 and previous page does not exist.

700

701 """

702 page = PageObject.create_blank_page(self, width, height)

703 return self.add_page(page)

704

705 def insert_blank_page(

706 self,

707 width: Optional[Union[float, decimal.Decimal]] = None,

708 height: Optional[Union[float, decimal.Decimal]] = None,

709 index: int = 0,

710 ) -> PageObject:

711 """

712 Insert a blank page to this PDF file and return it.

713

714 If no page size is specified for a dimension, use the size of the last page.

715

716 Args:

717 width: The width of the new page in default user space units.

718 height: The height of the new page in default user space units.

719 index: Position to add the page.

720

721 Returns:

722 The newly inserted page.

723

724 Raises:

725 PageSizeNotDefinedError: if width and height are not defined

726 and previous page does not exist.

727 IndexError: Index is outside of [-self.get_num_pages(), self.get_num_pages()]

728 """

729 num_pages = self.get_num_pages()

730 if abs(index) <= num_pages:

731 # Use the chosen index, but do not exceed the available pages

732 fixed_index = min(index, num_pages - 1)

733 mediabox = self.pages[fixed_index].mediabox

734 if width is None or width <= 0:

735 width = mediabox.width

736 if height is None or height <= 0:

737 height = mediabox.height

738 else:

739 raise IndexError(f"Index should be in range [-{num_pages}, {num_pages}]")

740

741 page = PageObject.create_blank_page(self, width, height)

742 self.insert_page(page, index)

743 return page

744

745 @property

746 def open_destination(

747 self,

748 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

749 return super().open_destination

750

751 @open_destination.setter

752 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

753 if dest is None:

754 try:

755 del self._root_object["/OpenAction"]

756 except KeyError:

757 pass

758 elif isinstance(dest, str):

759 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

760 elif isinstance(dest, Destination):

761 self._root_object[NameObject("/OpenAction")] = dest.dest_array

762 elif isinstance(dest, PageObject):

763 self._root_object[NameObject("/OpenAction")] = Destination(

764 "Opening",

765 dest.indirect_reference

766 if dest.indirect_reference is not None

767 else NullObject(),

768 PAGE_FIT,

769 ).dest_array

770

771 def add_js(self, javascript: str) -> None:

772 """

773 Add JavaScript which will launch upon opening this PDF.

774

775 Args:

776 javascript: Your JavaScript.

777

778 Example:

779 This will launch the print window when the PDF is opened.

780

781 >>> from pypdf import PdfWriter

782 >>> output = PdfWriter()

783 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

784

785 """

786 # Names / JavaScript preferred to be able to add multiple scripts

787 if "/Names" not in self._root_object:

788 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

789 names = cast(DictionaryObject, self._root_object[CA.NAMES])

790 if "/JavaScript" not in names:

791 names[NameObject("/JavaScript")] = DictionaryObject(

792 {NameObject("/Names"): ArrayObject()}

793 )

794 js_list = cast(

795 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

796 )

797 # We need a name for parameterized JavaScript in the PDF file,

798 # but it can be anything.

799 js_list.append(create_string_object(str(uuid.uuid4())))

800

801 js = DictionaryObject(

802 {

803 NameObject(PagesAttributes.TYPE): NameObject("/Action"),

804 NameObject("/S"): NameObject("/JavaScript"),

805 NameObject("/JS"): TextStringObject(f"{javascript}"),

806 }

807 )

808 js_list.append(self._add_object(js))

809

810 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":

811 """

812 Embed a file inside the PDF.

813

814 Reference:

815 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

816 Section 7.11.3

817

818 Args:

819 filename: The filename to display.

820 data: The data in the file.

821

822 Returns:

823 EmbeddedFile instance for the newly created embedded file.

824

825 """

826 return EmbeddedFile._create_new(self, filename, data)

827

828 def append_pages_from_reader(

829 self,

830 reader: PdfReader,

831 after_page_append: Optional[Callable[[PageObject], None]] = None,

832 ) -> None:

833 """

834 Copy pages from reader to writer. Includes an optional callback

835 parameter which is invoked after pages are appended to the writer.

836

837 ``append`` should be preferred.

838

839 Args:

840 reader: a PdfReader object from which to copy page

841 annotations to this writer object. The writer's annots

842 will then be updated.

843 after_page_append:

844 Callback function that is invoked after each page is appended to

845 the writer. Signature includes a reference to the appended page

846 (delegates to append_pages_from_reader). The single parameter of

847 the callback is a reference to the page just appended to the

848 document.

849

850 """

851 reader_num_pages = len(reader.pages)

852 # Copy pages from reader to writer

853 for reader_page_number in range(reader_num_pages):

854 reader_page = reader.pages[reader_page_number]

855 writer_page = self.add_page(reader_page)

856 # Trigger callback, pass writer page as parameter

857 if callable(after_page_append):

858 after_page_append(writer_page)

859

860 def _merge_content_stream_to_page(

861 self,

862 page: PageObject,

863 new_content_data: bytes,

864 ) -> None:

865 """

866 Combines existing content stream(s) with new content (as bytes).

867

868 Args:

869 page: The page to which the new content data will be added.

870 new_content_data: A binary-encoded new content stream, for

871 instance the commands to draw an XObject.

872 """

873 # First resolve the existing page content. This always is an IndirectObject:

874 # PDF Explained by John Whitington

875 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

876 if NameObject("/Contents") in page:

877 existing_content_ref = page[NameObject("/Contents")]

878 existing_content = existing_content_ref.get_object()

879

880 if isinstance(existing_content, ArrayObject):

881 # Create a new StreamObject for the new_content_data

882 new_stream_obj = StreamObject()

883 new_stream_obj.set_data(new_content_data)

884 existing_content.append(self._add_object(new_stream_obj))

885 page[NameObject("/Contents")] = self._add_object(existing_content)

886 if isinstance(existing_content, StreamObject):

887 # Merge new content to existing StreamObject

888 merged_data = existing_content.get_data() + b"\n" + new_content_data

889 new_stream = StreamObject()

890 new_stream.set_data(merged_data)

891 page[NameObject("/Contents")] = self._add_object(new_stream)

892 else:

893 # If no existing content, then we have an empty page.

894 # Create a new StreamObject in a new /Contents entry.

895 new_stream = StreamObject()

896 new_stream.set_data(new_content_data)

897 page[NameObject("/Contents")] = self._add_object(new_stream)

898

899 def _add_apstream_object(

900 self,

901 page: PageObject,

902 appearance_stream_obj: StreamObject,

903 object_name: str,

904 x_offset: float,

905 y_offset: float,

906 ) -> None:

907 """

908 Adds an appearance stream to the page content in the form of

909 an XObject.

910

911 Args:

912 page: The page to which to add the appearance stream.

913 appearance_stream_obj: The appearance stream.

914 object_name: The name of the appearance stream.

915 x_offset: The horizontal offset for the appearance stream.

916 y_offset: The vertical offset for the appearance stream.

917 """

918 # Prepare XObject resource dictionary on the page. This currently

919 # only deals with font resources, but can easily be adapted to also

920 # include other resources.

921 pg_res = cast(DictionaryObject, page[PG.RESOURCES])

922 if "/Resources" in appearance_stream_obj:

923 ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"])

924 ap_stream_font_dict = cast(DictionaryObject, ap_stream_res.get("/Font", DictionaryObject()))

925 if "/Font" not in pg_res:

926 font_dict_ref = self._add_object(DictionaryObject())

927 pg_res[NameObject("/Font")] = font_dict_ref

928 pg_font_res = cast(DictionaryObject, pg_res["/Font"].get_object())

929 # Merge fonts from the appearance stream into the page's font resources

930 for font_name, font_res in ap_stream_font_dict.items():

931 if font_name not in pg_font_res:

932 font_res_ref = self._add_object(font_res)

933 pg_font_res[font_name] = font_res_ref

934 # Always add the resolved stream object to the writer to get a new IndirectObject.

935 # This ensures we have a valid IndirectObject managed by *this* writer.

936 xobject_ref = self._add_object(appearance_stream_obj)

937 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()

938 if "/XObject" not in pg_res:

939 pg_res[NameObject("/XObject")] = DictionaryObject()

940 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])

941 if xobject_name not in pg_xo_res:

942 pg_xo_res[xobject_name] = xobject_ref

943 else:

944 logger_warning(

945 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",

946 __name__

947 )

948 xobject_cm = Transformation().translate(x_offset, y_offset)

949 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()

950 self._merge_content_stream_to_page(page, xobject_drawing_commands)

951

952 FFBITS_NUL = FA.FfBits(0)

953

954 def update_page_form_field_values(

955 self,

956 page: Union[PageObject, list[PageObject], None],

957 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],

958 flags: FA.FfBits = FFBITS_NUL,

959 auto_regenerate: Optional[bool] = True,

960 flatten: bool = False,

961 ) -> None:

962 """

963 Update the form field values for a given page from a fields dictionary.

964

965 Copy field texts and values from fields to page.

966 If the field links to a parent object, add the information to the parent.

967

968 Args:

969 page: `PageObject` - references **PDF writer's page** where the

970 annotations and field data will be updated.

971 `List[Pageobject]` - provides list of pages to be processed.

972 `None` - all pages.

973 fields: a Python dictionary of:

974

975 * field names (/T) as keys and text values (/V) as value

976 * field names (/T) as keys and list of text values (/V) for multiple choice list

977 * field names (/T) as keys and tuple of:

978 * text values (/V)

979 * font id (e.g. /F1, the font id must exist)

980 * font size (0 for autosize)

981

982 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

983

984 auto_regenerate: Set/unset the need_appearances flag;

985 the flag is unchanged if auto_regenerate is None.

986

987 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's

988 appearance stream to the page contents. Note that this option does not remove the

989 annotation itself.

990

991 """

992 if CatalogDictionary.ACRO_FORM not in self._root_object:

993 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

994 acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

995 if InteractiveFormDictEntries.Fields not in acro_form:

996 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

997 if isinstance(auto_regenerate, bool):

998 self.set_need_appearances_writer(auto_regenerate)

999 # Iterate through pages, update field values

1000 if page is None:

1001 page = list(self.pages)

1002 if isinstance(page, list):

1003 for p in page:

1004 if PG.ANNOTS in p: # just to prevent warnings

1005 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)

1006 return

1007 if PG.ANNOTS not in page:

1008 logger_warning("No fields to update on this page", __name__)

1009 return

1010 appearance_stream_obj: Optional[StreamObject] = None

1011

1012 for annotation in page[PG.ANNOTS]: # type: ignore

1013 annotation = cast(DictionaryObject, annotation.get_object())

1014 if annotation.get("/Subtype", "") != "/Widget":

1015 continue

1016 if "/FT" in annotation and "/T" in annotation:

1017 parent_annotation = annotation

1018 else:

1019 parent_annotation = annotation.get(

1020 PG.PARENT, DictionaryObject()

1021 ).get_object()

1022

1023 for field, value in fields.items():

1024 rectangle = cast(RectangleObject, annotation[AA.Rect])

1025 if not (

1026 self._get_qualified_field_name(parent_annotation) == field

1027 or parent_annotation.get("/T", None) == field

1028 ):

1029 continue

1030 if (

1031 parent_annotation.get("/FT", None) == "/Ch"

1032 and "/I" in parent_annotation

1033 ):

1034 del parent_annotation["/I"]

1035 if flags:

1036 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1037 # Set the field value

1038 if not (value is None and flatten): # Only change values if given by user and not flattening.

1039 if isinstance(value, list):

1040 lst = ArrayObject(TextStringObject(v) for v in value)

1041 parent_annotation[NameObject(FA.V)] = lst

1042 elif isinstance(value, tuple):

1043 annotation[NameObject(FA.V)] = TextStringObject(

1044 value[0],

1045 )

1046 else:

1047 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1048 # Get or create the field's appearance stream object

1049 if parent_annotation.get(FA.FT) == "/Btn":

1050 # Checkbox button (no /FT found in Radio widgets);

1051 # We can find the associated appearance stream object

1052 # within the annotation.

1053 v = NameObject(value)

1054 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1055 normal_ap = cast(DictionaryObject, ap["/N"])

1056 if v not in normal_ap:

1057 v = NameObject("/Off")

1058 appearance_stream_obj = normal_ap.get(v)

1059 # Other cases will be updated through the for loop

1060 annotation[NameObject(AA.AS)] = v

1061 annotation[NameObject(FA.V)] = v

1062 elif (

1063 parent_annotation.get(FA.FT) == "/Tx"

1064 or parent_annotation.get(FA.FT) == "/Ch"

1065 ):

1066 # Textbox; we need to generate the appearance stream object

1067 if isinstance(value, tuple):

1068 appearance_stream_obj = TextStreamAppearance.from_text_annotation(

1069 acro_form, parent_annotation, annotation, value[1], value[2]

1070 )

1071 else:

1072 appearance_stream_obj = TextStreamAppearance.from_text_annotation(

1073 acro_form, parent_annotation, annotation

1074 )

1075 # Add the appearance stream object

1076 if AA.AP not in annotation:

1077 annotation[NameObject(AA.AP)] = DictionaryObject(

1078 {NameObject("/N"): self._add_object(appearance_stream_obj)}

1079 )

1080 elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])):

1081 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1082 NameObject("/N")

1083 ] = self._add_object(appearance_stream_obj)

1084 else: # [/AP][/N] exists

1085 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore

1086 self._objects[n - 1] = appearance_stream_obj

1087 appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self)

1088 elif (

1089 annotation.get(FA.FT) == "/Sig"

1090 ): # deprecated # not implemented yet

1091 logger_warning("Signature forms not implemented yet", __name__)

1092 if flatten and appearance_stream_obj is not None:

1093 self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])

1094

1095 def reattach_fields(

1096 self, page: Optional[PageObject] = None

1097 ) -> list[DictionaryObject]:

1098 """

1099 Parse annotations within the page looking for orphan fields and

1100 reattach then into the Fields Structure.

1101

1102 Args:

1103 page: page to analyze.

1104 If none is provided, all pages will be analyzed.

1105

1106 Returns:

1107 list of reattached fields.

1108

1109 """

1110 lst = []

1111 if page is None:

1112 for p in self.pages:

1113 lst += self.reattach_fields(p)

1114 return lst

1115

1116 try:

1117 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1118 except KeyError:

1119 af = DictionaryObject()

1120 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1121 try:

1122 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1123 except KeyError:

1124 fields = ArrayObject()

1125 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1126

1127 if "/Annots" not in page:

1128 return lst

1129 annotations = cast(ArrayObject, page["/Annots"])

1130 for idx, annotation in enumerate(annotations):

1131 is_indirect = isinstance(annotation, IndirectObject)

1132 annotation = cast(DictionaryObject, annotation.get_object())

1133 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1134 if (

1135 "indirect_reference" in annotation.__dict__

1136 and annotation.indirect_reference in fields

1137 ):

1138 continue

1139 if not is_indirect:

1140 annotations[idx] = self._add_object(annotation)

1141 fields.append(annotation.indirect_reference)

1142 lst.append(annotation)

1143 return lst

1144

1145 def _collect_incremental_clone_object_ids(self, reader: PdfReader) -> list[int]:

1146 object_ids: set[int] = set()

1147 for xref_entry in reader.xref.values():

1148 object_ids.update(filter(None, xref_entry))

1149 object_ids.update(filter(None, reader.xref_objStm))

1150

1151 object_count = len(object_ids)

1152 if object_count > self._incremental_clone_object_count_limit:

1153 raise LimitReachedError(

1154 f"Incremental clone object count {object_count} exceeds "

1155 f"maximum allowed count {self._incremental_clone_object_count_limit}."

1156 )

1157

1158 max_object_id = max(object_ids, default=0)

1159 if max_object_id > self._incremental_clone_object_id_limit:

1160 raise LimitReachedError(

1161 f"Incremental clone object ID {max_object_id} exceeds "

1162 f"maximum allowed ID {self._incremental_clone_object_id_limit}."

1163 )

1164

1165 return sorted(object_ids)

1166

1167 def clone_reader_document_root(self, reader: PdfReader) -> None:

1168 """

1169 Copy the reader document root to the writer and all sub-elements,

1170 including pages, threads, outlines,... For partial insertion, ``append``

1171 should be considered.

1172

1173 Args:

1174 reader: PdfReader from which the document root should be copied.

1175

1176 """

1177 self._info_obj = None

1178 if self.incremental:

1179 object_ids = self._collect_incremental_clone_object_ids(reader)

1180 self._objects = [None] * (object_ids[-1] if object_ids else 0)

1181 for object_id in object_ids:

1182 reader_object = reader.get_object(object_id)

1183 if reader_object is not None:

1184 self._objects[object_id - 1] = reader_object.replicate(self)

1185 else:

1186 self._objects.clear()

1187 self._root_object = reader.root_object.clone(self)

1188 self._pages = self._root_object.raw_get("/Pages")

1189

1190 if len(self._objects) > cast(int, reader.trailer["/Size"]):

1191 if self.strict:

1192 raise PdfReadError(

1193 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}"

1194 )

1195 logger_warning(

1196 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}",

1197 __name__

1198 )

1199

1200 # must be done here before rewriting

1201 if self.incremental:

1202 self._original_hash = [

1203 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1204 ]

1205

1206 try:

1207 self._flatten()

1208 except IndexError:

1209 raise PdfReadError("Got index error while flattening.")

1210

1211 assert self.flattened_pages is not None

1212 for p in self.flattened_pages:

1213 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1214 if not self.incremental:

1215 p[NameObject("/Parent")] = self._pages

1216 if not self.incremental:

1217 cast(DictionaryObject, self._pages.get_object())[

1218 NameObject("/Kids")

1219 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1220

1221 def clone_document_from_reader(

1222 self,

1223 reader: PdfReader,

1224 after_page_append: Optional[Callable[[PageObject], None]] = None,

1225 ) -> None:

1226 """

1227 Create a copy (clone) of a document from a PDF file reader cloning

1228 section '/Root' and '/Info' and '/ID' of the pdf.

1229

1230 Args:

1231 reader: PDF file reader instance from which the clone

1232 should be created.

1233 after_page_append:

1234 Callback function that is invoked after each page is appended to

1235 the writer. Signature includes a reference to the appended page

1236 (delegates to append_pages_from_reader). The single parameter of

1237 the callback is a reference to the page just appended to the

1238 document.

1239

1240 """

1241 self.clone_reader_document_root(reader)

1242 inf = reader._info

1243 if self.incremental:

1244 if inf is not None:

1245 self._info_obj = cast(

1246 IndirectObject, inf.clone(self).indirect_reference

1247 )

1248 assert isinstance(self._info, DictionaryObject), "for mypy"

1249 self._original_hash[

1250 self._info_obj.indirect_reference.idnum - 1

1251 ] = self._info.hash_bin()

1252 elif inf is not None:

1253 self._info_obj = self._add_object(

1254 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1255 )

1256 # else: _info_obj = None done in clone_reader_document_root()

1257

1258 try:

1259 self._ID = cast(ArrayObject, reader._ID).clone(self)

1260 except AttributeError:

1261 pass

1262

1263 if callable(after_page_append):

1264 for page in cast(

1265 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1266 ):

1267 after_page_append(page.get_object())

1268

1269 def _compute_document_identifier(self) -> ByteStringObject:

1270 stream = BytesIO()

1271 self._write_pdf_structure(stream)

1272 stream.seek(0)

1273 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1274

1275 def generate_file_identifiers(self) -> None:

1276 """

1277 Generate an identifier for the PDF that will be written.

1278

1279 The only point of this is ensuring uniqueness. Reproducibility is not

1280 required.

1281 When a file is first written, both identifiers shall be set to the same value.

1282 If both identifiers match when a file reference is resolved, it is very

1283 likely that the correct and unchanged file has been found. If only the first

1284 identifier matches, a different version of the correct file has been found.

1285 see §14.4 "File Identifiers".

1286 """

1287 if self._ID:

1288 id1 = self._ID[0]

1289 id2 = self._compute_document_identifier()

1290 else:

1291 id1 = self._compute_document_identifier()

1292 id2 = id1

1293 self._ID = ArrayObject((id1, id2))

1294

1295 def encrypt(

1296 self,

1297 user_password: str,

1298 owner_password: Optional[str] = None,

1299 use_128bit: bool = True,

1300 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1301 *,

1302 algorithm: Optional[str] = None,

1303 ) -> None:

1304 """

1305 Encrypt this PDF file with the PDF Standard encryption handler.

1306

1307 Args:

1308 user_password: The password which allows for opening

1309 and reading the PDF file with the restrictions provided.

1310 owner_password: The password which allows for

1311 opening the PDF files without any restrictions. By default,

1312 the owner password is the same as the user password.

1313 use_128bit: flag as to whether to use 128bit

1314 encryption. When false, 40bit encryption will be used.

1315 By default, this flag is on.

1316 permissions_flag: permissions as described in

1317 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1318 the permission is granted.

1319 Hence an integer value of -1 will set all flags.

1320 Bit position 3 is for printing, 4 is for modifying content,

1321 5 and 6 control annotations, 9 for form fields,

1322 10 for extraction of text and graphics.

1323 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1324 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1325 `use_128bit` will be ignored.

1326

1327 """

1328 if owner_password is None:

1329 owner_password = user_password

1330

1331 if algorithm is not None:

1332 try:

1333 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1334 except AttributeError:

1335 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1336 else:

1337 alg = EncryptAlgorithm.RC4_128

1338 if not use_128bit:

1339 alg = EncryptAlgorithm.RC4_40

1340 self.generate_file_identifiers()

1341 assert self._ID

1342 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1343 # in case call `encrypt` again

1344 entry = self._encryption.write_entry(user_password, owner_password)

1345 if self._encrypt_entry:

1346 # replace old encrypt_entry

1347 assert self._encrypt_entry.indirect_reference is not None

1348 entry.indirect_reference = self._encrypt_entry.indirect_reference

1349 self._objects[entry.indirect_reference.idnum - 1] = entry

1350 else:

1351 self._add_object(entry)

1352 self._encrypt_entry = entry

1353

1354 def _resolve_links(self) -> None:

1355 """Patch up links that were added to the document earlier, to

1356 make sure they still point to the same pages.

1357 """

1358 for (new_link, old_link) in self._unresolved_links:

1359 old_page = old_link.find_referenced_page()

1360 if not old_page:

1361 continue

1362 new_page = self._merged_in_pages.get(old_page)

1363 if new_page is None:

1364 continue

1365 new_link.patch_reference(self, new_page)

1366

1367 def write_stream(self, stream: StreamType) -> None:

1368 if hasattr(stream, "mode") and "b" not in stream.mode:

1369 logger_warning(

1370 f"File <{stream.name}> to write to is not in binary mode. "

1371 "It may not be written to correctly.",

1372 __name__,

1373 )

1374 self._resolve_links()

1375

1376 if self.incremental:

1377 self._reader.stream.seek(0)

1378 stream.write(self._reader.stream.read(-1))

1379 if len(self.list_objects_in_increment()) > 0:

1380 self._write_increment(stream) # writes objs, xref stream and startxref

1381 else:

1382 object_positions, free_objects = self._write_pdf_structure(stream)

1383 xref_location = self._write_xref_table(

1384 stream, object_positions, free_objects

1385 )

1386 self._write_trailer(stream, xref_location)

1387

1388 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:

1389 """

1390 Write the collection of pages added to this object out as a PDF file.

1391

1392 Args:

1393 stream: An object to write the file to. The object can support

1394 the write method and the tell method, similar to a file object, or

1395 be a file path, just like the fileobj, just named it stream to keep

1396 existing workflow.

1397

1398 Returns:

1399 A tuple (bool, IO).

1400

1401 """

1402 my_file = False

1403

1404 if stream == "":

1405 raise ValueError(f"Output({stream=}) is empty.")

1406

1407 if isinstance(stream, (str, Path)):

1408 stream = FileIO(stream, "wb")

1409 my_file = True

1410

1411 self.write_stream(stream)

1412

1413 if my_file:

1414 stream.close()

1415 else:

1416 stream.flush()

1417

1418 return my_file, stream

1419

1420 def list_objects_in_increment(self) -> list[IndirectObject]:

1421 """

1422 For analysis or debugging.

1423 Provides the list of new or modified objects that will be written

1424 in the increment.

1425 Deleted objects will not be freed but will become orphans.

1426

1427 Returns:

1428 List of new or modified IndirectObjects

1429

1430 """

1431 original_hash_count = len(self._original_hash)

1432 return [

1433 cast(IndirectObject, obj).indirect_reference

1434 for i, obj in enumerate(self._objects)

1435 if (

1436 obj is not None

1437 and (

1438 i >= original_hash_count

1439 or obj.hash_bin() != self._original_hash[i]

1440 )

1441 )

1442 ]

1443

1444 def _write_increment(self, stream: StreamType) -> None:

1445 object_positions = {}

1446 object_blocks = []

1447 current_start = -1

1448 current_stop = -2

1449 original_hash_count = len(self._original_hash)

1450 for i, obj in enumerate(self._objects):

1451 if obj is not None and (

1452 i >= original_hash_count

1453 or obj.hash_bin() != self._original_hash[i]

1454 ):

1455 idnum = i + 1

1456 assert isinstance(obj, PdfObject), "mypy"

1457 # first write new/modified object

1458 object_positions[idnum] = stream.tell()

1459 stream.write(f"{idnum} 0 obj\n".encode())

1460 """ encryption is not operational

1461 if self._encryption and obj != self._encrypt_entry:

1462 obj = self._encryption.encrypt_object(obj, idnum, 0)

1463 """

1464 obj.write_to_stream(stream)

1465 stream.write(b"\nendobj\n")

1466

1467 # prepare xref

1468 if idnum != current_stop:

1469 if current_start > 0:

1470 object_blocks.append(

1471 [current_start, current_stop - current_start]

1472 )

1473 current_start = idnum

1474 current_stop = idnum + 1

1475 assert current_start > 0, "for pytest only"

1476 object_blocks.append([current_start, current_stop - current_start])

1477 # write incremented xref

1478 xref_location = stream.tell()

1479 xr_id = len(self._objects) + 1

1480 stream.write(f"{xr_id} 0 obj".encode())

1481 init_data = {

1482 NameObject("/Type"): NameObject("/XRef"),

1483 NameObject("/Size"): NumberObject(xr_id + 1),

1484 NameObject("/Root"): self.root_object.indirect_reference,

1485 NameObject("/Filter"): NameObject("/FlateDecode"),

1486 NameObject("/Index"): ArrayObject(

1487 [NumberObject(_it) for _su in object_blocks for _it in _su]

1488 ),

1489 NameObject("/W"): ArrayObject(

1490 [NumberObject(1), NumberObject(4), NumberObject(1)]

1491 ),

1492 "__streamdata__": b"",

1493 }

1494 if self._info is not None and (

1495 self._info.indirect_reference.idnum - 1 # type: ignore

1496 >= len(self._original_hash)

1497 or cast(IndirectObject, self._info).hash_bin() # kept for future

1498 != self._original_hash[

1499 self._info.indirect_reference.idnum - 1 # type: ignore

1500 ]

1501 ):

1502 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1503 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1504 if self._ID:

1505 init_data[NameObject(TK.ID)] = self._ID

1506 xr = StreamObject.initialize_from_dictionary(init_data)

1507 xr.set_data(

1508 b"".join(

1509 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1510 )

1511 )

1512 xr.write_to_stream(stream)

1513 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1514

1515 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:

1516 object_positions = []

1517 free_objects = []

1518 stream.write(self.pdf_header.encode() + b"\n")

1519 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1520

1521 for idnum, obj in enumerate(self._objects, start=1):

1522 if obj is not None:

1523 object_positions.append(stream.tell())

1524 stream.write(f"{idnum} 0 obj\n".encode())

1525 if self._encryption and obj != self._encrypt_entry:

1526 obj = self._encryption.encrypt_object(obj, idnum, 0)

1527 obj.write_to_stream(stream)

1528 stream.write(b"\nendobj\n")

1529 else:

1530 object_positions.append(-1)

1531 free_objects.append(idnum)

1532 free_objects.append(0) # add 0 to loop in accordance with specification

1533 return object_positions, free_objects

1534

1535 def _write_xref_table(

1536 self, stream: StreamType, object_positions: list[int], free_objects: list[int]

1537 ) -> int:

1538 xref_location = stream.tell()

1539 stream.write(b"xref\n")

1540 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1541 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1542 free_idx = 1

1543 for offset in object_positions:

1544 if offset > 0:

1545 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1546 else:

1547 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1548 free_idx += 1

1549 return xref_location

1550

1551 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1552 """

1553 Write the PDF trailer to the stream.

1554

1555 To quote the PDF specification:

1556 [The] trailer [gives] the location of the cross-reference table and

1557 of certain special objects within the body of the file.

1558 """

1559 stream.write(b"trailer\n")

1560 trailer = DictionaryObject(

1561 {

1562 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1563 NameObject(TK.ROOT): self.root_object.indirect_reference,

1564 }

1565 )

1566 if self._info is not None:

1567 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1568 if self._ID is not None:

1569 trailer[NameObject(TK.ID)] = self._ID

1570 if self._encrypt_entry:

1571 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1572 trailer.write_to_stream(stream)

1573 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1574

1575 @property

1576 def metadata(self) -> Optional[DocumentInformation]:

1577 """

1578 Retrieve/set the PDF file's document information dictionary, if it exists.

1579

1580 Args:

1581 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1582

1583 Note that some PDF files use (XMP) metadata streams instead of document

1584 information dictionaries, and these metadata streams will not be

1585 accessed by this function, but by :meth:`~xmp_metadata`.

1586

1587 """

1588 return super().metadata

1589

1590 @metadata.setter

1591 def metadata(

1592 self,

1593 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],

1594 ) -> None:

1595 if value is None:

1596 self._info = None

1597 else:

1598 if self._info is not None:

1599 self._info.clear()

1600

1601 self.add_metadata(value)

1602

1603 def add_metadata(self, infos: dict[str, Any]) -> None:

1604 """

1605 Add custom metadata to the output.

1606

1607 Args:

1608 infos: a Python dictionary where each key is a field

1609 and each value is your new metadata.

1610

1611 """

1612 args = {}

1613 if isinstance(infos, PdfObject):

1614 infos = cast(DictionaryObject, infos.get_object())

1615 for key, value in list(infos.items()):

1616 if isinstance(value, PdfObject):

1617 value = value.get_object()

1618 args[NameObject(key)] = create_string_object(str(value))

1619 if self._info is None:

1620 self._info = DictionaryObject()

1621 self._info.update(args)

1622

1623 _UNSET = object()

1624

1625 def compress_identical_objects(

1626 self,

1627 remove_identicals: Any = _UNSET,

1628 remove_orphans: Any = _UNSET,

1629 *,

1630 remove_duplicates: bool = True,

1631 remove_unreferenced: bool = True,

1632 ) -> None:

1633 """

1634 Parse the PDF file and merge objects that have the same hash.

1635 This will make objects common to multiple pages.

1636 Recommended to be used just before writing output.

1637

1638 Args:

1639 remove_identicals: Deprecated.

1640 remove_orphans: Deprecated.

1641 remove_duplicates: Remove duplicate objects.

1642 remove_unreferenced: Remove unreferenced objects.

1643

1644 """

1645 if remove_identicals != self._UNSET:

1646 deprecate_with_replacement("remove_identicals", "remove_duplicates", "7.0.0")

1647 assert isinstance(remove_identicals, bool)

1648 remove_duplicates = remove_identicals

1649 if remove_orphans != self._UNSET:

1650 deprecate_with_replacement("remove_orphans", "remove_unreferenced", "7.0.0")

1651 assert isinstance(remove_orphans, bool)

1652 remove_unreferenced = remove_orphans

1653

1654 def replace_in_obj(

1655 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]

1656 ) -> None:

1657 if isinstance(obj, DictionaryObject):

1658 key_val = obj.items()

1659 elif isinstance(obj, ArrayObject):

1660 key_val = enumerate(obj) # type: ignore

1661 else:

1662 return

1663 assert isinstance(obj, (DictionaryObject, ArrayObject))

1664 for k, v in key_val:

1665 if isinstance(v, IndirectObject):

1666 unreferenced[v.idnum - 1] = False

1667 if v in crossref:

1668 obj[k] = crossref[v]

1669 else:

1670 """The filtering on DictionaryObject and ArrayObject only

1671 will be performed within replace_in_obj"""

1672 replace_in_obj(v, crossref)

1673

1674 # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...])

1675 self._idnum_hash = {}

1676 unreferenced = [True] * len(self._objects)

1677 # look for similar objects

1678 for idx, obj in enumerate(self._objects):

1679 if is_null_or_none(obj):

1680 continue

1681 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1682 assert isinstance(obj.indirect_reference, IndirectObject)

1683 h = obj.hash_value()

1684 if remove_duplicates and h in self._idnum_hash:

1685 self._idnum_hash[h][1].append(obj.indirect_reference)

1686 self._objects[idx] = None

1687 else:

1688 self._idnum_hash[h] = (obj.indirect_reference, [])

1689

1690 # generate the dict converting others to 1st

1691 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1692 cnv_rev: dict[IndirectObject, IndirectObject] = {}

1693 for k, v in cnv.items():

1694 cnv_rev.update(zip(v, (k,) * len(v)))

1695

1696 # replace reference to merged objects

1697 for obj in self._objects:

1698 if isinstance(obj, (DictionaryObject, ArrayObject)):

1699 replace_in_obj(obj, cnv_rev)

1700

1701 if remove_unreferenced:

1702 unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

1703

1704 if not is_null_or_none(self._info):

1705 unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore

1706

1707 try:

1708 unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore

1709 except AttributeError:

1710 pass

1711

1712 for i in compress(range(len(self._objects)), unreferenced):

1713 self._objects[i] = None

1714

1715 def get_reference(self, obj: PdfObject) -> IndirectObject:

1716 idnum = self._objects.index(obj) + 1

1717 ref = IndirectObject(idnum, 0, self)

1718 assert ref.get_object() == obj

1719 return ref

1720

1721 def get_outline_root(self) -> TreeObject:

1722 if CO.OUTLINES in self._root_object:

1723 # Entries in the catalog dictionary

1724 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1725 if not isinstance(outline, TreeObject):

1726 t = TreeObject(outline)

1727 self._replace_object(outline.indirect_reference.idnum, t)

1728 outline = t

1729 idnum = self._objects.index(outline) + 1

1730 outline_ref = IndirectObject(idnum, 0, self)

1731 assert outline_ref.get_object() == outline

1732 else:

1733 outline = TreeObject()

1734 outline.update({})

1735 outline_ref = self._add_object(outline)

1736 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1737

1738 return outline

1739

1740 def get_threads_root(self) -> ArrayObject:

1741 """

1742 The list of threads.

1743

1744 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1745

1746 Returns:

1747 An array (possibly empty) of Dictionaries with an ``/F`` key,

1748 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1749

1750 """

1751 if CO.THREADS in self._root_object:

1752 # Entries in the catalog dictionary

1753 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1754 else:

1755 threads = ArrayObject()

1756 self._root_object[NameObject(CO.THREADS)] = threads

1757 return threads

1758

1759 @property

1760 def threads(self) -> ArrayObject:

1761 """

1762 Read-only property for the list of threads.

1763

1764 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1765

1766 Each element is a dictionary with an ``/F`` key, and optionally

1767 information about the thread in ``/I`` or ``/Metadata`` keys.

1768 """

1769 return self.get_threads_root()

1770

1771 def add_outline_item_destination(

1772 self,

1773 page_destination: Union[IndirectObject, PageObject, TreeObject],

1774 parent: Union[None, TreeObject, IndirectObject] = None,

1775 before: Union[None, TreeObject, IndirectObject] = None,

1776 is_open: bool = True,

1777 ) -> IndirectObject:

1778 page_destination = cast(PageObject, page_destination.get_object())

1779 if isinstance(page_destination, PageObject):

1780 return self.add_outline_item_destination(

1781 Destination(

1782 f"page #{page_destination.page_number}",

1783 cast(IndirectObject, page_destination.indirect_reference),

1784 Fit.fit(),

1785 )

1786 )

1787

1788 if parent is None:

1789 parent = self.get_outline_root()

1790

1791 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1792 parent = cast(TreeObject, parent.get_object())

1793 page_destination_ref = self._add_object(page_destination)

1794 if before is not None:

1795 before = before.indirect_reference

1796 parent.insert_child(

1797 page_destination_ref,

1798 before,

1799 self,

1800 page_destination.inc_parent_counter_outline

1801 if is_open

1802 else (lambda x, y: 0), # noqa: ARG005

1803 )

1804 if "/Count" not in page_destination:

1805 page_destination[NameObject("/Count")] = NumberObject(0)

1806

1807 return page_destination_ref

1808

1809 def add_outline_item_dict(

1810 self,

1811 outline_item: OutlineItemType,

1812 parent: Union[None, TreeObject, IndirectObject] = None,

1813 before: Union[None, TreeObject, IndirectObject] = None,

1814 is_open: bool = True,

1815 ) -> IndirectObject:

1816 outline_item_object = TreeObject()

1817 outline_item_object.update(outline_item)

1818

1819 """code currently unreachable

1820 if "/A" in outline_item:

1821 action = DictionaryObject()

1822 a_dict = cast(DictionaryObject, outline_item["/A"])

1823 for k, v in list(a_dict.items()):

1824 action[NameObject(str(k))] = v

1825 action_ref = self._add_object(action)

1826 outline_item_object[NameObject("/A")] = action_ref

1827 """

1828 return self.add_outline_item_destination(

1829 outline_item_object, parent, before, is_open

1830 )

1831

1832 def add_outline_item(

1833 self,

1834 title: str,

1835 page_number: Union[None, PageObject, IndirectObject, int],

1836 parent: Union[None, TreeObject, IndirectObject] = None,

1837 before: Union[None, TreeObject, IndirectObject] = None,

1838 color: Optional[Union[tuple[float, float, float], str]] = None,

1839 bold: bool = False,

1840 italic: bool = False,

1841 fit: Fit = PAGE_FIT,

1842 is_open: bool = True,

1843 ) -> IndirectObject:

1844 """

1845 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1846

1847 Args:

1848 title: Title to use for this outline item.

1849 page_number: Page number this outline item will point to.

1850 parent: A reference to a parent outline item to create nested

1851 outline items.

1852 before:

1853 color: Color of the outline item's font as a red, green, blue tuple

1854 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1855 bold: Outline item font is bold

1856 italic: Outline item font is italic

1857 fit: The fit of the destination page.

1858

1859 Returns:

1860 The added outline item as an indirect object.

1861

1862 """

1863 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

1864 if isinstance(italic, Fit): # it means that we are on the old params

1865 if fit is not None and page_number is None:

1866 page_number = fit

1867 return self.add_outline_item(

1868 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

1869 )

1870 if page_number is None:

1871 action_ref = None

1872 else:

1873 if isinstance(page_number, IndirectObject):

1874 page_ref = page_number

1875 elif isinstance(page_number, PageObject):

1876 page_ref = page_number.indirect_reference

1877 elif isinstance(page_number, int):

1878 try:

1879 page_ref = self.pages[page_number].indirect_reference

1880 except IndexError:

1881 page_ref = NumberObject(page_number)

1882 if page_ref is None:

1883 logger_warning(

1884 f"can not find reference of page {page_number}",

1885 __name__,

1886 )

1887 page_ref = NullObject()

1888 dest = Destination(

1889 NameObject("/" + title + " outline item"),

1890 page_ref,

1891 fit,

1892 )

1893

1894 action_ref = self._add_object(

1895 DictionaryObject(

1896 {

1897 NameObject(GoToActionArguments.D): dest.dest_array,

1898 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1899 }

1900 )

1901 )

1902 outline_item = self._add_object(

1903 _create_outline_item(action_ref, title, color, italic, bold)

1904 )

1905

1906 if parent is None:

1907 parent = self.get_outline_root()

1908 return self.add_outline_item_destination(outline_item, parent, before, is_open)

1909

1910 def add_outline(self) -> None:

1911 raise NotImplementedError(

1912 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

1913 )

1914

1915 def add_named_destination_array(

1916 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

1917 ) -> None:

1918 named_dest = self.get_named_dest_root()

1919 i = 0

1920 while i < len(named_dest):

1921 if title < named_dest[i]:

1922 named_dest.insert(i, destination)

1923 named_dest.insert(i, TextStringObject(title))

1924 return

1925 i += 2

1926 named_dest.extend([TextStringObject(title), destination])

1927 return

1928

1929 def add_named_destination_object(

1930 self,

1931 page_destination: PdfObject,

1932 ) -> IndirectObject:

1933 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore

1934 self.add_named_destination_array(

1935 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore

1936 )

1937

1938 return page_destination_ref

1939

1940 def add_named_destination(

1941 self,

1942 title: str,

1943 page_number: int,

1944 ) -> IndirectObject:

1945 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore

1946 dest = DictionaryObject()

1947 dest.update(

1948 {

1949 NameObject(GoToActionArguments.D): ArrayObject(

1950 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

1951 ),

1952 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1953 }

1954 )

1955

1956 dest_ref = self._add_object(dest)

1957 if not isinstance(title, TextStringObject):

1958 title = TextStringObject(str(title))

1959

1960 self.add_named_destination_array(title, dest_ref)

1961 return dest_ref

1962

1963 def remove_links(self) -> None:

1964 """Remove links and annotations from this output."""

1965 for page in self.pages:

1966 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

1967

1968 def remove_annotations(

1969 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

1970 ) -> None:

1971 """

1972 Remove annotations by annotation subtype.

1973

1974 Args:

1975 subtypes: subtype or list of subtypes to be removed.

1976 Examples are: "/Link", "/FileAttachment", "/Sound",

1977 "/Movie", "/Screen", ...

1978 If you want to remove all annotations, use subtypes=None.

1979

1980 """

1981 for page in self.pages:

1982 self._remove_annots_from_page(page, subtypes)

1983

1984 def _remove_annots_from_page(

1985 self,

1986 page: Union[IndirectObject, PageObject, DictionaryObject],

1987 subtypes: Optional[Iterable[str]],

1988 ) -> None:

1989 page = cast(DictionaryObject, page.get_object())

1990 if PG.ANNOTS in page:

1991 i = 0

1992 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

1993 an = cast(ArrayObject, page[PG.ANNOTS])[i]

1994 obj = cast(DictionaryObject, an.get_object())

1995 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

1996 if isinstance(an, IndirectObject):

1997 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

1998 del page[PG.ANNOTS][i] # type:ignore

1999 else:

2000 i += 1

2001

2002 def remove_objects_from_page(

2003 self,

2004 page: Union[PageObject, DictionaryObject],

2005 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

2006 text_filters: Optional[dict[str, Any]] = None

2007 ) -> None:

2008 """

2009 Remove objects specified by ``to_delete`` from the given page.

2010

2011 Args:

2012 page: Page object to clean up.

2013 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

2014 or a list of ObjectDeletionFlag

2015 text_filters: Properties of text to be deleted, if applicable. Optional.

2016 This is a Python dictionary with the following properties:

2017

2018 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

2019

2020 """

2021 if isinstance(to_delete, (list, tuple)):

2022 for to_d in to_delete:

2023 self.remove_objects_from_page(page, to_d)

2024 return None

2025 assert isinstance(to_delete, ObjectDeletionFlag)

2026

2027 if to_delete & ObjectDeletionFlag.LINKS:

2028 return self._remove_annots_from_page(page, ("/Link",))

2029 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

2030 return self._remove_annots_from_page(

2031 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

2032 )

2033 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

2034 return self._remove_annots_from_page(page, ("/3D",))

2035 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

2036 return self._remove_annots_from_page(page, None)

2037

2038 jump_operators = []

2039 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

2040 jump_operators = [

2041 b"w", b"J", b"j", b"M", b"d", b"i",

2042 b"W", b"W*",

2043 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

2044 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

2045 b"sh"

2046 ]

2047 if to_delete & ObjectDeletionFlag.TEXT:

2048 jump_operators = [b"Tj", b"TJ", b"'", b'"']

2049

2050 if not isinstance(page, PageObject):

2051 page = PageObject(self, page.indirect_reference) # pragma: no cover

2052 if "/Contents" in page:

2053 content = cast(ContentStream, page.get_contents())

2054

2055 images, forms = self._remove_objects_from_page__clean_forms(

2056 elt=page, stack=[], jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters,

2057 )

2058

2059 self._remove_objects_from_page__clean(

2060 content=content, images=images, forms=forms,

2061 jump_operators=jump_operators, to_delete=to_delete,

2062 text_filters=text_filters

2063 )

2064 page.replace_contents(content)

2065 return [], [] # type: ignore[return-value]

2066

2067 def _remove_objects_from_page__clean(

2068 self,

2069 content: ContentStream,

2070 images: list[str],

2071 forms: list[str],

2072 jump_operators: list[bytes],

2073 to_delete: ObjectDeletionFlag,

2074 text_filters: Optional[dict[str, Any]] = None,

2075 ) -> None:

2076 font_id = None

2077 font_ids_to_delete = []

2078 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

2079 font_ids_to_delete = text_filters.get("font_ids", [])

2080

2081 i = 0

2082 while i < len(content.operations):

2083 operands, operator = content.operations[i]

2084 if operator == b"Tf":

2085 font_id = operands[0]

2086 if (

2087 (

2088 operator == b"INLINE IMAGE"

2089 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2090 )

2091 or (operator in jump_operators)

2092 or (

2093 operator == b"Do"

2094 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2095 and (operands[0] in images)

2096 )

2097 ):

2098 if (

2099 not to_delete & ObjectDeletionFlag.TEXT

2100 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2101 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2102 ):

2103 del content.operations[i]

2104 else:

2105 i += 1

2106 else:

2107 i += 1

2108 content.get_data() # this ensures ._data is rebuilt from the .operations

2109

2110 def _remove_objects_from_page__clean_forms(

2111 self,

2112 elt: DictionaryObject,

2113 stack: list[DictionaryObject],

2114 jump_operators: list[bytes],

2115 to_delete: ObjectDeletionFlag,

2116 text_filters: Optional[dict[str, Any]] = None,

2117 ) -> tuple[list[str], list[str]]:

2118 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2119 if (elt in stack) or (

2120 hasattr(elt, "indirect_reference") and any(

2121 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2122 for x in stack

2123 )

2124 ):

2125 # to prevent infinite looping

2126 return [], [] # pragma: no cover

2127 try:

2128 d = cast(

2129 dict[Any, Any],

2130 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2131 )

2132 except KeyError:

2133 d = {}

2134 images = []

2135 forms = []

2136 for k, v in d.items():

2137 o = v.get_object()

2138 try:

2139 content: Any = None

2140 if (

2141 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2142 and o["/Subtype"] == "/Image"

2143 ):

2144 content = NullObject() # to delete the image keeping the entry

2145 images.append(k)

2146 if o["/Subtype"] == "/Form":

2147 forms.append(k)

2148 if isinstance(o, ContentStream):

2149 content = o

2150 else:

2151 content = ContentStream(o, self)

2152 content.update(

2153 {

2154 k1: v1

2155 for k1, v1 in o.items()

2156 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2157 }

2158 )

2159 try:

2160 content.indirect_reference = o.indirect_reference

2161 except AttributeError: # pragma: no cover

2162 pass

2163 stack.append(elt)

2164

2165 # clean subforms

2166 self._remove_objects_from_page__clean_forms(

2167 elt=content, stack=stack, jump_operators=jump_operators, to_delete=to_delete,

2168 text_filters=text_filters,

2169 )

2170 if content is not None:

2171 if isinstance(v, IndirectObject):

2172 self._objects[v.idnum - 1] = content

2173 else:

2174 # should only occur in a PDF not respecting PDF spec

2175 # where streams must be indirected.

2176 d[k] = self._add_object(content) # pragma: no cover

2177 except (TypeError, KeyError):

2178 pass

2179 for im in images:

2180 del d[im] # for clean-up

2181 if isinstance(elt, StreamObject): # for /Form

2182 if not isinstance(elt, ContentStream): # pragma: no cover

2183 e = ContentStream(elt, self)

2184 e.update(elt.items())

2185 elt = e

2186 # clean the content

2187 self._remove_objects_from_page__clean(

2188 content=elt, images=images, forms=forms, jump_operators=jump_operators,

2189 to_delete=to_delete, text_filters=text_filters

2190 )

2191 return images, forms

2192

2193 def remove_images(

2194 self,

2195 to_delete: ImageType = ImageType.ALL,

2196 ) -> None:

2197 """

2198 Remove images from this output.

2199

2200 Args:

2201 to_delete: The type of images to be deleted

2202 (default = all images types)

2203

2204 """

2205 if isinstance(to_delete, bool):

2206 to_delete = ImageType.ALL

2207

2208 i = ObjectDeletionFlag.NONE

2209

2210 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2211 if to_delete & ImageType[image]:

2212 i |= ObjectDeletionFlag[image]

2213

2214 for page in self.pages:

2215 self.remove_objects_from_page(page, i)

2216

2217 def remove_text(self, font_names: Optional[list[str]] = None) -> None:

2218 """

2219 Remove text from the PDF.

2220

2221 Args:

2222 font_names: List of font names to remove, such as "Helvetica-Bold".

2223 Optional. If not specified, all text will be removed.

2224 """

2225 if not font_names:

2226 font_names = []

2227

2228 for page in self.pages:

2229 resource_ids_to_remove = []

2230

2231 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2232 # Font names need to be converted to resource names/IDs for easier removal

2233 if font_names:

2234 # Recursively loop through page objects to gather font info

2235 def get_font_info(

2236 obj: Any,

2237 font_info: Optional[dict[str, Any]] = None,

2238 key: Optional[str] = None

2239 ) -> dict[str, Any]:

2240 if font_info is None:

2241 font_info = {}

2242 if isinstance(obj, IndirectObject):

2243 obj = obj.get_object()

2244 if isinstance(obj, dict):

2245 if obj.get("/Type") == "/Font":

2246 font_name = obj.get("/BaseFont", "")

2247 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2248 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2249 if normalized_font_name not in font_info:

2250 font_info[normalized_font_name] = {

2251 "normalized_font_name": normalized_font_name,

2252 "resource_ids": [],

2253 }

2254 if key not in font_info[normalized_font_name]["resource_ids"]:

2255 font_info[normalized_font_name]["resource_ids"].append(key)

2256 for k in obj:

2257 font_info = get_font_info(obj[k], font_info, k)

2258 elif isinstance(obj, (list, ArrayObject)):

2259 for child_obj in obj:

2260 font_info = get_font_info(child_obj, font_info)

2261 return font_info

2262

2263 # Add relevant resource names for removal

2264 font_info = get_font_info(page.get("/Resources"))

2265 for font_name in font_names:

2266 if font_name in font_info:

2267 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2268

2269 text_filters = {}

2270 if font_names:

2271 text_filters["font_ids"] = resource_ids_to_remove

2272 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2273

2274 def add_uri(

2275 self,

2276 page_number: int,

2277 uri: str,

2278 rect: RectangleObject,

2279 border: Optional[ArrayObject] = None,

2280 ) -> None:

2281 """

2282 Add an URI from a rectangular area to the specified page.

2283

2284 Args:

2285 page_number: index of the page on which to place the URI action.

2286 uri: URI of resource to link to.

2287 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2288 array of four integers specifying the clickable rectangular area

2289 ``[xLL, yLL, xUR, yUR]``, or string in the form

2290 ``"[ xLL yLL xUR yUR ]"``.

2291 border: if provided, an array describing border-drawing

2292 properties. See the PDF spec for details. No border will be

2293 drawn if this argument is omitted.

2294

2295 """

2296 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore

2297 page_ref = cast(dict[str, Any], self.get_object(page_link))

2298

2299 border_arr: BorderArrayType

2300 if border is not None:

2301 border_arr = [NumberObject(n) for n in border[:3]]

2302 if len(border) == 4:

2303 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2304 border_arr.append(dash_pattern)

2305 else:

2306 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2307

2308 if isinstance(rect, str):

2309 rect = NumberObject(rect)

2310 elif isinstance(rect, RectangleObject):

2311 pass

2312 else:

2313 rect = RectangleObject(rect)

2314

2315 lnk2 = DictionaryObject()

2316 lnk2.update(

2317 {

2318 NameObject("/S"): NameObject("/URI"),

2319 NameObject("/URI"): TextStringObject(uri),

2320 }

2321 )

2322 lnk = DictionaryObject()

2323 lnk.update(

2324 {

2325 NameObject(AA.Type): NameObject("/Annot"),

2326 NameObject(AA.Subtype): NameObject("/Link"),

2327 NameObject(AA.P): page_link,

2328 NameObject(AA.Rect): rect,

2329 NameObject("/H"): NameObject("/I"),

2330 NameObject(AA.Border): ArrayObject(border_arr),

2331 NameObject("/A"): lnk2,

2332 }

2333 )

2334 lnk_ref = self._add_object(lnk)

2335

2336 if PG.ANNOTS in page_ref:

2337 page_ref[PG.ANNOTS].append(lnk_ref)

2338 else:

2339 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2340

2341 _valid_layouts = (

2342 "/NoLayout",

2343 "/SinglePage",

2344 "/OneColumn",

2345 "/TwoColumnLeft",

2346 "/TwoColumnRight",

2347 "/TwoPageLeft",

2348 "/TwoPageRight",

2349 )

2350

2351 def _get_page_layout(self) -> Optional[LayoutType]:

2352 try:

2353 return cast(LayoutType, self._root_object["/PageLayout"])

2354 except KeyError:

2355 return None

2356

2357 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2358 """

2359 Set the page layout.

2360

2361 Args:

2362 layout: The page layout to be used.

2363

2364 .. list-table:: Valid ``layout`` arguments

2365 :widths: 50 200

2366

2367 * - /NoLayout

2368 - Layout explicitly not specified

2369 * - /SinglePage

2370 - Show one page at a time

2371 * - /OneColumn

2372 - Show one column at a time

2373 * - /TwoColumnLeft

2374 - Show pages in two columns, odd-numbered pages on the left

2375 * - /TwoColumnRight

2376 - Show pages in two columns, odd-numbered pages on the right

2377 * - /TwoPageLeft

2378 - Show two pages at a time, odd-numbered pages on the left

2379 * - /TwoPageRight

2380 - Show two pages at a time, odd-numbered pages on the right

2381

2382 """

2383 if not isinstance(layout, NameObject):

2384 if layout not in self._valid_layouts:

2385 logger_warning(

2386 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",

2387 __name__,

2388 )

2389 layout = NameObject(layout)

2390 self._root_object.update({NameObject("/PageLayout"): layout})

2391

2392 def set_page_layout(self, layout: LayoutType) -> None:

2393 """

2394 Set the page layout.

2395

2396 Args:

2397 layout: The page layout to be used

2398

2399 .. list-table:: Valid ``layout`` arguments

2400 :widths: 50 200

2401

2402 * - /NoLayout

2403 - Layout explicitly not specified

2404 * - /SinglePage

2405 - Show one page at a time

2406 * - /OneColumn

2407 - Show one column at a time

2408 * - /TwoColumnLeft

2409 - Show pages in two columns, odd-numbered pages on the left

2410 * - /TwoColumnRight

2411 - Show pages in two columns, odd-numbered pages on the right

2412 * - /TwoPageLeft

2413 - Show two pages at a time, odd-numbered pages on the left

2414 * - /TwoPageRight

2415 - Show two pages at a time, odd-numbered pages on the right

2416

2417 """

2418 self._set_page_layout(layout)

2419

2420 @property

2421 def page_layout(self) -> Optional[LayoutType]:

2422 """

2423 Page layout property.

2424

2425 .. list-table:: Valid ``layout`` values

2426 :widths: 50 200

2427

2428 * - /NoLayout

2429 - Layout explicitly not specified

2430 * - /SinglePage

2431 - Show one page at a time

2432 * - /OneColumn

2433 - Show one column at a time

2434 * - /TwoColumnLeft

2435 - Show pages in two columns, odd-numbered pages on the left

2436 * - /TwoColumnRight

2437 - Show pages in two columns, odd-numbered pages on the right

2438 * - /TwoPageLeft

2439 - Show two pages at a time, odd-numbered pages on the left

2440 * - /TwoPageRight

2441 - Show two pages at a time, odd-numbered pages on the right

2442 """

2443 return self._get_page_layout()

2444

2445 @page_layout.setter

2446 def page_layout(self, layout: LayoutType) -> None:

2447 self._set_page_layout(layout)

2448

2449 _valid_modes = (

2450 "/UseNone",

2451 "/UseOutlines",

2452 "/UseThumbs",

2453 "/FullScreen",

2454 "/UseOC",

2455 "/UseAttachments",

2456 )

2457

2458 def _get_page_mode(self) -> Optional[PagemodeType]:

2459 try:

2460 return cast(PagemodeType, self._root_object["/PageMode"])

2461 except KeyError:

2462 return None

2463

2464 @property

2465 def page_mode(self) -> Optional[PagemodeType]:

2466 """

2467 Page mode property.

2468

2469 .. list-table:: Valid ``mode`` values

2470 :widths: 50 200

2471

2472 * - /UseNone

2473 - Do not show outline or thumbnails panels

2474 * - /UseOutlines

2475 - Show outline (aka bookmarks) panel

2476 * - /UseThumbs

2477 - Show page thumbnails panel

2478 * - /FullScreen

2479 - Fullscreen view

2480 * - /UseOC

2481 - Show Optional Content Group (OCG) panel

2482 * - /UseAttachments

2483 - Show attachments panel

2484 """

2485 return self._get_page_mode()

2486

2487 @page_mode.setter

2488 def page_mode(self, mode: PagemodeType) -> None:

2489 if isinstance(mode, NameObject):

2490 mode_name: NameObject = mode

2491 else:

2492 if mode not in self._valid_modes:

2493 logger_warning(

2494 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__

2495 )

2496 mode_name = NameObject(mode)

2497 self._root_object.update({NameObject("/PageMode"): mode_name})

2498

2499 def add_annotation(

2500 self,

2501 page_number: Union[int, PageObject],

2502 annotation: dict[str, Any],

2503 ) -> DictionaryObject:

2504 """

2505 Add a single annotation to the page.

2506 The added annotation must be a new annotation.

2507 It cannot be recycled.

2508

2509 Args:

2510 page_number: PageObject or page index.

2511 annotation: Annotation to be added (created with annotation).

2512

2513 Returns:

2514 The inserted object.

2515 This can be used for popup creation, for example.

2516

2517 """

2518 page = page_number

2519 if isinstance(page, int):

2520 page = self.pages[page]

2521 elif not isinstance(page, PageObject):

2522 raise TypeError("page: invalid type")

2523

2524 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2525 to_add[NameObject("/P")] = page.indirect_reference

2526

2527 if page.annotations is None:

2528 page[NameObject("/Annots")] = ArrayObject()

2529 assert page.annotations is not None

2530

2531 # Internal link annotations need the correct object type for the

2532 # destination

2533 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2534 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])

2535 dest = Destination(

2536 NameObject("/LinkName"),

2537 tmp["target_page_index"],

2538 Fit(

2539 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2540 ), # I have no clue why this dict-hack is necessary

2541 )

2542 to_add[NameObject("/Dest")] = dest.dest_array

2543

2544 page.annotations.append(self._add_object(to_add))

2545

2546 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2547 cast(DictionaryObject, to_add["/Parent"].get_object())[

2548 NameObject("/Popup")

2549 ] = to_add.indirect_reference

2550

2551 return to_add

2552

2553 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2554 """

2555 Perform some clean up in the page.

2556 Currently: convert NameObject named destination to TextStringObject

2557 (required for names/dests list)

2558

2559 Args:

2560 page:

2561

2562 Returns:

2563 The cleaned PageObject

2564

2565 """

2566 page = cast("PageObject", page.get_object())

2567 for a in page.get("/Annots", []):

2568 a_obj = a.get_object()

2569 d = a_obj.get("/Dest", None)

2570 act = a_obj.get("/A", None)

2571 if isinstance(d, NameObject):

2572 a_obj[NameObject("/Dest")] = TextStringObject(d)

2573 elif act is not None:

2574 act = act.get_object()

2575 d = act.get("/D", None)

2576 if isinstance(d, NameObject):

2577 act[NameObject("/D")] = TextStringObject(d)

2578 return page

2579

2580 def _create_stream(

2581 self, fileobj: Union[Path, StrByteType, PdfReader]

2582 ) -> tuple[IOBase, Optional[Encryption]]:

2583 # If the fileobj parameter is a string, assume it is a path

2584 # and create a file object at that location. If it is a file,

2585 # copy the file's contents into a BytesIO stream object; if

2586 # it is a PdfReader, copy that reader's stream into a

2587 # BytesIO stream.

2588 # If fileobj is none of the above types, it is not modified

2589 encryption_obj = None

2590 stream: IOBase

2591 if isinstance(fileobj, (str, Path)):

2592 with FileIO(fileobj, "rb") as f:

2593 stream = BytesIO(f.read())

2594 elif isinstance(fileobj, PdfReader):

2595 if fileobj._encryption:

2596 encryption_obj = fileobj._encryption

2597 orig_tell = fileobj.stream.tell()

2598 fileobj.stream.seek(0)

2599 stream = BytesIO(fileobj.stream.read())

2600

2601 # reset the stream to its original location

2602 fileobj.stream.seek(orig_tell)

2603 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2604 fileobj.seek(0)

2605 filecontent = fileobj.read()

2606 stream = BytesIO(filecontent)

2607 else:

2608 raise NotImplementedError(

2609 "Merging requires an object that PdfReader can parse. "

2610 "Typically, that is a Path or a string representing a Path, "

2611 "a file object, or an object implementing .seek and .read. "

2612 "Passing a PdfReader directly works as well."

2613 )

2614 return stream, encryption_obj

2615

2616 def append(

2617 self,

2618 fileobj: Union[StrByteType, PdfReader, Path],

2619 outline_item: Union[

2620 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]

2621 ] = None,

2622 pages: Union[

2623 None,

2624 PageRange,

2625 tuple[int, int],

2626 tuple[int, int, int],

2627 list[int],

2628 list[PageObject],

2629 ] = None,

2630 import_outline: bool = True,

2631 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,

2632 ) -> None:

2633 """

2634 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2635 concatenate all pages onto the end of the file instead of specifying a

2636 position.

2637

2638 Args:

2639 fileobj: A File Object or an object that supports the standard

2640 read and seek methods similar to a File Object. Could also be a

2641 string representing a path to a PDF file.

2642 outline_item: Optionally, you may specify a string to build an

2643 outline (aka 'bookmark') to identify the beginning of the

2644 included file.

2645 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2646 or a ``(start, stop[, step])`` tuple

2647 or a list of pages to be processed

2648 to merge only the specified range of pages from the source

2649 document into the output document.

2650 import_outline: You may prevent the source document's

2651 outline (collection of outline items, previously referred to as

2652 'bookmarks') from being imported by specifying this as ``False``.

2653 excluded_fields: Provide the list of fields/keys to be ignored

2654 if ``/Annots`` is part of the list, the annotation will be ignored

2655 if ``/B`` is part of the list, the articles will be ignored

2656

2657 """

2658 if excluded_fields is None:

2659 excluded_fields = ()

2660 if isinstance(outline_item, (tuple, list, PageRange)):

2661 if isinstance(pages, bool):

2662 if not isinstance(import_outline, bool):

2663 excluded_fields = import_outline

2664 import_outline = pages

2665 pages = outline_item

2666 self.merge(

2667 None,

2668 fileobj,

2669 None,

2670 pages,

2671 import_outline,

2672 excluded_fields,

2673 )

2674 else: # if isinstance(outline_item, str):

2675 self.merge(

2676 None,

2677 fileobj,

2678 outline_item,

2679 pages,

2680 import_outline,

2681 excluded_fields,

2682 )

2683

2684 def merge(

2685 self,

2686 position: Optional[int],

2687 fileobj: Union[Path, StrByteType, PdfReader],

2688 outline_item: Optional[str] = None,

2689 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,

2690 import_outline: bool = True,

2691 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),

2692 ) -> None:

2693 """

2694 Merge the pages from the given file into the output file at the

2695 specified page number.

2696

2697 Args:

2698 position: The *page number* to insert this file. File will

2699 be inserted after the given number.

2700 fileobj: A File Object or an object that supports the standard

2701 read and seek methods similar to a File Object. Could also be a

2702 string representing a path to a PDF file.

2703 outline_item: Optionally, you may specify a string to build an outline

2704 (aka 'bookmark') to identify the

2705 beginning of the included file.

2706 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2707 or a ``(start, stop[, step])`` tuple

2708 or a list of pages to be processed

2709 to merge only the specified range of pages from the source

2710 document into the output document.

2711 import_outline: You may prevent the source document's

2712 outline (collection of outline items, previously referred to as

2713 'bookmarks') from being imported by specifying this as ``False``.

2714 excluded_fields: provide the list of fields/keys to be ignored

2715 if ``/Annots`` is part of the list, the annotation will be ignored

2716 if ``/B`` is part of the list, the articles will be ignored

2717

2718 Raises:

2719 TypeError: The pages attribute is not configured properly

2720

2721 """

2722 if isinstance(fileobj, PdfDocCommon):

2723 reader = fileobj

2724 else:

2725 stream, _encryption_obj = self._create_stream(fileobj)

2726 # Create a new PdfReader instance using the stream

2727 # (either file or BytesIO or StringIO) created above

2728 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2729

2730 if excluded_fields is None:

2731 excluded_fields = ()

2732 # Find the range of pages to merge.

2733 if pages is None:

2734 pages = list(range(len(reader.pages)))

2735 elif isinstance(pages, PageRange):

2736 pages = list(range(*pages.indices(len(reader.pages))))

2737 elif isinstance(pages, list):

2738 pass # keep unchanged

2739 elif isinstance(pages, tuple) and len(pages) <= 3:

2740 pages = list(range(*pages))

2741 elif not isinstance(pages, tuple):

2742 raise TypeError(

2743 '"pages" must be a tuple of (start, stop[, step]) or a list'

2744 )

2745

2746 srcpages = {}

2747 for page in pages:

2748 if isinstance(page, PageObject):

2749 pg = page

2750 else:

2751 pg = reader.pages[page]

2752 assert pg.indirect_reference is not None

2753 if position is None:

2754 # numbers in the exclude list identifies that the exclusion is

2755 # only applicable to 1st level of cloning

2756 srcpages[pg.indirect_reference.idnum] = self.add_page(

2757 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2758 )

2759 else:

2760 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2761 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2762 )

2763 position += 1

2764 srcpages[pg.indirect_reference.idnum].original_page = pg

2765

2766 reader._named_destinations = (

2767 reader.named_destinations

2768 ) # need for the outline processing below

2769

2770 arr: Any

2771

2772 for dest in reader._named_destinations.values():

2773 self._merge__process_named_dests(dest=dest, reader=reader, srcpages=srcpages)

2774

2775 outline_item_typ: TreeObject

2776 if outline_item is not None:

2777 outline_item_typ = cast(

2778 "TreeObject",

2779 self.add_outline_item(

2780 TextStringObject(outline_item),

2781 next(iter(srcpages.values())).indirect_reference,

2782 fit=PAGE_FIT,

2783 ).get_object(),

2784 )

2785 else:

2786 outline_item_typ = self.get_outline_root()

2787

2788 _ro = reader.root_object

2789 if import_outline and CO.OUTLINES in _ro:

2790 outline = self._get_filtered_outline(

2791 _ro.get(CO.OUTLINES, None), srcpages, reader

2792 )

2793 self._insert_filtered_outline(

2794 outline, outline_item_typ, None

2795 ) # TODO: use before parameter

2796

2797 if "/Annots" not in excluded_fields:

2798 for pag in srcpages.values():

2799 lst = self._insert_filtered_annotations(

2800 pag.original_page.get("/Annots", []), pag, srcpages, reader

2801 )

2802 if len(lst) > 0:

2803 pag[NameObject("/Annots")] = lst

2804 self.clean_page(pag)

2805

2806 if "/AcroForm" in _ro and not is_null_or_none(_ro["/AcroForm"]):

2807 if "/AcroForm" not in self._root_object:

2808 self._root_object[NameObject("/AcroForm")] = self._add_object(

2809 cast(

2810 DictionaryObject,

2811 reader.root_object["/AcroForm"],

2812 ).clone(self, False, ("/Fields",))

2813 )

2814 arr = ArrayObject()

2815 else:

2816 arr = cast(

2817 ArrayObject,

2818 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2819 )

2820 trslat = self._id_translated[id(reader)]

2821 try:

2822 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore

2823 try:

2824 ind = IndirectObject(trslat[f.idnum], 0, self)

2825 if ind not in arr:

2826 arr.append(ind)

2827 except KeyError:

2828 # for trslat[] which mean the field has not be copied

2829 # through the page

2830 pass

2831 except KeyError: # for /Acroform or /Fields are not existing

2832 arr = self._add_object(ArrayObject())

2833 cast(DictionaryObject, self._root_object["/AcroForm"])[

2834 NameObject("/Fields")

2835 ] = arr

2836

2837 if "/B" not in excluded_fields:

2838 self.add_filtered_articles("", srcpages, reader)

2839

2840 def _merge__process_named_dests(self, dest: Any, reader: PdfDocCommon, srcpages: dict[int, PageObject]) -> None:

2841 arr: Any = dest.dest_array

2842 if "/Names" in self._root_object and dest["/Title"] in cast(

2843 list[Any],

2844 cast(

2845 DictionaryObject,

2846 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2847 ).get("/Names", DictionaryObject()),

2848 ):

2849 # already exists: should not duplicate it

2850 pass

2851 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2852 pass

2853 elif isinstance(dest["/Page"], int):

2854 # the page reference is a page number normally not a PDF Reference

2855 # page numbers as int are normally accepted only in external goto

2856 try:

2857 p = reader.pages[dest["/Page"]]

2858 except IndexError:

2859 return

2860 assert p.indirect_reference is not None

2861 try:

2862 arr[NumberObject(0)] = NumberObject(

2863 srcpages[p.indirect_reference.idnum].page_number

2864 )

2865 self.add_named_destination_array(dest["/Title"], arr)

2866 except KeyError:

2867 pass

2868 elif dest["/Page"].indirect_reference.idnum in srcpages:

2869 arr[NumberObject(0)] = srcpages[

2870 dest["/Page"].indirect_reference.idnum

2871 ].indirect_reference

2872 self.add_named_destination_array(dest["/Title"], arr)

2873

2874 def _add_articles_thread(

2875 self,

2876 thread: DictionaryObject, # thread entry from the reader's array of threads

2877 pages: dict[int, PageObject],

2878 reader: PdfReader,

2879 ) -> IndirectObject:

2880 """

2881 Clone the thread with only the applicable articles.

2882

2883 Args:

2884 thread:

2885 pages:

2886 reader:

2887

2888 Returns:

2889 The added thread as an indirect reference

2890

2891 """

2892 nthread = thread.clone(

2893 self, force_duplicate=True, ignore_fields=("/F",)

2894 ) # use of clone to keep link between reader and writer

2895 self.threads.append(nthread.indirect_reference)

2896 first_article = cast("DictionaryObject", thread["/F"])

2897 current_article: Optional[DictionaryObject] = first_article

2898 new_article: Optional[DictionaryObject] = None

2899 while current_article is not None:

2900 pag = self._get_cloned_page(

2901 cast("PageObject", current_article["/P"]), pages, reader

2902 )

2903 if pag is not None:

2904 if new_article is None:

2905 new_article = cast(

2906 "DictionaryObject",

2907 self._add_object(DictionaryObject()).get_object(),

2908 )

2909 new_first = new_article

2910 nthread[NameObject("/F")] = new_article.indirect_reference

2911 else:

2912 new_article2 = cast(

2913 "DictionaryObject",

2914 self._add_object(

2915 DictionaryObject(

2916 {NameObject("/V"): new_article.indirect_reference}

2917 )

2918 ).get_object(),

2919 )

2920 new_article[NameObject("/N")] = new_article2.indirect_reference

2921 new_article = new_article2

2922 new_article[NameObject("/P")] = pag

2923 new_article[NameObject("/T")] = nthread.indirect_reference

2924 new_article[NameObject("/R")] = current_article["/R"]

2925 pag_obj = cast("PageObject", pag.get_object())

2926 if "/B" not in pag_obj:

2927 pag_obj[NameObject("/B")] = ArrayObject()

2928 cast("ArrayObject", pag_obj["/B"]).append(

2929 new_article.indirect_reference

2930 )

2931 current_article = cast("DictionaryObject", current_article["/N"])

2932 if current_article == first_article:

2933 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore

2934 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore

2935 current_article = None

2936 assert nthread.indirect_reference is not None

2937 return nthread.indirect_reference

2938

2939 def add_filtered_articles(

2940 self,

2941 fltr: Union[

2942 Pattern[Any], str

2943 ], # thread entry from the reader's array of threads

2944 pages: dict[int, PageObject],

2945 reader: PdfReader,

2946 ) -> None:

2947 """

2948 Add articles matching the defined criteria.

2949

2950 Args:

2951 fltr:

2952 pages:

2953 reader:

2954

2955 """

2956 if isinstance(fltr, str):

2957 fltr = re.compile(fltr)

2958 elif not isinstance(fltr, Pattern):

2959 fltr = re.compile("")

2960 for p in pages.values():

2961 pp = p.original_page

2962 for a in pp.get("/B", ()):

2963 a_obj = a.get_object()

2964 if is_null_or_none(a_obj):

2965 continue

2966 thr = a_obj.get("/T")

2967 if thr is None:

2968 continue

2969 thr = thr.get_object()

2970 if thr.indirect_reference.idnum not in self._id_translated[

2971 id(reader)

2972 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

2973 self._add_articles_thread(thr, pages, reader)

2974

2975 def _get_cloned_page(

2976 self,

2977 page: Union[None, IndirectObject, PageObject, NullObject],

2978 pages: dict[int, PageObject],

2979 reader: PdfReader,

2980 ) -> Optional[IndirectObject]:

2981 if isinstance(page, NullObject):

2982 return None

2983 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

2984 _i = page.indirect_reference

2985 elif isinstance(page, IndirectObject):

2986 _i = page

2987 try:

2988 return pages[_i.idnum].indirect_reference # type: ignore

2989 except Exception:

2990 return None

2991

2992 def _insert_filtered_annotations(

2993 self,

2994 annots: Union[IndirectObject, list[DictionaryObject], None],

2995 page: PageObject,

2996 pages: dict[int, PageObject],

2997 reader: PdfReader,

2998 ) -> list[Destination]:

2999 outlist = ArrayObject()

3000 if isinstance(annots, IndirectObject):

3001 annots = cast("list[Any]", annots.get_object())

3002 if annots is None:

3003 return outlist

3004 if not isinstance(annots, list):

3005 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)

3006 return outlist

3007 for an in annots:

3008 ano = cast("DictionaryObject", an.get_object())

3009 if (

3010 ano["/Subtype"] != "/Link" # type: ignore[comparison-overlap]

3011 or "/A" not in ano

3012 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" # type: ignore[comparison-overlap]

3013 or "/Dest" in ano

3014 ):

3015 if "/Dest" not in ano:

3016 outlist.append(self._add_object(ano.clone(self)))

3017 else:

3018 d = ano["/Dest"]

3019 if isinstance(d, str):

3020 # it is a named dest

3021 if str(d) in self.get_named_dest_root():

3022 outlist.append(ano.clone(self).indirect_reference)

3023 else:

3024 d = cast("ArrayObject", d)

3025 p = self._get_cloned_page(d[0], pages, reader)

3026 if p is not None:

3027 anc = ano.clone(self, ignore_fields=("/Dest",))

3028 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

3029 outlist.append(self._add_object(anc))

3030 else:

3031 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

3032 if is_null_or_none(d):

3033 continue

3034 if isinstance(d, str):

3035 # it is a named dest

3036 if str(d) in self.get_named_dest_root():

3037 outlist.append(ano.clone(self).indirect_reference)

3038 else:

3039 d = cast("ArrayObject", d)

3040 p = self._get_cloned_page(d[0], pages, reader)

3041 if p is not None:

3042 anc = ano.clone(self, ignore_fields=("/D",))

3043 cast("DictionaryObject", anc["/A"])[

3044 NameObject("/D")

3045 ] = ArrayObject([p, *d[1:]])

3046 outlist.append(self._add_object(anc))

3047 return outlist

3048

3049 def _get_filtered_outline(

3050 self,

3051 node: Any,

3052 pages: dict[int, PageObject],

3053 reader: PdfReader,

3054 ) -> list[Destination]:

3055 """

3056 Extract outline item entries that are part of the specified page set.

3057

3058 Args:

3059 node:

3060 pages:

3061 reader:

3062

3063 Returns:

3064 A list of destination objects.

3065

3066 """

3067 new_outline = []

3068 if node is None:

3069 node = NullObject()

3070 node = node.get_object()

3071 if is_null_or_none(node):

3072 node = DictionaryObject()

3073 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

3074 node = node.get("/First", None)

3075 if node is not None:

3076 node = node.get_object()

3077 new_outline += self._get_filtered_outline(node, pages, reader)

3078 else:

3079 v: Union[None, IndirectObject, NullObject]

3080 while node is not None:

3081 node = node.get_object()

3082 o = cast("Destination", reader._build_outline_item(node))

3083 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3084 if v is None:

3085 v = NullObject()

3086 o[NameObject("/Page")] = v

3087 if "/First" in node:

3088 o._filtered_children = self._get_filtered_outline(

3089 node["/First"], pages, reader

3090 )

3091 else:

3092 o._filtered_children = []

3093 if (

3094 not isinstance(o["/Page"], NullObject)

3095 or len(o._filtered_children) > 0

3096 ):

3097 new_outline.append(o)

3098 node = node.get("/Next", None)

3099 return new_outline

3100

3101 def _clone_outline(self, dest: Destination) -> TreeObject:

3102 n_ol = TreeObject()

3103 self._add_object(n_ol)

3104 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3105 if not isinstance(dest["/Page"], NullObject):

3106 if dest.node is not None and "/A" in dest.node:

3107 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3108 else:

3109 n_ol[NameObject("/Dest")] = dest.dest_array

3110 # TODO: /SE

3111 if dest.node is not None:

3112 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3113 n_ol[NameObject("/C")] = ArrayObject(

3114 dest.node.get(

3115 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3116 )

3117 )

3118 return n_ol

3119

3120 def _insert_filtered_outline(

3121 self,

3122 outlines: list[Destination],

3123 parent: Union[TreeObject, IndirectObject],

3124 before: Union[None, TreeObject, IndirectObject] = None,

3125 ) -> None:

3126 for dest in outlines:

3127 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3128 # with np=self.add_outline_item_destination(dest,parent,before)

3129 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3130 np = parent

3131 else:

3132 np = self._clone_outline(dest)

3133 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3134 self._insert_filtered_outline(dest._filtered_children, np, None)

3135

3136 def close(self) -> None:

3137 """Implemented for API harmonization."""

3138 return

3139

3140 def find_outline_item(

3141 self,

3142 outline_item: dict[str, Any],

3143 root: Optional[OutlineType] = None,

3144 ) -> Optional[list[int]]:

3145 if root is None:

3146 o = self.get_outline_root()

3147 else:

3148 o = cast("TreeObject", root)

3149

3150 i = 0

3151 while o is not None:

3152 if (

3153 o.indirect_reference == outline_item

3154 or o.get("/Title", None) == outline_item

3155 ):

3156 return [i]

3157 if "/First" in o:

3158 res = self.find_outline_item(

3159 outline_item, cast(OutlineType, o["/First"])

3160 )

3161 if res:

3162 return ([i] if "/Title" in o else []) + res

3163 if "/Next" in o:

3164 i += 1

3165 o = cast(TreeObject, o["/Next"])

3166 else:

3167 return None

3168 raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover

3169

3170 def reset_translation(

3171 self, reader: Union[None, PdfReader, IndirectObject] = None

3172 ) -> None:

3173 """

3174 Reset the translation table between reader and the writer object.

3175

3176 Late cloning will create new independent objects.

3177

3178 Args:

3179 reader: PdfReader or IndirectObject referencing a PdfReader object.

3180 if set to None or omitted, all tables will be reset.

3181

3182 """

3183 if reader is None:

3184 self._id_translated = {}

3185 elif isinstance(reader, PdfReader):

3186 try:

3187 del self._id_translated[id(reader)]

3188 except Exception:

3189 pass

3190 elif isinstance(reader, IndirectObject):

3191 try:

3192 del self._id_translated[id(reader.pdf)]

3193 except Exception:

3194 pass

3195 else:

3196 raise Exception("invalid parameter {reader}")

3197

3198 def set_page_label(

3199 self,

3200 page_index_from: int,

3201 page_index_to: int,

3202 style: Optional[PageLabelStyle] = None,

3203 prefix: Optional[str] = None,

3204 start: Optional[int] = 0,

3205 ) -> None:

3206 """

3207 Set a page label to a range of pages.

3208

3209 Page indexes must be given starting from 0.

3210 Labels must have a style, a prefix or both.

3211 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3212

3213 Args:

3214 page_index_from: page index of the beginning of the range starting from 0

3215 page_index_to: page index of the beginning of the range starting from 0

3216 style: The numbering style to be used for the numeric portion of each page label:

3217

3218 * ``/D`` Decimal Arabic numerals

3219 * ``/R`` Uppercase Roman numerals

3220 * ``/r`` Lowercase Roman numerals

3221 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3222 AA to ZZ for the next 26, and so on)

3223 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3224 aa to zz for the next 26, and so on)

3225

3226 prefix: The label prefix for page labels in this range.

3227 start: The value of the numeric portion for the first page label

3228 in the range.

3229 Subsequent pages are numbered sequentially from this value,

3230 which must be greater than or equal to 1.

3231 Default value: 1.

3232

3233 """

3234 if style is None and prefix is None:

3235 raise ValueError("At least one of style and prefix must be given")

3236 if page_index_from < 0:

3237 raise ValueError("page_index_from must be greater or equal than 0")

3238 if page_index_to < page_index_from:

3239 raise ValueError(

3240 "page_index_to must be greater or equal than page_index_from"

3241 )

3242 if page_index_to >= len(self.pages):

3243 raise ValueError("page_index_to exceeds number of pages")

3244 if start is not None and start != 0 and start < 1:

3245 raise ValueError("If given, start must be greater or equal than one")

3246

3247 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3248

3249 def _set_page_label(

3250 self,

3251 page_index_from: int,

3252 page_index_to: int,

3253 style: Optional[PageLabelStyle] = None,

3254 prefix: Optional[str] = None,

3255 start: Optional[int] = 0,

3256 ) -> None:

3257 """

3258 Set a page label to a range of pages.

3259

3260 Page indexes must be given starting from 0.

3261 Labels must have a style, a prefix or both.

3262 If a range is not assigned any page label a decimal label starting from 1 is applied.

3263

3264 Args:

3265 page_index_from: page index of the beginning of the range starting from 0

3266 page_index_to: page index of the beginning of the range starting from 0

3267 style: The numbering style to be used for the numeric portion of each page label:

3268 /D Decimal Arabic numerals

3269 /R Uppercase Roman numerals

3270 /r Lowercase Roman numerals

3271 /A Uppercase letters (A to Z for the first 26 pages,

3272 AA to ZZ for the next 26, and so on)

3273 /a Lowercase letters (a to z for the first 26 pages,

3274 aa to zz for the next 26, and so on)

3275 prefix: The label prefix for page labels in this range.

3276 start: The value of the numeric portion for the first page label

3277 in the range.

3278 Subsequent pages are numbered sequentially from this value,

3279 which must be greater than or equal to 1. Default value: 1.

3280

3281 """

3282 default_page_label = DictionaryObject()

3283 default_page_label[NameObject("/S")] = NameObject("/D")

3284

3285 new_page_label = DictionaryObject()

3286 if style is not None:

3287 new_page_label[NameObject("/S")] = NameObject(style)

3288 if prefix is not None:

3289 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3290 if start != 0:

3291 new_page_label[NameObject("/St")] = NumberObject(start)

3292

3293 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3294 nums = ArrayObject()

3295 nums_insert(NumberObject(0), default_page_label, nums)

3296 page_labels = TreeObject()

3297 page_labels[NameObject("/Nums")] = nums

3298 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3299

3300 page_labels = cast(

3301 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3302 )

3303 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3304

3305 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3306 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3307 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3308 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3309 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3310

3311 page_labels[NameObject("/Nums")] = nums

3312 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3313

3314 def _repr_mimebundle_(

3315 self,

3316 include: Union[None, Iterable[str]] = None,

3317 exclude: Union[None, Iterable[str]] = None,

3318 ) -> dict[str, Any]:

3319 """

3320 Integration into Jupyter Notebooks.

3321

3322 This method returns a dictionary that maps a mime-type to its

3323 representation.

3324

3325 .. seealso::

3326

3327 https://ipython.readthedocs.io/en/stable/config/integrating.html

3328 """

3329 pdf_data = BytesIO()

3330 self.write(pdf_data)

3331 data = {

3332 "application/pdf": pdf_data,

3333 }

3334

3335 if include is not None:

3336 # Filter representations based on include list

3337 data = {k: v for k, v in data.items() if k in include}

3338

3339 if exclude is not None:

3340 # Remove representations based on exclude list

3341 data = {k: v for k, v in data.items() if k not in exclude}

3342

3343 return data

3344

3345

3346def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:

3347 if isinstance(obj, PdfObject):

3348 return obj

3349 if isinstance(obj, dict):

3350 to_add = DictionaryObject()

3351 for key, value in obj.items():

3352 to_add[NameObject(key)] = _pdf_objectify(value)

3353 return to_add

3354 if isinstance(obj, str):

3355 if obj.startswith("/"):

3356 return NameObject(obj)

3357 return TextStringObject(obj)

3358 if isinstance(obj, (float, int)):

3359 return FloatObject(obj)

3360 if isinstance(obj, list):

3361 return ArrayObject(_pdf_objectify(i) for i in obj)

3362 raise NotImplementedError(

3363 f"{type(obj)=} could not be cast to a PdfObject"

3364 )

3365

3366

3367def _create_outline_item(

3368 action_ref: Union[None, IndirectObject],

3369 title: str,

3370 color: Union[tuple[float, float, float], str, None],

3371 italic: bool,

3372 bold: bool,

3373) -> TreeObject:

3374 outline_item = TreeObject()

3375 if action_ref is not None:

3376 outline_item[NameObject("/A")] = action_ref

3377 outline_item.update(

3378 {

3379 NameObject("/Title"): create_string_object(title),

3380 }

3381 )

3382 if color:

3383 if isinstance(color, str):

3384 color = hex_to_rgb(color)

3385 outline_item.update(

3386 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3387 )

3388 if italic or bold:

3389 format_flag = 0

3390 if italic:

3391 format_flag += OutlineFontFlag.italic

3392 if bold:

3393 format_flag += OutlineFontFlag.bold

3394 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3395 return outline_item

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 21%

1450 statements