Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import sys

36import uuid

37from collections.abc import Iterable, Mapping

38from io import BytesIO, FileIO, IOBase

39from itertools import compress

40from pathlib import Path

41from re import Pattern

42from types import TracebackType

43from typing import (

44 IO,

45 Any,

46 Callable,

47 Optional,

48 Union,

49 cast,

50)

52if sys.version_info >= (3, 11):

53 from typing import Self

54else:

55 from typing_extensions import Self

57from ._doc_common import DocumentInformation, PdfDocCommon

58from ._encryption import EncryptAlgorithm, Encryption

59from ._page import PageObject, Transformation

60from ._page_labels import nums_clear_range, nums_insert, nums_next

61from ._reader import PdfReader

62from ._utils import (

63 StrByteType,

64 StreamType,

65 _get_max_pdf_version_header,

66 deprecate_with_replacement,

67 deprecation_no_replacement,

68 logger_warning,

69)

70from .constants import AnnotationDictionaryAttributes as AA

71from .constants import CatalogAttributes as CA

72from .constants import (

73 CatalogDictionary,

74 GoToActionArguments,

75 ImageType,

76 InteractiveFormDictEntries,

77 OutlineFontFlag,

78 PageLabelStyle,

79 PagesAttributes,

80 TypFitArguments,

81 UserAccessPermissions,

82)

83from .constants import Core as CO

84from .constants import FieldDictionaryAttributes as FA

85from .constants import PageAttributes as PG

86from .constants import TrailerKeys as TK

87from .errors import LimitReachedError, PdfReadError, PyPdfError

88from .generic import (

89 PAGE_FIT,

90 ArrayObject,

91 BooleanObject,

92 ByteStringObject,

93 ContentStream,

94 Destination,

95 DictionaryObject,

96 EmbeddedFile,

97 Fit,

98 FloatObject,

99 IndirectObject,

100 NameObject,

101 NullObject,

102 NumberObject,

103 PdfObject,

104 RectangleObject,

105 ReferenceLink,

106 StreamObject,

107 TextStringObject,

108 TreeObject,

109 ViewerPreferences,

110 create_string_object,

111 extract_links,

112 hex_to_rgb,

113 is_null_or_none,

114)

115from .generic._appearance_stream import TextStreamAppearance

116from .pagerange import PageRange, PageRangeSpec

117from .types import (

118 AnnotationSubtype,

119 BorderArrayType,

120 LayoutType,

121 OutlineItemType,

122 OutlineType,

123 PagemodeType,

124)

125from .xmp import XmpInformation

126

127ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

128

129

130class ObjectDeletionFlag(enum.IntFlag):

131 NONE = 0

132 TEXT = enum.auto()

133 LINKS = enum.auto()

134 ATTACHMENTS = enum.auto()

135 OBJECTS_3D = enum.auto()

136 ALL_ANNOTATIONS = enum.auto()

137 XOBJECT_IMAGES = enum.auto()

138 INLINE_IMAGES = enum.auto()

139 DRAWING_IMAGES = enum.auto()

140 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

141

142

143def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

144 hash = hashlib.md5(usedforsecurity=False)

145 for block in iter(lambda: stream.read(blocksize), b""):

146 hash.update(block)

147 return hash.hexdigest()

148

149

150class PdfWriter(PdfDocCommon):

151 """

152 Write a PDF file out, given pages produced by another class or through

153 cloning a PDF file during initialization.

154

155 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

156

157 Args:

158 clone_from: identical to fileobj (for compatibility)

159

160 incremental: If true, loads the document and set the PdfWriter in incremental mode.

161

162 When writing incrementally, the original document is written first and new/modified

163 content is appended. To be used for signed document/forms to keep signature valid.

164

165 full: If true, loads all the objects (always full if incremental = True).

166 This parameter may allow loading large PDFs.

167

168 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.

169 If false, pypdf will try to be forgiving and do something reasonable, but it will log

170 a warning message. It is a best-effort approach.

171

172 """

173

174 def __init__(

175 self,

176 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

177 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

178 incremental: bool = False,

179 full: bool = False,

180 strict: bool = False,

181 *,

182 incremental_clone_object_count_limit: Optional[int] = 500_000,

183 incremental_clone_object_id_limit: Optional[int] = 1_000_000,

184 ) -> None:

185 self.strict = strict

186 """

187 If true, pypdf will raise an exception if a PDF does not follow the specification.

188 If false, pypdf will try to be forgiving and do something reasonable, but it will log

189 a warning message. It is a best-effort approach.

190 """

191

192 self.incremental = incremental or full

193 """

194 Returns if the PdfWriter object has been started in incremental mode.

195 """

196

197 self._objects: list[Optional[PdfObject]] = []

198 """

199 The indirect objects in the PDF.

200 For the incremental case, it will be filled with None

201 in clone_reader_document_root.

202 """

203

204 self._original_hash: list[int] = []

205 """

206 List of hashes after import; used to identify changes.

207 """

208

209 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}

210 """

211 Maps hash values of indirect objects to the list of IndirectObjects.

212 This is used for compression.

213 """

214

215 self._id_translated: dict[int, dict[int, int]] = {}

216 """List of already translated IDs.

217 dict[id(pdf)][(idnum, generation)]

218 """

219

220 self._info_obj: Optional[PdfObject]

221 """The PDF files's document information dictionary,

222 defined by Info in the PDF file's trailer dictionary."""

223

224 self._ID: Union[ArrayObject, None] = None

225 """The PDF file identifier,

226 defined by the ID in the PDF file's trailer dictionary."""

227

228 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []

229 "Tracks links in pages added to the writer for resolving later."

230 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}

231 "Tracks pages added to the writer and what page they turned into."

232

233 # Security parameters.

234 self._incremental_clone_object_count_limit = (

235 incremental_clone_object_count_limit

236 if isinstance(incremental_clone_object_count_limit, int)

237 else sys.maxsize

238 )

239 self._incremental_clone_object_id_limit = (

240 incremental_clone_object_id_limit if isinstance(incremental_clone_object_id_limit, int) else sys.maxsize

241 )

242

243 if self.incremental:

244 if isinstance(fileobj, (str, Path)):

245 with open(fileobj, "rb") as f:

246 fileobj = BytesIO(f.read(-1))

247 if isinstance(fileobj, BytesIO):

248 fileobj = PdfReader(fileobj)

249 if not isinstance(fileobj, PdfReader):

250 raise PyPdfError("Invalid type for incremental mode")

251 self._reader = fileobj # prev content is in _reader.stream

252 self._header = fileobj.pdf_header.encode()

253 self._readonly = True # TODO: to be analysed

254 else:

255 self._header = b"%PDF-1.3"

256 self._info_obj = self._add_object(

257 DictionaryObject(

258 {NameObject("/Producer"): create_string_object("pypdf")}

259 )

260 )

261

262 def _get_clone_from(

263 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

264 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

265 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

266 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

267 fileobj == "" or clone_from is not None

268 ):

269 return clone_from

270 cloning = True

271 if isinstance(fileobj, (str, Path)):

272 fileobj_path = Path(fileobj)

273 if not fileobj_path.exists() or fileobj_path.stat().st_size == 0:

274 cloning = False

275 elif isinstance(fileobj, (IOBase, BytesIO)):

276 t = fileobj.tell()

277 if fileobj.seek(0, 2) == 0:

278 cloning = False

279 fileobj.seek(t, 0)

280 if cloning:

281 clone_from = fileobj

282 return clone_from

283

284 clone_from = _get_clone_from(fileobj, clone_from)

285 # To prevent overwriting

286 self.temp_fileobj = fileobj

287 self.fileobj = ""

288 self._with_as_usage = False

289 self._cloned = False

290 # The root of our page tree node

291 pages = DictionaryObject(

292 {

293 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),

294 NameObject(PagesAttributes.COUNT): NumberObject(0),

295 NameObject(PagesAttributes.KIDS): ArrayObject(),

296 }

297 )

298 self.flattened_pages = []

299 self._encryption: Optional[Encryption] = None

300 self._encrypt_entry: Optional[DictionaryObject] = None

301

302 if clone_from is not None:

303 if not isinstance(clone_from, PdfReader):

304 clone_from = PdfReader(clone_from)

305 self.clone_document_from_reader(clone_from)

306 self._cloned = True

307 else:

308 self._pages = self._add_object(pages)

309 self._root_object = DictionaryObject(

310 {

311 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),

312 NameObject(CO.PAGES): self._pages,

313 }

314 )

315 self._add_object(self._root_object)

316 if full and not incremental:

317 self.incremental = False

318 if isinstance(self._ID, list):

319 if isinstance(self._ID[0], TextStringObject):

320 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

321 if isinstance(self._ID[1], TextStringObject):

322 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

323

324 # for commonality

325 @property

326 def is_encrypted(self) -> bool:

327 """

328 Read-only boolean property showing whether this PDF file is encrypted.

329

330 Note that this property, if true, will remain true even after the

331 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

332 """

333 return False

334

335 @property

336 def root_object(self) -> DictionaryObject:

337 """

338 Provide direct access to PDF Structure.

339

340 Note:

341 Recommended only for read access.

342

343 """

344 return self._root_object

345

346 @property

347 def _info(self) -> Optional[DictionaryObject]:

348 """

349 Provide access to "/Info". Standardized with PdfReader.

350

351 Returns:

352 /Info Dictionary; None if the entry does not exist

353

354 """

355 return (

356 None

357 if self._info_obj is None

358 else cast(DictionaryObject, self._info_obj.get_object())

359 )

360

361 @_info.setter

362 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

363 if value is None:

364 try:

365 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore[union-attr]

366 except (KeyError, AttributeError):

367 pass

368 self._info_obj = None

369 else:

370 if self._info_obj is None:

371 self._info_obj = self._add_object(DictionaryObject())

372 obj = cast(DictionaryObject, self._info_obj.get_object())

373 obj.clear()

374 obj.update(cast(DictionaryObject, value.get_object()))

375

376 @property

377 def xmp_metadata(self) -> Optional[XmpInformation]:

378 """XMP (Extensible Metadata Platform) data."""

379 return cast(XmpInformation, self.root_object.xmp_metadata)

380

381 @xmp_metadata.setter

382 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:

383 """XMP (Extensible Metadata Platform) data."""

384 if value is None:

385 if "/Metadata" in self.root_object:

386 del self.root_object["/Metadata"]

387 return

388

389 metadata = self.root_object.get("/Metadata", None)

390 if not isinstance(metadata, IndirectObject):

391 if metadata is not None:

392 del self.root_object["/Metadata"]

393 metadata_stream = StreamObject()

394 stream_reference = self._add_object(metadata_stream)

395 self.root_object[NameObject("/Metadata")] = stream_reference

396 else:

397 metadata_stream = cast(StreamObject, metadata.get_object())

398

399 if isinstance(value, XmpInformation):

400 bytes_data = value.stream.get_data()

401 else:

402 bytes_data = value

403 metadata_stream.set_data(bytes_data)

404

405 @property

406 def with_as_usage(self) -> bool:

407 deprecation_no_replacement("with_as_usage", "5.0")

408

409 @with_as_usage.setter

410 def with_as_usage(self, value: bool) -> None:

411 deprecation_no_replacement("with_as_usage", "5.0")

412

413 def __enter__(self) -> Self:

414 """Store how writer is initialized by 'with'."""

415 c: bool = self._cloned

416 t = self.temp_fileobj

417 self.__init__() # type: ignore[misc]

418 self._cloned = c

419 self._with_as_usage = True

420 self.fileobj = t # type: ignore[assignment]

421 return self

422

423 def __exit__(

424 self,

425 exc_type: Optional[type[BaseException]],

426 exc: Optional[BaseException],

427 traceback: Optional[TracebackType],

428 ) -> None:

429 """Write data to the fileobj."""

430 if self.fileobj and not self._cloned:

431 self.write(self.fileobj)

432

433 @property

434 def pdf_header(self) -> str:

435 """

436 Read/Write property of the PDF header that is written.

437

438 This should be something like ``'%PDF-1.5'``. It is recommended to set

439 the lowest version that supports all features which are used within the

440 PDF file.

441

442 Note: `pdf_header` returns a string but accepts bytes or str for writing

443 """

444 return self._header.decode()

445

446 @pdf_header.setter

447 def pdf_header(self, new_header: Union[str, bytes]) -> None:

448 if isinstance(new_header, str):

449 new_header = new_header.encode()

450 self._header = new_header

451

452 def _add_object(self, obj: PdfObject) -> IndirectObject:

453 if (

454 getattr(obj, "indirect_reference", None) is not None

455 and obj.indirect_reference.pdf == self # type: ignore[union-attr]

456 ):

457 return obj.indirect_reference # type: ignore[return-value]

458 # check for /Contents in Pages (/Contents in annotations are strings)

459 if isinstance(obj, DictionaryObject) and isinstance(

460 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

461 ):

462 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

463 self._objects.append(obj)

464 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

465 return obj.indirect_reference

466

467 def get_object(

468 self,

469 indirect_reference: Union[int, IndirectObject],

470 ) -> PdfObject:

471 if isinstance(indirect_reference, int):

472 obj = self._objects[indirect_reference - 1]

473 elif indirect_reference.pdf != self:

474 raise ValueError("PDF must be self")

475 else:

476 obj = self._objects[indirect_reference.idnum - 1]

477 if obj is None:

478 raise PdfReadError(f"Object {indirect_reference!r} not found!")

479 return obj

480

481 def _replace_object(

482 self,

483 indirect_reference: Union[int, IndirectObject],

484 obj: PdfObject,

485 ) -> PdfObject:

486 if isinstance(indirect_reference, IndirectObject):

487 if indirect_reference.pdf != self:

488 raise ValueError("PDF must be self")

489 indirect_reference = indirect_reference.idnum

490 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore[union-attr]

491 if (

492 getattr(obj, "indirect_reference", None) is not None

493 and obj.indirect_reference.pdf != self # type: ignore[union-attr]

494 ):

495 obj = obj.clone(self)

496 self._objects[indirect_reference - 1] = obj

497 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

498

499 assert isinstance(obj, PdfObject), "mypy"

500 return obj

501

502 def _add_page(

503 self,

504 page: PageObject,

505 index: int,

506 excluded_keys: Iterable[str] = (),

507 ) -> PageObject:

508 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:

509 raise ValueError("Invalid page object")

510 assert self.flattened_pages is not None, "for mypy"

511 page_org = page

512 excluded_keys = list(excluded_keys)

513 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]

514 # Acrobat does not accept two indirect references pointing on the same

515 # page; therefore in order to add multiple copies of the same

516 # page, we need to create a new dictionary for the page, however the

517 # objects below (including content) are not duplicated:

518 try: # delete an already existing page

519 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore[union-attr]

520 page_org.indirect_reference.idnum # type: ignore[union-attr]

521 ]

522 except Exception:

523 pass

524

525 page = cast(

526 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

527 )

528 if page_org.pdf is not None:

529 other = page_org.pdf.pdf_header

530 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

531

532 node, idx = self._get_page_in_node(index)

533 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference

534

535 if idx >= 0:

536 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)

537 self.flattened_pages.insert(index, page)

538 else:

539 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)

540 self.flattened_pages.append(page)

541 current: Optional[PdfObject] = node

542 recurse = 0

543 while not is_null_or_none(current):

544 assert current is not None # for mypy; guarded by is_null_or_none

545 node_dict = cast(DictionaryObject, current.get_object())

546 node_dict[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node_dict[PagesAttributes.COUNT]) + 1)

547 current = node_dict.get(PagesAttributes.PARENT, None)

548 recurse += 1

549 if recurse > 1000:

550 raise PyPdfError("Too many recursive calls!")

551

552 if page_org.pdf is not None:

553 # the page may contain links to other pages, and those other

554 # pages may or may not already be added. we store the

555 # information we need, so that we can resolve the references

556 # later.

557 self._unresolved_links.extend(extract_links(page, page_org))

558 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference

559

560 return page

561

562 def set_need_appearances_writer(self, state: bool = True) -> None:

563 """

564 Sets the "NeedAppearances" flag in the PDF writer.

565

566 The "NeedAppearances" flag indicates whether the appearance dictionary

567 for form fields should be automatically generated by the PDF viewer or

568 if the embedded appearance should be used.

569

570 Args:

571 state: The actual value of the NeedAppearances flag.

572

573 Returns:

574 None

575

576 """

577 # See §12.7.2 and §7.7.2 for more information:

578 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

579 try:

580 # get the AcroForm tree

581 if CatalogDictionary.ACRO_FORM not in self._root_object:

582 self._root_object[

583 NameObject(CatalogDictionary.ACRO_FORM)

584 ] = self._add_object(DictionaryObject())

585

586 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

587 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

588 need_appearances

589 ] = BooleanObject(state)

590 except Exception as exc: # pragma: no cover

591 logger_warning(

592 "set_need_appearances_writer(%(state)s) catch : %(exc)s",

593 source=__name__,

594 state=state,

595 exc=exc,

596 )

597

598 def create_viewer_preferences(self) -> ViewerPreferences:

599 o = ViewerPreferences()

600 self._root_object[

601 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

602 ] = self._add_object(o)

603 return o

604

605 def add_page(

606 self,

607 page: PageObject,

608 excluded_keys: Iterable[str] = (),

609 ) -> PageObject:

610 """

611 Add a page to this PDF file.

612

613 Recommended for advanced usage including the adequate excluded_keys.

614

615 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

616 instance.

617

618 Args:

619 page: The page to add to the document. Should be

620 an instance of :class:`PageObject<pypdf._page.PageObject>`

621 excluded_keys:

622

623 Returns:

624 The added PageObject.

625

626 """

627 assert self.flattened_pages is not None, "mypy"

628 return self._add_page(page, len(self.flattened_pages), excluded_keys)

629

630 def insert_page(

631 self,

632 page: PageObject,

633 index: int = 0,

634 excluded_keys: Iterable[str] = (),

635 ) -> PageObject:

636 """

637 Insert a page in this PDF file. The page is usually acquired from a

638 :class:`PdfReader<pypdf.PdfReader>` instance.

639

640 Args:

641 page: The page to add to the document.

642 index: Position at which the page will be inserted.

643 excluded_keys:

644

645 Returns:

646 The added PageObject.

647

648 """

649 assert self.flattened_pages is not None, "mypy"

650 if index < 0:

651 index += len(self.flattened_pages)

652 if index < 0:

653 raise ValueError("Invalid index value")

654 if index >= len(self.flattened_pages):

655 return self.add_page(page, excluded_keys)

656 return self._add_page(page, index, excluded_keys)

657

658 def _get_page_number_by_indirect(

659 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

660 ) -> Optional[int]:

661 """

662 Generate _page_id2num.

663

664 Args:

665 indirect_reference:

666

667 Returns:

668 The page number or None

669

670 """

671 # To provide same function as in PdfReader

672 if is_null_or_none(indirect_reference):

673 return None

674 assert indirect_reference is not None, "mypy"

675 if isinstance(indirect_reference, int):

676 indirect_reference = IndirectObject(indirect_reference, 0, self)

677 obj = indirect_reference.get_object()

678 if isinstance(obj, PageObject):

679 return obj.page_number

680 return None

681

682 def add_blank_page(

683 self, width: Optional[float] = None, height: Optional[float] = None

684 ) -> PageObject:

685 """

686 Append a blank page to this PDF file and return it.

687

688 If no page size is specified, use the size of the last page.

689

690 Args:

691 width: The width of the new page expressed in default user

692 space units.

693 height: The height of the new page expressed in default

694 user space units.

695

696 Returns:

697 The newly appended page.

698

699 Raises:

700 PageSizeNotDefinedError: if width and height are not defined

701 and previous page does not exist.

702

703 """

704 page = PageObject.create_blank_page(self, width, height)

705 return self.add_page(page)

706

707 def insert_blank_page(

708 self,

709 width: Optional[Union[float, decimal.Decimal]] = None,

710 height: Optional[Union[float, decimal.Decimal]] = None,

711 index: int = 0,

712 ) -> PageObject:

713 """

714 Insert a blank page to this PDF file and return it.

715

716 If no page size is specified for a dimension, use the size of the last page.

717

718 Args:

719 width: The width of the new page in default user space units.

720 height: The height of the new page in default user space units.

721 index: Position to add the page.

722

723 Returns:

724 The newly inserted page.

725

726 Raises:

727 PageSizeNotDefinedError: if width and height are not defined

728 and previous page does not exist.

729 IndexError: Index is outside of [-self.get_num_pages(), self.get_num_pages()]

730 """

731 num_pages = self.get_num_pages()

732 if abs(index) <= num_pages:

733 # Use the chosen index, but do not exceed the available pages

734 fixed_index = min(index, num_pages - 1)

735 mediabox = self.pages[fixed_index].mediabox

736 if width is None or width <= 0:

737 width = mediabox.width

738 if height is None or height <= 0:

739 height = mediabox.height

740 else:

741 raise IndexError(f"Index should be in range [-{num_pages}, {num_pages}]")

742

743 page = PageObject.create_blank_page(self, width, height)

744 self.insert_page(page, index)

745 return page

746

747 @property

748 def open_destination(

749 self,

750 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

751 return super().open_destination

752

753 @open_destination.setter

754 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

755 if dest is None:

756 try:

757 del self._root_object["/OpenAction"]

758 except KeyError:

759 pass

760 elif isinstance(dest, str):

761 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

762 elif isinstance(dest, Destination):

763 self._root_object[NameObject("/OpenAction")] = dest.dest_array

764 elif isinstance(dest, PageObject):

765 self._root_object[NameObject("/OpenAction")] = Destination(

766 "Opening",

767 dest.indirect_reference

768 if dest.indirect_reference is not None

769 else NullObject(),

770 PAGE_FIT,

771 ).dest_array

772

773 def add_js(self, javascript: str) -> None:

774 """

775 Add JavaScript which will launch upon opening this PDF.

776

777 Args:

778 javascript: Your JavaScript.

779

780 Example:

781 This will launch the print window when the PDF is opened.

782

783 >>> from pypdf import PdfWriter

784 >>> output = PdfWriter()

785 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

786

787 """

788 # Names / JavaScript preferred to be able to add multiple scripts

789 if "/Names" not in self._root_object:

790 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

791 names = cast(DictionaryObject, self._root_object[CA.NAMES])

792 if "/JavaScript" not in names:

793 names[NameObject("/JavaScript")] = DictionaryObject(

794 {NameObject("/Names"): ArrayObject()}

795 )

796 js_list = cast(

797 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

798 )

799 # We need a name for parameterized JavaScript in the PDF file,

800 # but it can be anything.

801 js_list.append(create_string_object(str(uuid.uuid4())))

802

803 js = DictionaryObject(

804 {

805 NameObject(PagesAttributes.TYPE): NameObject("/Action"),

806 NameObject("/S"): NameObject("/JavaScript"),

807 NameObject("/JS"): TextStringObject(f"{javascript}"),

808 }

809 )

810 js_list.append(self._add_object(js))

811

812 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":

813 """

814 Embed a file inside the PDF.

815

816 Reference:

817 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

818 Section 7.11.3

819

820 Args:

821 filename: The filename to display.

822 data: The data in the file.

823

824 Returns:

825 EmbeddedFile instance for the newly created embedded file.

826

827 """

828 return EmbeddedFile._create_new(self, filename, data)

829

830 def append_pages_from_reader(

831 self,

832 reader: PdfReader,

833 after_page_append: Optional[Callable[[PageObject], None]] = None,

834 ) -> None:

835 """

836 Copy pages from reader to writer. Includes an optional callback

837 parameter which is invoked after pages are appended to the writer.

838

839 ``append`` should be preferred.

840

841 Args:

842 reader: a PdfReader object from which to copy page

843 annotations to this writer object. The writer's annots

844 will then be updated.

845 after_page_append:

846 Callback function that is invoked after each page is appended to

847 the writer. Signature includes a reference to the appended page

848 (delegates to append_pages_from_reader). The single parameter of

849 the callback is a reference to the page just appended to the

850 document.

851

852 """

853 reader_num_pages = len(reader.pages)

854 # Copy pages from reader to writer

855 for reader_page_number in range(reader_num_pages):

856 reader_page = reader.pages[reader_page_number]

857 writer_page = self.add_page(reader_page)

858 # Trigger callback, pass writer page as parameter

859 if callable(after_page_append):

860 after_page_append(writer_page)

861

862 def _merge_content_stream_to_page(

863 self,

864 page: PageObject,

865 new_content_data: bytes,

866 ) -> None:

867 """

868 Combines existing content stream(s) with new content (as bytes).

869

870 Args:

871 page: The page to which the new content data will be added.

872 new_content_data: A binary-encoded new content stream, for

873 instance the commands to draw an XObject.

874 """

875 # First resolve the existing page content. This always is an IndirectObject:

876 # PDF Explained by John Whitington

877 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

878 if NameObject("/Contents") in page:

879 existing_content_ref = page[NameObject("/Contents")]

880 existing_content = existing_content_ref.get_object()

881

882 if isinstance(existing_content, ArrayObject):

883 # Create a new StreamObject for the new_content_data

884 new_stream_obj = StreamObject()

885 new_stream_obj.set_data(new_content_data)

886 existing_content.append(self._add_object(new_stream_obj))

887 page[NameObject("/Contents")] = self._add_object(existing_content)

888 if isinstance(existing_content, StreamObject):

889 # Merge new content to existing StreamObject

890 merged_data = existing_content.get_data() + b"\n" + new_content_data

891 new_stream = StreamObject()

892 new_stream.set_data(merged_data)

893 page[NameObject("/Contents")] = self._add_object(new_stream)

894 else:

895 # If no existing content, then we have an empty page.

896 # Create a new StreamObject in a new /Contents entry.

897 new_stream = StreamObject()

898 new_stream.set_data(new_content_data)

899 page[NameObject("/Contents")] = self._add_object(new_stream)

900

901 def _add_apstream_object(

902 self,

903 page: PageObject,

904 appearance_stream_obj: StreamObject,

905 object_name: str,

906 x_offset: float,

907 y_offset: float,

908 ) -> None:

909 """

910 Adds an appearance stream to the page content in the form of

911 an XObject.

912

913 Args:

914 page: The page to which to add the appearance stream.

915 appearance_stream_obj: The appearance stream.

916 object_name: The name of the appearance stream.

917 x_offset: The horizontal offset for the appearance stream.

918 y_offset: The vertical offset for the appearance stream.

919 """

920 pg_res = cast(DictionaryObject, page[PG.RESOURCES])

921 # Always add the resolved stream object to the writer to get a new IndirectObject.

922 # This ensures we have a valid IndirectObject managed by *this* writer.

923 xobject_ref = self._add_object(appearance_stream_obj)

924 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()

925 if "/XObject" not in pg_res:

926 pg_res[NameObject("/XObject")] = DictionaryObject()

927 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])

928 if xobject_name not in pg_xo_res:

929 pg_xo_res[xobject_name] = xobject_ref

930 else:

931 logger_warning(

932 "XObject %(xobject_name)r already added to page resources. This might be an issue.",

933 source=__name__,

934 xobject_name=xobject_name,

935 )

936 xobject_cm = Transformation().translate(x_offset, y_offset)

937 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()

938 self._merge_content_stream_to_page(page, xobject_drawing_commands)

939

940 FFBITS_NUL = FA.FfBits(0)

941

942 def update_page_form_field_values(

943 self,

944 page: Union[PageObject, list[PageObject], None],

945 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],

946 flags: FA.FfBits = FFBITS_NUL,

947 auto_regenerate: Optional[bool] = True,

948 flatten: bool = False,

949 ) -> None:

950 """

951 Update the form field values for a given page from a fields dictionary.

952

953 Copy field texts and values from fields to page.

954 If the field links to a parent object, add the information to the parent.

955

956 Args:

957 page: `PageObject` - references **PDF writer's page** where the

958 annotations and field data will be updated.

959 `List[Pageobject]` - provides list of pages to be processed.

960 `None` - all pages.

961 fields: a Python dictionary of:

962

963 * field names (/T) as keys and text values (/V) as value

964 * field names (/T) as keys and list of text values (/V) for multiple choice list

965 * field names (/T) as keys and tuple of:

966 * text values (/V)

967 * font id (e.g. /F1, the font id must exist)

968 * font size (0 for autosize)

969

970 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

971

972 auto_regenerate: Set/unset the need_appearances flag;

973 the flag is unchanged if auto_regenerate is None.

974

975 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's

976 appearance stream to the page contents. Note that this option does not remove the

977 annotation itself.

978

979 """

980 if CatalogDictionary.ACRO_FORM not in self._root_object:

981 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

982 acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

983 if InteractiveFormDictEntries.Fields not in acro_form:

984 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

985 if isinstance(auto_regenerate, bool):

986 self.set_need_appearances_writer(auto_regenerate)

987 # Iterate through pages, update field values

988 if page is None:

989 page = list(self.pages)

990 if isinstance(page, list):

991 for p in page:

992 if PG.ANNOTS in p: # just to prevent warnings

993 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)

994 return

995 if PG.ANNOTS not in page:

996 logger_warning("No fields to update on this page", source=__name__)

997 return

998 appearance_stream_obj: Optional[StreamObject] = None

999

1000 for annotation in page[PG.ANNOTS]: # type: ignore[attr-defined]

1001 annotation = cast(DictionaryObject, annotation.get_object())

1002 if annotation.get("/Subtype", "") != "/Widget":

1003 continue

1004 if "/FT" in annotation and "/T" in annotation:

1005 parent_annotation = annotation

1006 else:

1007 parent_annotation = annotation.get(

1008 PG.PARENT, DictionaryObject()

1009 ).get_object()

1010

1011 for field, value in fields.items():

1012 rectangle = cast(RectangleObject, annotation[AA.Rect])

1013 if not (

1014 self._get_qualified_field_name(parent_annotation) == field

1015 or parent_annotation.get("/T", None) == field

1016 ):

1017 continue

1018 if (

1019 parent_annotation.get("/FT", None) == "/Ch"

1020 and "/I" in parent_annotation

1021 ):

1022 del parent_annotation["/I"]

1023 if flags:

1024 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1025 # Set the field value

1026 if not (value is None and flatten): # Only change values if given by user and not flattening.

1027 if isinstance(value, list):

1028 lst = ArrayObject(TextStringObject(v) for v in value)

1029 parent_annotation[NameObject(FA.V)] = lst

1030 elif isinstance(value, tuple):

1031 annotation[NameObject(FA.V)] = TextStringObject(

1032 value[0],

1033 )

1034 else:

1035 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1036 # Get or create the field's appearance stream object

1037 if parent_annotation.get(FA.FT) == "/Btn":

1038 # Checkbox button (no /FT found in Radio widgets);

1039 # We can find the associated appearance stream object

1040 # within the annotation.

1041 v = NameObject(value)

1042 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1043 normal_ap = cast(DictionaryObject, ap["/N"])

1044 if v not in normal_ap:

1045 v = NameObject("/Off")

1046 appearance_stream_obj = normal_ap.get(v)

1047 # Other cases will be updated through the for loop

1048 annotation[NameObject(AA.AS)] = v

1049 annotation[NameObject(FA.V)] = v

1050 elif (

1051 parent_annotation.get(FA.FT) == "/Tx"

1052 or parent_annotation.get(FA.FT) == "/Ch"

1053 ):

1054 # Textbox; we need to generate the appearance stream object

1055 if isinstance(value, tuple):

1056 appearance_stream_obj = TextStreamAppearance.from_text_annotation(

1057 self, page, flatten, acro_form, parent_annotation, annotation, value[1], value[2]

1058 )

1059 else:

1060 appearance_stream_obj = TextStreamAppearance.from_text_annotation(

1061 self, page, flatten, acro_form, parent_annotation, annotation

1062 )

1063 # Add the appearance stream object

1064 if AA.AP not in annotation:

1065 annotation[NameObject(AA.AP)] = DictionaryObject(

1066 {NameObject("/N"): self._add_object(appearance_stream_obj)}

1067 )

1068 elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])):

1069 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1070 NameObject("/N")

1071 ] = self._add_object(appearance_stream_obj)

1072 else: # [/AP][/N] exists

1073 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore[index]

1074 self._objects[n - 1] = appearance_stream_obj

1075 appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self)

1076 elif (

1077 annotation.get(FA.FT) == "/Sig"

1078 ): # deprecated # not implemented yet

1079 logger_warning("Signature forms not implemented yet", source=__name__)

1080

1081 if appearance_stream_obj and flatten:

1082 self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])

1083

1084 def reattach_fields(

1085 self, page: Optional[PageObject] = None

1086 ) -> list[DictionaryObject]:

1087 """

1088 Parse annotations within the page looking for orphan fields and

1089 reattach then into the Fields Structure.

1090

1091 Args:

1092 page: page to analyze.

1093 If none is provided, all pages will be analyzed.

1094

1095 Returns:

1096 list of reattached fields.

1097

1098 """

1099 lst = []

1100 if page is None:

1101 for p in self.pages:

1102 lst += self.reattach_fields(p)

1103 return lst

1104

1105 try:

1106 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1107 except KeyError:

1108 af = DictionaryObject()

1109 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1110 try:

1111 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1112 except KeyError:

1113 fields = ArrayObject()

1114 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1115

1116 if "/Annots" not in page:

1117 return lst

1118 annotations = cast(ArrayObject, page["/Annots"])

1119 for idx, annotation in enumerate(annotations):

1120 is_indirect = isinstance(annotation, IndirectObject)

1121 annotation = cast(DictionaryObject, annotation.get_object())

1122 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1123 if (

1124 "indirect_reference" in annotation.__dict__

1125 and annotation.indirect_reference in fields

1126 ):

1127 continue

1128 if not is_indirect:

1129 annotations[idx] = self._add_object(annotation)

1130 fields.append(annotation.indirect_reference)

1131 lst.append(annotation)

1132 return lst

1133

1134 def _collect_incremental_clone_object_ids(self, reader: PdfReader) -> list[int]:

1135 object_ids: set[int] = set()

1136 for xref_entry in reader.xref.values():

1137 object_ids.update(filter(None, xref_entry))

1138 object_ids.update(filter(None, reader.xref_objStm))

1139

1140 object_count = len(object_ids)

1141 if object_count > self._incremental_clone_object_count_limit:

1142 raise LimitReachedError(

1143 f"Incremental clone object count {object_count} exceeds "

1144 f"maximum allowed count {self._incremental_clone_object_count_limit}."

1145 )

1146

1147 max_object_id = max(object_ids, default=0)

1148 if max_object_id > self._incremental_clone_object_id_limit:

1149 raise LimitReachedError(

1150 f"Incremental clone object ID {max_object_id} exceeds "

1151 f"maximum allowed ID {self._incremental_clone_object_id_limit}."

1152 )

1153

1154 return sorted(object_ids)

1155

1156 def clone_reader_document_root(self, reader: PdfReader) -> None:

1157 """

1158 Copy the reader document root to the writer and all sub-elements,

1159 including pages, threads, outlines,... For partial insertion, ``append``

1160 should be considered.

1161

1162 Args:

1163 reader: PdfReader from which the document root should be copied.

1164

1165 """

1166 self._info_obj = None

1167 if self.incremental:

1168 object_ids = self._collect_incremental_clone_object_ids(reader)

1169 self._objects = [None] * (object_ids[-1] if object_ids else 0)

1170 for object_id in object_ids:

1171 reader_object = reader.get_object(object_id)

1172 if reader_object is not None:

1173 self._objects[object_id - 1] = reader_object.replicate(self)

1174 else:

1175 self._objects.clear()

1176 self._root_object = reader.root_object.clone(self)

1177 self._pages = self._root_object.raw_get("/Pages")

1178

1179 trailer_size = cast(int, reader.trailer["/Size"])

1180 if len(self._objects) > trailer_size:

1181 if self.strict:

1182 raise PdfReadError(

1183 f"Object count {len(self._objects)} exceeds defined trailer size {trailer_size}"

1184 )

1185 logger_warning(

1186 "Object count %(object_count)d exceeds defined trailer size %(trailer_size)d",

1187 source=__name__,

1188 object_count=len(self._objects),

1189 trailer_size=trailer_size,

1190 )

1191

1192 # must be done here before rewriting

1193 if self.incremental:

1194 self._original_hash = [

1195 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1196 ]

1197

1198 try:

1199 self._flatten()

1200 except IndexError:

1201 raise PdfReadError("Got index error while flattening.")

1202

1203 assert self.flattened_pages is not None

1204 for p in self.flattened_pages:

1205 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1206 if not self.incremental:

1207 p[NameObject("/Parent")] = self._pages

1208 if not self.incremental:

1209 cast(DictionaryObject, self._pages.get_object())[

1210 NameObject("/Kids")

1211 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1212

1213 def clone_document_from_reader(

1214 self,

1215 reader: PdfReader,

1216 after_page_append: Optional[Callable[[PageObject], None]] = None,

1217 ) -> None:

1218 """

1219 Create a copy (clone) of a document from a PDF file reader cloning

1220 section '/Root' and '/Info' and '/ID' of the pdf.

1221

1222 Args:

1223 reader: PDF file reader instance from which the clone

1224 should be created.

1225 after_page_append:

1226 Callback function that is invoked after each page is appended to

1227 the writer. Signature includes a reference to the appended page

1228 (delegates to append_pages_from_reader). The single parameter of

1229 the callback is a reference to the page just appended to the

1230 document.

1231

1232 """

1233 self.clone_reader_document_root(reader)

1234 inf = reader._info

1235 if self.incremental:

1236 if inf is not None:

1237 self._info_obj = cast(

1238 IndirectObject, inf.clone(self).indirect_reference

1239 )

1240 assert isinstance(self._info, DictionaryObject), "for mypy"

1241 self._original_hash[

1242 self._info_obj.indirect_reference.idnum - 1

1243 ] = self._info.hash_bin()

1244 elif inf is not None:

1245 self._info_obj = self._add_object(

1246 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1247 )

1248 # else: _info_obj = None done in clone_reader_document_root()

1249

1250 try:

1251 self._ID = cast(ArrayObject, reader._ID).clone(self)

1252 except AttributeError:

1253 pass

1254

1255 if callable(after_page_append):

1256 for page in cast(

1257 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1258 ):

1259 after_page_append(page.get_object())

1260

1261 def _compute_document_identifier(self) -> ByteStringObject:

1262 stream = BytesIO()

1263 self._write_pdf_structure(stream)

1264 stream.seek(0)

1265 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1266

1267 def generate_file_identifiers(self) -> None:

1268 """

1269 Generate an identifier for the PDF that will be written.

1270

1271 The only point of this is ensuring uniqueness. Reproducibility is not

1272 required.

1273 When a file is first written, both identifiers shall be set to the same value.

1274 If both identifiers match when a file reference is resolved, it is very

1275 likely that the correct and unchanged file has been found. If only the first

1276 identifier matches, a different version of the correct file has been found.

1277 see §14.4 "File Identifiers".

1278 """

1279 if self._ID:

1280 id1 = self._ID[0]

1281 id2 = self._compute_document_identifier()

1282 else:

1283 id1 = self._compute_document_identifier()

1284 id2 = id1

1285 self._ID = ArrayObject((id1, id2))

1286

1287 def encrypt(

1288 self,

1289 user_password: str,

1290 owner_password: Optional[str] = None,

1291 use_128bit: bool = True,

1292 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1293 *,

1294 algorithm: Optional[str] = None,

1295 ) -> None:

1296 """

1297 Encrypt this PDF file with the PDF Standard encryption handler.

1298

1299 Args:

1300 user_password: The password which allows for opening

1301 and reading the PDF file with the restrictions provided.

1302 owner_password: The password which allows for

1303 opening the PDF files without any restrictions. By default,

1304 the owner password is the same as the user password.

1305 use_128bit: flag as to whether to use 128bit

1306 encryption. When false, 40bit encryption will be used.

1307 By default, this flag is on.

1308 permissions_flag: permissions as described in

1309 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1310 the permission is granted.

1311 Hence an integer value of -1 will set all flags.

1312 Bit position 3 is for printing, 4 is for modifying content,

1313 5 and 6 control annotations, 9 for form fields,

1314 10 for extraction of text and graphics.

1315 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1316 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1317 `use_128bit` will be ignored.

1318

1319 """

1320 if self.incremental:

1321 raise NotImplementedError("Encrypting incremental PDF files is currently not supported.")

1322

1323 if owner_password is None:

1324 owner_password = user_password

1325

1326 if algorithm is not None:

1327 try:

1328 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1329 except AttributeError:

1330 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1331 else:

1332 alg = EncryptAlgorithm.RC4_128

1333 if not use_128bit:

1334 alg = EncryptAlgorithm.RC4_40

1335 self.generate_file_identifiers()

1336 assert self._ID

1337 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1338 # in case call `encrypt` again

1339 entry = self._encryption.write_entry(user_password, owner_password, strict=self.strict)

1340 if self._encrypt_entry:

1341 # replace old encrypt_entry

1342 assert self._encrypt_entry.indirect_reference is not None

1343 entry.indirect_reference = self._encrypt_entry.indirect_reference

1344 self._objects[entry.indirect_reference.idnum - 1] = entry

1345 else:

1346 self._add_object(entry)

1347 self._encrypt_entry = entry

1348

1349 def _resolve_links(self) -> None:

1350 """Patch up links that were added to the document earlier, to

1351 make sure they still point to the same pages.

1352 """

1353 for (new_link, old_link) in self._unresolved_links:

1354 old_page = old_link.find_referenced_page()

1355 if not old_page:

1356 continue

1357 new_page = self._merged_in_pages.get(old_page)

1358 if new_page is None:

1359 continue

1360 new_link.patch_reference(self, new_page)

1361

1362 def write_stream(self, stream: StreamType) -> None:

1363 if hasattr(stream, "mode") and "b" not in stream.mode:

1364 logger_warning(

1365 "File <%(stream_name)s> to write to is not in binary mode. "

1366 "It may not be written to correctly.",

1367 source=__name__,

1368 stream_name=stream.name,

1369 )

1370 self._resolve_links()

1371

1372 if self.incremental:

1373 self._reader.stream.seek(0)

1374 stream.write(self._reader.stream.read(-1))

1375 if len(self.list_objects_in_increment()) > 0:

1376 self._write_increment(stream) # writes objs, xref stream and startxref

1377 else:

1378 object_positions, free_objects = self._write_pdf_structure(stream)

1379 xref_location = self._write_xref_table(

1380 stream, object_positions, free_objects

1381 )

1382 self._write_trailer(stream, xref_location)

1383

1384 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:

1385 """

1386 Write the collection of pages added to this object out as a PDF file.

1387

1388 Args:

1389 stream: An object to write the file to. The object can support

1390 the write method and the tell method, similar to a file object, or

1391 be a file path, just like the fileobj, just named it stream to keep

1392 existing workflow.

1393

1394 Returns:

1395 A tuple (bool, IO).

1396

1397 """

1398 my_file = False

1399

1400 if stream == "":

1401 raise ValueError(f"Output({stream=}) is empty.")

1402

1403 if isinstance(stream, (str, Path)):

1404 stream = FileIO(stream, "wb")

1405 my_file = True

1406

1407 self.write_stream(stream)

1408

1409 if my_file:

1410 stream.close()

1411 else:

1412 stream.flush()

1413

1414 return my_file, stream

1415

1416 def list_objects_in_increment(self) -> list[IndirectObject]:

1417 """

1418 For analysis or debugging.

1419 Provides the list of new or modified objects that will be written

1420 in the increment.

1421 Deleted objects will not be freed but will become orphans.

1422

1423 Returns:

1424 List of new or modified IndirectObjects

1425

1426 """

1427 original_hash_count = len(self._original_hash)

1428 return [

1429 cast(IndirectObject, obj).indirect_reference

1430 for i, obj in enumerate(self._objects)

1431 if (

1432 obj is not None

1433 and (

1434 i >= original_hash_count

1435 or obj.hash_bin() != self._original_hash[i]

1436 )

1437 )

1438 ]

1439

1440 def _write_increment(self, stream: StreamType) -> None:

1441 object_positions = {}

1442 object_blocks = []

1443 current_start = -1

1444 current_stop = -2

1445 original_hash_count = len(self._original_hash)

1446 for i, obj in enumerate(self._objects):

1447 if obj is not None and (

1448 i >= original_hash_count

1449 or obj.hash_bin() != self._original_hash[i]

1450 ):

1451 idnum = i + 1

1452 assert isinstance(obj, PdfObject), "mypy"

1453 # first write new/modified object

1454 object_positions[idnum] = stream.tell()

1455 stream.write(f"{idnum} 0 obj\n".encode())

1456 """ encryption is not operational

1457 if self._encryption and obj != self._encrypt_entry:

1458 obj = self._encryption.encrypt_object(obj, idnum, 0)

1459 """

1460 obj.write_to_stream(stream)

1461 stream.write(b"\nendobj\n")

1462

1463 # prepare xref

1464 if idnum != current_stop:

1465 if current_start > 0:

1466 object_blocks.append(

1467 [current_start, current_stop - current_start]

1468 )

1469 current_start = idnum

1470 current_stop = idnum + 1

1471 assert current_start > 0, "for pytest only"

1472 object_blocks.append([current_start, current_stop - current_start])

1473 # write incremented xref

1474 xref_location = stream.tell()

1475 xr_id = len(self._objects) + 1

1476 stream.write(f"{xr_id} 0 obj".encode())

1477 init_data = {

1478 NameObject("/Type"): NameObject("/XRef"),

1479 NameObject("/Size"): NumberObject(xr_id + 1),

1480 NameObject("/Root"): self.root_object.indirect_reference,

1481 NameObject("/Filter"): NameObject("/FlateDecode"),

1482 NameObject("/Index"): ArrayObject(

1483 [NumberObject(_it) for _su in object_blocks for _it in _su]

1484 ),

1485 NameObject("/W"): ArrayObject(

1486 [NumberObject(1), NumberObject(4), NumberObject(1)]

1487 ),

1488 "__streamdata__": b"",

1489 }

1490 if self._info is not None and (

1491 self._info.indirect_reference.idnum - 1 # type: ignore[union-attr]

1492 >= len(self._original_hash)

1493 or cast(IndirectObject, self._info).hash_bin() # kept for future

1494 != self._original_hash[

1495 self._info.indirect_reference.idnum - 1 # type: ignore[union-attr]

1496 ]

1497 ):

1498 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1499 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1500 if self._ID:

1501 init_data[NameObject(TK.ID)] = self._ID

1502 xr = StreamObject.initialize_from_dictionary(init_data)

1503 xr.set_data(

1504 b"".join(

1505 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1506 )

1507 )

1508 xr.write_to_stream(stream)

1509 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1510

1511 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:

1512 object_positions = []

1513 free_objects = []

1514 stream.write(self.pdf_header.encode() + b"\n")

1515 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1516

1517 for idnum, obj in enumerate(self._objects, start=1):

1518 if obj is not None:

1519 object_positions.append(stream.tell())

1520 stream.write(f"{idnum} 0 obj\n".encode())

1521 if self._encryption and obj != self._encrypt_entry:

1522 obj = self._encryption.encrypt_object(obj, idnum, 0)

1523 obj.write_to_stream(stream)

1524 stream.write(b"\nendobj\n")

1525 else:

1526 object_positions.append(-1)

1527 free_objects.append(idnum)

1528 free_objects.append(0) # add 0 to loop in accordance with specification

1529 return object_positions, free_objects

1530

1531 def _write_xref_table(

1532 self, stream: StreamType, object_positions: list[int], free_objects: list[int]

1533 ) -> int:

1534 xref_location = stream.tell()

1535 stream.write(b"xref\n")

1536 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1537 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1538 free_idx = 1

1539 for offset in object_positions:

1540 if offset > 0:

1541 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1542 else:

1543 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1544 free_idx += 1

1545 return xref_location

1546

1547 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1548 """

1549 Write the PDF trailer to the stream.

1550

1551 To quote the PDF specification:

1552 [The] trailer [gives] the location of the cross-reference table and

1553 of certain special objects within the body of the file.

1554 """

1555 stream.write(b"trailer\n")

1556 trailer = DictionaryObject(

1557 {

1558 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1559 NameObject(TK.ROOT): self.root_object.indirect_reference,

1560 }

1561 )

1562 if self._info is not None:

1563 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1564 if self._ID is not None:

1565 trailer[NameObject(TK.ID)] = self._ID

1566 if self._encrypt_entry:

1567 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1568 trailer.write_to_stream(stream)

1569 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1570

1571 @property

1572 def metadata(self) -> Optional[DocumentInformation]:

1573 """

1574 Retrieve/set the PDF file's document information dictionary, if it exists.

1575

1576 Args:

1577 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1578

1579 Note that some PDF files use (XMP) metadata streams instead of document

1580 information dictionaries, and these metadata streams will not be

1581 accessed by this function, but by :meth:`~xmp_metadata`.

1582

1583 """

1584 return super().metadata

1585

1586 @metadata.setter

1587 def metadata(

1588 self,

1589 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],

1590 ) -> None:

1591 if value is None:

1592 self._info = None

1593 else:

1594 if self._info is not None:

1595 self._info.clear()

1596

1597 self.add_metadata(value)

1598

1599 def add_metadata(self, infos: dict[str, Any]) -> None:

1600 """

1601 Add custom metadata to the output.

1602

1603 Args:

1604 infos: a Python dictionary where each key is a field

1605 and each value is your new metadata.

1606

1607 """

1608 args = {}

1609 if isinstance(infos, PdfObject):

1610 infos = cast(DictionaryObject, infos.get_object())

1611 for key, value in list(infos.items()):

1612 if isinstance(value, PdfObject):

1613 value = value.get_object()

1614 args[NameObject(key)] = create_string_object(str(value))

1615 if self._info is None:

1616 self._info = DictionaryObject()

1617 self._info.update(args)

1618

1619 _UNSET = object()

1620

1621 def compress_identical_objects(

1622 self,

1623 remove_identicals: Any = _UNSET,

1624 remove_orphans: Any = _UNSET,

1625 *,

1626 remove_duplicates: bool = True,

1627 remove_unreferenced: bool = True,

1628 ) -> None:

1629 """

1630 Parse the PDF file and merge objects that have the same hash.

1631 This will make objects common to multiple pages.

1632 Recommended to be used just before writing output.

1633

1634 Args:

1635 remove_identicals: Deprecated.

1636 remove_orphans: Deprecated.

1637 remove_duplicates: Remove duplicate objects.

1638 remove_unreferenced: Remove unreferenced objects.

1639

1640 """

1641 if remove_identicals != self._UNSET:

1642 deprecate_with_replacement("remove_identicals", "remove_duplicates", "7.0.0")

1643 assert isinstance(remove_identicals, bool)

1644 remove_duplicates = remove_identicals

1645 if remove_orphans != self._UNSET:

1646 deprecate_with_replacement("remove_orphans", "remove_unreferenced", "7.0.0")

1647 assert isinstance(remove_orphans, bool)

1648 remove_unreferenced = remove_orphans

1649

1650 def replace_in_obj(

1651 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]

1652 ) -> None:

1653 if isinstance(obj, DictionaryObject):

1654 key_val = obj.items()

1655 elif isinstance(obj, ArrayObject):

1656 key_val = enumerate(obj) # type: ignore[assignment]

1657 else:

1658 return

1659 assert isinstance(obj, (DictionaryObject, ArrayObject))

1660 for k, v in key_val:

1661 if isinstance(v, IndirectObject):

1662 unreferenced[v.idnum - 1] = False

1663 if v in crossref:

1664 obj[k] = crossref[v]

1665 else:

1666 """The filtering on DictionaryObject and ArrayObject only

1667 will be performed within replace_in_obj"""

1668 replace_in_obj(v, crossref)

1669

1670 # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...])

1671 self._idnum_hash = {}

1672 unreferenced = [True] * len(self._objects)

1673 # look for similar objects

1674 for idx, obj in enumerate(self._objects):

1675 if is_null_or_none(obj):

1676 continue

1677 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1678 assert isinstance(obj.indirect_reference, IndirectObject)

1679 h = obj.hash_value()

1680 if remove_duplicates and h in self._idnum_hash:

1681 self._idnum_hash[h][1].append(obj.indirect_reference)

1682 self._objects[idx] = None

1683 else:

1684 self._idnum_hash[h] = (obj.indirect_reference, [])

1685

1686 # generate the dict converting others to 1st

1687 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1688 cnv_rev: dict[IndirectObject, IndirectObject] = {}

1689 for k, v in cnv.items():

1690 cnv_rev.update(zip(v, (k,) * len(v)))

1691

1692 # replace reference to merged objects

1693 for obj in self._objects:

1694 if isinstance(obj, (DictionaryObject, ArrayObject)):

1695 replace_in_obj(obj, cnv_rev)

1696

1697 if remove_unreferenced:

1698 unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore[union-attr]

1699

1700 if not is_null_or_none(self._info):

1701 unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore[union-attr]

1702

1703 try:

1704 unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore[union-attr]

1705 except AttributeError:

1706 pass

1707

1708 for i in compress(range(len(self._objects)), unreferenced):

1709 self._objects[i] = None

1710

1711 def get_reference(self, obj: PdfObject) -> IndirectObject:

1712 idnum = self._objects.index(obj) + 1

1713 ref = IndirectObject(idnum, 0, self)

1714 assert ref.get_object() == obj

1715 return ref

1716

1717 def get_outline_root(self) -> TreeObject:

1718 if CO.OUTLINES in self._root_object:

1719 # Entries in the catalog dictionary

1720 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1721 if not isinstance(outline, TreeObject):

1722 t = TreeObject(outline)

1723 self._replace_object(outline.indirect_reference.idnum, t)

1724 outline = t

1725 idnum = self._objects.index(outline) + 1

1726 outline_ref = IndirectObject(idnum, 0, self)

1727 assert outline_ref.get_object() == outline

1728 else:

1729 outline = TreeObject()

1730 outline.update({})

1731 outline_ref = self._add_object(outline)

1732 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1733

1734 return outline

1735

1736 def get_threads_root(self) -> ArrayObject:

1737 """

1738 The list of threads.

1739

1740 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1741

1742 Returns:

1743 An array (possibly empty) of Dictionaries with an ``/F`` key,

1744 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1745

1746 """

1747 if CO.THREADS in self._root_object:

1748 # Entries in the catalog dictionary

1749 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1750 else:

1751 threads = ArrayObject()

1752 self._root_object[NameObject(CO.THREADS)] = threads

1753 return threads

1754

1755 @property

1756 def threads(self) -> ArrayObject:

1757 """

1758 Read-only property for the list of threads.

1759

1760 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1761

1762 Each element is a dictionary with an ``/F`` key, and optionally

1763 information about the thread in ``/I`` or ``/Metadata`` keys.

1764 """

1765 return self.get_threads_root()

1766

1767 def add_outline_item_destination(

1768 self,

1769 page_destination: Union[IndirectObject, PageObject, TreeObject],

1770 parent: Union[None, TreeObject, IndirectObject] = None,

1771 before: Union[None, TreeObject, IndirectObject] = None,

1772 is_open: bool = True,

1773 ) -> IndirectObject:

1774 page_destination = cast(PageObject, page_destination.get_object())

1775 if isinstance(page_destination, PageObject):

1776 return self.add_outline_item_destination(

1777 Destination(

1778 f"page #{page_destination.page_number}",

1779 cast(IndirectObject, page_destination.indirect_reference),

1780 Fit.fit(),

1781 )

1782 )

1783

1784 if parent is None:

1785 parent = self.get_outline_root()

1786

1787 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1788 parent = cast(TreeObject, parent.get_object())

1789 page_destination_ref = self._add_object(page_destination)

1790 if before is not None:

1791 before = before.indirect_reference

1792 parent.insert_child(

1793 page_destination_ref,

1794 before,

1795 self,

1796 page_destination.inc_parent_counter_outline

1797 if is_open

1798 else (lambda x, y: 0), # noqa: ARG005

1799 )

1800 if "/Count" not in page_destination:

1801 page_destination[NameObject("/Count")] = NumberObject(0)

1802

1803 return page_destination_ref

1804

1805 def add_outline_item_dict(

1806 self,

1807 outline_item: OutlineItemType,

1808 parent: Union[None, TreeObject, IndirectObject] = None,

1809 before: Union[None, TreeObject, IndirectObject] = None,

1810 is_open: bool = True,

1811 ) -> IndirectObject:

1812 outline_item_object = TreeObject()

1813 outline_item_object.update(outline_item)

1814

1815 """code currently unreachable

1816 if "/A" in outline_item:

1817 action = DictionaryObject()

1818 a_dict = cast(DictionaryObject, outline_item["/A"])

1819 for k, v in list(a_dict.items()):

1820 action[NameObject(str(k))] = v

1821 action_ref = self._add_object(action)

1822 outline_item_object[NameObject("/A")] = action_ref

1823 """

1824 return self.add_outline_item_destination(

1825 outline_item_object, parent, before, is_open

1826 )

1827

1828 def add_outline_item(

1829 self,

1830 title: str,

1831 page_number: Union[None, PageObject, IndirectObject, int],

1832 parent: Union[None, TreeObject, IndirectObject] = None,

1833 before: Union[None, TreeObject, IndirectObject] = None,

1834 color: Optional[Union[tuple[float, float, float], str]] = None,

1835 bold: bool = False,

1836 italic: bool = False,

1837 fit: Fit = PAGE_FIT,

1838 is_open: bool = True,

1839 ) -> IndirectObject:

1840 """

1841 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1842

1843 Args:

1844 title: Title to use for this outline item.

1845 page_number: Page number this outline item will point to.

1846 parent: A reference to a parent outline item to create nested

1847 outline items.

1848 before:

1849 color: Color of the outline item's font as a red, green, blue tuple

1850 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1851 bold: Outline item font is bold

1852 italic: Outline item font is italic

1853 fit: The fit of the destination page.

1854

1855 Returns:

1856 The added outline item as an indirect object.

1857

1858 """

1859 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

1860 if isinstance(italic, Fit): # it means that we are on the old params

1861 if fit is not None and page_number is None:

1862 page_number = fit

1863 return self.add_outline_item(

1864 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

1865 )

1866 if page_number is None:

1867 action_ref = None

1868 else:

1869 if isinstance(page_number, IndirectObject):

1870 page_ref = page_number

1871 elif isinstance(page_number, PageObject):

1872 page_ref = page_number.indirect_reference

1873 elif isinstance(page_number, int):

1874 try:

1875 page_ref = self.pages[page_number].indirect_reference

1876 except IndexError:

1877 page_ref = NumberObject(page_number)

1878 if page_ref is None:

1879 logger_warning(

1880 "can not find reference of page %(page_number)s",

1881 source=__name__,

1882 page_number=page_number,

1883 )

1884 page_ref = NullObject()

1885 dest = Destination(

1886 NameObject("/" + title + " outline item"),

1887 page_ref,

1888 fit,

1889 )

1890

1891 action_ref = self._add_object(

1892 DictionaryObject(

1893 {

1894 NameObject(GoToActionArguments.D): dest.dest_array,

1895 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1896 }

1897 )

1898 )

1899 outline_item = self._add_object(

1900 _create_outline_item(action_ref, title, color, italic, bold)

1901 )

1902

1903 if parent is None:

1904 parent = self.get_outline_root()

1905 return self.add_outline_item_destination(outline_item, parent, before, is_open)

1906

1907 def add_outline(self) -> None:

1908 raise NotImplementedError(

1909 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

1910 )

1911

1912 def add_named_destination_array(

1913 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

1914 ) -> None:

1915 named_dest = self.get_named_dest_root()

1916 i = 0

1917 while i < len(named_dest):

1918 if title < named_dest[i]:

1919 named_dest.insert(i, destination)

1920 named_dest.insert(i, TextStringObject(title))

1921 return

1922 i += 2

1923 named_dest.extend([TextStringObject(title), destination])

1924 return

1925

1926 def add_named_destination_object(

1927 self,

1928 page_destination: PdfObject,

1929 ) -> IndirectObject:

1930 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore[attr-defined]

1931 self.add_named_destination_array(

1932 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore[index]

1933 )

1934

1935 return page_destination_ref

1936

1937 def add_named_destination(

1938 self,

1939 title: str,

1940 page_number: int,

1941 ) -> IndirectObject:

1942 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore[index]

1943 dest = DictionaryObject()

1944 dest.update(

1945 {

1946 NameObject(GoToActionArguments.D): ArrayObject(

1947 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

1948 ),

1949 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1950 }

1951 )

1952

1953 dest_ref = self._add_object(dest)

1954 if not isinstance(title, TextStringObject):

1955 title = TextStringObject(str(title))

1956

1957 self.add_named_destination_array(title, dest_ref)

1958 return dest_ref

1959

1960 def remove_links(self) -> None:

1961 """Remove links and annotations from this output."""

1962 for page in self.pages:

1963 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

1964

1965 def remove_annotations(

1966 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

1967 ) -> None:

1968 """

1969 Remove annotations by annotation subtype.

1970

1971 Args:

1972 subtypes: subtype or list of subtypes to be removed.

1973 Examples are: "/Link", "/FileAttachment", "/Sound",

1974 "/Movie", "/Screen", ...

1975 If you want to remove all annotations, use subtypes=None.

1976

1977 """

1978 for page in self.pages:

1979 self._remove_annots_from_page(page, subtypes)

1980

1981 def _remove_annots_from_page(

1982 self,

1983 page: Union[IndirectObject, PageObject, DictionaryObject],

1984 subtypes: Optional[Iterable[str]],

1985 ) -> None:

1986 page = cast(DictionaryObject, page.get_object())

1987 if PG.ANNOTS in page:

1988 i = 0

1989 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

1990 an = cast(ArrayObject, page[PG.ANNOTS])[i]

1991 obj = cast(DictionaryObject, an.get_object())

1992 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

1993 if isinstance(an, IndirectObject):

1994 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

1995 del page[PG.ANNOTS][i] # type:ignore

1996 else:

1997 i += 1

1998

1999 def remove_objects_from_page(

2000 self,

2001 page: Union[PageObject, DictionaryObject],

2002 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

2003 text_filters: Optional[dict[str, Any]] = None

2004 ) -> None:

2005 """

2006 Remove objects specified by ``to_delete`` from the given page.

2007

2008 Args:

2009 page: Page object to clean up.

2010 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

2011 or a list of ObjectDeletionFlag

2012 text_filters: Properties of text to be deleted, if applicable. Optional.

2013 This is a Python dictionary with the following properties:

2014

2015 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

2016

2017 """

2018 if isinstance(to_delete, (list, tuple)):

2019 for to_d in to_delete:

2020 self.remove_objects_from_page(page, to_d)

2021 return None

2022 assert isinstance(to_delete, ObjectDeletionFlag)

2023

2024 if to_delete & ObjectDeletionFlag.LINKS:

2025 return self._remove_annots_from_page(page, ("/Link",))

2026 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

2027 return self._remove_annots_from_page(

2028 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

2029 )

2030 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

2031 return self._remove_annots_from_page(page, ("/3D",))

2032 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

2033 return self._remove_annots_from_page(page, None)

2034

2035 jump_operators = []

2036 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

2037 jump_operators = [

2038 b"w", b"J", b"j", b"M", b"d", b"i",

2039 b"W", b"W*",

2040 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

2041 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

2042 b"sh"

2043 ]

2044 if to_delete & ObjectDeletionFlag.TEXT:

2045 jump_operators = [b"Tj", b"TJ", b"'", b'"']

2046

2047 if not isinstance(page, PageObject):

2048 page = PageObject(self, page.indirect_reference) # pragma: no cover

2049 if "/Contents" in page:

2050 content = cast(ContentStream, page.get_contents())

2051

2052 images, forms = self._remove_objects_from_page__clean_forms(

2053 elt=page, stack=[], jump_operators=jump_operators, to_delete=to_delete, text_filters=text_filters,

2054 )

2055

2056 self._remove_objects_from_page__clean(

2057 content=content, images=images, forms=forms,

2058 jump_operators=jump_operators, to_delete=to_delete,

2059 text_filters=text_filters

2060 )

2061 page.replace_contents(content)

2062 return [], [] # type: ignore[return-value]

2063

2064 def _remove_objects_from_page__clean(

2065 self,

2066 content: ContentStream,

2067 images: list[str],

2068 forms: list[str],

2069 jump_operators: list[bytes],

2070 to_delete: ObjectDeletionFlag,

2071 text_filters: Optional[dict[str, Any]] = None,

2072 ) -> None:

2073 font_id = None

2074 font_ids_to_delete = []

2075 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

2076 font_ids_to_delete = text_filters.get("font_ids", [])

2077

2078 i = 0

2079 while i < len(content.operations):

2080 operands, operator = content.operations[i]

2081 if operator == b"Tf":

2082 font_id = operands[0]

2083 if (

2084 (

2085 operator == b"INLINE IMAGE"

2086 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2087 )

2088 or (operator in jump_operators)

2089 or (

2090 operator == b"Do"

2091 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2092 and (operands[0] in images)

2093 )

2094 ):

2095 if (

2096 not to_delete & ObjectDeletionFlag.TEXT

2097 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2098 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2099 ):

2100 del content.operations[i]

2101 else:

2102 i += 1

2103 else:

2104 i += 1

2105 content.get_data() # this ensures ._data is rebuilt from the .operations

2106

2107 def _remove_objects_from_page__clean_forms(

2108 self,

2109 elt: DictionaryObject,

2110 stack: list[DictionaryObject],

2111 jump_operators: list[bytes],

2112 to_delete: ObjectDeletionFlag,

2113 text_filters: Optional[dict[str, Any]] = None,

2114 ) -> tuple[list[str], list[str]]:

2115 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2116 if (elt in stack) or (

2117 hasattr(elt, "indirect_reference") and any(

2118 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2119 for x in stack

2120 )

2121 ):

2122 # to prevent infinite looping

2123 return [], [] # pragma: no cover

2124 try:

2125 d = cast(

2126 dict[Any, Any],

2127 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2128 )

2129 except KeyError:

2130 d = {}

2131 images = []

2132 forms = []

2133 for k, v in d.items():

2134 o = v.get_object()

2135 try:

2136 content: Any = None

2137 if (

2138 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2139 and o["/Subtype"] == "/Image"

2140 ):

2141 content = NullObject() # to delete the image keeping the entry

2142 images.append(k)

2143 if o["/Subtype"] == "/Form":

2144 forms.append(k)

2145 if isinstance(o, ContentStream):

2146 content = o

2147 else:

2148 content = ContentStream(o, self)

2149 content.update(

2150 {

2151 k1: v1

2152 for k1, v1 in o.items()

2153 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2154 }

2155 )

2156 try:

2157 content.indirect_reference = o.indirect_reference

2158 except AttributeError: # pragma: no cover

2159 pass

2160 stack.append(elt)

2161

2162 # clean subforms

2163 self._remove_objects_from_page__clean_forms(

2164 elt=content, stack=stack, jump_operators=jump_operators, to_delete=to_delete,

2165 text_filters=text_filters,

2166 )

2167 if content is not None:

2168 if isinstance(v, IndirectObject):

2169 self._objects[v.idnum - 1] = content

2170 else:

2171 # should only occur in a PDF not respecting PDF spec

2172 # where streams must be indirected.

2173 d[k] = self._add_object(content) # pragma: no cover

2174 except (TypeError, KeyError):

2175 pass

2176 for im in images:

2177 del d[im] # for clean-up

2178 if isinstance(elt, StreamObject): # for /Form

2179 if not isinstance(elt, ContentStream): # pragma: no cover

2180 e = ContentStream(elt, self)

2181 e.update(elt.items())

2182 elt = e

2183 # clean the content

2184 self._remove_objects_from_page__clean(

2185 content=elt, images=images, forms=forms, jump_operators=jump_operators,

2186 to_delete=to_delete, text_filters=text_filters

2187 )

2188 return images, forms

2189

2190 def remove_images(

2191 self,

2192 to_delete: ImageType = ImageType.ALL,

2193 ) -> None:

2194 """

2195 Remove images from this output.

2196

2197 Args:

2198 to_delete: The type of images to be deleted

2199 (default = all images types)

2200

2201 """

2202 if isinstance(to_delete, bool):

2203 to_delete = ImageType.ALL

2204

2205 i = ObjectDeletionFlag.NONE

2206

2207 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2208 if to_delete & ImageType[image]:

2209 i |= ObjectDeletionFlag[image]

2210

2211 for page in self.pages:

2212 self.remove_objects_from_page(page, i)

2213

2214 def remove_text(self, font_names: Optional[list[str]] = None) -> None:

2215 """

2216 Remove text from the PDF.

2217

2218 Args:

2219 font_names: List of font names to remove, such as "Helvetica-Bold".

2220 Optional. If not specified, all text will be removed.

2221 """

2222 if not font_names:

2223 font_names = []

2224

2225 for page in self.pages:

2226 resource_ids_to_remove = []

2227

2228 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2229 # Font names need to be converted to resource names/IDs for easier removal

2230 if font_names:

2231 # Recursively loop through page objects to gather font info

2232 def get_font_info(

2233 obj: Any,

2234 font_info: Optional[dict[str, Any]] = None,

2235 key: Optional[str] = None

2236 ) -> dict[str, Any]:

2237 if font_info is None:

2238 font_info = {}

2239 if isinstance(obj, IndirectObject):

2240 obj = obj.get_object()

2241 if isinstance(obj, dict):

2242 if obj.get("/Type") == "/Font":

2243 font_name = obj.get("/BaseFont", "")

2244 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2245 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2246 if normalized_font_name not in font_info:

2247 font_info[normalized_font_name] = {

2248 "normalized_font_name": normalized_font_name,

2249 "resource_ids": [],

2250 }

2251 if key not in font_info[normalized_font_name]["resource_ids"]:

2252 font_info[normalized_font_name]["resource_ids"].append(key)

2253 for k in obj:

2254 font_info = get_font_info(obj[k], font_info, k)

2255 elif isinstance(obj, (list, ArrayObject)):

2256 for child_obj in obj:

2257 font_info = get_font_info(child_obj, font_info)

2258 return font_info

2259

2260 # Add relevant resource names for removal

2261 font_info = get_font_info(page.get("/Resources"))

2262 for font_name in font_names:

2263 if font_name in font_info:

2264 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2265

2266 text_filters = {}

2267 if font_names:

2268 text_filters["font_ids"] = resource_ids_to_remove

2269 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2270

2271 def add_uri(

2272 self,

2273 page_number: int,

2274 uri: str,

2275 rect: RectangleObject,

2276 border: Optional[ArrayObject] = None,

2277 ) -> None:

2278 """

2279 Add an URI from a rectangular area to the specified page.

2280

2281 Args:

2282 page_number: index of the page on which to place the URI action.

2283 uri: URI of resource to link to.

2284 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2285 array of four integers specifying the clickable rectangular area

2286 ``[xLL, yLL, xUR, yUR]``, or string in the form

2287 ``"[ xLL yLL xUR yUR ]"``.

2288 border: if provided, an array describing border-drawing

2289 properties. See the PDF spec for details. No border will be

2290 drawn if this argument is omitted.

2291

2292 """

2293 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore[index]

2294 page_ref = cast(dict[str, Any], self.get_object(page_link))

2295

2296 border_arr: BorderArrayType

2297 if border is not None:

2298 border_arr = [NumberObject(n) for n in border[:3]]

2299 if len(border) == 4:

2300 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2301 border_arr.append(dash_pattern)

2302 else:

2303 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2304

2305 if isinstance(rect, str):

2306 rect = NumberObject(rect)

2307 elif isinstance(rect, RectangleObject):

2308 pass

2309 else:

2310 rect = RectangleObject(rect)

2311

2312 lnk2 = DictionaryObject()

2313 lnk2.update(

2314 {

2315 NameObject("/S"): NameObject("/URI"),

2316 NameObject("/URI"): TextStringObject(uri),

2317 }

2318 )

2319 lnk = DictionaryObject()

2320 lnk.update(

2321 {

2322 NameObject(AA.Type): NameObject("/Annot"),

2323 NameObject(AA.Subtype): NameObject("/Link"),

2324 NameObject(AA.P): page_link,

2325 NameObject(AA.Rect): rect,

2326 NameObject("/H"): NameObject("/I"),

2327 NameObject(AA.Border): ArrayObject(border_arr),

2328 NameObject("/A"): lnk2,

2329 }

2330 )

2331 lnk_ref = self._add_object(lnk)

2332

2333 if PG.ANNOTS in page_ref:

2334 page_ref[PG.ANNOTS].append(lnk_ref)

2335 else:

2336 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2337

2338 _valid_layouts = (

2339 "/NoLayout",

2340 "/SinglePage",

2341 "/OneColumn",

2342 "/TwoColumnLeft",

2343 "/TwoColumnRight",

2344 "/TwoPageLeft",

2345 "/TwoPageRight",

2346 )

2347

2348 def _get_page_layout(self) -> Optional[LayoutType]:

2349 try:

2350 return cast(LayoutType, self._root_object["/PageLayout"])

2351 except KeyError:

2352 return None

2353

2354 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2355 """

2356 Set the page layout.

2357

2358 Args:

2359 layout: The page layout to be used.

2360

2361 .. list-table:: Valid ``layout`` arguments

2362 :widths: 50 200

2363

2364 * - /NoLayout

2365 - Layout explicitly not specified

2366 * - /SinglePage

2367 - Show one page at a time

2368 * - /OneColumn

2369 - Show one column at a time

2370 * - /TwoColumnLeft

2371 - Show pages in two columns, odd-numbered pages on the left

2372 * - /TwoColumnRight

2373 - Show pages in two columns, odd-numbered pages on the right

2374 * - /TwoPageLeft

2375 - Show two pages at a time, odd-numbered pages on the left

2376 * - /TwoPageRight

2377 - Show two pages at a time, odd-numbered pages on the right

2378

2379 """

2380 if not isinstance(layout, NameObject):

2381 if layout not in self._valid_layouts:

2382 logger_warning(

2383 "Layout should be one of: %(layouts)s",

2384 source=__name__,

2385 layouts={"", "".join(self._valid_layouts)},

2386 )

2387 layout = NameObject(layout)

2388 self._root_object.update({NameObject("/PageLayout"): layout})

2389

2390 def set_page_layout(self, layout: LayoutType) -> None:

2391 """

2392 Set the page layout.

2393

2394 Args:

2395 layout: The page layout to be used

2396

2397 .. list-table:: Valid ``layout`` arguments

2398 :widths: 50 200

2399

2400 * - /NoLayout

2401 - Layout explicitly not specified

2402 * - /SinglePage

2403 - Show one page at a time

2404 * - /OneColumn

2405 - Show one column at a time

2406 * - /TwoColumnLeft

2407 - Show pages in two columns, odd-numbered pages on the left

2408 * - /TwoColumnRight

2409 - Show pages in two columns, odd-numbered pages on the right

2410 * - /TwoPageLeft

2411 - Show two pages at a time, odd-numbered pages on the left

2412 * - /TwoPageRight

2413 - Show two pages at a time, odd-numbered pages on the right

2414

2415 """

2416 self._set_page_layout(layout)

2417

2418 @property

2419 def page_layout(self) -> Optional[LayoutType]:

2420 """

2421 Page layout property.

2422

2423 .. list-table:: Valid ``layout`` values

2424 :widths: 50 200

2425

2426 * - /NoLayout

2427 - Layout explicitly not specified

2428 * - /SinglePage

2429 - Show one page at a time

2430 * - /OneColumn

2431 - Show one column at a time

2432 * - /TwoColumnLeft

2433 - Show pages in two columns, odd-numbered pages on the left

2434 * - /TwoColumnRight

2435 - Show pages in two columns, odd-numbered pages on the right

2436 * - /TwoPageLeft

2437 - Show two pages at a time, odd-numbered pages on the left

2438 * - /TwoPageRight

2439 - Show two pages at a time, odd-numbered pages on the right

2440 """

2441 return self._get_page_layout()

2442

2443 @page_layout.setter

2444 def page_layout(self, layout: LayoutType) -> None:

2445 self._set_page_layout(layout)

2446

2447 _valid_modes = (

2448 "/UseNone",

2449 "/UseOutlines",

2450 "/UseThumbs",

2451 "/FullScreen",

2452 "/UseOC",

2453 "/UseAttachments",

2454 )

2455

2456 def _get_page_mode(self) -> Optional[PagemodeType]:

2457 try:

2458 return cast(PagemodeType, self._root_object["/PageMode"])

2459 except KeyError:

2460 return None

2461

2462 @property

2463 def page_mode(self) -> Optional[PagemodeType]:

2464 """

2465 Page mode property.

2466

2467 .. list-table:: Valid ``mode`` values

2468 :widths: 50 200

2469

2470 * - /UseNone

2471 - Do not show outline or thumbnails panels

2472 * - /UseOutlines

2473 - Show outline (aka bookmarks) panel

2474 * - /UseThumbs

2475 - Show page thumbnails panel

2476 * - /FullScreen

2477 - Fullscreen view

2478 * - /UseOC

2479 - Show Optional Content Group (OCG) panel

2480 * - /UseAttachments

2481 - Show attachments panel

2482 """

2483 return self._get_page_mode()

2484

2485 @page_mode.setter

2486 def page_mode(self, mode: PagemodeType) -> None:

2487 if isinstance(mode, NameObject):

2488 mode_name: NameObject = mode

2489 else:

2490 if mode not in self._valid_modes:

2491 logger_warning(

2492 "Mode should be one of: %(modes)s",

2493 source=__name__,

2494 modes=", ".join(self._valid_modes),

2495 )

2496 mode_name = NameObject(mode)

2497 self._root_object.update({NameObject("/PageMode"): mode_name})

2498

2499 def add_annotation(

2500 self,

2501 page_number: Union[int, PageObject],

2502 annotation: dict[str, Any],

2503 ) -> DictionaryObject:

2504 """

2505 Add a single annotation to the page.

2506 The added annotation must be a new annotation.

2507 It cannot be recycled.

2508

2509 Args:

2510 page_number: PageObject or page index.

2511 annotation: Annotation to be added (created with annotation).

2512

2513 Returns:

2514 The inserted object.

2515 This can be used for popup creation, for example.

2516

2517 """

2518 page = page_number

2519 if isinstance(page, int):

2520 page = self.pages[page]

2521 elif not isinstance(page, PageObject):

2522 raise TypeError("page: invalid type")

2523

2524 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2525 to_add[NameObject("/P")] = page.indirect_reference

2526

2527 if page.annotations is None:

2528 page[NameObject("/Annots")] = ArrayObject()

2529 assert page.annotations is not None

2530

2531 # Internal link annotations need the correct object type for the

2532 # destination

2533 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2534 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])

2535 dest = Destination(

2536 NameObject("/LinkName"),

2537 tmp["target_page_index"],

2538 Fit(

2539 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2540 ), # I have no clue why this dict-hack is necessary

2541 )

2542 to_add[NameObject("/Dest")] = dest.dest_array

2543

2544 page.annotations.append(self._add_object(to_add))

2545

2546 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2547 cast(DictionaryObject, to_add["/Parent"].get_object())[

2548 NameObject("/Popup")

2549 ] = to_add.indirect_reference

2550

2551 return to_add

2552

2553 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2554 """

2555 Perform some clean up in the page.

2556 Currently: convert NameObject named destination to TextStringObject

2557 (required for names/dests list)

2558

2559 Args:

2560 page:

2561

2562 Returns:

2563 The cleaned PageObject

2564

2565 """

2566 page = cast("PageObject", page.get_object())

2567 for a in page.get("/Annots", []):

2568 a_obj = a.get_object()

2569 d = a_obj.get("/Dest", None)

2570 act = a_obj.get("/A", None)

2571 if isinstance(d, NameObject):

2572 a_obj[NameObject("/Dest")] = TextStringObject(d)

2573 elif act is not None:

2574 act = act.get_object()

2575 d = act.get("/D", None)

2576 if isinstance(d, NameObject):

2577 act[NameObject("/D")] = TextStringObject(d)

2578 return page

2579

2580 def _create_stream(

2581 self, fileobj: Union[Path, StrByteType, PdfReader]

2582 ) -> tuple[IOBase, Optional[Encryption]]:

2583 # If the fileobj parameter is a string, assume it is a path

2584 # and create a file object at that location. If it is a file,

2585 # copy the file's contents into a BytesIO stream object; if

2586 # it is a PdfReader, copy that reader's stream into a

2587 # BytesIO stream.

2588 # If fileobj is none of the above types, it is not modified

2589 encryption_obj = None

2590 stream: IOBase

2591 if isinstance(fileobj, (str, Path)):

2592 with FileIO(fileobj, "rb") as f:

2593 stream = BytesIO(f.read())

2594 elif isinstance(fileobj, PdfReader):

2595 if fileobj._encryption:

2596 encryption_obj = fileobj._encryption

2597 orig_tell = fileobj.stream.tell()

2598 fileobj.stream.seek(0)

2599 stream = BytesIO(fileobj.stream.read())

2600

2601 # reset the stream to its original location

2602 fileobj.stream.seek(orig_tell)

2603 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2604 fileobj.seek(0)

2605 filecontent = fileobj.read()

2606 stream = BytesIO(filecontent)

2607 else:

2608 raise NotImplementedError(

2609 "Merging requires an object that PdfReader can parse. "

2610 "Typically, that is a Path or a string representing a Path, "

2611 "a file object, or an object implementing .seek and .read. "

2612 "Passing a PdfReader directly works as well."

2613 )

2614 return stream, encryption_obj

2615

2616 def append(

2617 self,

2618 fileobj: Union[StrByteType, PdfReader, Path],

2619 outline_item: Union[

2620 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]

2621 ] = None,

2622 pages: Union[

2623 None,

2624 PageRange,

2625 tuple[int, int],

2626 tuple[int, int, int],

2627 list[int],

2628 list[PageObject],

2629 ] = None,

2630 import_outline: bool = True,

2631 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,

2632 ) -> None:

2633 """

2634 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2635 concatenate all pages onto the end of the file instead of specifying a

2636 position.

2637

2638 Args:

2639 fileobj: A File Object or an object that supports the standard

2640 read and seek methods similar to a File Object. Could also be a

2641 string representing a path to a PDF file.

2642 outline_item: Optionally, you may specify a string to build an

2643 outline (aka 'bookmark') to identify the beginning of the

2644 included file.

2645 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2646 or a ``(start, stop[, step])`` tuple

2647 or a list of pages to be processed

2648 to merge only the specified range of pages from the source

2649 document into the output document.

2650 import_outline: You may prevent the source document's

2651 outline (collection of outline items, previously referred to as

2652 'bookmarks') from being imported by specifying this as ``False``.

2653 excluded_fields: Provide the list of fields/keys to be ignored

2654 if ``/Annots`` is part of the list, the annotation will be ignored

2655 if ``/B`` is part of the list, the articles will be ignored

2656

2657 """

2658 if excluded_fields is None:

2659 excluded_fields = ()

2660 if isinstance(outline_item, (tuple, list, PageRange)):

2661 if isinstance(pages, bool):

2662 if not isinstance(import_outline, bool):

2663 excluded_fields = import_outline

2664 import_outline = pages

2665 pages = outline_item

2666 self.merge(

2667 None,

2668 fileobj,

2669 None,

2670 pages,

2671 import_outline,

2672 excluded_fields,

2673 )

2674 else: # if isinstance(outline_item, str):

2675 self.merge(

2676 None,

2677 fileobj,

2678 outline_item,

2679 pages,

2680 import_outline,

2681 excluded_fields,

2682 )

2683

2684 def merge(

2685 self,

2686 position: Optional[int],

2687 fileobj: Union[Path, StrByteType, PdfReader],

2688 outline_item: Optional[str] = None,

2689 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,

2690 import_outline: bool = True,

2691 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),

2692 ) -> None:

2693 """

2694 Merge the pages from the given file into the output file at the

2695 specified page number.

2696

2697 Args:

2698 position: The *page number* to insert this file. File will

2699 be inserted after the given number.

2700 fileobj: A File Object or an object that supports the standard

2701 read and seek methods similar to a File Object. Could also be a

2702 string representing a path to a PDF file.

2703 outline_item: Optionally, you may specify a string to build an outline

2704 (aka 'bookmark') to identify the

2705 beginning of the included file.

2706 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2707 or a ``(start, stop[, step])`` tuple

2708 or a list of pages to be processed

2709 to merge only the specified range of pages from the source

2710 document into the output document.

2711 import_outline: You may prevent the source document's

2712 outline (collection of outline items, previously referred to as

2713 'bookmarks') from being imported by specifying this as ``False``.

2714 excluded_fields: provide the list of fields/keys to be ignored

2715 if ``/Annots`` is part of the list, the annotation will be ignored

2716 if ``/B`` is part of the list, the articles will be ignored

2717

2718 Raises:

2719 TypeError: The pages attribute is not configured properly

2720

2721 """

2722 if isinstance(fileobj, PdfDocCommon):

2723 reader = fileobj

2724 else:

2725 stream, _encryption_obj = self._create_stream(fileobj)

2726 # Create a new PdfReader instance using the stream

2727 # (either file or BytesIO or StringIO) created above

2728 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2729

2730 if excluded_fields is None:

2731 excluded_fields = ()

2732 # Find the range of pages to merge.

2733 if pages is None:

2734 pages = list(range(len(reader.pages)))

2735 elif isinstance(pages, PageRange):

2736 pages = list(range(*pages.indices(len(reader.pages))))

2737 elif isinstance(pages, list):

2738 pass # keep unchanged

2739 elif isinstance(pages, tuple) and len(pages) <= 3:

2740 pages = list(range(*pages))

2741 elif not isinstance(pages, tuple):

2742 raise TypeError(

2743 '"pages" must be a tuple of (start, stop[, step]) or a list'

2744 )

2745

2746 srcpages = {}

2747 for page in pages:

2748 if isinstance(page, PageObject):

2749 pg = page

2750 else:

2751 pg = reader.pages[page]

2752 assert pg.indirect_reference is not None

2753 if position is None:

2754 # numbers in the exclude list identifies that the exclusion is

2755 # only applicable to 1st level of cloning

2756 srcpages[pg.indirect_reference.idnum] = self.add_page(

2757 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore[list-item]

2758 )

2759 else:

2760 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2761 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore[list-item]

2762 )

2763 position += 1

2764 srcpages[pg.indirect_reference.idnum].original_page = pg

2765

2766 reader._named_destinations = (

2767 reader.named_destinations

2768 ) # need for the outline processing below

2769

2770 arr: Any

2771

2772 for dest in reader._named_destinations.values():

2773 self._merge__process_named_dests(dest=dest, reader=reader, srcpages=srcpages)

2774

2775 outline_item_typ: TreeObject

2776 if outline_item is not None:

2777 outline_item_typ = cast(

2778 "TreeObject",

2779 self.add_outline_item(

2780 TextStringObject(outline_item),

2781 next(iter(srcpages.values())).indirect_reference,

2782 fit=PAGE_FIT,

2783 ).get_object(),

2784 )

2785 else:

2786 outline_item_typ = self.get_outline_root()

2787

2788 _ro = reader.root_object

2789 if import_outline and CO.OUTLINES in _ro:

2790 outline = self._get_filtered_outline(

2791 _ro.get(CO.OUTLINES, None), srcpages, reader

2792 )

2793 self._insert_filtered_outline(

2794 outline, outline_item_typ, None

2795 ) # TODO: use before parameter

2796

2797 if "/Annots" not in excluded_fields:

2798 for pag in srcpages.values():

2799 lst = self._insert_filtered_annotations(

2800 pag.original_page.get("/Annots", []), pag, srcpages, reader

2801 )

2802 if len(lst) > 0:

2803 pag[NameObject("/Annots")] = lst

2804 self.clean_page(pag)

2805

2806 if "/AcroForm" in _ro and not is_null_or_none(_ro["/AcroForm"]):

2807 if "/AcroForm" not in self._root_object:

2808 self._root_object[NameObject("/AcroForm")] = self._add_object(

2809 cast(

2810 DictionaryObject,

2811 reader.root_object["/AcroForm"],

2812 ).clone(self, False, ("/Fields",))

2813 )

2814 arr = ArrayObject()

2815 else:

2816 arr = cast(

2817 ArrayObject,

2818 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2819 )

2820 trslat = self._id_translated[id(reader)]

2821 try:

2822 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore[index]

2823 try:

2824 ind = IndirectObject(trslat[f.idnum], 0, self)

2825 if ind not in arr:

2826 arr.append(ind)

2827 except KeyError:

2828 # for trslat[] which mean the field has not be copied

2829 # through the page

2830 pass

2831 except KeyError: # for /Acroform or /Fields are not existing

2832 arr = self._add_object(ArrayObject())

2833 cast(DictionaryObject, self._root_object["/AcroForm"])[

2834 NameObject("/Fields")

2835 ] = arr

2836

2837 if "/B" not in excluded_fields:

2838 self.add_filtered_articles("", srcpages, reader)

2839

2840 def _merge__process_named_dests(self, dest: Any, reader: PdfDocCommon, srcpages: dict[int, PageObject]) -> None:

2841 arr: Any = dest.dest_array

2842 if "/Names" in self._root_object and dest["/Title"] in cast(

2843 list[Any],

2844 cast(

2845 DictionaryObject,

2846 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2847 ).get("/Names", DictionaryObject()),

2848 ):

2849 # already exists: should not duplicate it

2850 pass

2851 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2852 pass

2853 elif isinstance(dest["/Page"], int):

2854 # the page reference is a page number normally not a PDF Reference

2855 # page numbers as int are normally accepted only in external goto

2856 try:

2857 p = reader.pages[dest["/Page"]]

2858 except IndexError:

2859 return

2860 assert p.indirect_reference is not None

2861 try:

2862 arr[NumberObject(0)] = NumberObject(

2863 srcpages[p.indirect_reference.idnum].page_number

2864 )

2865 self.add_named_destination_array(dest["/Title"], arr)

2866 except KeyError:

2867 pass

2868 elif dest["/Page"].indirect_reference.idnum in srcpages:

2869 arr[NumberObject(0)] = srcpages[

2870 dest["/Page"].indirect_reference.idnum

2871 ].indirect_reference

2872 self.add_named_destination_array(dest["/Title"], arr)

2873

2874 def _add_articles_thread(

2875 self,

2876 thread: DictionaryObject, # thread entry from the reader's array of threads

2877 pages: dict[int, PageObject],

2878 reader: PdfReader,

2879 ) -> IndirectObject:

2880 """

2881 Clone the thread with only the applicable articles.

2882

2883 Args:

2884 thread:

2885 pages:

2886 reader:

2887

2888 Returns:

2889 The added thread as an indirect reference

2890

2891 """

2892 nthread = thread.clone(

2893 self, force_duplicate=True, ignore_fields=("/F",)

2894 ) # use of clone to keep link between reader and writer

2895 self.threads.append(nthread.indirect_reference)

2896 first_article = cast("DictionaryObject", thread["/F"])

2897 current_article: Optional[DictionaryObject] = first_article

2898 new_article: Optional[DictionaryObject] = None

2899 while current_article is not None:

2900 pag = self._get_cloned_page(

2901 cast("PageObject", current_article["/P"]), pages, reader

2902 )

2903 if pag is not None:

2904 if new_article is None:

2905 new_article = cast(

2906 "DictionaryObject",

2907 self._add_object(DictionaryObject()).get_object(),

2908 )

2909 new_first = new_article

2910 nthread[NameObject("/F")] = new_article.indirect_reference

2911 else:

2912 new_article2 = cast(

2913 "DictionaryObject",

2914 self._add_object(

2915 DictionaryObject(

2916 {NameObject("/V"): new_article.indirect_reference}

2917 )

2918 ).get_object(),

2919 )

2920 new_article[NameObject("/N")] = new_article2.indirect_reference

2921 new_article = new_article2

2922 new_article[NameObject("/P")] = pag

2923 new_article[NameObject("/T")] = nthread.indirect_reference

2924 new_article[NameObject("/R")] = current_article["/R"]

2925 pag_obj = cast("PageObject", pag.get_object())

2926 if "/B" not in pag_obj:

2927 pag_obj[NameObject("/B")] = ArrayObject()

2928 cast("ArrayObject", pag_obj["/B"]).append(

2929 new_article.indirect_reference

2930 )

2931 current_article = cast("DictionaryObject", current_article["/N"])

2932 if current_article == first_article:

2933 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore[index]

2934 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore[union-attr]

2935 current_article = None

2936 assert nthread.indirect_reference is not None

2937 return nthread.indirect_reference

2938

2939 def add_filtered_articles(

2940 self,

2941 fltr: Union[

2942 Pattern[Any], str

2943 ], # thread entry from the reader's array of threads

2944 pages: dict[int, PageObject],

2945 reader: PdfReader,

2946 ) -> None:

2947 """

2948 Add articles matching the defined criteria.

2949

2950 Args:

2951 fltr:

2952 pages:

2953 reader:

2954

2955 """

2956 if isinstance(fltr, str):

2957 fltr = re.compile(fltr)

2958 elif not isinstance(fltr, Pattern):

2959 fltr = re.compile("")

2960 for p in pages.values():

2961 pp = p.original_page

2962 for a in pp.get("/B", ()):

2963 a_obj = a.get_object()

2964 if is_null_or_none(a_obj):

2965 continue

2966 thr = a_obj.get("/T")

2967 if thr is None:

2968 continue

2969 thr = thr.get_object()

2970 if thr.indirect_reference.idnum not in self._id_translated[

2971 id(reader)

2972 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

2973 self._add_articles_thread(thr, pages, reader)

2974

2975 def _get_cloned_page(

2976 self,

2977 page: Union[None, IndirectObject, PageObject, NullObject],

2978 pages: dict[int, PageObject],

2979 reader: PdfReader,

2980 ) -> Optional[IndirectObject]:

2981 if isinstance(page, NullObject):

2982 return None

2983 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

2984 _i = page.indirect_reference

2985 elif isinstance(page, IndirectObject):

2986 _i = page

2987 try:

2988 return pages[_i.idnum].indirect_reference # type: ignore[union-attr]

2989 except Exception:

2990 return None

2991

2992 def _insert_filtered_annotations(

2993 self,

2994 annots: Union[IndirectObject, list[DictionaryObject], None],

2995 page: PageObject,

2996 pages: dict[int, PageObject],

2997 reader: PdfReader,

2998 ) -> list[Destination]:

2999 outlist = ArrayObject()

3000 if isinstance(annots, IndirectObject):

3001 annots = cast("list[Any]", annots.get_object())

3002 if annots is None:

3003 return outlist

3004 if not isinstance(annots, list):

3005 logger_warning(

3006 "Expected list of annotations, got %(annots)s of type %(annots_type)s.",

3007 source=__name__,

3008 annots=annots,

3009 annots_type=annots.__class__.__name__,

3010 )

3011 return outlist

3012 for an in annots:

3013 ano = cast("DictionaryObject", an.get_object())

3014 if (

3015 ano["/Subtype"] != "/Link" # type: ignore[comparison-overlap]

3016 or "/A" not in ano

3017 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" # type: ignore[comparison-overlap]

3018 or "/Dest" in ano

3019 ):

3020 if "/Dest" not in ano:

3021 outlist.append(self._add_object(ano.clone(self)))

3022 else:

3023 d = ano["/Dest"]

3024 if isinstance(d, str):

3025 # it is a named dest

3026 if str(d) in self.get_named_dest_root():

3027 outlist.append(ano.clone(self).indirect_reference)

3028 else:

3029 d = cast("ArrayObject", d)

3030 p = self._get_cloned_page(d[0], pages, reader)

3031 if p is not None:

3032 anc = ano.clone(self, ignore_fields=("/Dest",))

3033 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

3034 outlist.append(self._add_object(anc))

3035 else:

3036 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

3037 if is_null_or_none(d):

3038 continue

3039 if isinstance(d, str):

3040 # it is a named dest

3041 if str(d) in self.get_named_dest_root():

3042 outlist.append(ano.clone(self).indirect_reference)

3043 else:

3044 d = cast("ArrayObject", d)

3045 p = self._get_cloned_page(d[0], pages, reader)

3046 if p is not None:

3047 anc = ano.clone(self, ignore_fields=("/D",))

3048 cast("DictionaryObject", anc["/A"])[

3049 NameObject("/D")

3050 ] = ArrayObject([p, *d[1:]])

3051 outlist.append(self._add_object(anc))

3052 return outlist

3053

3054 def _get_filtered_outline(

3055 self,

3056 node: Any,

3057 pages: dict[int, PageObject],

3058 reader: PdfReader,

3059 ) -> list[Destination]:

3060 """

3061 Extract outline item entries that are part of the specified page set.

3062

3063 Args:

3064 node:

3065 pages:

3066 reader:

3067

3068 Returns:

3069 A list of destination objects.

3070

3071 """

3072 new_outline = []

3073 if node is None:

3074 node = NullObject()

3075 node = node.get_object()

3076 if is_null_or_none(node):

3077 node = DictionaryObject()

3078 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

3079 node = node.get("/First", None)

3080 if node is not None:

3081 node = node.get_object()

3082 new_outline += self._get_filtered_outline(node, pages, reader)

3083 else:

3084 v: Union[None, IndirectObject, NullObject]

3085 while node is not None:

3086 node = node.get_object()

3087 o = cast("Destination", reader._build_outline_item(node))

3088 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3089 if v is None:

3090 v = NullObject()

3091 o[NameObject("/Page")] = v

3092 if "/First" in node:

3093 o._filtered_children = self._get_filtered_outline(

3094 node["/First"], pages, reader

3095 )

3096 else:

3097 o._filtered_children = []

3098 if (

3099 not isinstance(o["/Page"], NullObject)

3100 or len(o._filtered_children) > 0

3101 ):

3102 new_outline.append(o)

3103 node = node.get("/Next", None)

3104 return new_outline

3105

3106 def _clone_outline(self, dest: Destination) -> TreeObject:

3107 n_ol = TreeObject()

3108 self._add_object(n_ol)

3109 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3110 if not isinstance(dest["/Page"], NullObject):

3111 if dest.node is not None and "/A" in dest.node:

3112 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3113 else:

3114 n_ol[NameObject("/Dest")] = dest.dest_array

3115 # TODO: /SE

3116 if dest.node is not None:

3117 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3118 n_ol[NameObject("/C")] = ArrayObject(

3119 dest.node.get(

3120 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3121 )

3122 )

3123 return n_ol

3124

3125 def _insert_filtered_outline(

3126 self,

3127 outlines: list[Destination],

3128 parent: Union[TreeObject, IndirectObject],

3129 before: Union[None, TreeObject, IndirectObject] = None,

3130 ) -> None:

3131 for dest in outlines:

3132 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3133 # with np=self.add_outline_item_destination(dest,parent,before)

3134 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3135 np = parent

3136 else:

3137 np = self._clone_outline(dest)

3138 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3139 self._insert_filtered_outline(dest._filtered_children, np, None)

3140

3141 def close(self) -> None:

3142 """Implemented for API harmonization."""

3143 return

3144

3145 def find_outline_item(

3146 self,

3147 outline_item: dict[str, Any],

3148 root: Optional[OutlineType] = None,

3149 ) -> Optional[list[int]]:

3150 if root is None:

3151 o = self.get_outline_root()

3152 else:

3153 o = cast("TreeObject", root)

3154

3155 i = 0

3156 while o is not None:

3157 if (

3158 o.indirect_reference == outline_item

3159 or o.get("/Title", None) == outline_item

3160 ):

3161 return [i]

3162 if "/First" in o:

3163 res = self.find_outline_item(

3164 outline_item, cast(OutlineType, o["/First"])

3165 )

3166 if res:

3167 return ([i] if "/Title" in o else []) + res

3168 if "/Next" in o:

3169 i += 1

3170 o = cast(TreeObject, o["/Next"])

3171 else:

3172 return None

3173 raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover

3174

3175 def reset_translation(

3176 self, reader: Union[None, PdfReader, IndirectObject] = None

3177 ) -> None:

3178 """

3179 Reset the translation table between reader and the writer object.

3180

3181 Late cloning will create new independent objects.

3182

3183 Args:

3184 reader: PdfReader or IndirectObject referencing a PdfReader object.

3185 if set to None or omitted, all tables will be reset.

3186

3187 """

3188 if reader is None:

3189 self._id_translated = {}

3190 elif isinstance(reader, PdfReader):

3191 try:

3192 del self._id_translated[id(reader)]

3193 except Exception:

3194 pass

3195 elif isinstance(reader, IndirectObject):

3196 try:

3197 del self._id_translated[id(reader.pdf)]

3198 except Exception:

3199 pass

3200 else:

3201 raise Exception("invalid parameter {reader}")

3202

3203 def set_page_label(

3204 self,

3205 page_index_from: int,

3206 page_index_to: int,

3207 style: Optional[PageLabelStyle] = None,

3208 prefix: Optional[str] = None,

3209 start: Optional[int] = 0,

3210 ) -> None:

3211 """

3212 Set a page label to a range of pages.

3213

3214 Page indexes must be given starting from 0.

3215 Labels must have a style, a prefix or both.

3216 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3217

3218 Args:

3219 page_index_from: page index of the beginning of the range starting from 0

3220 page_index_to: page index of the beginning of the range starting from 0

3221 style: The numbering style to be used for the numeric portion of each page label:

3222

3223 * ``/D`` Decimal Arabic numerals

3224 * ``/R`` Uppercase Roman numerals

3225 * ``/r`` Lowercase Roman numerals

3226 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3227 AA to ZZ for the next 26, and so on)

3228 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3229 aa to zz for the next 26, and so on)

3230

3231 prefix: The label prefix for page labels in this range.

3232 start: The value of the numeric portion for the first page label

3233 in the range.

3234 Subsequent pages are numbered sequentially from this value,

3235 which must be greater than or equal to 1.

3236 Default value: 1.

3237

3238 """

3239 if style is None and prefix is None:

3240 raise ValueError("At least one of style and prefix must be given")

3241 if page_index_from < 0:

3242 raise ValueError("page_index_from must be greater or equal than 0")

3243 if page_index_to < page_index_from:

3244 raise ValueError(

3245 "page_index_to must be greater or equal than page_index_from"

3246 )

3247 if page_index_to >= len(self.pages):

3248 raise ValueError("page_index_to exceeds number of pages")

3249 if start is not None and start != 0 and start < 1:

3250 raise ValueError("If given, start must be greater or equal than one")

3251

3252 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3253

3254 def _set_page_label(

3255 self,

3256 page_index_from: int,

3257 page_index_to: int,

3258 style: Optional[PageLabelStyle] = None,

3259 prefix: Optional[str] = None,

3260 start: Optional[int] = 0,

3261 ) -> None:

3262 """

3263 Set a page label to a range of pages.

3264

3265 Page indexes must be given starting from 0.

3266 Labels must have a style, a prefix or both.

3267 If a range is not assigned any page label a decimal label starting from 1 is applied.

3268

3269 Args:

3270 page_index_from: page index of the beginning of the range starting from 0

3271 page_index_to: page index of the beginning of the range starting from 0

3272 style: The numbering style to be used for the numeric portion of each page label:

3273 /D Decimal Arabic numerals

3274 /R Uppercase Roman numerals

3275 /r Lowercase Roman numerals

3276 /A Uppercase letters (A to Z for the first 26 pages,

3277 AA to ZZ for the next 26, and so on)

3278 /a Lowercase letters (a to z for the first 26 pages,

3279 aa to zz for the next 26, and so on)

3280 prefix: The label prefix for page labels in this range.

3281 start: The value of the numeric portion for the first page label

3282 in the range.

3283 Subsequent pages are numbered sequentially from this value,

3284 which must be greater than or equal to 1. Default value: 1.

3285

3286 """

3287 default_page_label = DictionaryObject()

3288 default_page_label[NameObject("/S")] = NameObject("/D")

3289

3290 new_page_label = DictionaryObject()

3291 if style is not None:

3292 new_page_label[NameObject("/S")] = NameObject(style)

3293 if prefix is not None:

3294 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3295 if start != 0:

3296 new_page_label[NameObject("/St")] = NumberObject(start)

3297

3298 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3299 nums = ArrayObject()

3300 nums_insert(NumberObject(0), default_page_label, nums)

3301 page_labels = TreeObject()

3302 page_labels[NameObject("/Nums")] = nums

3303 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3304

3305 page_labels = cast(

3306 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3307 )

3308 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3309

3310 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3311 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3312 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3313 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3314 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3315

3316 page_labels[NameObject("/Nums")] = nums

3317 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3318

3319 def _repr_mimebundle_(

3320 self,

3321 include: Union[None, Iterable[str]] = None,

3322 exclude: Union[None, Iterable[str]] = None,

3323 ) -> dict[str, Any]:

3324 """

3325 Integration into Jupyter Notebooks.

3326

3327 This method returns a dictionary that maps a mime-type to its

3328 representation.

3329

3330 .. seealso::

3331

3332 https://ipython.readthedocs.io/en/stable/config/integrating.html

3333 """

3334 pdf_data = BytesIO()

3335 self.write(pdf_data)

3336 data = {

3337 "application/pdf": pdf_data,

3338 }

3339

3340 if include is not None:

3341 # Filter representations based on include list

3342 data = {k: v for k, v in data.items() if k in include}

3343

3344 if exclude is not None:

3345 # Remove representations based on exclude list

3346 data = {k: v for k, v in data.items() if k not in exclude}

3347

3348 return data

3349

3350

3351def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:

3352 if isinstance(obj, PdfObject):

3353 return obj

3354 if isinstance(obj, dict):

3355 to_add = DictionaryObject()

3356 for key, value in obj.items():

3357 to_add[NameObject(key)] = _pdf_objectify(value)

3358 return to_add

3359 if isinstance(obj, str):

3360 if obj.startswith("/"):

3361 return NameObject(obj)

3362 return TextStringObject(obj)

3363 if isinstance(obj, (float, int)):

3364 return FloatObject(obj)

3365 if isinstance(obj, list):

3366 return ArrayObject(_pdf_objectify(i) for i in obj)

3367 raise NotImplementedError(

3368 f"{type(obj)=} could not be cast to a PdfObject"

3369 )

3370

3371

3372def _create_outline_item(

3373 action_ref: Union[None, IndirectObject],

3374 title: str,

3375 color: Union[tuple[float, float, float], str, None],

3376 italic: bool,

3377 bold: bool,

3378) -> TreeObject:

3379 outline_item = TreeObject()

3380 if action_ref is not None:

3381 outline_item[NameObject("/A")] = action_ref

3382 outline_item.update(

3383 {

3384 NameObject("/Title"): create_string_object(title),

3385 }

3386 )

3387 if color:

3388 if isinstance(color, str):

3389 color = hex_to_rgb(color)

3390 outline_item.update(

3391 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3392 )

3393 if italic or bold:

3394 format_flag = 0

3395 if italic:

3396 format_flag += OutlineFontFlag.italic

3397 if bold:

3398 format_flag += OutlineFontFlag.bold

3399 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3400 return outline_item

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 21%

1444 statements