Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import uuid

36from collections.abc import Iterable, Mapping

37from io import BytesIO, FileIO, IOBase

38from itertools import compress

39from pathlib import Path

40from re import Pattern

41from types import TracebackType

42from typing import (

43 IO,

44 Any,

45 Callable,

46 Optional,

47 Union,

48 cast,

49)

51from ._cmap import _default_fonts_space_width, build_char_map_from_dict

52from ._doc_common import DocumentInformation, PdfDocCommon

53from ._encryption import EncryptAlgorithm, Encryption

54from ._page import PageObject, Transformation

55from ._page_labels import nums_clear_range, nums_insert, nums_next

56from ._reader import PdfReader

57from ._utils import (

58 StrByteType,

59 StreamType,

60 _get_max_pdf_version_header,

61 deprecation_no_replacement,

62 logger_warning,

63)

64from .constants import AnnotationDictionaryAttributes as AA

65from .constants import CatalogAttributes as CA

66from .constants import (

67 CatalogDictionary,

68 GoToActionArguments,

69 ImageType,

70 InteractiveFormDictEntries,

71 OutlineFontFlag,

72 PageLabelStyle,

73 PagesAttributes,

74 TypFitArguments,

75 UserAccessPermissions,

76)

77from .constants import Core as CO

78from .constants import FieldDictionaryAttributes as FA

79from .constants import PageAttributes as PG

80from .constants import TrailerKeys as TK

81from .errors import PdfReadError, PyPdfError

82from .generic import (

83 PAGE_FIT,

84 ArrayObject,

85 BooleanObject,

86 ByteStringObject,

87 ContentStream,

88 DecodedStreamObject,

89 Destination,

90 DictionaryObject,

91 EmbeddedFile,

92 Fit,

93 FloatObject,

94 IndirectObject,

95 NameObject,

96 NullObject,

97 NumberObject,

98 PdfObject,

99 RectangleObject,

100 ReferenceLink,

101 StreamObject,

102 TextStringObject,

103 TreeObject,

104 ViewerPreferences,

105 create_string_object,

106 extract_links,

107 hex_to_rgb,

108 is_null_or_none,

109)

110from .pagerange import PageRange, PageRangeSpec

111from .types import (

112 AnnotationSubtype,

113 BorderArrayType,

114 LayoutType,

115 OutlineItemType,

116 OutlineType,

117 PagemodeType,

118)

119from .xmp import XmpInformation

120

121ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

122DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12

123

124

125class ObjectDeletionFlag(enum.IntFlag):

126 NONE = 0

127 TEXT = enum.auto()

128 LINKS = enum.auto()

129 ATTACHMENTS = enum.auto()

130 OBJECTS_3D = enum.auto()

131 ALL_ANNOTATIONS = enum.auto()

132 XOBJECT_IMAGES = enum.auto()

133 INLINE_IMAGES = enum.auto()

134 DRAWING_IMAGES = enum.auto()

135 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

136

137

138def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

139 hash = hashlib.md5(usedforsecurity=False)

140 for block in iter(lambda: stream.read(blocksize), b""):

141 hash.update(block)

142 return hash.hexdigest()

143

144

145class PdfWriter(PdfDocCommon):

146 """

147 Write a PDF file out, given pages produced by another class or through

148 cloning a PDF file during initialization.

149

150 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

151

152 Args:

153 clone_from: identical to fileobj (for compatibility)

154

155 incremental: If true, loads the document and set the PdfWriter in incremental mode.

156

157 When writing incrementally, the original document is written first and new/modified

158 content is appended. To be used for signed document/forms to keep signature valid.

159

160 full: If true, loads all the objects (always full if incremental = True).

161 This parameter may allow loading large PDFs.

162

163 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.

164 If false, pypdf will try to be forgiving and do something reasonable, but it will log

165 a warning message. It is a best-effort approach.

166

167 """

168

169 def __init__(

170 self,

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

173 incremental: bool = False,

174 full: bool = False,

175 strict: bool = False,

176 ) -> None:

177 self.strict = strict

178 """

179 If true, pypdf will raise an exception if a PDF does not follow the specification.

180 If false, pypdf will try to be forgiving and do something reasonable, but it will log

181 a warning message. It is a best-effort approach.

182 """

183

184 self.incremental = incremental or full

185 """

186 Returns if the PdfWriter object has been started in incremental mode.

187 """

188

189 self._objects: list[Optional[PdfObject]] = []

190 """

191 The indirect objects in the PDF.

192 For the incremental case, it will be filled with None

193 in clone_reader_document_root.

194 """

195

196 self._original_hash: list[int] = []

197 """

198 List of hashes after import; used to identify changes.

199 """

200

201 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}

202 """

203 Maps hash values of indirect objects to the list of IndirectObjects.

204 This is used for compression.

205 """

206

207 self._id_translated: dict[int, dict[int, int]] = {}

208 """List of already translated IDs.

209 dict[id(pdf)][(idnum, generation)]

210 """

211

212 self._info_obj: Optional[PdfObject]

213 """The PDF files's document information dictionary,

214 the Info entry in the PDF file's trailer dictionary."""

215

216 self._ID: Union[ArrayObject, None] = None

217 """The PDF file identifier,

218 defined by the ID in the PDF file's trailer dictionary."""

219

220 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []

221 "Tracks links in pages added to the writer for resolving later."

222 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}

223 "Tracks pages added to the writer and what page they turned into."

224

225 if self.incremental:

226 if isinstance(fileobj, (str, Path)):

227 with open(fileobj, "rb") as f:

228 fileobj = BytesIO(f.read(-1))

229 if isinstance(fileobj, BytesIO):

230 fileobj = PdfReader(fileobj)

231 if not isinstance(fileobj, PdfReader):

232 raise PyPdfError("Invalid type for incremental mode")

233 self._reader = fileobj # prev content is in _reader.stream

234 self._header = fileobj.pdf_header.encode()

235 self._readonly = True # TODO: to be analysed

236 else:

237 self._header = b"%PDF-1.3"

238 self._info_obj = self._add_object(

239 DictionaryObject(

240 {NameObject("/Producer"): create_string_object("pypdf")}

241 )

242 )

243

244 def _get_clone_from(

245 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

246 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

247 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

248 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

249 fileobj == "" or clone_from is not None

250 ):

251 return clone_from

252 cloning = True

253 if isinstance(fileobj, (str, Path)) and (

254 not Path(str(fileobj)).exists()

255 or Path(str(fileobj)).stat().st_size == 0

256 ):

257 cloning = False

258 if isinstance(fileobj, (IOBase, BytesIO)):

259 t = fileobj.tell()

260 if fileobj.seek(0, 2) == 0:

261 cloning = False

262 fileobj.seek(t, 0)

263 if cloning:

264 clone_from = fileobj

265 return clone_from

266

267 clone_from = _get_clone_from(fileobj, clone_from)

268 # To prevent overwriting

269 self.temp_fileobj = fileobj

270 self.fileobj = ""

271 self._with_as_usage = False

272 self._cloned = False

273 # The root of our page tree node

274 pages = DictionaryObject(

275 {

276 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),

277 NameObject(PagesAttributes.COUNT): NumberObject(0),

278 NameObject(PagesAttributes.KIDS): ArrayObject(),

279 }

280 )

281 self.flattened_pages = []

282 self._encryption: Optional[Encryption] = None

283 self._encrypt_entry: Optional[DictionaryObject] = None

284

285 if clone_from is not None:

286 if not isinstance(clone_from, PdfReader):

287 clone_from = PdfReader(clone_from)

288 self.clone_document_from_reader(clone_from)

289 self._cloned = True

290 else:

291 self._pages = self._add_object(pages)

292 self._root_object = DictionaryObject(

293 {

294 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),

295 NameObject(CO.PAGES): self._pages,

296 }

297 )

298 self._add_object(self._root_object)

299 if full and not incremental:

300 self.incremental = False

301 if isinstance(self._ID, list):

302 if isinstance(self._ID[0], TextStringObject):

303 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

304 if isinstance(self._ID[1], TextStringObject):

305 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

306

307 # for commonality

308 @property

309 def is_encrypted(self) -> bool:

310 """

311 Read-only boolean property showing whether this PDF file is encrypted.

312

313 Note that this property, if true, will remain true even after the

314 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

315 """

316 return False

317

318 @property

319 def root_object(self) -> DictionaryObject:

320 """

321 Provide direct access to PDF Structure.

322

323 Note:

324 Recommended only for read access.

325

326 """

327 return self._root_object

328

329 @property

330 def _info(self) -> Optional[DictionaryObject]:

331 """

332 Provide access to "/Info". Standardized with PdfReader.

333

334 Returns:

335 /Info Dictionary; None if the entry does not exist

336

337 """

338 return (

339 None

340 if self._info_obj is None

341 else cast(DictionaryObject, self._info_obj.get_object())

342 )

343

344 @_info.setter

345 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

346 if value is None:

347 try:

348 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore

349 except (KeyError, AttributeError):

350 pass

351 self._info_obj = None

352 else:

353 if self._info_obj is None:

354 self._info_obj = self._add_object(DictionaryObject())

355 obj = cast(DictionaryObject, self._info_obj.get_object())

356 obj.clear()

357 obj.update(cast(DictionaryObject, value.get_object()))

358

359 @property

360 def xmp_metadata(self) -> Optional[XmpInformation]:

361 """XMP (Extensible Metadata Platform) data."""

362 return cast(XmpInformation, self.root_object.xmp_metadata)

363

364 @xmp_metadata.setter

365 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:

366 """XMP (Extensible Metadata Platform) data."""

367 if value is None:

368 if "/Metadata" in self.root_object:

369 del self.root_object["/Metadata"]

370 return

371

372 metadata = self.root_object.get("/Metadata", None)

373 if not isinstance(metadata, IndirectObject):

374 if metadata is not None:

375 del self.root_object["/Metadata"]

376 metadata_stream = StreamObject()

377 stream_reference = self._add_object(metadata_stream)

378 self.root_object[NameObject("/Metadata")] = stream_reference

379 else:

380 metadata_stream = cast(StreamObject, metadata.get_object())

381

382 if isinstance(value, XmpInformation):

383 bytes_data = value.stream.get_data()

384 else:

385 bytes_data = value

386 metadata_stream.set_data(bytes_data)

387

388 @property

389 def with_as_usage(self) -> bool:

390 deprecation_no_replacement("with_as_usage", "5.0")

391 return self._with_as_usage

392

393 @with_as_usage.setter

394 def with_as_usage(self, value: bool) -> None:

395 deprecation_no_replacement("with_as_usage", "5.0")

396 self._with_as_usage = value

397

398 def __enter__(self) -> "PdfWriter":

399 """Store how writer is initialized by 'with'."""

400 c: bool = self._cloned

401 t = self.temp_fileobj

402 self.__init__() # type: ignore

403 self._cloned = c

404 self._with_as_usage = True

405 self.fileobj = t # type: ignore

406 return self

407

408 def __exit__(

409 self,

410 exc_type: Optional[type[BaseException]],

411 exc: Optional[BaseException],

412 traceback: Optional[TracebackType],

413 ) -> None:

414 """Write data to the fileobj."""

415 if self.fileobj and not self._cloned:

416 self.write(self.fileobj)

417

418 @property

419 def pdf_header(self) -> str:

420 """

421 Read/Write property of the PDF header that is written.

422

423 This should be something like ``'%PDF-1.5'``. It is recommended to set

424 the lowest version that supports all features which are used within the

425 PDF file.

426

427 Note: `pdf_header` returns a string but accepts bytes or str for writing

428 """

429 return self._header.decode()

430

431 @pdf_header.setter

432 def pdf_header(self, new_header: Union[str, bytes]) -> None:

433 if isinstance(new_header, str):

434 new_header = new_header.encode()

435 self._header = new_header

436

437 def _add_object(self, obj: PdfObject) -> IndirectObject:

438 if (

439 getattr(obj, "indirect_reference", None) is not None

440 and obj.indirect_reference.pdf == self # type: ignore

441 ):

442 return obj.indirect_reference # type: ignore

443 # check for /Contents in Pages (/Contents in annotations are strings)

444 if isinstance(obj, DictionaryObject) and isinstance(

445 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

446 ):

447 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

448 self._objects.append(obj)

449 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

450 return obj.indirect_reference

451

452 def get_object(

453 self,

454 indirect_reference: Union[int, IndirectObject],

455 ) -> PdfObject:

456 if isinstance(indirect_reference, int):

457 obj = self._objects[indirect_reference - 1]

458 elif indirect_reference.pdf != self:

459 raise ValueError("PDF must be self")

460 else:

461 obj = self._objects[indirect_reference.idnum - 1]

462 assert obj is not None, "mypy"

463 return obj

464

465 def _replace_object(

466 self,

467 indirect_reference: Union[int, IndirectObject],

468 obj: PdfObject,

469 ) -> PdfObject:

470 if isinstance(indirect_reference, IndirectObject):

471 if indirect_reference.pdf != self:

472 raise ValueError("PDF must be self")

473 indirect_reference = indirect_reference.idnum

474 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore

475 if (

476 getattr(obj, "indirect_reference", None) is not None

477 and obj.indirect_reference.pdf != self # type: ignore

478 ):

479 obj = obj.clone(self)

480 self._objects[indirect_reference - 1] = obj

481 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

482

483 assert isinstance(obj, PdfObject), "mypy"

484 return obj

485

486 def _add_page(

487 self,

488 page: PageObject,

489 index: int,

490 excluded_keys: Iterable[str] = (),

491 ) -> PageObject:

492 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:

493 raise ValueError("Invalid page object")

494 assert self.flattened_pages is not None, "for mypy"

495 page_org = page

496 excluded_keys = list(excluded_keys)

497 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]

498 # Acrobat does not accept two indirect references pointing on the same

499 # page; therefore in order to add multiple copies of the same

500 # page, we need to create a new dictionary for the page, however the

501 # objects below (including content) are not duplicated:

502 try: # delete an already existing page

503 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore

504 page_org.indirect_reference.idnum # type: ignore

505 ]

506 except Exception:

507 pass

508

509 page = cast(

510 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

511 )

512 if page_org.pdf is not None:

513 other = page_org.pdf.pdf_header

514 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

515

516 node, idx = self._get_page_in_node(index)

517 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference

518

519 if idx >= 0:

520 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)

521 self.flattened_pages.insert(index, page)

522 else:

523 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)

524 self.flattened_pages.append(page)

525 recurse = 0

526 while not is_null_or_none(node):

527 node = cast(DictionaryObject, node.get_object())

528 node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1)

529 node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix.

530 recurse += 1

531 if recurse > 1000:

532 raise PyPdfError("Too many recursive calls!")

533

534 if page_org.pdf is not None:

535 # the page may contain links to other pages, and those other

536 # pages may or may not already be added. we store the

537 # information we need, so that we can resolve the references

538 # later.

539 self._unresolved_links.extend(extract_links(page, page_org))

540 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference

541

542 return page

543

544 def set_need_appearances_writer(self, state: bool = True) -> None:

545 """

546 Sets the "NeedAppearances" flag in the PDF writer.

547

548 The "NeedAppearances" flag indicates whether the appearance dictionary

549 for form fields should be automatically generated by the PDF viewer or

550 if the embedded appearance should be used.

551

552 Args:

553 state: The actual value of the NeedAppearances flag.

554

555 Returns:

556 None

557

558 """

559 # See §12.7.2 and §7.7.2 for more information:

560 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

561 try:

562 # get the AcroForm tree

563 if CatalogDictionary.ACRO_FORM not in self._root_object:

564 self._root_object[

565 NameObject(CatalogDictionary.ACRO_FORM)

566 ] = self._add_object(DictionaryObject())

567

568 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

569 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

570 need_appearances

571 ] = BooleanObject(state)

572 except Exception as exc: # pragma: no cover

573 logger_warning(

574 f"set_need_appearances_writer({state}) catch : {exc}", __name__

575 )

576

577 def create_viewer_preferences(self) -> ViewerPreferences:

578 o = ViewerPreferences()

579 self._root_object[

580 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

581 ] = self._add_object(o)

582 return o

583

584 def add_page(

585 self,

586 page: PageObject,

587 excluded_keys: Iterable[str] = (),

588 ) -> PageObject:

589 """

590 Add a page to this PDF file.

591

592 Recommended for advanced usage including the adequate excluded_keys.

593

594 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

595 instance.

596

597 Args:

598 page: The page to add to the document. Should be

599 an instance of :class:`PageObject<pypdf._page.PageObject>`

600 excluded_keys:

601

602 Returns:

603 The added PageObject.

604

605 """

606 assert self.flattened_pages is not None, "mypy"

607 return self._add_page(page, len(self.flattened_pages), excluded_keys)

608

609 def insert_page(

610 self,

611 page: PageObject,

612 index: int = 0,

613 excluded_keys: Iterable[str] = (),

614 ) -> PageObject:

615 """

616 Insert a page in this PDF file. The page is usually acquired from a

617 :class:`PdfReader<pypdf.PdfReader>` instance.

618

619 Args:

620 page: The page to add to the document.

621 index: Position at which the page will be inserted.

622 excluded_keys:

623

624 Returns:

625 The added PageObject.

626

627 """

628 assert self.flattened_pages is not None, "mypy"

629 if index < 0:

630 index = len(self.flattened_pages) + index

631 if index < 0:

632 raise ValueError("Invalid index value")

633 if index >= len(self.flattened_pages):

634 return self.add_page(page, excluded_keys)

635 return self._add_page(page, index, excluded_keys)

636

637 def _get_page_number_by_indirect(

638 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

639 ) -> Optional[int]:

640 """

641 Generate _page_id2num.

642

643 Args:

644 indirect_reference:

645

646 Returns:

647 The page number or None

648

649 """

650 # To provide same function as in PdfReader

651 if is_null_or_none(indirect_reference):

652 return None

653 assert indirect_reference is not None, "mypy"

654 if isinstance(indirect_reference, int):

655 indirect_reference = IndirectObject(indirect_reference, 0, self)

656 obj = indirect_reference.get_object()

657 if isinstance(obj, PageObject):

658 return obj.page_number

659 return None

660

661 def add_blank_page(

662 self, width: Optional[float] = None, height: Optional[float] = None

663 ) -> PageObject:

664 """

665 Append a blank page to this PDF file and return it.

666

667 If no page size is specified, use the size of the last page.

668

669 Args:

670 width: The width of the new page expressed in default user

671 space units.

672 height: The height of the new page expressed in default

673 user space units.

674

675 Returns:

676 The newly appended page.

677

678 Raises:

679 PageSizeNotDefinedError: if width and height are not defined

680 and previous page does not exist.

681

682 """

683 page = PageObject.create_blank_page(self, width, height)

684 return self.add_page(page)

685

686 def insert_blank_page(

687 self,

688 width: Optional[Union[float, decimal.Decimal]] = None,

689 height: Optional[Union[float, decimal.Decimal]] = None,

690 index: int = 0,

691 ) -> PageObject:

692 """

693 Insert a blank page to this PDF file and return it.

694

695 If no page size is specified, use the size of the last page.

696

697 Args:

698 width: The width of the new page expressed in default user

699 space units.

700 height: The height of the new page expressed in default

701 user space units.

702 index: Position to add the page.

703

704 Returns:

705 The newly inserted page.

706

707 Raises:

708 PageSizeNotDefinedError: if width and height are not defined

709 and previous page does not exist.

710

711 """

712 if width is None or (height is None and index < self.get_num_pages()):

713 oldpage = self.pages[index]

714 width = oldpage.mediabox.width

715 height = oldpage.mediabox.height

716 page = PageObject.create_blank_page(self, width, height)

717 self.insert_page(page, index)

718 return page

719

720 @property

721 def open_destination(

722 self,

723 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

724 return super().open_destination

725

726 @open_destination.setter

727 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

728 if dest is None:

729 try:

730 del self._root_object["/OpenAction"]

731 except KeyError:

732 pass

733 elif isinstance(dest, str):

734 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

735 elif isinstance(dest, Destination):

736 self._root_object[NameObject("/OpenAction")] = dest.dest_array

737 elif isinstance(dest, PageObject):

738 self._root_object[NameObject("/OpenAction")] = Destination(

739 "Opening",

740 dest.indirect_reference

741 if dest.indirect_reference is not None

742 else NullObject(),

743 PAGE_FIT,

744 ).dest_array

745

746 def add_js(self, javascript: str) -> None:

747 """

748 Add JavaScript which will launch upon opening this PDF.

749

750 Args:

751 javascript: Your JavaScript.

752

753 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

754 # Example: This will launch the print window when the PDF is opened.

755

756 """

757 # Names / JavaScript preferred to be able to add multiple scripts

758 if "/Names" not in self._root_object:

759 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

760 names = cast(DictionaryObject, self._root_object[CA.NAMES])

761 if "/JavaScript" not in names:

762 names[NameObject("/JavaScript")] = DictionaryObject(

763 {NameObject("/Names"): ArrayObject()}

764 )

765 js_list = cast(

766 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

767 )

768 # We need a name for parameterized JavaScript in the PDF file,

769 # but it can be anything.

770 js_list.append(create_string_object(str(uuid.uuid4())))

771

772 js = DictionaryObject(

773 {

774 NameObject(PagesAttributes.TYPE): NameObject("/Action"),

775 NameObject("/S"): NameObject("/JavaScript"),

776 NameObject("/JS"): TextStringObject(f"{javascript}"),

777 }

778 )

779 js_list.append(self._add_object(js))

780

781 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":

782 """

783 Embed a file inside the PDF.

784

785 Reference:

786 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

787 Section 7.11.3

788

789 Args:

790 filename: The filename to display.

791 data: The data in the file.

792

793 Returns:

794 EmbeddedFile instance for the newly created embedded file.

795

796 """

797 return EmbeddedFile._create_new(self, filename, data)

798

799 def append_pages_from_reader(

800 self,

801 reader: PdfReader,

802 after_page_append: Optional[Callable[[PageObject], None]] = None,

803 ) -> None:

804 """

805 Copy pages from reader to writer. Includes an optional callback

806 parameter which is invoked after pages are appended to the writer.

807

808 ``append`` should be preferred.

809

810 Args:

811 reader: a PdfReader object from which to copy page

812 annotations to this writer object. The writer's annots

813 will then be updated.

814 after_page_append:

815 Callback function that is invoked after each page is appended to

816 the writer. Signature includes a reference to the appended page

817 (delegates to append_pages_from_reader). The single parameter of

818 the callback is a reference to the page just appended to the

819 document.

820

821 """

822 reader_num_pages = len(reader.pages)

823 # Copy pages from reader to writer

824 for reader_page_number in range(reader_num_pages):

825 reader_page = reader.pages[reader_page_number]

826 writer_page = self.add_page(reader_page)

827 # Trigger callback, pass writer page as parameter

828 if callable(after_page_append):

829 after_page_append(writer_page)

830

831 def _merge_content_stream_to_page(

832 self,

833 page: PageObject,

834 new_content_data: bytes,

835 ) -> None:

836 """

837 Combines existing content stream(s) with new content (as bytes).

838

839 Args:

840 page: The page to which the new content data will be added.

841 new_content_data: A binary-encoded new content stream, for

842 instance the commands to draw an XObject.

843 """

844 # First resolve the existing page content. This always is an IndirectObject:

845 # PDF Explained by John Whitington

846 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

847 if NameObject("/Contents") in page:

848 existing_content_ref = page[NameObject("/Contents")]

849 existing_content = existing_content_ref.get_object()

850

851 if isinstance(existing_content, ArrayObject):

852 # Create a new StreamObject for the new_content_data

853 new_stream_obj = StreamObject()

854 new_stream_obj.set_data(new_content_data)

855 existing_content.append(self._add_object(new_stream_obj))

856 page[NameObject("/Contents")] = self._add_object(existing_content)

857 if isinstance(existing_content, StreamObject):

858 # Merge new content to existing StreamObject

859 merged_data = existing_content.get_data() + b"\n" + new_content_data

860 new_stream = StreamObject()

861 new_stream.set_data(merged_data)

862 page[NameObject("/Contents")] = self._add_object(new_stream)

863 else:

864 # If no existing content, then we have an empty page.

865 # Create a new StreamObject in a new /Contents entry.

866 new_stream = StreamObject()

867 new_stream.set_data(new_content_data)

868 page[NameObject("/Contents")] = self._add_object(new_stream)

869

870 def _add_apstream_object(

871 self,

872 page: PageObject,

873 appearance_stream_obj: StreamObject,

874 object_name: str,

875 x_offset: float,

876 y_offset: float,

877 font_res: Optional[DictionaryObject] = None

878 ) -> None:

879 """

880 Adds an appearance stream to the page content in the form of

881 an XObject.

882

883 Args:

884 page: The page to which to add the appearance stream.

885 appearance_stream_obj: The appearance stream.

886 object_name: The name of the appearance stream.

887 x_offset: The horizontal offset for the appearance stream.

888 y_offset: The vertical offset for the appearance stream.

889 font_res: The appearance stream's font resource (if given).

890 """

891 # Prepare XObject resource dictionary on the page

892 pg_res = cast(DictionaryObject, page[PG.RESOURCES])

893 if font_res is not None:

894 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated

895 if "/Font" not in pg_res:

896 pg_res[NameObject("/Font")] = DictionaryObject()

897 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])

898 if font_name not in pg_ft_res:

899 pg_ft_res[NameObject(font_name)] = font_res

900 # Always add the resolved stream object to the writer to get a new IndirectObject.

901 # This ensures we have a valid IndirectObject managed by *this* writer.

902 xobject_ref = self._add_object(appearance_stream_obj)

903 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()

904 if "/XObject" not in pg_res:

905 pg_res[NameObject("/XObject")] = DictionaryObject()

906 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])

907 if xobject_name not in pg_xo_res:

908 pg_xo_res[xobject_name] = xobject_ref

909 else:

910 logger_warning(

911 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",

912 __name__

913 )

914 xobject_cm = Transformation().translate(x_offset, y_offset)

915 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()

916 self._merge_content_stream_to_page(page, xobject_drawing_commands)

917

918 def _update_field_annotation(

919 self,

920 page: PageObject,

921 field: DictionaryObject,

922 annotation: DictionaryObject,

923 font_name: str = "",

924 font_size: float = -1,

925 flatten: bool = False,

926 ) -> None:

927 # Calculate rectangle dimensions

928 _rct = cast(RectangleObject, annotation[AA.Rect])

929 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))

930

931 # Extract font information

932 da = annotation.get_inherited(

933 AA.DA,

934 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(

935 AA.DA, None

936 ),

937 )

938 if da is None:

939 da = TextStringObject("/Helv 0 Tf 0 g")

940 else:

941 da = da.get_object()

942 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")

943 font_properties = [x for x in font_properties if x != ""]

944 if font_name:

945 font_properties[font_properties.index("Tf") - 2] = font_name

946 else:

947 font_name = font_properties[font_properties.index("Tf") - 2]

948 font_height = (

949 font_size

950 if font_size >= 0

951 else float(font_properties[font_properties.index("Tf") - 1])

952 )

953 if font_height == 0:

954 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:

955 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE

956 else:

957 font_height = rct.height - 2

958 font_properties[font_properties.index("Tf") - 1] = str(font_height)

959 da = " ".join(font_properties)

960 y_offset = rct.height - 1 - font_height

961

962 # Retrieve font information from local DR ...

963 dr: Any = cast(

964 DictionaryObject,

965 cast(

966 DictionaryObject,

967 annotation.get_inherited(

968 "/DR",

969 cast(

970 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

971 ).get("/DR", DictionaryObject()),

972 ),

973 ).get_object(),

974 )

975 dr = dr.get("/Font", DictionaryObject()).get_object()

976 # _default_fonts_space_width keys is the list of Standard fonts

977 if font_name not in dr and font_name not in _default_fonts_space_width:

978 # ...or AcroForm dictionary

979 dr = cast(

980 dict[Any, Any],

981 cast(

982 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

983 ).get("/DR", {}),

984 )

985 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()

986 font_res = dr.get(font_name, None)

987 if not is_null_or_none(font_res):

988 font_res = cast(DictionaryObject, font_res.get_object())

989 _font_subtype, _, font_encoding, font_map = build_char_map_from_dict(

990 200, font_res

991 )

992 try: # remove width stored in -1 key

993 del font_map[-1]

994 except KeyError:

995 pass

996 font_full_rev: dict[str, bytes]

997 if isinstance(font_encoding, str):

998 font_full_rev = {

999 v: k.encode(font_encoding) for k, v in font_map.items()

1000 }

1001 else:

1002 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

1003 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

1004 for key, value in font_map.items():

1005 font_full_rev[value] = font_encoding_rev.get(key, key)

1006 else:

1007 logger_warning(f"Font dictionary for {font_name} not found.", __name__)

1008 font_full_rev = {}

1009

1010 # Retrieve field text and selected values

1011 field_flags = field.get(FA.Ff, 0)

1012 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:

1013 txt = "\n".join(annotation.get_inherited(FA.Opt, []))

1014 sel = field.get("/V", [])

1015 if not isinstance(sel, list):

1016 sel = [sel]

1017 else: # /Tx

1018 txt = field.get("/V", "")

1019 sel = []

1020 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)

1021 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")

1022 # Generate appearance stream

1023 ap_stream = generate_appearance_stream(

1024 txt, sel, da, font_full_rev, rct, font_height, y_offset

1025 )

1026

1027 # Create appearance dictionary

1028 dct = DecodedStreamObject.initialize_from_dictionary(

1029 {

1030 NameObject("/Type"): NameObject("/XObject"),

1031 NameObject("/Subtype"): NameObject("/Form"),

1032 NameObject("/BBox"): rct,

1033 "__streamdata__": ByteStringObject(ap_stream),

1034 "/Length": 0,

1035 }

1036 )

1037 if AA.AP in annotation:

1038 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():

1039 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:

1040 dct[k] = v

1041

1042 # Update Resources with font information if necessary

1043 if font_res is not None:

1044 dct[NameObject("/Resources")] = DictionaryObject(

1045 {

1046 NameObject("/Font"): DictionaryObject(

1047 {

1048 NameObject(font_name): getattr(

1049 font_res, "indirect_reference", font_res

1050 )

1051 }

1052 )

1053 }

1054 )

1055 if AA.AP not in annotation:

1056 annotation[NameObject(AA.AP)] = DictionaryObject(

1057 {NameObject("/N"): self._add_object(dct)}

1058 )

1059 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):

1060 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1061 NameObject("/N")

1062 ] = self._add_object(dct)

1063 else: # [/AP][/N] exists

1064 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore

1065 self._objects[n - 1] = dct

1066 dct.indirect_reference = IndirectObject(n, 0, self)

1067

1068 if flatten:

1069 field_name = self._get_qualified_field_name(annotation)

1070 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)

1071

1072 FFBITS_NUL = FA.FfBits(0)

1073

1074 def update_page_form_field_values(

1075 self,

1076 page: Union[PageObject, list[PageObject], None],

1077 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],

1078 flags: FA.FfBits = FFBITS_NUL,

1079 auto_regenerate: Optional[bool] = True,

1080 flatten: bool = False,

1081 ) -> None:

1082 """

1083 Update the form field values for a given page from a fields dictionary.

1084

1085 Copy field texts and values from fields to page.

1086 If the field links to a parent object, add the information to the parent.

1087

1088 Args:

1089 page: `PageObject` - references **PDF writer's page** where the

1090 annotations and field data will be updated.

1091 `List[Pageobject]` - provides list of pages to be processed.

1092 `None` - all pages.

1093 fields: a Python dictionary of:

1094

1095 * field names (/T) as keys and text values (/V) as value

1096 * field names (/T) as keys and list of text values (/V) for multiple choice list

1097 * field names (/T) as keys and tuple of:

1098 * text values (/V)

1099 * font id (e.g. /F1, the font id must exist)

1100 * font size (0 for autosize)

1101

1102 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

1103

1104 auto_regenerate: Set/unset the need_appearances flag;

1105 the flag is unchanged if auto_regenerate is None.

1106

1107 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's

1108 appearance stream to the page contents. Note that this option does not remove the

1109 annotation itself.

1110

1111 """

1112 if CatalogDictionary.ACRO_FORM not in self._root_object:

1113 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

1114 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1115 if InteractiveFormDictEntries.Fields not in af:

1116 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

1117 if isinstance(auto_regenerate, bool):

1118 self.set_need_appearances_writer(auto_regenerate)

1119 # Iterate through pages, update field values

1120 if page is None:

1121 page = list(self.pages)

1122 if isinstance(page, list):

1123 for p in page:

1124 if PG.ANNOTS in p: # just to prevent warnings

1125 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)

1126 return

1127 if PG.ANNOTS not in page:

1128 logger_warning("No fields to update on this page", __name__)

1129 return

1130 for annotation in page[PG.ANNOTS]: # type: ignore

1131 annotation = cast(DictionaryObject, annotation.get_object())

1132 if annotation.get("/Subtype", "") != "/Widget":

1133 continue

1134 if "/FT" in annotation and "/T" in annotation:

1135 parent_annotation = annotation

1136 else:

1137 parent_annotation = annotation.get(

1138 PG.PARENT, DictionaryObject()

1139 ).get_object()

1140

1141 for field, value in fields.items():

1142 if not (

1143 self._get_qualified_field_name(parent_annotation) == field

1144 or parent_annotation.get("/T", None) == field

1145 ):

1146 continue

1147 if (

1148 parent_annotation.get("/FT", None) == "/Ch"

1149 and "/I" in parent_annotation

1150 ):

1151 del parent_annotation["/I"]

1152 if flags:

1153 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1154 if not (value is None and flatten): # Only change values if given by user and not flattening.

1155 if isinstance(value, list):

1156 lst = ArrayObject(TextStringObject(v) for v in value)

1157 parent_annotation[NameObject(FA.V)] = lst

1158 elif isinstance(value, tuple):

1159 annotation[NameObject(FA.V)] = TextStringObject(

1160 value[0],

1161 )

1162 else:

1163 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1164 if parent_annotation.get(FA.FT) == "/Btn":

1165 # Checkbox button (no /FT found in Radio widgets)

1166 v = NameObject(value)

1167 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1168 normal_ap = cast(DictionaryObject, ap["/N"])

1169 if v not in normal_ap:

1170 v = NameObject("/Off")

1171 appearance_stream_obj = normal_ap.get(v)

1172 # other cases will be updated through the for loop

1173 annotation[NameObject(AA.AS)] = v

1174 annotation[NameObject(FA.V)] = v

1175 if flatten and appearance_stream_obj is not None:

1176 # We basically copy the entire appearance stream, which should be an XObject that

1177 # is already registered. No need to add font resources.

1178 rct = cast(RectangleObject, annotation[AA.Rect])

1179 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])

1180 elif (

1181 parent_annotation.get(FA.FT) == "/Tx"

1182 or parent_annotation.get(FA.FT) == "/Ch"

1183 ):

1184 # textbox

1185 if isinstance(value, tuple):

1186 self._update_field_annotation(

1187 page, parent_annotation, annotation, value[1], value[2], flatten=flatten

1188 )

1189 else:

1190 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)

1191 elif (

1192 annotation.get(FA.FT) == "/Sig"

1193 ): # deprecated # not implemented yet

1194 logger_warning("Signature forms not implemented yet", __name__)

1195

1196 def reattach_fields(

1197 self, page: Optional[PageObject] = None

1198 ) -> list[DictionaryObject]:

1199 """

1200 Parse annotations within the page looking for orphan fields and

1201 reattach then into the Fields Structure.

1202

1203 Args:

1204 page: page to analyze.

1205 If none is provided, all pages will be analyzed.

1206

1207 Returns:

1208 list of reattached fields.

1209

1210 """

1211 lst = []

1212 if page is None:

1213 for p in self.pages:

1214 lst += self.reattach_fields(p)

1215 return lst

1216

1217 try:

1218 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1219 except KeyError:

1220 af = DictionaryObject()

1221 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1222 try:

1223 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1224 except KeyError:

1225 fields = ArrayObject()

1226 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1227

1228 if "/Annots" not in page:

1229 return lst

1230 annotations = cast(ArrayObject, page["/Annots"])

1231 for idx, annotation in enumerate(annotations):

1232 is_indirect = isinstance(annotation, IndirectObject)

1233 annotation = cast(DictionaryObject, annotation.get_object())

1234 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1235 if (

1236 "indirect_reference" in annotation.__dict__

1237 and annotation.indirect_reference in fields

1238 ):

1239 continue

1240 if not is_indirect:

1241 annotations[idx] = self._add_object(annotation)

1242 fields.append(annotation.indirect_reference)

1243 lst.append(annotation)

1244 return lst

1245

1246 def clone_reader_document_root(self, reader: PdfReader) -> None:

1247 """

1248 Copy the reader document root to the writer and all sub-elements,

1249 including pages, threads, outlines,... For partial insertion, ``append``

1250 should be considered.

1251

1252 Args:

1253 reader: PdfReader from which the document root should be copied.

1254

1255 """

1256 self._info_obj = None

1257 if self.incremental:

1258 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)

1259 for i in range(len(self._objects)):

1260 o = reader.get_object(i + 1)

1261 if o is not None:

1262 self._objects[i] = o.replicate(self)

1263 else:

1264 self._objects.clear()

1265 self._root_object = reader.root_object.clone(self)

1266 self._pages = self._root_object.raw_get("/Pages")

1267

1268 if len(self._objects) > cast(int, reader.trailer["/Size"]):

1269 if self.strict:

1270 raise PdfReadError(

1271 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}"

1272 )

1273 logger_warning(

1274 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}",

1275 __name__

1276 )

1277

1278 # must be done here before rewriting

1279 if self.incremental:

1280 self._original_hash = [

1281 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1282 ]

1283

1284 try:

1285 self._flatten()

1286 except IndexError:

1287 raise PdfReadError("Got index error while flattening.")

1288

1289 assert self.flattened_pages is not None

1290 for p in self.flattened_pages:

1291 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1292 if not self.incremental:

1293 p[NameObject("/Parent")] = self._pages

1294 if not self.incremental:

1295 cast(DictionaryObject, self._pages.get_object())[

1296 NameObject("/Kids")

1297 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1298

1299 def clone_document_from_reader(

1300 self,

1301 reader: PdfReader,

1302 after_page_append: Optional[Callable[[PageObject], None]] = None,

1303 ) -> None:

1304 """

1305 Create a copy (clone) of a document from a PDF file reader cloning

1306 section '/Root' and '/Info' and '/ID' of the pdf.

1307

1308 Args:

1309 reader: PDF file reader instance from which the clone

1310 should be created.

1311 after_page_append:

1312 Callback function that is invoked after each page is appended to

1313 the writer. Signature includes a reference to the appended page

1314 (delegates to append_pages_from_reader). The single parameter of

1315 the callback is a reference to the page just appended to the

1316 document.

1317

1318 """

1319 self.clone_reader_document_root(reader)

1320 inf = reader._info

1321 if self.incremental:

1322 if inf is not None:

1323 self._info_obj = cast(

1324 IndirectObject, inf.clone(self).indirect_reference

1325 )

1326 assert isinstance(self._info, DictionaryObject), "for mypy"

1327 self._original_hash[

1328 self._info_obj.indirect_reference.idnum - 1

1329 ] = self._info.hash_bin()

1330 elif inf is not None:

1331 self._info_obj = self._add_object(

1332 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1333 )

1334 # else: _info_obj = None done in clone_reader_document_root()

1335

1336 try:

1337 self._ID = cast(ArrayObject, reader._ID).clone(self)

1338 except AttributeError:

1339 pass

1340

1341 if callable(after_page_append):

1342 for page in cast(

1343 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1344 ):

1345 after_page_append(page.get_object())

1346

1347 def _compute_document_identifier(self) -> ByteStringObject:

1348 stream = BytesIO()

1349 self._write_pdf_structure(stream)

1350 stream.seek(0)

1351 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1352

1353 def generate_file_identifiers(self) -> None:

1354 """

1355 Generate an identifier for the PDF that will be written.

1356

1357 The only point of this is ensuring uniqueness. Reproducibility is not

1358 required.

1359 When a file is first written, both identifiers shall be set to the same value.

1360 If both identifiers match when a file reference is resolved, it is very

1361 likely that the correct and unchanged file has been found. If only the first

1362 identifier matches, a different version of the correct file has been found.

1363 see §14.4 "File Identifiers".

1364 """

1365 if self._ID:

1366 id1 = self._ID[0]

1367 id2 = self._compute_document_identifier()

1368 else:

1369 id1 = self._compute_document_identifier()

1370 id2 = id1

1371 self._ID = ArrayObject((id1, id2))

1372

1373 def encrypt(

1374 self,

1375 user_password: str,

1376 owner_password: Optional[str] = None,

1377 use_128bit: bool = True,

1378 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1379 *,

1380 algorithm: Optional[str] = None,

1381 ) -> None:

1382 """

1383 Encrypt this PDF file with the PDF Standard encryption handler.

1384

1385 Args:

1386 user_password: The password which allows for opening

1387 and reading the PDF file with the restrictions provided.

1388 owner_password: The password which allows for

1389 opening the PDF files without any restrictions. By default,

1390 the owner password is the same as the user password.

1391 use_128bit: flag as to whether to use 128bit

1392 encryption. When false, 40bit encryption will be used.

1393 By default, this flag is on.

1394 permissions_flag: permissions as described in

1395 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1396 the permission is granted.

1397 Hence an integer value of -1 will set all flags.

1398 Bit position 3 is for printing, 4 is for modifying content,

1399 5 and 6 control annotations, 9 for form fields,

1400 10 for extraction of text and graphics.

1401 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1402 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1403 `use_128bit` will be ignored.

1404

1405 """

1406 if owner_password is None:

1407 owner_password = user_password

1408

1409 if algorithm is not None:

1410 try:

1411 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1412 except AttributeError:

1413 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1414 else:

1415 alg = EncryptAlgorithm.RC4_128

1416 if not use_128bit:

1417 alg = EncryptAlgorithm.RC4_40

1418 self.generate_file_identifiers()

1419 assert self._ID

1420 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1421 # in case call `encrypt` again

1422 entry = self._encryption.write_entry(user_password, owner_password)

1423 if self._encrypt_entry:

1424 # replace old encrypt_entry

1425 assert self._encrypt_entry.indirect_reference is not None

1426 entry.indirect_reference = self._encrypt_entry.indirect_reference

1427 self._objects[entry.indirect_reference.idnum - 1] = entry

1428 else:

1429 self._add_object(entry)

1430 self._encrypt_entry = entry

1431

1432 def _resolve_links(self) -> None:

1433 """Patch up links that were added to the document earlier, to

1434 make sure they still point to the same pages.

1435 """

1436 for (new_link, old_link) in self._unresolved_links:

1437 old_page = old_link.find_referenced_page()

1438 if not old_page:

1439 continue

1440 new_page = self._merged_in_pages.get(old_page)

1441 if new_page is None:

1442 continue

1443 new_link.patch_reference(self, new_page)

1444

1445 def write_stream(self, stream: StreamType) -> None:

1446 if hasattr(stream, "mode") and "b" not in stream.mode:

1447 logger_warning(

1448 f"File <{stream.name}> to write to is not in binary mode. "

1449 "It may not be written to correctly.",

1450 __name__,

1451 )

1452 self._resolve_links()

1453

1454 if self.incremental:

1455 self._reader.stream.seek(0)

1456 stream.write(self._reader.stream.read(-1))

1457 if len(self.list_objects_in_increment()) > 0:

1458 self._write_increment(stream) # writes objs, xref stream and startxref

1459 else:

1460 object_positions, free_objects = self._write_pdf_structure(stream)

1461 xref_location = self._write_xref_table(

1462 stream, object_positions, free_objects

1463 )

1464 self._write_trailer(stream, xref_location)

1465

1466 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:

1467 """

1468 Write the collection of pages added to this object out as a PDF file.

1469

1470 Args:

1471 stream: An object to write the file to. The object can support

1472 the write method and the tell method, similar to a file object, or

1473 be a file path, just like the fileobj, just named it stream to keep

1474 existing workflow.

1475

1476 Returns:

1477 A tuple (bool, IO).

1478

1479 """

1480 my_file = False

1481

1482 if stream == "":

1483 raise ValueError(f"Output({stream=}) is empty.")

1484

1485 if isinstance(stream, (str, Path)):

1486 stream = FileIO(stream, "wb")

1487 my_file = True

1488

1489 self.write_stream(stream)

1490

1491 if my_file:

1492 stream.close()

1493 else:

1494 stream.flush()

1495

1496 return my_file, stream

1497

1498 def list_objects_in_increment(self) -> list[IndirectObject]:

1499 """

1500 For analysis or debugging.

1501 Provides the list of new or modified objects that will be written

1502 in the increment.

1503 Deleted objects will not be freed but will become orphans.

1504

1505 Returns:

1506 List of new or modified IndirectObjects

1507

1508 """

1509 original_hash_count = len(self._original_hash)

1510 return [

1511 cast(IndirectObject, obj).indirect_reference

1512 for i, obj in enumerate(self._objects)

1513 if (

1514 obj is not None

1515 and (

1516 i >= original_hash_count

1517 or obj.hash_bin() != self._original_hash[i]

1518 )

1519 )

1520 ]

1521

1522 def _write_increment(self, stream: StreamType) -> None:

1523 object_positions = {}

1524 object_blocks = []

1525 current_start = -1

1526 current_stop = -2

1527 original_hash_count = len(self._original_hash)

1528 for i, obj in enumerate(self._objects):

1529 if obj is not None and (

1530 i >= original_hash_count

1531 or obj.hash_bin() != self._original_hash[i]

1532 ):

1533 idnum = i + 1

1534 assert isinstance(obj, PdfObject), "mypy"

1535 # first write new/modified object

1536 object_positions[idnum] = stream.tell()

1537 stream.write(f"{idnum} 0 obj\n".encode())

1538 """ encryption is not operational

1539 if self._encryption and obj != self._encrypt_entry:

1540 obj = self._encryption.encrypt_object(obj, idnum, 0)

1541 """

1542 obj.write_to_stream(stream)

1543 stream.write(b"\nendobj\n")

1544

1545 # prepare xref

1546 if idnum != current_stop:

1547 if current_start > 0:

1548 object_blocks.append(

1549 [current_start, current_stop - current_start]

1550 )

1551 current_start = idnum

1552 current_stop = idnum + 1

1553 assert current_start > 0, "for pytest only"

1554 object_blocks.append([current_start, current_stop - current_start])

1555 # write incremented xref

1556 xref_location = stream.tell()

1557 xr_id = len(self._objects) + 1

1558 stream.write(f"{xr_id} 0 obj".encode())

1559 init_data = {

1560 NameObject("/Type"): NameObject("/XRef"),

1561 NameObject("/Size"): NumberObject(xr_id + 1),

1562 NameObject("/Root"): self.root_object.indirect_reference,

1563 NameObject("/Filter"): NameObject("/FlateDecode"),

1564 NameObject("/Index"): ArrayObject(

1565 [NumberObject(_it) for _su in object_blocks for _it in _su]

1566 ),

1567 NameObject("/W"): ArrayObject(

1568 [NumberObject(1), NumberObject(4), NumberObject(1)]

1569 ),

1570 "__streamdata__": b"",

1571 }

1572 if self._info is not None and (

1573 self._info.indirect_reference.idnum - 1 # type: ignore

1574 >= len(self._original_hash)

1575 or cast(IndirectObject, self._info).hash_bin() # kept for future

1576 != self._original_hash[

1577 self._info.indirect_reference.idnum - 1 # type: ignore

1578 ]

1579 ):

1580 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1581 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1582 if self._ID:

1583 init_data[NameObject(TK.ID)] = self._ID

1584 xr = StreamObject.initialize_from_dictionary(init_data)

1585 xr.set_data(

1586 b"".join(

1587 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1588 )

1589 )

1590 xr.write_to_stream(stream)

1591 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1592

1593 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:

1594 object_positions = []

1595 free_objects = []

1596 stream.write(self.pdf_header.encode() + b"\n")

1597 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1598

1599 for idnum, obj in enumerate(self._objects, start=1):

1600 if obj is not None:

1601 object_positions.append(stream.tell())

1602 stream.write(f"{idnum} 0 obj\n".encode())

1603 if self._encryption and obj != self._encrypt_entry:

1604 obj = self._encryption.encrypt_object(obj, idnum, 0)

1605 obj.write_to_stream(stream)

1606 stream.write(b"\nendobj\n")

1607 else:

1608 object_positions.append(-1)

1609 free_objects.append(idnum)

1610 free_objects.append(0) # add 0 to loop in accordance with specification

1611 return object_positions, free_objects

1612

1613 def _write_xref_table(

1614 self, stream: StreamType, object_positions: list[int], free_objects: list[int]

1615 ) -> int:

1616 xref_location = stream.tell()

1617 stream.write(b"xref\n")

1618 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1619 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1620 free_idx = 1

1621 for offset in object_positions:

1622 if offset > 0:

1623 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1624 else:

1625 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1626 free_idx += 1

1627 return xref_location

1628

1629 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1630 """

1631 Write the PDF trailer to the stream.

1632

1633 To quote the PDF specification:

1634 [The] trailer [gives] the location of the cross-reference table and

1635 of certain special objects within the body of the file.

1636 """

1637 stream.write(b"trailer\n")

1638 trailer = DictionaryObject(

1639 {

1640 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1641 NameObject(TK.ROOT): self.root_object.indirect_reference,

1642 }

1643 )

1644 if self._info is not None:

1645 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1646 if self._ID is not None:

1647 trailer[NameObject(TK.ID)] = self._ID

1648 if self._encrypt_entry:

1649 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1650 trailer.write_to_stream(stream)

1651 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1652

1653 @property

1654 def metadata(self) -> Optional[DocumentInformation]:

1655 """

1656 Retrieve/set the PDF file's document information dictionary, if it exists.

1657

1658 Args:

1659 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1660

1661 Note that some PDF files use (XMP) metadata streams instead of document

1662 information dictionaries, and these metadata streams will not be

1663 accessed by this function, but by :meth:`~xmp_metadata`.

1664

1665 """

1666 return super().metadata

1667

1668 @metadata.setter

1669 def metadata(

1670 self,

1671 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],

1672 ) -> None:

1673 if value is None:

1674 self._info = None

1675 else:

1676 if self._info is not None:

1677 self._info.clear()

1678

1679 self.add_metadata(value)

1680

1681 def add_metadata(self, infos: dict[str, Any]) -> None:

1682 """

1683 Add custom metadata to the output.

1684

1685 Args:

1686 infos: a Python dictionary where each key is a field

1687 and each value is your new metadata.

1688

1689 """

1690 args = {}

1691 if isinstance(infos, PdfObject):

1692 infos = cast(DictionaryObject, infos.get_object())

1693 for key, value in list(infos.items()):

1694 if isinstance(value, PdfObject):

1695 value = value.get_object()

1696 args[NameObject(key)] = create_string_object(str(value))

1697 if self._info is None:

1698 self._info = DictionaryObject()

1699 self._info.update(args)

1700

1701 def compress_identical_objects(

1702 self,

1703 remove_identicals: bool = True,

1704 remove_orphans: bool = True,

1705 ) -> None:

1706 """

1707 Parse the PDF file and merge objects that have the same hash.

1708 This will make objects common to multiple pages.

1709 Recommended to be used just before writing output.

1710

1711 Args:

1712 remove_identicals: Remove identical objects.

1713 remove_orphans: Remove unreferenced objects.

1714

1715 """

1716

1717 def replace_in_obj(

1718 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]

1719 ) -> None:

1720 if isinstance(obj, DictionaryObject):

1721 key_val = obj.items()

1722 elif isinstance(obj, ArrayObject):

1723 key_val = enumerate(obj) # type: ignore

1724 else:

1725 return

1726 assert isinstance(obj, (DictionaryObject, ArrayObject))

1727 for k, v in key_val:

1728 if isinstance(v, IndirectObject):

1729 orphans[v.idnum - 1] = False

1730 if v in crossref:

1731 obj[k] = crossref[v]

1732 else:

1733 """the filtering on DictionaryObject and ArrayObject only

1734 will be performed within replace_in_obj"""

1735 replace_in_obj(v, crossref)

1736

1737 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])

1738 self._idnum_hash = {}

1739 orphans = [True] * len(self._objects)

1740 # look for similar objects

1741 for idx, obj in enumerate(self._objects):

1742 if is_null_or_none(obj):

1743 continue

1744 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1745 assert isinstance(obj.indirect_reference, IndirectObject)

1746 h = obj.hash_value()

1747 if remove_identicals and h in self._idnum_hash:

1748 self._idnum_hash[h][1].append(obj.indirect_reference)

1749 self._objects[idx] = None

1750 else:

1751 self._idnum_hash[h] = (obj.indirect_reference, [])

1752

1753 # generate the dict converting others to 1st

1754 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1755 cnv_rev: dict[IndirectObject, IndirectObject] = {}

1756 for k, v in cnv.items():

1757 cnv_rev.update(zip(v, (k,) * len(v)))

1758

1759 # replace reference to merged objects

1760 for obj in self._objects:

1761 if isinstance(obj, (DictionaryObject, ArrayObject)):

1762 replace_in_obj(obj, cnv_rev)

1763

1764 # remove orphans (if applicable)

1765 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

1766

1767 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore

1768

1769 try:

1770 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore

1771 except AttributeError:

1772 pass

1773 for i in compress(range(len(self._objects)), orphans):

1774 self._objects[i] = None

1775

1776 def get_reference(self, obj: PdfObject) -> IndirectObject:

1777 idnum = self._objects.index(obj) + 1

1778 ref = IndirectObject(idnum, 0, self)

1779 assert ref.get_object() == obj

1780 return ref

1781

1782 def get_outline_root(self) -> TreeObject:

1783 if CO.OUTLINES in self._root_object:

1784 # Entries in the catalog dictionary

1785 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1786 if not isinstance(outline, TreeObject):

1787 t = TreeObject(outline)

1788 self._replace_object(outline.indirect_reference.idnum, t)

1789 outline = t

1790 idnum = self._objects.index(outline) + 1

1791 outline_ref = IndirectObject(idnum, 0, self)

1792 assert outline_ref.get_object() == outline

1793 else:

1794 outline = TreeObject()

1795 outline.update({})

1796 outline_ref = self._add_object(outline)

1797 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1798

1799 return outline

1800

1801 def get_threads_root(self) -> ArrayObject:

1802 """

1803 The list of threads.

1804

1805 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1806

1807 Returns:

1808 An array (possibly empty) of Dictionaries with an ``/F`` key,

1809 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1810

1811 """

1812 if CO.THREADS in self._root_object:

1813 # Entries in the catalog dictionary

1814 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1815 else:

1816 threads = ArrayObject()

1817 self._root_object[NameObject(CO.THREADS)] = threads

1818 return threads

1819

1820 @property

1821 def threads(self) -> ArrayObject:

1822 """

1823 Read-only property for the list of threads.

1824

1825 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1826

1827 Each element is a dictionary with an ``/F`` key, and optionally

1828 information about the thread in ``/I`` or ``/Metadata`` keys.

1829 """

1830 return self.get_threads_root()

1831

1832 def add_outline_item_destination(

1833 self,

1834 page_destination: Union[IndirectObject, PageObject, TreeObject],

1835 parent: Union[None, TreeObject, IndirectObject] = None,

1836 before: Union[None, TreeObject, IndirectObject] = None,

1837 is_open: bool = True,

1838 ) -> IndirectObject:

1839 page_destination = cast(PageObject, page_destination.get_object())

1840 if isinstance(page_destination, PageObject):

1841 return self.add_outline_item_destination(

1842 Destination(

1843 f"page #{page_destination.page_number}",

1844 cast(IndirectObject, page_destination.indirect_reference),

1845 Fit.fit(),

1846 )

1847 )

1848

1849 if parent is None:

1850 parent = self.get_outline_root()

1851

1852 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1853 parent = cast(TreeObject, parent.get_object())

1854 page_destination_ref = self._add_object(page_destination)

1855 if before is not None:

1856 before = before.indirect_reference

1857 parent.insert_child(

1858 page_destination_ref,

1859 before,

1860 self,

1861 page_destination.inc_parent_counter_outline

1862 if is_open

1863 else (lambda x, y: 0), # noqa: ARG005

1864 )

1865 if "/Count" not in page_destination:

1866 page_destination[NameObject("/Count")] = NumberObject(0)

1867

1868 return page_destination_ref

1869

1870 def add_outline_item_dict(

1871 self,

1872 outline_item: OutlineItemType,

1873 parent: Union[None, TreeObject, IndirectObject] = None,

1874 before: Union[None, TreeObject, IndirectObject] = None,

1875 is_open: bool = True,

1876 ) -> IndirectObject:

1877 outline_item_object = TreeObject()

1878 outline_item_object.update(outline_item)

1879

1880 """code currently unreachable

1881 if "/A" in outline_item:

1882 action = DictionaryObject()

1883 a_dict = cast(DictionaryObject, outline_item["/A"])

1884 for k, v in list(a_dict.items()):

1885 action[NameObject(str(k))] = v

1886 action_ref = self._add_object(action)

1887 outline_item_object[NameObject("/A")] = action_ref

1888 """

1889 return self.add_outline_item_destination(

1890 outline_item_object, parent, before, is_open

1891 )

1892

1893 def add_outline_item(

1894 self,

1895 title: str,

1896 page_number: Union[None, PageObject, IndirectObject, int],

1897 parent: Union[None, TreeObject, IndirectObject] = None,

1898 before: Union[None, TreeObject, IndirectObject] = None,

1899 color: Optional[Union[tuple[float, float, float], str]] = None,

1900 bold: bool = False,

1901 italic: bool = False,

1902 fit: Fit = PAGE_FIT,

1903 is_open: bool = True,

1904 ) -> IndirectObject:

1905 """

1906 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1907

1908 Args:

1909 title: Title to use for this outline item.

1910 page_number: Page number this outline item will point to.

1911 parent: A reference to a parent outline item to create nested

1912 outline items.

1913 before:

1914 color: Color of the outline item's font as a red, green, blue tuple

1915 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1916 bold: Outline item font is bold

1917 italic: Outline item font is italic

1918 fit: The fit of the destination page.

1919

1920 Returns:

1921 The added outline item as an indirect object.

1922

1923 """

1924 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

1925 if isinstance(italic, Fit): # it means that we are on the old params

1926 if fit is not None and page_number is None:

1927 page_number = fit

1928 return self.add_outline_item(

1929 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

1930 )

1931 if page_number is None:

1932 action_ref = None

1933 else:

1934 if isinstance(page_number, IndirectObject):

1935 page_ref = page_number

1936 elif isinstance(page_number, PageObject):

1937 page_ref = page_number.indirect_reference

1938 elif isinstance(page_number, int):

1939 try:

1940 page_ref = self.pages[page_number].indirect_reference

1941 except IndexError:

1942 page_ref = NumberObject(page_number)

1943 if page_ref is None:

1944 logger_warning(

1945 f"can not find reference of page {page_number}",

1946 __name__,

1947 )

1948 page_ref = NullObject()

1949 dest = Destination(

1950 NameObject("/" + title + " outline item"),

1951 page_ref,

1952 fit,

1953 )

1954

1955 action_ref = self._add_object(

1956 DictionaryObject(

1957 {

1958 NameObject(GoToActionArguments.D): dest.dest_array,

1959 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1960 }

1961 )

1962 )

1963 outline_item = self._add_object(

1964 _create_outline_item(action_ref, title, color, italic, bold)

1965 )

1966

1967 if parent is None:

1968 parent = self.get_outline_root()

1969 return self.add_outline_item_destination(outline_item, parent, before, is_open)

1970

1971 def add_outline(self) -> None:

1972 raise NotImplementedError(

1973 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

1974 )

1975

1976 def add_named_destination_array(

1977 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

1978 ) -> None:

1979 named_dest = self.get_named_dest_root()

1980 i = 0

1981 while i < len(named_dest):

1982 if title < named_dest[i]:

1983 named_dest.insert(i, destination)

1984 named_dest.insert(i, TextStringObject(title))

1985 return

1986 i += 2

1987 named_dest.extend([TextStringObject(title), destination])

1988 return

1989

1990 def add_named_destination_object(

1991 self,

1992 page_destination: PdfObject,

1993 ) -> IndirectObject:

1994 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore

1995 self.add_named_destination_array(

1996 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore

1997 )

1998

1999 return page_destination_ref

2000

2001 def add_named_destination(

2002 self,

2003 title: str,

2004 page_number: int,

2005 ) -> IndirectObject:

2006 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore

2007 dest = DictionaryObject()

2008 dest.update(

2009 {

2010 NameObject(GoToActionArguments.D): ArrayObject(

2011 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

2012 ),

2013 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

2014 }

2015 )

2016

2017 dest_ref = self._add_object(dest)

2018 if not isinstance(title, TextStringObject):

2019 title = TextStringObject(str(title))

2020

2021 self.add_named_destination_array(title, dest_ref)

2022 return dest_ref

2023

2024 def remove_links(self) -> None:

2025 """Remove links and annotations from this output."""

2026 for page in self.pages:

2027 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

2028

2029 def remove_annotations(

2030 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

2031 ) -> None:

2032 """

2033 Remove annotations by annotation subtype.

2034

2035 Args:

2036 subtypes: subtype or list of subtypes to be removed.

2037 Examples are: "/Link", "/FileAttachment", "/Sound",

2038 "/Movie", "/Screen", ...

2039 If you want to remove all annotations, use subtypes=None.

2040

2041 """

2042 for page in self.pages:

2043 self._remove_annots_from_page(page, subtypes)

2044

2045 def _remove_annots_from_page(

2046 self,

2047 page: Union[IndirectObject, PageObject, DictionaryObject],

2048 subtypes: Optional[Iterable[str]],

2049 ) -> None:

2050 page = cast(DictionaryObject, page.get_object())

2051 if PG.ANNOTS in page:

2052 i = 0

2053 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

2054 an = cast(ArrayObject, page[PG.ANNOTS])[i]

2055 obj = cast(DictionaryObject, an.get_object())

2056 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

2057 if isinstance(an, IndirectObject):

2058 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

2059 del page[PG.ANNOTS][i] # type:ignore

2060 else:

2061 i += 1

2062

2063 def remove_objects_from_page(

2064 self,

2065 page: Union[PageObject, DictionaryObject],

2066 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

2067 text_filters: Optional[dict[str, Any]] = None

2068 ) -> None:

2069 """

2070 Remove objects specified by ``to_delete`` from the given page.

2071

2072 Args:

2073 page: Page object to clean up.

2074 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

2075 or a list of ObjectDeletionFlag

2076 text_filters: Properties of text to be deleted, if applicable. Optional.

2077 This is a Python dictionary with the following properties:

2078

2079 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

2080

2081 """

2082 if isinstance(to_delete, (list, tuple)):

2083 for to_d in to_delete:

2084 self.remove_objects_from_page(page, to_d)

2085 return None

2086 assert isinstance(to_delete, ObjectDeletionFlag)

2087

2088 if to_delete & ObjectDeletionFlag.LINKS:

2089 return self._remove_annots_from_page(page, ("/Link",))

2090 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

2091 return self._remove_annots_from_page(

2092 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

2093 )

2094 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

2095 return self._remove_annots_from_page(page, ("/3D",))

2096 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

2097 return self._remove_annots_from_page(page, None)

2098

2099 jump_operators = []

2100 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

2101 jump_operators = (

2102 [

2103 b"w", b"J", b"j", b"M", b"d", b"i",

2104 b"W", b"W*",

2105 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

2106 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

2107 b"sh"

2108 ]

2109 )

2110 if to_delete & ObjectDeletionFlag.TEXT:

2111 jump_operators = [b"Tj", b"TJ", b"'", b'"']

2112

2113 def clean(

2114 content: ContentStream,

2115 images: list[str],

2116 forms: list[str],

2117 text_filters: Optional[dict[str, Any]] = None

2118 ) -> None:

2119 nonlocal jump_operators, to_delete

2120

2121 font_id = None

2122 font_ids_to_delete = []

2123 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

2124 font_ids_to_delete = text_filters.get("font_ids", [])

2125

2126 i = 0

2127 while i < len(content.operations):

2128 operands, operator = content.operations[i]

2129 if operator == b"Tf":

2130 font_id = operands[0]

2131 if (

2132 (

2133 operator == b"INLINE IMAGE"

2134 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2135 )

2136 or (operator in jump_operators)

2137 or (

2138 operator == b"Do"

2139 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2140 and (operands[0] in images)

2141 )

2142 ):

2143 if (

2144 not to_delete & ObjectDeletionFlag.TEXT

2145 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2146 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2147 ):

2148 del content.operations[i]

2149 else:

2150 i += 1

2151 else:

2152 i += 1

2153 content.get_data() # this ensures ._data is rebuilt from the .operations

2154

2155 def clean_forms(

2156 elt: DictionaryObject, stack: list[DictionaryObject]

2157 ) -> tuple[list[str], list[str]]:

2158 nonlocal to_delete

2159 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2160 if (elt in stack) or (

2161 hasattr(elt, "indirect_reference")

2162 and any(

2163 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2164 for x in stack

2165 )

2166 ):

2167 # to prevent infinite looping

2168 return [], [] # pragma: no cover

2169 try:

2170 d = cast(

2171 dict[Any, Any],

2172 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2173 )

2174 except KeyError:

2175 d = {}

2176 images = []

2177 forms = []

2178 for k, v in d.items():

2179 o = v.get_object()

2180 try:

2181 content: Any = None

2182 if (

2183 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2184 and o["/Subtype"] == "/Image"

2185 ):

2186 content = NullObject() # to delete the image keeping the entry

2187 images.append(k)

2188 if o["/Subtype"] == "/Form":

2189 forms.append(k)

2190 if isinstance(o, ContentStream):

2191 content = o

2192 else:

2193 content = ContentStream(o, self)

2194 content.update(

2195 {

2196 k1: v1

2197 for k1, v1 in o.items()

2198 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2199 }

2200 )

2201 try:

2202 content.indirect_reference = o.indirect_reference

2203 except AttributeError: # pragma: no cover

2204 pass

2205 stack.append(elt)

2206 clean_forms(content, stack) # clean subforms

2207 if content is not None:

2208 if isinstance(v, IndirectObject):

2209 self._objects[v.idnum - 1] = content

2210 else:

2211 # should only occur in a PDF not respecting PDF spec

2212 # where streams must be indirected.

2213 d[k] = self._add_object(content) # pragma: no cover

2214 except (TypeError, KeyError):

2215 pass

2216 for im in images:

2217 del d[im] # for clean-up

2218 if isinstance(elt, StreamObject): # for /Form

2219 if not isinstance(elt, ContentStream): # pragma: no cover

2220 e = ContentStream(elt, self)

2221 e.update(elt.items())

2222 elt = e

2223 clean(elt, images, forms, text_filters) # clean the content

2224 return images, forms

2225

2226 if not isinstance(page, PageObject):

2227 page = PageObject(self, page.indirect_reference) # pragma: no cover

2228 if "/Contents" in page:

2229 content = cast(ContentStream, page.get_contents())

2230

2231 images, forms = clean_forms(page, [])

2232

2233 clean(content, images, forms, text_filters)

2234 page.replace_contents(content)

2235

2236 def remove_images(

2237 self,

2238 to_delete: ImageType = ImageType.ALL,

2239 ) -> None:

2240 """

2241 Remove images from this output.

2242

2243 Args:

2244 to_delete: The type of images to be deleted

2245 (default = all images types)

2246

2247 """

2248 if isinstance(to_delete, bool):

2249 to_delete = ImageType.ALL

2250

2251 i = ObjectDeletionFlag.NONE

2252

2253 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2254 if to_delete & ImageType[image]:

2255 i |= ObjectDeletionFlag[image]

2256

2257 for page in self.pages:

2258 self.remove_objects_from_page(page, i)

2259

2260 def remove_text(self, font_names: Optional[list[str]] = None) -> None:

2261 """

2262 Remove text from the PDF.

2263

2264 Args:

2265 font_names: List of font names to remove, such as "Helvetica-Bold".

2266 Optional. If not specified, all text will be removed.

2267 """

2268 if not font_names:

2269 font_names = []

2270

2271 for page in self.pages:

2272 resource_ids_to_remove = []

2273

2274 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2275 # Font names need to be converted to resource names/IDs for easier removal

2276 if font_names:

2277 # Recursively loop through page objects to gather font info

2278 def get_font_info(

2279 obj: Any,

2280 font_info: Optional[dict[str, Any]] = None,

2281 key: Optional[str] = None

2282 ) -> dict[str, Any]:

2283 if font_info is None:

2284 font_info = {}

2285 if isinstance(obj, IndirectObject):

2286 obj = obj.get_object()

2287 if isinstance(obj, dict):

2288 if obj.get("/Type") == "/Font":

2289 font_name = obj.get("/BaseFont", "")

2290 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2291 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2292 if normalized_font_name not in font_info:

2293 font_info[normalized_font_name] = {

2294 "normalized_font_name": normalized_font_name,

2295 "resource_ids": [],

2296 }

2297 if key not in font_info[normalized_font_name]["resource_ids"]:

2298 font_info[normalized_font_name]["resource_ids"].append(key)

2299 for k in obj:

2300 font_info = get_font_info(obj[k], font_info, k)

2301 elif isinstance(obj, (list, ArrayObject)):

2302 for child_obj in obj:

2303 font_info = get_font_info(child_obj, font_info)

2304 return font_info

2305

2306 # Add relevant resource names for removal

2307 font_info = get_font_info(page.get("/Resources"))

2308 for font_name in font_names:

2309 if font_name in font_info:

2310 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2311

2312 text_filters = {}

2313 if font_names:

2314 text_filters["font_ids"] = resource_ids_to_remove

2315 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2316

2317 def add_uri(

2318 self,

2319 page_number: int,

2320 uri: str,

2321 rect: RectangleObject,

2322 border: Optional[ArrayObject] = None,

2323 ) -> None:

2324 """

2325 Add an URI from a rectangular area to the specified page.

2326

2327 Args:

2328 page_number: index of the page on which to place the URI action.

2329 uri: URI of resource to link to.

2330 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2331 array of four integers specifying the clickable rectangular area

2332 ``[xLL, yLL, xUR, yUR]``, or string in the form

2333 ``"[ xLL yLL xUR yUR ]"``.

2334 border: if provided, an array describing border-drawing

2335 properties. See the PDF spec for details. No border will be

2336 drawn if this argument is omitted.

2337

2338 """

2339 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore

2340 page_ref = cast(dict[str, Any], self.get_object(page_link))

2341

2342 border_arr: BorderArrayType

2343 if border is not None:

2344 border_arr = [NumberObject(n) for n in border[:3]]

2345 if len(border) == 4:

2346 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2347 border_arr.append(dash_pattern)

2348 else:

2349 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2350

2351 if isinstance(rect, str):

2352 rect = NumberObject(rect)

2353 elif isinstance(rect, RectangleObject):

2354 pass

2355 else:

2356 rect = RectangleObject(rect)

2357

2358 lnk2 = DictionaryObject()

2359 lnk2.update(

2360 {

2361 NameObject("/S"): NameObject("/URI"),

2362 NameObject("/URI"): TextStringObject(uri),

2363 }

2364 )

2365 lnk = DictionaryObject()

2366 lnk.update(

2367 {

2368 NameObject(AA.Type): NameObject("/Annot"),

2369 NameObject(AA.Subtype): NameObject("/Link"),

2370 NameObject(AA.P): page_link,

2371 NameObject(AA.Rect): rect,

2372 NameObject("/H"): NameObject("/I"),

2373 NameObject(AA.Border): ArrayObject(border_arr),

2374 NameObject("/A"): lnk2,

2375 }

2376 )

2377 lnk_ref = self._add_object(lnk)

2378

2379 if PG.ANNOTS in page_ref:

2380 page_ref[PG.ANNOTS].append(lnk_ref)

2381 else:

2382 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2383

2384 _valid_layouts = (

2385 "/NoLayout",

2386 "/SinglePage",

2387 "/OneColumn",

2388 "/TwoColumnLeft",

2389 "/TwoColumnRight",

2390 "/TwoPageLeft",

2391 "/TwoPageRight",

2392 )

2393

2394 def _get_page_layout(self) -> Optional[LayoutType]:

2395 try:

2396 return cast(LayoutType, self._root_object["/PageLayout"])

2397 except KeyError:

2398 return None

2399

2400 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2401 """

2402 Set the page layout.

2403

2404 Args:

2405 layout: The page layout to be used.

2406

2407 .. list-table:: Valid ``layout`` arguments

2408 :widths: 50 200

2409

2410 * - /NoLayout

2411 - Layout explicitly not specified

2412 * - /SinglePage

2413 - Show one page at a time

2414 * - /OneColumn

2415 - Show one column at a time

2416 * - /TwoColumnLeft

2417 - Show pages in two columns, odd-numbered pages on the left

2418 * - /TwoColumnRight

2419 - Show pages in two columns, odd-numbered pages on the right

2420 * - /TwoPageLeft

2421 - Show two pages at a time, odd-numbered pages on the left

2422 * - /TwoPageRight

2423 - Show two pages at a time, odd-numbered pages on the right

2424

2425 """

2426 if not isinstance(layout, NameObject):

2427 if layout not in self._valid_layouts:

2428 logger_warning(

2429 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",

2430 __name__,

2431 )

2432 layout = NameObject(layout)

2433 self._root_object.update({NameObject("/PageLayout"): layout})

2434

2435 def set_page_layout(self, layout: LayoutType) -> None:

2436 """

2437 Set the page layout.

2438

2439 Args:

2440 layout: The page layout to be used

2441

2442 .. list-table:: Valid ``layout`` arguments

2443 :widths: 50 200

2444

2445 * - /NoLayout

2446 - Layout explicitly not specified

2447 * - /SinglePage

2448 - Show one page at a time

2449 * - /OneColumn

2450 - Show one column at a time

2451 * - /TwoColumnLeft

2452 - Show pages in two columns, odd-numbered pages on the left

2453 * - /TwoColumnRight

2454 - Show pages in two columns, odd-numbered pages on the right

2455 * - /TwoPageLeft

2456 - Show two pages at a time, odd-numbered pages on the left

2457 * - /TwoPageRight

2458 - Show two pages at a time, odd-numbered pages on the right

2459

2460 """

2461 self._set_page_layout(layout)

2462

2463 @property

2464 def page_layout(self) -> Optional[LayoutType]:

2465 """

2466 Page layout property.

2467

2468 .. list-table:: Valid ``layout`` values

2469 :widths: 50 200

2470

2471 * - /NoLayout

2472 - Layout explicitly not specified

2473 * - /SinglePage

2474 - Show one page at a time

2475 * - /OneColumn

2476 - Show one column at a time

2477 * - /TwoColumnLeft

2478 - Show pages in two columns, odd-numbered pages on the left

2479 * - /TwoColumnRight

2480 - Show pages in two columns, odd-numbered pages on the right

2481 * - /TwoPageLeft

2482 - Show two pages at a time, odd-numbered pages on the left

2483 * - /TwoPageRight

2484 - Show two pages at a time, odd-numbered pages on the right

2485 """

2486 return self._get_page_layout()

2487

2488 @page_layout.setter

2489 def page_layout(self, layout: LayoutType) -> None:

2490 self._set_page_layout(layout)

2491

2492 _valid_modes = (

2493 "/UseNone",

2494 "/UseOutlines",

2495 "/UseThumbs",

2496 "/FullScreen",

2497 "/UseOC",

2498 "/UseAttachments",

2499 )

2500

2501 def _get_page_mode(self) -> Optional[PagemodeType]:

2502 try:

2503 return cast(PagemodeType, self._root_object["/PageMode"])

2504 except KeyError:

2505 return None

2506

2507 @property

2508 def page_mode(self) -> Optional[PagemodeType]:

2509 """

2510 Page mode property.

2511

2512 .. list-table:: Valid ``mode`` values

2513 :widths: 50 200

2514

2515 * - /UseNone

2516 - Do not show outline or thumbnails panels

2517 * - /UseOutlines

2518 - Show outline (aka bookmarks) panel

2519 * - /UseThumbs

2520 - Show page thumbnails panel

2521 * - /FullScreen

2522 - Fullscreen view

2523 * - /UseOC

2524 - Show Optional Content Group (OCG) panel

2525 * - /UseAttachments

2526 - Show attachments panel

2527 """

2528 return self._get_page_mode()

2529

2530 @page_mode.setter

2531 def page_mode(self, mode: PagemodeType) -> None:

2532 if isinstance(mode, NameObject):

2533 mode_name: NameObject = mode

2534 else:

2535 if mode not in self._valid_modes:

2536 logger_warning(

2537 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__

2538 )

2539 mode_name = NameObject(mode)

2540 self._root_object.update({NameObject("/PageMode"): mode_name})

2541

2542 def add_annotation(

2543 self,

2544 page_number: Union[int, PageObject],

2545 annotation: dict[str, Any],

2546 ) -> DictionaryObject:

2547 """

2548 Add a single annotation to the page.

2549 The added annotation must be a new annotation.

2550 It cannot be recycled.

2551

2552 Args:

2553 page_number: PageObject or page index.

2554 annotation: Annotation to be added (created with annotation).

2555

2556 Returns:

2557 The inserted object.

2558 This can be used for popup creation, for example.

2559

2560 """

2561 page = page_number

2562 if isinstance(page, int):

2563 page = self.pages[page]

2564 elif not isinstance(page, PageObject):

2565 raise TypeError("page: invalid type")

2566

2567 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2568 to_add[NameObject("/P")] = page.indirect_reference

2569

2570 if page.annotations is None:

2571 page[NameObject("/Annots")] = ArrayObject()

2572 assert page.annotations is not None

2573

2574 # Internal link annotations need the correct object type for the

2575 # destination

2576 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2577 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])

2578 dest = Destination(

2579 NameObject("/LinkName"),

2580 tmp["target_page_index"],

2581 Fit(

2582 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2583 ), # I have no clue why this dict-hack is necessary

2584 )

2585 to_add[NameObject("/Dest")] = dest.dest_array

2586

2587 page.annotations.append(self._add_object(to_add))

2588

2589 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2590 cast(DictionaryObject, to_add["/Parent"].get_object())[

2591 NameObject("/Popup")

2592 ] = to_add.indirect_reference

2593

2594 return to_add

2595

2596 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2597 """

2598 Perform some clean up in the page.

2599 Currently: convert NameObject named destination to TextStringObject

2600 (required for names/dests list)

2601

2602 Args:

2603 page:

2604

2605 Returns:

2606 The cleaned PageObject

2607

2608 """

2609 page = cast("PageObject", page.get_object())

2610 for a in page.get("/Annots", []):

2611 a_obj = a.get_object()

2612 d = a_obj.get("/Dest", None)

2613 act = a_obj.get("/A", None)

2614 if isinstance(d, NameObject):

2615 a_obj[NameObject("/Dest")] = TextStringObject(d)

2616 elif act is not None:

2617 act = act.get_object()

2618 d = act.get("/D", None)

2619 if isinstance(d, NameObject):

2620 act[NameObject("/D")] = TextStringObject(d)

2621 return page

2622

2623 def _create_stream(

2624 self, fileobj: Union[Path, StrByteType, PdfReader]

2625 ) -> tuple[IOBase, Optional[Encryption]]:

2626 # If the fileobj parameter is a string, assume it is a path

2627 # and create a file object at that location. If it is a file,

2628 # copy the file's contents into a BytesIO stream object; if

2629 # it is a PdfReader, copy that reader's stream into a

2630 # BytesIO stream.

2631 # If fileobj is none of the above types, it is not modified

2632 encryption_obj = None

2633 stream: IOBase

2634 if isinstance(fileobj, (str, Path)):

2635 with FileIO(fileobj, "rb") as f:

2636 stream = BytesIO(f.read())

2637 elif isinstance(fileobj, PdfReader):

2638 if fileobj._encryption:

2639 encryption_obj = fileobj._encryption

2640 orig_tell = fileobj.stream.tell()

2641 fileobj.stream.seek(0)

2642 stream = BytesIO(fileobj.stream.read())

2643

2644 # reset the stream to its original location

2645 fileobj.stream.seek(orig_tell)

2646 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2647 fileobj.seek(0)

2648 filecontent = fileobj.read()

2649 stream = BytesIO(filecontent)

2650 else:

2651 raise NotImplementedError(

2652 "Merging requires an object that PdfReader can parse. "

2653 "Typically, that is a Path or a string representing a Path, "

2654 "a file object, or an object implementing .seek and .read. "

2655 "Passing a PdfReader directly works as well."

2656 )

2657 return stream, encryption_obj

2658

2659 def append(

2660 self,

2661 fileobj: Union[StrByteType, PdfReader, Path],

2662 outline_item: Union[

2663 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]

2664 ] = None,

2665 pages: Union[

2666 None,

2667 PageRange,

2668 tuple[int, int],

2669 tuple[int, int, int],

2670 list[int],

2671 list[PageObject],

2672 ] = None,

2673 import_outline: bool = True,

2674 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,

2675 ) -> None:

2676 """

2677 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2678 concatenate all pages onto the end of the file instead of specifying a

2679 position.

2680

2681 Args:

2682 fileobj: A File Object or an object that supports the standard

2683 read and seek methods similar to a File Object. Could also be a

2684 string representing a path to a PDF file.

2685 outline_item: Optionally, you may specify a string to build an

2686 outline (aka 'bookmark') to identify the beginning of the

2687 included file.

2688 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2689 or a ``(start, stop[, step])`` tuple

2690 or a list of pages to be processed

2691 to merge only the specified range of pages from the source

2692 document into the output document.

2693 import_outline: You may prevent the source document's

2694 outline (collection of outline items, previously referred to as

2695 'bookmarks') from being imported by specifying this as ``False``.

2696 excluded_fields: Provide the list of fields/keys to be ignored

2697 if ``/Annots`` is part of the list, the annotation will be ignored

2698 if ``/B`` is part of the list, the articles will be ignored

2699

2700 """

2701 if excluded_fields is None:

2702 excluded_fields = ()

2703 if isinstance(outline_item, (tuple, list, PageRange)):

2704 if isinstance(pages, bool):

2705 if not isinstance(import_outline, bool):

2706 excluded_fields = import_outline

2707 import_outline = pages

2708 pages = outline_item

2709 self.merge(

2710 None,

2711 fileobj,

2712 None,

2713 pages,

2714 import_outline,

2715 excluded_fields,

2716 )

2717 else: # if isinstance(outline_item, str):

2718 self.merge(

2719 None,

2720 fileobj,

2721 outline_item,

2722 pages,

2723 import_outline,

2724 excluded_fields,

2725 )

2726

2727 def merge(

2728 self,

2729 position: Optional[int],

2730 fileobj: Union[Path, StrByteType, PdfReader],

2731 outline_item: Optional[str] = None,

2732 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,

2733 import_outline: bool = True,

2734 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),

2735 ) -> None:

2736 """

2737 Merge the pages from the given file into the output file at the

2738 specified page number.

2739

2740 Args:

2741 position: The *page number* to insert this file. File will

2742 be inserted after the given number.

2743 fileobj: A File Object or an object that supports the standard

2744 read and seek methods similar to a File Object. Could also be a

2745 string representing a path to a PDF file.

2746 outline_item: Optionally, you may specify a string to build an outline

2747 (aka 'bookmark') to identify the

2748 beginning of the included file.

2749 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2750 or a ``(start, stop[, step])`` tuple

2751 or a list of pages to be processed

2752 to merge only the specified range of pages from the source

2753 document into the output document.

2754 import_outline: You may prevent the source document's

2755 outline (collection of outline items, previously referred to as

2756 'bookmarks') from being imported by specifying this as ``False``.

2757 excluded_fields: provide the list of fields/keys to be ignored

2758 if ``/Annots`` is part of the list, the annotation will be ignored

2759 if ``/B`` is part of the list, the articles will be ignored

2760

2761 Raises:

2762 TypeError: The pages attribute is not configured properly

2763

2764 """

2765 if isinstance(fileobj, PdfDocCommon):

2766 reader = fileobj

2767 else:

2768 stream, _encryption_obj = self._create_stream(fileobj)

2769 # Create a new PdfReader instance using the stream

2770 # (either file or BytesIO or StringIO) created above

2771 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2772

2773 if excluded_fields is None:

2774 excluded_fields = ()

2775 # Find the range of pages to merge.

2776 if pages is None:

2777 pages = list(range(len(reader.pages)))

2778 elif isinstance(pages, PageRange):

2779 pages = list(range(*pages.indices(len(reader.pages))))

2780 elif isinstance(pages, list):

2781 pass # keep unchanged

2782 elif isinstance(pages, tuple) and len(pages) <= 3:

2783 pages = list(range(*pages))

2784 elif not isinstance(pages, tuple):

2785 raise TypeError(

2786 '"pages" must be a tuple of (start, stop[, step]) or a list'

2787 )

2788

2789 srcpages = {}

2790 for page in pages:

2791 if isinstance(page, PageObject):

2792 pg = page

2793 else:

2794 pg = reader.pages[page]

2795 assert pg.indirect_reference is not None

2796 if position is None:

2797 # numbers in the exclude list identifies that the exclusion is

2798 # only applicable to 1st level of cloning

2799 srcpages[pg.indirect_reference.idnum] = self.add_page(

2800 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2801 )

2802 else:

2803 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2804 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2805 )

2806 position += 1

2807 srcpages[pg.indirect_reference.idnum].original_page = pg

2808

2809 reader._named_destinations = (

2810 reader.named_destinations

2811 ) # need for the outline processing below

2812

2813 arr: Any

2814

2815 def _process_named_dests(dest: Any) -> None:

2816 arr = dest.dest_array

2817 if "/Names" in self._root_object and dest["/Title"] in cast(

2818 list[Any],

2819 cast(

2820 DictionaryObject,

2821 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2822 ).get("/Names", DictionaryObject()),

2823 ):

2824 # already exists: should not duplicate it

2825 pass

2826 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2827 pass

2828 elif isinstance(dest["/Page"], int):

2829 # the page reference is a page number normally not a PDF Reference

2830 # page numbers as int are normally accepted only in external goto

2831 try:

2832 p = reader.pages[dest["/Page"]]

2833 except IndexError:

2834 return

2835 assert p.indirect_reference is not None

2836 try:

2837 arr[NumberObject(0)] = NumberObject(

2838 srcpages[p.indirect_reference.idnum].page_number

2839 )

2840 self.add_named_destination_array(dest["/Title"], arr)

2841 except KeyError:

2842 pass

2843 elif dest["/Page"].indirect_reference.idnum in srcpages:

2844 arr[NumberObject(0)] = srcpages[

2845 dest["/Page"].indirect_reference.idnum

2846 ].indirect_reference

2847 self.add_named_destination_array(dest["/Title"], arr)

2848

2849 for dest in reader._named_destinations.values():

2850 _process_named_dests(dest)

2851

2852 outline_item_typ: TreeObject

2853 if outline_item is not None:

2854 outline_item_typ = cast(

2855 "TreeObject",

2856 self.add_outline_item(

2857 TextStringObject(outline_item),

2858 next(iter(srcpages.values())).indirect_reference,

2859 fit=PAGE_FIT,

2860 ).get_object(),

2861 )

2862 else:

2863 outline_item_typ = self.get_outline_root()

2864

2865 _ro = reader.root_object

2866 if import_outline and CO.OUTLINES in _ro:

2867 outline = self._get_filtered_outline(

2868 _ro.get(CO.OUTLINES, None), srcpages, reader

2869 )

2870 self._insert_filtered_outline(

2871 outline, outline_item_typ, None

2872 ) # TODO: use before parameter

2873

2874 if "/Annots" not in excluded_fields:

2875 for pag in srcpages.values():

2876 lst = self._insert_filtered_annotations(

2877 pag.original_page.get("/Annots", []), pag, srcpages, reader

2878 )

2879 if len(lst) > 0:

2880 pag[NameObject("/Annots")] = lst

2881 self.clean_page(pag)

2882

2883 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:

2884 if "/AcroForm" not in self._root_object:

2885 self._root_object[NameObject("/AcroForm")] = self._add_object(

2886 cast(

2887 DictionaryObject,

2888 reader.root_object["/AcroForm"],

2889 ).clone(self, False, ("/Fields",))

2890 )

2891 arr = ArrayObject()

2892 else:

2893 arr = cast(

2894 ArrayObject,

2895 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2896 )

2897 trslat = self._id_translated[id(reader)]

2898 try:

2899 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore

2900 try:

2901 ind = IndirectObject(trslat[f.idnum], 0, self)

2902 if ind not in arr:

2903 arr.append(ind)

2904 except KeyError:

2905 # for trslat[] which mean the field has not be copied

2906 # through the page

2907 pass

2908 except KeyError: # for /Acroform or /Fields are not existing

2909 arr = self._add_object(ArrayObject())

2910 cast(DictionaryObject, self._root_object["/AcroForm"])[

2911 NameObject("/Fields")

2912 ] = arr

2913

2914 if "/B" not in excluded_fields:

2915 self.add_filtered_articles("", srcpages, reader)

2916

2917 def _add_articles_thread(

2918 self,

2919 thread: DictionaryObject, # thread entry from the reader's array of threads

2920 pages: dict[int, PageObject],

2921 reader: PdfReader,

2922 ) -> IndirectObject:

2923 """

2924 Clone the thread with only the applicable articles.

2925

2926 Args:

2927 thread:

2928 pages:

2929 reader:

2930

2931 Returns:

2932 The added thread as an indirect reference

2933

2934 """

2935 nthread = thread.clone(

2936 self, force_duplicate=True, ignore_fields=("/F",)

2937 ) # use of clone to keep link between reader and writer

2938 self.threads.append(nthread.indirect_reference)

2939 first_article = cast("DictionaryObject", thread["/F"])

2940 current_article: Optional[DictionaryObject] = first_article

2941 new_article: Optional[DictionaryObject] = None

2942 while current_article is not None:

2943 pag = self._get_cloned_page(

2944 cast("PageObject", current_article["/P"]), pages, reader

2945 )

2946 if pag is not None:

2947 if new_article is None:

2948 new_article = cast(

2949 "DictionaryObject",

2950 self._add_object(DictionaryObject()).get_object(),

2951 )

2952 new_first = new_article

2953 nthread[NameObject("/F")] = new_article.indirect_reference

2954 else:

2955 new_article2 = cast(

2956 "DictionaryObject",

2957 self._add_object(

2958 DictionaryObject(

2959 {NameObject("/V"): new_article.indirect_reference}

2960 )

2961 ).get_object(),

2962 )

2963 new_article[NameObject("/N")] = new_article2.indirect_reference

2964 new_article = new_article2

2965 new_article[NameObject("/P")] = pag

2966 new_article[NameObject("/T")] = nthread.indirect_reference

2967 new_article[NameObject("/R")] = current_article["/R"]

2968 pag_obj = cast("PageObject", pag.get_object())

2969 if "/B" not in pag_obj:

2970 pag_obj[NameObject("/B")] = ArrayObject()

2971 cast("ArrayObject", pag_obj["/B"]).append(

2972 new_article.indirect_reference

2973 )

2974 current_article = cast("DictionaryObject", current_article["/N"])

2975 if current_article == first_article:

2976 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore

2977 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore

2978 current_article = None

2979 assert nthread.indirect_reference is not None

2980 return nthread.indirect_reference

2981

2982 def add_filtered_articles(

2983 self,

2984 fltr: Union[

2985 Pattern[Any], str

2986 ], # thread entry from the reader's array of threads

2987 pages: dict[int, PageObject],

2988 reader: PdfReader,

2989 ) -> None:

2990 """

2991 Add articles matching the defined criteria.

2992

2993 Args:

2994 fltr:

2995 pages:

2996 reader:

2997

2998 """

2999 if isinstance(fltr, str):

3000 fltr = re.compile(fltr)

3001 elif not isinstance(fltr, Pattern):

3002 fltr = re.compile("")

3003 for p in pages.values():

3004 pp = p.original_page

3005 for a in pp.get("/B", ()):

3006 thr = a.get_object().get("/T")

3007 if thr is None:

3008 continue

3009 thr = thr.get_object()

3010 if thr.indirect_reference.idnum not in self._id_translated[

3011 id(reader)

3012 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

3013 self._add_articles_thread(thr, pages, reader)

3014

3015 def _get_cloned_page(

3016 self,

3017 page: Union[None, IndirectObject, PageObject, NullObject],

3018 pages: dict[int, PageObject],

3019 reader: PdfReader,

3020 ) -> Optional[IndirectObject]:

3021 if isinstance(page, NullObject):

3022 return None

3023 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

3024 _i = page.indirect_reference

3025 elif isinstance(page, IndirectObject):

3026 _i = page

3027 try:

3028 return pages[_i.idnum].indirect_reference # type: ignore

3029 except Exception:

3030 return None

3031

3032 def _insert_filtered_annotations(

3033 self,

3034 annots: Union[IndirectObject, list[DictionaryObject], None],

3035 page: PageObject,

3036 pages: dict[int, PageObject],

3037 reader: PdfReader,

3038 ) -> list[Destination]:

3039 outlist = ArrayObject()

3040 if isinstance(annots, IndirectObject):

3041 annots = cast("list[Any]", annots.get_object())

3042 if annots is None:

3043 return outlist

3044 if not isinstance(annots, list):

3045 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)

3046 return outlist

3047 for an in annots:

3048 ano = cast("DictionaryObject", an.get_object())

3049 if (

3050 ano["/Subtype"] != "/Link"

3051 or "/A" not in ano

3052 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"

3053 or "/Dest" in ano

3054 ):

3055 if "/Dest" not in ano:

3056 outlist.append(self._add_object(ano.clone(self)))

3057 else:

3058 d = ano["/Dest"]

3059 if isinstance(d, str):

3060 # it is a named dest

3061 if str(d) in self.get_named_dest_root():

3062 outlist.append(ano.clone(self).indirect_reference)

3063 else:

3064 d = cast("ArrayObject", d)

3065 p = self._get_cloned_page(d[0], pages, reader)

3066 if p is not None:

3067 anc = ano.clone(self, ignore_fields=("/Dest",))

3068 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

3069 outlist.append(self._add_object(anc))

3070 else:

3071 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

3072 if d is None or isinstance(d, NullObject):

3073 continue

3074 if isinstance(d, str):

3075 # it is a named dest

3076 if str(d) in self.get_named_dest_root():

3077 outlist.append(ano.clone(self).indirect_reference)

3078 else:

3079 d = cast("ArrayObject", d)

3080 p = self._get_cloned_page(d[0], pages, reader)

3081 if p is not None:

3082 anc = ano.clone(self, ignore_fields=("/D",))

3083 cast("DictionaryObject", anc["/A"])[

3084 NameObject("/D")

3085 ] = ArrayObject([p, *d[1:]])

3086 outlist.append(self._add_object(anc))

3087 return outlist

3088

3089 def _get_filtered_outline(

3090 self,

3091 node: Any,

3092 pages: dict[int, PageObject],

3093 reader: PdfReader,

3094 ) -> list[Destination]:

3095 """

3096 Extract outline item entries that are part of the specified page set.

3097

3098 Args:

3099 node:

3100 pages:

3101 reader:

3102

3103 Returns:

3104 A list of destination objects.

3105

3106 """

3107 new_outline = []

3108 if node is None:

3109 node = NullObject()

3110 node = node.get_object()

3111 if is_null_or_none(node):

3112 node = DictionaryObject()

3113 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

3114 node = node.get("/First", None)

3115 if node is not None:

3116 node = node.get_object()

3117 new_outline += self._get_filtered_outline(node, pages, reader)

3118 else:

3119 v: Union[None, IndirectObject, NullObject]

3120 while node is not None:

3121 node = node.get_object()

3122 o = cast("Destination", reader._build_outline_item(node))

3123 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3124 if v is None:

3125 v = NullObject()

3126 o[NameObject("/Page")] = v

3127 if "/First" in node:

3128 o._filtered_children = self._get_filtered_outline(

3129 node["/First"], pages, reader

3130 )

3131 else:

3132 o._filtered_children = []

3133 if (

3134 not isinstance(o["/Page"], NullObject)

3135 or len(o._filtered_children) > 0

3136 ):

3137 new_outline.append(o)

3138 node = node.get("/Next", None)

3139 return new_outline

3140

3141 def _clone_outline(self, dest: Destination) -> TreeObject:

3142 n_ol = TreeObject()

3143 self._add_object(n_ol)

3144 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3145 if not isinstance(dest["/Page"], NullObject):

3146 if dest.node is not None and "/A" in dest.node:

3147 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3148 else:

3149 n_ol[NameObject("/Dest")] = dest.dest_array

3150 # TODO: /SE

3151 if dest.node is not None:

3152 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3153 n_ol[NameObject("/C")] = ArrayObject(

3154 dest.node.get(

3155 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3156 )

3157 )

3158 return n_ol

3159

3160 def _insert_filtered_outline(

3161 self,

3162 outlines: list[Destination],

3163 parent: Union[TreeObject, IndirectObject],

3164 before: Union[None, TreeObject, IndirectObject] = None,

3165 ) -> None:

3166 for dest in outlines:

3167 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3168 # with np=self.add_outline_item_destination(dest,parent,before)

3169 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3170 np = parent

3171 else:

3172 np = self._clone_outline(dest)

3173 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3174 self._insert_filtered_outline(dest._filtered_children, np, None)

3175

3176 def close(self) -> None:

3177 """Implemented for API harmonization."""

3178 return

3179

3180 def find_outline_item(

3181 self,

3182 outline_item: dict[str, Any],

3183 root: Optional[OutlineType] = None,

3184 ) -> Optional[list[int]]:

3185 if root is None:

3186 o = self.get_outline_root()

3187 else:

3188 o = cast("TreeObject", root)

3189

3190 i = 0

3191 while o is not None:

3192 if (

3193 o.indirect_reference == outline_item

3194 or o.get("/Title", None) == outline_item

3195 ):

3196 return [i]

3197 if "/First" in o:

3198 res = self.find_outline_item(

3199 outline_item, cast(OutlineType, o["/First"])

3200 )

3201 if res:

3202 return ([i] if "/Title" in o else []) + res

3203 if "/Next" in o:

3204 i += 1

3205 o = cast(TreeObject, o["/Next"])

3206 else:

3207 return None

3208

3209 def reset_translation(

3210 self, reader: Union[None, PdfReader, IndirectObject] = None

3211 ) -> None:

3212 """

3213 Reset the translation table between reader and the writer object.

3214

3215 Late cloning will create new independent objects.

3216

3217 Args:

3218 reader: PdfReader or IndirectObject referencing a PdfReader object.

3219 if set to None or omitted, all tables will be reset.

3220

3221 """

3222 if reader is None:

3223 self._id_translated = {}

3224 elif isinstance(reader, PdfReader):

3225 try:

3226 del self._id_translated[id(reader)]

3227 except Exception:

3228 pass

3229 elif isinstance(reader, IndirectObject):

3230 try:

3231 del self._id_translated[id(reader.pdf)]

3232 except Exception:

3233 pass

3234 else:

3235 raise Exception("invalid parameter {reader}")

3236

3237 def set_page_label(

3238 self,

3239 page_index_from: int,

3240 page_index_to: int,

3241 style: Optional[PageLabelStyle] = None,

3242 prefix: Optional[str] = None,

3243 start: Optional[int] = 0,

3244 ) -> None:

3245 """

3246 Set a page label to a range of pages.

3247

3248 Page indexes must be given starting from 0.

3249 Labels must have a style, a prefix or both.

3250 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3251

3252 Args:

3253 page_index_from: page index of the beginning of the range starting from 0

3254 page_index_to: page index of the beginning of the range starting from 0

3255 style: The numbering style to be used for the numeric portion of each page label:

3256

3257 * ``/D`` Decimal Arabic numerals

3258 * ``/R`` Uppercase Roman numerals

3259 * ``/r`` Lowercase Roman numerals

3260 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3261 AA to ZZ for the next 26, and so on)

3262 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3263 aa to zz for the next 26, and so on)

3264

3265 prefix: The label prefix for page labels in this range.

3266 start: The value of the numeric portion for the first page label

3267 in the range.

3268 Subsequent pages are numbered sequentially from this value,

3269 which must be greater than or equal to 1.

3270 Default value: 1.

3271

3272 """

3273 if style is None and prefix is None:

3274 raise ValueError("At least one of style and prefix must be given")

3275 if page_index_from < 0:

3276 raise ValueError("page_index_from must be greater or equal than 0")

3277 if page_index_to < page_index_from:

3278 raise ValueError(

3279 "page_index_to must be greater or equal than page_index_from"

3280 )

3281 if page_index_to >= len(self.pages):

3282 raise ValueError("page_index_to exceeds number of pages")

3283 if start is not None and start != 0 and start < 1:

3284 raise ValueError("If given, start must be greater or equal than one")

3285

3286 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3287

3288 def _set_page_label(

3289 self,

3290 page_index_from: int,

3291 page_index_to: int,

3292 style: Optional[PageLabelStyle] = None,

3293 prefix: Optional[str] = None,

3294 start: Optional[int] = 0,

3295 ) -> None:

3296 """

3297 Set a page label to a range of pages.

3298

3299 Page indexes must be given starting from 0.

3300 Labels must have a style, a prefix or both.

3301 If a range is not assigned any page label a decimal label starting from 1 is applied.

3302

3303 Args:

3304 page_index_from: page index of the beginning of the range starting from 0

3305 page_index_to: page index of the beginning of the range starting from 0

3306 style: The numbering style to be used for the numeric portion of each page label:

3307 /D Decimal Arabic numerals

3308 /R Uppercase Roman numerals

3309 /r Lowercase Roman numerals

3310 /A Uppercase letters (A to Z for the first 26 pages,

3311 AA to ZZ for the next 26, and so on)

3312 /a Lowercase letters (a to z for the first 26 pages,

3313 aa to zz for the next 26, and so on)

3314 prefix: The label prefix for page labels in this range.

3315 start: The value of the numeric portion for the first page label

3316 in the range.

3317 Subsequent pages are numbered sequentially from this value,

3318 which must be greater than or equal to 1. Default value: 1.

3319

3320 """

3321 default_page_label = DictionaryObject()

3322 default_page_label[NameObject("/S")] = NameObject("/D")

3323

3324 new_page_label = DictionaryObject()

3325 if style is not None:

3326 new_page_label[NameObject("/S")] = NameObject(style)

3327 if prefix is not None:

3328 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3329 if start != 0:

3330 new_page_label[NameObject("/St")] = NumberObject(start)

3331

3332 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3333 nums = ArrayObject()

3334 nums_insert(NumberObject(0), default_page_label, nums)

3335 page_labels = TreeObject()

3336 page_labels[NameObject("/Nums")] = nums

3337 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3338

3339 page_labels = cast(

3340 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3341 )

3342 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3343

3344 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3345 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3346 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3347 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3348 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3349

3350 page_labels[NameObject("/Nums")] = nums

3351 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3352

3353 def _repr_mimebundle_(

3354 self,

3355 include: Union[None, Iterable[str]] = None,

3356 exclude: Union[None, Iterable[str]] = None,

3357 ) -> dict[str, Any]:

3358 """

3359 Integration into Jupyter Notebooks.

3360

3361 This method returns a dictionary that maps a mime-type to its

3362 representation.

3363

3364 .. seealso::

3365

3366 https://ipython.readthedocs.io/en/stable/config/integrating.html

3367 """

3368 pdf_data = BytesIO()

3369 self.write(pdf_data)

3370 data = {

3371 "application/pdf": pdf_data,

3372 }

3373

3374 if include is not None:

3375 # Filter representations based on include list

3376 data = {k: v for k, v in data.items() if k in include}

3377

3378 if exclude is not None:

3379 # Remove representations based on exclude list

3380 data = {k: v for k, v in data.items() if k not in exclude}

3381

3382 return data

3383

3384

3385def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:

3386 if isinstance(obj, PdfObject):

3387 return obj

3388 if isinstance(obj, dict):

3389 to_add = DictionaryObject()

3390 for key, value in obj.items():

3391 to_add[NameObject(key)] = _pdf_objectify(value)

3392 return to_add

3393 if isinstance(obj, str):

3394 if obj.startswith("/"):

3395 return NameObject(obj)

3396 return TextStringObject(obj)

3397 if isinstance(obj, (float, int)):

3398 return FloatObject(obj)

3399 if isinstance(obj, list):

3400 return ArrayObject(_pdf_objectify(i) for i in obj)

3401 raise NotImplementedError(

3402 f"{type(obj)=} could not be cast to a PdfObject"

3403 )

3404

3405

3406def _create_outline_item(

3407 action_ref: Union[None, IndirectObject],

3408 title: str,

3409 color: Union[tuple[float, float, float], str, None],

3410 italic: bool,

3411 bold: bool,

3412) -> TreeObject:

3413 outline_item = TreeObject()

3414 if action_ref is not None:

3415 outline_item[NameObject("/A")] = action_ref

3416 outline_item.update(

3417 {

3418 NameObject("/Title"): create_string_object(title),

3419 }

3420 )

3421 if color:

3422 if isinstance(color, str):

3423 color = hex_to_rgb(color)

3424 outline_item.update(

3425 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3426 )

3427 if italic or bold:

3428 format_flag = 0

3429 if italic:

3430 format_flag += OutlineFontFlag.italic

3431 if bold:

3432 format_flag += OutlineFontFlag.bold

3433 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3434 return outline_item

3435

3436

3437def generate_appearance_stream(

3438 txt: str,

3439 sel: list[str],

3440 da: str,

3441 font_full_rev: dict[str, bytes],

3442 rct: RectangleObject,

3443 font_height: float,

3444 y_offset: float,

3445) -> bytes:

3446 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()

3447 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):

3448 if line in sel:

3449 # may be improved but cannot find how to get fill working => replaced with lined box

3450 ap_stream += (

3451 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"

3452 f"0.5 0.5 0.5 rg s\n{da}\n"

3453 ).encode()

3454 if line_number == 0:

3455 ap_stream += f"2 {y_offset} Td\n".encode()

3456 else:

3457 # Td is a relative translation

3458 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()

3459 enc_line: list[bytes] = [

3460 font_full_rev.get(c, c.encode("utf-16-be")) for c in line

3461 ]

3462 if any(len(c) >= 2 for c in enc_line):

3463 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"

3464 else:

3465 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"

3466 ap_stream += b"ET\nQ\nEMC\nQ\n"

3467 return ap_stream

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 20%

1481 statements