Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import uuid

36from io import BytesIO, FileIO, IOBase

37from itertools import compress

38from pathlib import Path

39from types import TracebackType

40from typing import (

41 IO,

42 Any,

43 Callable,

44 Dict,

45 Iterable,

46 List,

47 Optional,

48 Pattern,

49 Tuple,

50 Type,

51 Union,

52 cast,

53)

55from ._cmap import _default_fonts_space_width, build_char_map_from_dict

56from ._doc_common import DocumentInformation, PdfDocCommon

57from ._encryption import EncryptAlgorithm, Encryption

58from ._page import PageObject, Transformation

59from ._page_labels import nums_clear_range, nums_insert, nums_next

60from ._reader import PdfReader

61from ._utils import (

62 StrByteType,

63 StreamType,

64 _get_max_pdf_version_header,

65 deprecate,

66 deprecate_no_replacement,

67 deprecation_with_replacement,

68 logger_warning,

69)

70from .constants import AnnotationDictionaryAttributes as AA

71from .constants import CatalogAttributes as CA

72from .constants import (

73 CatalogDictionary,

74 GoToActionArguments,

75 ImageType,

76 InteractiveFormDictEntries,

77 OutlineFontFlag,

78 PageLabelStyle,

79 TypFitArguments,

80 UserAccessPermissions,

81)

82from .constants import Core as CO

83from .constants import FieldDictionaryAttributes as FA

84from .constants import PageAttributes as PG

85from .constants import PagesAttributes as PA

86from .constants import TrailerKeys as TK

87from .errors import PyPdfError

88from .generic import (

89 PAGE_FIT,

90 ArrayObject,

91 BooleanObject,

92 ByteStringObject,

93 ContentStream,

94 DecodedStreamObject,

95 Destination,

96 DictionaryObject,

97 EmbeddedFile,

98 Fit,

99 FloatObject,

100 IndirectObject,

101 NameObject,

102 NullObject,

103 NumberObject,

104 PdfObject,

105 RectangleObject,

106 StreamObject,

107 TextStringObject,

108 TreeObject,

109 ViewerPreferences,

110 create_string_object,

111 hex_to_rgb,

112 is_null_or_none,

113)

114from .pagerange import PageRange, PageRangeSpec

115from .types import (

116 AnnotationSubtype,

117 BorderArrayType,

118 LayoutType,

119 OutlineItemType,

120 OutlineType,

121 PagemodeType,

122)

123from .xmp import XmpInformation

124

125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12

127

128

129class ObjectDeletionFlag(enum.IntFlag):

130 NONE = 0

131 TEXT = enum.auto()

132 LINKS = enum.auto()

133 ATTACHMENTS = enum.auto()

134 OBJECTS_3D = enum.auto()

135 ALL_ANNOTATIONS = enum.auto()

136 XOBJECT_IMAGES = enum.auto()

137 INLINE_IMAGES = enum.auto()

138 DRAWING_IMAGES = enum.auto()

139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

140

141

142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

143 hash = hashlib.md5()

144 for block in iter(lambda: stream.read(blocksize), b""):

145 hash.update(block)

146 return hash.hexdigest()

147

148

149class PdfWriter(PdfDocCommon):

150 """

151 Write a PDF file out, given pages produced by another class or through

152 cloning a PDF file during initialization.

153

154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

155

156 Args:

157 clone_from: identical to fileobj (for compatibility)

158

159 incremental: If true, loads the document and set the PdfWriter in incremental mode.

160

161 When writing incrementally, the original document is written first and new/modified

162 content is appended. To be used for signed document/forms to keep signature valid.

163

164 full: If true, loads all the objects (always full if incremental = True).

165 This parameter may allow loading large PDFs.

166

167 """

168

169 def __init__(

170 self,

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

173 incremental: bool = False,

174 full: bool = False,

175 ) -> None:

176 self.incremental = incremental or full

177 """

178 Returns if the PdfWriter object has been started in incremental mode.

179 """

180

181 self._objects: List[Optional[PdfObject]] = []

182 """

183 The indirect objects in the PDF.

184 For the incremental case, it will be filled with None

185 in clone_reader_document_root.

186 """

187

188 self._original_hash: List[int] = []

189 """

190 List of hashes after import; used to identify changes.

191 """

192

193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}

194 """

195 Maps hash values of indirect objects to the list of IndirectObjects.

196 This is used for compression.

197 """

198

199 self._id_translated: Dict[int, Dict[int, int]] = {}

200 """List of already translated IDs.

201 dict[id(pdf)][(idnum, generation)]

202 """

203

204 self._info_obj: Optional[PdfObject]

205 """The PDF files's document information dictionary,

206 the Info entry in the PDF file's trailer dictionary."""

207

208 self._ID: Union[ArrayObject, None] = None

209 """The PDF file identifier,

210 defined by the ID in the PDF file's trailer dictionary."""

211

212 if self.incremental:

213 if isinstance(fileobj, (str, Path)):

214 with open(fileobj, "rb") as f:

215 fileobj = BytesIO(f.read(-1))

216 if isinstance(fileobj, BytesIO):

217 fileobj = PdfReader(fileobj)

218 if not isinstance(fileobj, PdfReader):

219 raise PyPdfError("Invalid type for incremental mode")

220 self._reader = fileobj # prev content is in _reader.stream

221 self._header = fileobj.pdf_header.encode()

222 self._readonly = True # TODO: to be analysed

223 else:

224 self._header = b"%PDF-1.3"

225 self._info_obj = self._add_object(

226 DictionaryObject(

227 {NameObject("/Producer"): create_string_object("pypdf")}

228 )

229 )

230

231 def _get_clone_from(

232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

236 fileobj == "" or clone_from is not None

237 ):

238 return clone_from

239 cloning = True

240 if isinstance(fileobj, (str, Path)) and (

241 not Path(str(fileobj)).exists()

242 or Path(str(fileobj)).stat().st_size == 0

243 ):

244 cloning = False

245 if isinstance(fileobj, (IOBase, BytesIO)):

246 t = fileobj.tell()

247 if fileobj.seek(0, 2) == 0:

248 cloning = False

249 fileobj.seek(t, 0)

250 if cloning:

251 clone_from = fileobj

252 return clone_from

253

254 clone_from = _get_clone_from(fileobj, clone_from)

255 # To prevent overwriting

256 self.temp_fileobj = fileobj

257 self.fileobj = ""

258 self._with_as_usage = False

259 self._cloned = False

260 # The root of our page tree node

261 pages = DictionaryObject(

262 {

263 NameObject(PA.TYPE): NameObject("/Pages"),

264 NameObject(PA.COUNT): NumberObject(0),

265 NameObject(PA.KIDS): ArrayObject(),

266 }

267 )

268 self.flattened_pages = []

269 self._encryption: Optional[Encryption] = None

270 self._encrypt_entry: Optional[DictionaryObject] = None

271

272 if clone_from is not None:

273 if not isinstance(clone_from, PdfReader):

274 clone_from = PdfReader(clone_from)

275 self.clone_document_from_reader(clone_from)

276 self._cloned = True

277 else:

278 self._pages = self._add_object(pages)

279 self._root_object = DictionaryObject(

280 {

281 NameObject(PA.TYPE): NameObject(CO.CATALOG),

282 NameObject(CO.PAGES): self._pages,

283 }

284 )

285 self._add_object(self._root_object)

286 if full and not incremental:

287 self.incremental = False

288 if isinstance(self._ID, list):

289 if isinstance(self._ID[0], TextStringObject):

290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

291 if isinstance(self._ID[1], TextStringObject):

292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

293

294 # for commonality

295 @property

296 def is_encrypted(self) -> bool:

297 """

298 Read-only boolean property showing whether this PDF file is encrypted.

299

300 Note that this property, if true, will remain true even after the

301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

302 """

303 return False

304

305 @property

306 def root_object(self) -> DictionaryObject:

307 """

308 Provide direct access to PDF Structure.

309

310 Note:

311 Recommended only for read access.

312

313 """

314 return self._root_object

315

316 @property

317 def _info(self) -> Optional[DictionaryObject]:

318 """

319 Provide access to "/Info". Standardized with PdfReader.

320

321 Returns:

322 /Info Dictionary; None if the entry does not exist

323

324 """

325 return (

326 None

327 if self._info_obj is None

328 else cast(DictionaryObject, self._info_obj.get_object())

329 )

330

331 @_info.setter

332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

333 if value is None:

334 try:

335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore

336 except (KeyError, AttributeError):

337 pass

338 self._info_obj = None

339 else:

340 if self._info_obj is None:

341 self._info_obj = self._add_object(DictionaryObject())

342 obj = cast(DictionaryObject, self._info_obj.get_object())

343 obj.clear()

344 obj.update(cast(DictionaryObject, value.get_object()))

345

346 @property

347 def xmp_metadata(self) -> Optional[XmpInformation]:

348 """XMP (Extensible Metadata Platform) data."""

349 return cast(XmpInformation, self.root_object.xmp_metadata)

350

351 @xmp_metadata.setter

352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None:

353 """XMP (Extensible Metadata Platform) data."""

354 if value is None:

355 if "/Metadata" in self.root_object:

356 del self.root_object["/Metadata"]

357 else:

358 self.root_object[NameObject("/Metadata")] = value

359

360 return self.root_object.xmp_metadata # type: ignore

361

362 @property

363 def with_as_usage(self) -> bool:

364 deprecate_no_replacement("with_as_usage", "6.0")

365 return self._with_as_usage

366

367 @with_as_usage.setter

368 def with_as_usage(self, value: bool) -> None:

369 deprecate_no_replacement("with_as_usage", "6.0")

370 self._with_as_usage = value

371

372 def __enter__(self) -> "PdfWriter":

373 """Store how writer is initialized by 'with'."""

374 c: bool = self._cloned

375 t = self.temp_fileobj

376 self.__init__() # type: ignore

377 self._cloned = c

378 self._with_as_usage = True

379 self.fileobj = t # type: ignore

380 return self

381

382 def __exit__(

383 self,

384 exc_type: Optional[Type[BaseException]],

385 exc: Optional[BaseException],

386 traceback: Optional[TracebackType],

387 ) -> None:

388 """Write data to the fileobj."""

389 if self.fileobj and not self._cloned:

390 self.write(self.fileobj)

391

392 @property

393 def pdf_header(self) -> str:

394 """

395 Read/Write property of the PDF header that is written.

396

397 This should be something like ``'%PDF-1.5'``. It is recommended to set

398 the lowest version that supports all features which are used within the

399 PDF file.

400

401 Note: `pdf_header` returns a string but accepts bytes or str for writing

402 """

403 return self._header.decode()

404

405 @pdf_header.setter

406 def pdf_header(self, new_header: Union[str, bytes]) -> None:

407 if isinstance(new_header, str):

408 new_header = new_header.encode()

409 self._header = new_header

410

411 def _add_object(self, obj: PdfObject) -> IndirectObject:

412 if (

413 getattr(obj, "indirect_reference", None) is not None

414 and obj.indirect_reference.pdf == self # type: ignore

415 ):

416 return obj.indirect_reference # type: ignore

417 # check for /Contents in Pages (/Contents in annotations are strings)

418 if isinstance(obj, DictionaryObject) and isinstance(

419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

420 ):

421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

422 self._objects.append(obj)

423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

424 return obj.indirect_reference

425

426 def get_object(

427 self,

428 indirect_reference: Union[int, IndirectObject],

429 ) -> PdfObject:

430 if isinstance(indirect_reference, int):

431 obj = self._objects[indirect_reference - 1]

432 elif indirect_reference.pdf != self:

433 raise ValueError("PDF must be self")

434 else:

435 obj = self._objects[indirect_reference.idnum - 1]

436 assert obj is not None, "mypy"

437 return obj

438

439 def _replace_object(

440 self,

441 indirect_reference: Union[int, IndirectObject],

442 obj: PdfObject,

443 ) -> PdfObject:

444 if isinstance(indirect_reference, IndirectObject):

445 if indirect_reference.pdf != self:

446 raise ValueError("PDF must be self")

447 indirect_reference = indirect_reference.idnum

448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore

449 if (

450 getattr(obj, "indirect_reference", None) is not None

451 and obj.indirect_reference.pdf != self # type: ignore

452 ):

453 obj = obj.clone(self)

454 self._objects[indirect_reference - 1] = obj

455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

456

457 assert isinstance(obj, PdfObject), "mypy"

458 return obj

459

460 def _add_page(

461 self,

462 page: PageObject,

463 index: int,

464 excluded_keys: Iterable[str] = (),

465 ) -> PageObject:

466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:

467 raise ValueError("Invalid page object")

468 assert self.flattened_pages is not None, "for mypy"

469 page_org = page

470 excluded_keys = list(excluded_keys)

471 excluded_keys += [PA.PARENT, "/StructParents"]

472 # Acrobat does not accept two indirect references pointing on the same

473 # page; therefore in order to add multiple copies of the same

474 # page, we need to create a new dictionary for the page, however the

475 # objects below (including content) are not duplicated:

476 try: # delete an already existing page

477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore

478 page_org.indirect_reference.idnum # type: ignore

479 ]

480 except Exception:

481 pass

482 page = cast(

483 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

484 )

485 if page_org.pdf is not None:

486 other = page_org.pdf.pdf_header

487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

488 node, idx = self._get_page_in_node(index)

489 page[NameObject(PA.PARENT)] = node.indirect_reference

490

491 if idx >= 0:

492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)

493 self.flattened_pages.insert(index, page)

494 else:

495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)

496 self.flattened_pages.append(page)

497 recurse = 0

498 while not is_null_or_none(node):

499 node = cast(DictionaryObject, node.get_object())

500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)

501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix.

502 recurse += 1

503 if recurse > 1000:

504 raise PyPdfError("Too many recursive calls!")

505 return page

506

507 def set_need_appearances_writer(self, state: bool = True) -> None:

508 """

509 Sets the "NeedAppearances" flag in the PDF writer.

510

511 The "NeedAppearances" flag indicates whether the appearance dictionary

512 for form fields should be automatically generated by the PDF viewer or

513 if the embedded appearance should be used.

514

515 Args:

516 state: The actual value of the NeedAppearances flag.

517

518 Returns:

519 None

520

521 """

522 # See §12.7.2 and §7.7.2 for more information:

523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

524 try:

525 # get the AcroForm tree

526 if CatalogDictionary.ACRO_FORM not in self._root_object:

527 self._root_object[

528 NameObject(CatalogDictionary.ACRO_FORM)

529 ] = self._add_object(DictionaryObject())

530

531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

533 need_appearances

534 ] = BooleanObject(state)

535 except Exception as exc: # pragma: no cover

536 logger_warning(

537 f"set_need_appearances_writer({state}) catch : {exc}", __name__

538 )

539

540 def create_viewer_preferences(self) -> ViewerPreferences:

541 o = ViewerPreferences()

542 self._root_object[

543 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

544 ] = self._add_object(o)

545 return o

546

547 def add_page(

548 self,

549 page: PageObject,

550 excluded_keys: Iterable[str] = (),

551 ) -> PageObject:

552 """

553 Add a page to this PDF file.

554

555 Recommended for advanced usage including the adequate excluded_keys.

556

557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

558 instance.

559

560 Args:

561 page: The page to add to the document. Should be

562 an instance of :class:`PageObject<pypdf._page.PageObject>`

563 excluded_keys:

564

565 Returns:

566 The added PageObject.

567

568 """

569 assert self.flattened_pages is not None, "mypy"

570 return self._add_page(page, len(self.flattened_pages), excluded_keys)

571

572 def insert_page(

573 self,

574 page: PageObject,

575 index: int = 0,

576 excluded_keys: Iterable[str] = (),

577 ) -> PageObject:

578 """

579 Insert a page in this PDF file. The page is usually acquired from a

580 :class:`PdfReader<pypdf.PdfReader>` instance.

581

582 Args:

583 page: The page to add to the document.

584 index: Position at which the page will be inserted.

585 excluded_keys:

586

587 Returns:

588 The added PageObject.

589

590 """

591 assert self.flattened_pages is not None, "mypy"

592 if index < 0:

593 index = len(self.flattened_pages) + index

594 if index < 0:

595 raise ValueError("Invalid index value")

596 if index >= len(self.flattened_pages):

597 return self.add_page(page, excluded_keys)

598 return self._add_page(page, index, excluded_keys)

599

600 def _get_page_number_by_indirect(

601 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

602 ) -> Optional[int]:

603 """

604 Generate _page_id2num.

605

606 Args:

607 indirect_reference:

608

609 Returns:

610 The page number or None

611

612 """

613 # To provide same function as in PdfReader

614 if is_null_or_none(indirect_reference):

615 return None

616 assert indirect_reference is not None, "mypy"

617 if isinstance(indirect_reference, int):

618 indirect_reference = IndirectObject(indirect_reference, 0, self)

619 obj = indirect_reference.get_object()

620 if isinstance(obj, PageObject):

621 return obj.page_number

622 return None

623

624 def add_blank_page(

625 self, width: Optional[float] = None, height: Optional[float] = None

626 ) -> PageObject:

627 """

628 Append a blank page to this PDF file and return it.

629

630 If no page size is specified, use the size of the last page.

631

632 Args:

633 width: The width of the new page expressed in default user

634 space units.

635 height: The height of the new page expressed in default

636 user space units.

637

638 Returns:

639 The newly appended page.

640

641 Raises:

642 PageSizeNotDefinedError: if width and height are not defined

643 and previous page does not exist.

644

645 """

646 page = PageObject.create_blank_page(self, width, height)

647 return self.add_page(page)

648

649 def insert_blank_page(

650 self,

651 width: Optional[Union[float, decimal.Decimal]] = None,

652 height: Optional[Union[float, decimal.Decimal]] = None,

653 index: int = 0,

654 ) -> PageObject:

655 """

656 Insert a blank page to this PDF file and return it.

657

658 If no page size is specified, use the size of the last page.

659

660 Args:

661 width: The width of the new page expressed in default user

662 space units.

663 height: The height of the new page expressed in default

664 user space units.

665 index: Position to add the page.

666

667 Returns:

668 The newly inserted page.

669

670 Raises:

671 PageSizeNotDefinedError: if width and height are not defined

672 and previous page does not exist.

673

674 """

675 if width is None or (height is None and index < self.get_num_pages()):

676 oldpage = self.pages[index]

677 width = oldpage.mediabox.width

678 height = oldpage.mediabox.height

679 page = PageObject.create_blank_page(self, width, height)

680 self.insert_page(page, index)

681 return page

682

683 @property

684 def open_destination(

685 self,

686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

687 return super().open_destination

688

689 @open_destination.setter

690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

691 if dest is None:

692 try:

693 del self._root_object["/OpenAction"]

694 except KeyError:

695 pass

696 elif isinstance(dest, str):

697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

698 elif isinstance(dest, Destination):

699 self._root_object[NameObject("/OpenAction")] = dest.dest_array

700 elif isinstance(dest, PageObject):

701 self._root_object[NameObject("/OpenAction")] = Destination(

702 "Opening",

703 dest.indirect_reference

704 if dest.indirect_reference is not None

705 else NullObject(),

706 PAGE_FIT,

707 ).dest_array

708

709 def add_js(self, javascript: str) -> None:

710 """

711 Add JavaScript which will launch upon opening this PDF.

712

713 Args:

714 javascript: Your JavaScript.

715

716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

717 # Example: This will launch the print window when the PDF is opened.

718

719 """

720 # Names / JavaScript preferred to be able to add multiple scripts

721 if "/Names" not in self._root_object:

722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

723 names = cast(DictionaryObject, self._root_object[CA.NAMES])

724 if "/JavaScript" not in names:

725 names[NameObject("/JavaScript")] = DictionaryObject(

726 {NameObject("/Names"): ArrayObject()}

727 )

728 js_list = cast(

729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

730 )

731 # We need a name for parameterized JavaScript in the PDF file,

732 # but it can be anything.

733 js_list.append(create_string_object(str(uuid.uuid4())))

734

735 js = DictionaryObject(

736 {

737 NameObject(PA.TYPE): NameObject("/Action"),

738 NameObject("/S"): NameObject("/JavaScript"),

739 NameObject("/JS"): TextStringObject(f"{javascript}"),

740 }

741 )

742 js_list.append(self._add_object(js))

743

744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":

745 """

746 Embed a file inside the PDF.

747

748 Reference:

749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

750 Section 7.11.3

751

752 Args:

753 filename: The filename to display.

754 data: The data in the file.

755

756 Returns:

757 EmbeddedFile instance for the newly created embedded file.

758

759 """

760 return EmbeddedFile._create_new(self, filename, data)

761

762 def append_pages_from_reader(

763 self,

764 reader: PdfReader,

765 after_page_append: Optional[Callable[[PageObject], None]] = None,

766 ) -> None:

767 """

768 Copy pages from reader to writer. Includes an optional callback

769 parameter which is invoked after pages are appended to the writer.

770

771 ``append`` should be preferred.

772

773 Args:

774 reader: a PdfReader object from which to copy page

775 annotations to this writer object. The writer's annots

776 will then be updated.

777 after_page_append:

778 Callback function that is invoked after each page is appended to

779 the writer. Signature includes a reference to the appended page

780 (delegates to append_pages_from_reader). The single parameter of

781 the callback is a reference to the page just appended to the

782 document.

783

784 """

785 reader_num_pages = len(reader.pages)

786 # Copy pages from reader to writer

787 for reader_page_number in range(reader_num_pages):

788 reader_page = reader.pages[reader_page_number]

789 writer_page = self.add_page(reader_page)

790 # Trigger callback, pass writer page as parameter

791 if callable(after_page_append):

792 after_page_append(writer_page)

793

794 def _merge_content_stream_to_page(

795 self,

796 page: PageObject,

797 new_content_data: bytes,

798 ) -> None:

799 """

800 Combines existing content stream(s) with new content (as bytes),

801 and returns a new single StreamObject.

802

803 Args:

804 page: The page to which the new content data will be added.

805 new_content_data: A binary-encoded new content stream, for

806 instance the commands to draw an XObject.

807 """

808 # First resolve the existing page content. This always is an IndirectObject:

809 # PDF Explained by John Whitington

810 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

811 if NameObject("/Contents") in page:

812 existing_content_ref = page[NameObject("/Contents")]

813 existing_content = existing_content_ref.get_object()

814

815 if isinstance(existing_content, ArrayObject):

816 # Create a new StreamObject for the new_content_data

817 new_stream_obj = StreamObject()

818 new_stream_obj.set_data(new_content_data)

819 existing_content.append(self._add_object(new_stream_obj))

820 page[NameObject("/Contents")] = self._add_object(existing_content)

821 if isinstance(existing_content, StreamObject):

822 # Merge new content to existing StreamObject

823 merged_data = existing_content.get_data() + b"\n" + new_content_data

824 new_stream = StreamObject()

825 new_stream.set_data(merged_data)

826 page[NameObject("/Contents")] = self._add_object(new_stream)

827 else:

828 # If no existing content, then we have an empty page.

829 # Create a new StreamObject in a new /Contents entry.

830 new_stream = StreamObject()

831 new_stream.set_data(new_content_data)

832 page[NameObject("/Contents")] = self._add_object(new_stream)

833

834 def _add_apstream_object(

835 self,

836 page: PageObject,

837 appearance_stream_obj: StreamObject,

838 object_name: str,

839 x_offset: float,

840 y_offset: float,

841 font_res: Optional[DictionaryObject] = None

842 ) -> None:

843 """

844 Adds an appearance stream to the page content in the form of

845 an XObject.

846

847 Args:

848 page: The page to which to add the appearance stream.

849 appearance_stream_obj: The appearance stream.

850 object_name: The name of the appearance stream.

851 x_offset: The horizontal offset for the appearance stream.

852 y_offset: The vertical offset for the appearance stream.

853 font_res: The appearance stream's font resource (if given).

854 """

855 # Prepare XObject resource dictionary on the page

856 pg_res = cast(DictionaryObject, page[PG.RESOURCES])

857 if font_res is not None:

858 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated

859 if "/Font" not in pg_res:

860 pg_res[NameObject("/Font")] = DictionaryObject()

861 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])

862 if font_name not in pg_ft_res:

863 pg_ft_res[NameObject(font_name)] = font_res

864 # Always add the resolved stream object to the writer to get a new IndirectObject.

865 # This ensures we have a valid IndirectObject managed by *this* writer.

866 xobject_ref = self._add_object(appearance_stream_obj)

867 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()

868 if "/XObject" not in pg_res:

869 pg_res[NameObject("/XObject")] = DictionaryObject()

870 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])

871 if xobject_name not in pg_xo_res:

872 pg_xo_res[xobject_name] = xobject_ref

873 else:

874 logger_warning(

875 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",

876 __name__

877 )

878 xobject_cm = Transformation().translate(x_offset, y_offset)

879 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()

880 self._merge_content_stream_to_page(page, xobject_drawing_commands)

881

882 def _update_field_annotation(

883 self,

884 page: PageObject,

885 field: DictionaryObject,

886 annotation: DictionaryObject,

887 font_name: str = "",

888 font_size: float = -1,

889 flatten: bool = False,

890 ) -> None:

891 # Calculate rectangle dimensions

892 _rct = cast(RectangleObject, annotation[AA.Rect])

893 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))

894

895 # Extract font information

896 da = annotation.get_inherited(

897 AA.DA,

898 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(

899 AA.DA, None

900 ),

901 )

902 if da is None:

903 da = TextStringObject("/Helv 0 Tf 0 g")

904 else:

905 da = da.get_object()

906 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")

907 font_properties = [x for x in font_properties if x != ""]

908 if font_name:

909 font_properties[font_properties.index("Tf") - 2] = font_name

910 else:

911 font_name = font_properties[font_properties.index("Tf") - 2]

912 font_height = (

913 font_size

914 if font_size >= 0

915 else float(font_properties[font_properties.index("Tf") - 1])

916 )

917 if font_height == 0:

918 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:

919 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE

920 else:

921 font_height = rct.height - 2

922 font_properties[font_properties.index("Tf") - 1] = str(font_height)

923 da = " ".join(font_properties)

924 y_offset = rct.height - 1 - font_height

925

926 # Retrieve font information from local DR ...

927 dr: Any = cast(

928 DictionaryObject,

929 cast(

930 DictionaryObject,

931 annotation.get_inherited(

932 "/DR",

933 cast(

934 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

935 ).get("/DR", DictionaryObject()),

936 ),

937 ).get_object(),

938 )

939 dr = dr.get("/Font", DictionaryObject()).get_object()

940 # _default_fonts_space_width keys is the list of Standard fonts

941 if font_name not in dr and font_name not in _default_fonts_space_width:

942 # ...or AcroForm dictionary

943 dr = cast(

944 Dict[Any, Any],

945 cast(

946 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

947 ).get("/DR", {}),

948 )

949 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()

950 font_res = dr.get(font_name, None)

951 if not is_null_or_none(font_res):

952 font_res = cast(DictionaryObject, font_res.get_object())

953 font_subtype, _, font_encoding, font_map = build_char_map_from_dict(

954 200, font_res

955 )

956 try: # remove width stored in -1 key

957 del font_map[-1]

958 except KeyError:

959 pass

960 font_full_rev: Dict[str, bytes]

961 if isinstance(font_encoding, str):

962 font_full_rev = {

963 v: k.encode(font_encoding) for k, v in font_map.items()

964 }

965 else:

966 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

967 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

968 for key, value in font_map.items():

969 font_full_rev[value] = font_encoding_rev.get(key, key)

970 else:

971 logger_warning(f"Font dictionary for {font_name} not found.", __name__)

972 font_full_rev = {}

973

974 # Retrieve field text and selected values

975 field_flags = field.get(FA.Ff, 0)

976 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:

977 txt = "\n".join(annotation.get_inherited(FA.Opt, []))

978 sel = field.get("/V", [])

979 if not isinstance(sel, list):

980 sel = [sel]

981 else: # /Tx

982 txt = field.get("/V", "")

983 sel = []

984 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)

985 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")

986 # Generate appearance stream

987 ap_stream = generate_appearance_stream(

988 txt, sel, da, font_full_rev, rct, font_height, y_offset

989 )

990

991 # Create appearance dictionary

992 dct = DecodedStreamObject.initialize_from_dictionary(

993 {

994 NameObject("/Type"): NameObject("/XObject"),

995 NameObject("/Subtype"): NameObject("/Form"),

996 NameObject("/BBox"): rct,

997 "__streamdata__": ByteStringObject(ap_stream),

998 "/Length": 0,

999 }

1000 )

1001 if AA.AP in annotation:

1002 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():

1003 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:

1004 dct[k] = v

1005

1006 # Update Resources with font information if necessary

1007 if font_res is not None:

1008 dct[NameObject("/Resources")] = DictionaryObject(

1009 {

1010 NameObject("/Font"): DictionaryObject(

1011 {

1012 NameObject(font_name): getattr(

1013 font_res, "indirect_reference", font_res

1014 )

1015 }

1016 )

1017 }

1018 )

1019 if AA.AP not in annotation:

1020 annotation[NameObject(AA.AP)] = DictionaryObject(

1021 {NameObject("/N"): self._add_object(dct)}

1022 )

1023 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):

1024 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1025 NameObject("/N")

1026 ] = self._add_object(dct)

1027 else: # [/AP][/N] exists

1028 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore

1029 self._objects[n - 1] = dct

1030 dct.indirect_reference = IndirectObject(n, 0, self)

1031

1032 if flatten:

1033 field_name = self._get_qualified_field_name(annotation)

1034 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)

1035

1036 FFBITS_NUL = FA.FfBits(0)

1037

1038 def update_page_form_field_values(

1039 self,

1040 page: Union[PageObject, List[PageObject], None],

1041 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]],

1042 flags: FA.FfBits = FFBITS_NUL,

1043 auto_regenerate: Optional[bool] = True,

1044 flatten: bool = False,

1045 ) -> None:

1046 """

1047 Update the form field values for a given page from a fields dictionary.

1048

1049 Copy field texts and values from fields to page.

1050 If the field links to a parent object, add the information to the parent.

1051

1052 Args:

1053 page: `PageObject` - references **PDF writer's page** where the

1054 annotations and field data will be updated.

1055 `List[Pageobject]` - provides list of pages to be processed.

1056 `None` - all pages.

1057 fields: a Python dictionary of:

1058

1059 * field names (/T) as keys and text values (/V) as value

1060 * field names (/T) as keys and list of text values (/V) for multiple choice list

1061 * field names (/T) as keys and tuple of:

1062 * text values (/V)

1063 * font id (e.g. /F1, the font id must exist)

1064 * font size (0 for autosize)

1065

1066 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

1067

1068 auto_regenerate: Set/unset the need_appearances flag;

1069 the flag is unchanged if auto_regenerate is None.

1070

1071 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's

1072 appearance stream to the page contents. Note that this option does not remove the

1073 annotation itself.

1074

1075 """

1076 if CatalogDictionary.ACRO_FORM not in self._root_object:

1077 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

1078 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1079 if InteractiveFormDictEntries.Fields not in af:

1080 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

1081 if isinstance(auto_regenerate, bool):

1082 self.set_need_appearances_writer(auto_regenerate)

1083 # Iterate through pages, update field values

1084 if page is None:

1085 page = list(self.pages)

1086 if isinstance(page, list):

1087 for p in page:

1088 if PG.ANNOTS in p: # just to prevent warnings

1089 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)

1090 return

1091 if PG.ANNOTS not in page:

1092 logger_warning("No fields to update on this page", __name__)

1093 return

1094 for annotation in page[PG.ANNOTS]: # type: ignore

1095 annotation = cast(DictionaryObject, annotation.get_object())

1096 if annotation.get("/Subtype", "") != "/Widget":

1097 continue

1098 if "/FT" in annotation and "/T" in annotation:

1099 parent_annotation = annotation

1100 else:

1101 parent_annotation = annotation.get(

1102 PG.PARENT, DictionaryObject()

1103 ).get_object()

1104

1105 for field, value in fields.items():

1106 if not (

1107 self._get_qualified_field_name(parent_annotation) == field

1108 or parent_annotation.get("/T", None) == field

1109 ):

1110 continue

1111 if (

1112 parent_annotation.get("/FT", None) == "/Ch"

1113 and "/I" in parent_annotation

1114 ):

1115 del parent_annotation["/I"]

1116 if flags:

1117 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1118 if not (value is None and flatten): # Only change values if given by user and not flattening.

1119 if isinstance(value, list):

1120 lst = ArrayObject(TextStringObject(v) for v in value)

1121 parent_annotation[NameObject(FA.V)] = lst

1122 elif isinstance(value, tuple):

1123 annotation[NameObject(FA.V)] = TextStringObject(

1124 value[0],

1125 )

1126 else:

1127 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1128 if parent_annotation.get(FA.FT) == "/Btn":

1129 # Checkbox button (no /FT found in Radio widgets)

1130 v = NameObject(value)

1131 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1132 normal_ap = cast(DictionaryObject, ap["/N"])

1133 if v not in normal_ap:

1134 v = NameObject("/Off")

1135 appearance_stream_obj = normal_ap.get(v)

1136 # other cases will be updated through the for loop

1137 annotation[NameObject(AA.AS)] = v

1138 annotation[NameObject(FA.V)] = v

1139 if flatten and appearance_stream_obj is not None:

1140 # We basically copy the entire appearance stream, which should be an XObject that

1141 # is already registered. No need to add font resources.

1142 rct = cast(RectangleObject, annotation[AA.Rect])

1143 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])

1144 elif (

1145 parent_annotation.get(FA.FT) == "/Tx"

1146 or parent_annotation.get(FA.FT) == "/Ch"

1147 ):

1148 # textbox

1149 if isinstance(value, tuple):

1150 self._update_field_annotation(

1151 page, parent_annotation, annotation, value[1], value[2], flatten=flatten

1152 )

1153 else:

1154 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)

1155 elif (

1156 annotation.get(FA.FT) == "/Sig"

1157 ): # deprecated # not implemented yet

1158 logger_warning("Signature forms not implemented yet", __name__)

1159

1160 def reattach_fields(

1161 self, page: Optional[PageObject] = None

1162 ) -> List[DictionaryObject]:

1163 """

1164 Parse annotations within the page looking for orphan fields and

1165 reattach then into the Fields Structure.

1166

1167 Args:

1168 page: page to analyze.

1169 If none is provided, all pages will be analyzed.

1170

1171 Returns:

1172 list of reattached fields.

1173

1174 """

1175 lst = []

1176 if page is None:

1177 for p in self.pages:

1178 lst += self.reattach_fields(p)

1179 return lst

1180

1181 try:

1182 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1183 except KeyError:

1184 af = DictionaryObject()

1185 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1186 try:

1187 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1188 except KeyError:

1189 fields = ArrayObject()

1190 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1191

1192 if "/Annots" not in page:

1193 return lst

1194 annotations = cast(ArrayObject, page["/Annots"])

1195 for idx, annotation in enumerate(annotations):

1196 is_indirect = isinstance(annotation, IndirectObject)

1197 annotation = cast(DictionaryObject, annotation.get_object())

1198 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1199 if (

1200 "indirect_reference" in annotation.__dict__

1201 and annotation.indirect_reference in fields

1202 ):

1203 continue

1204 if not is_indirect:

1205 annotations[idx] = self._add_object(annotation)

1206 fields.append(annotation.indirect_reference)

1207 lst.append(annotation)

1208 return lst

1209

1210 def clone_reader_document_root(self, reader: PdfReader) -> None:

1211 """

1212 Copy the reader document root to the writer and all sub-elements,

1213 including pages, threads, outlines,... For partial insertion, ``append``

1214 should be considered.

1215

1216 Args:

1217 reader: PdfReader from which the document root should be copied.

1218

1219 """

1220 self._info_obj = None

1221 if self.incremental:

1222 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)

1223 for i in range(len(self._objects)):

1224 o = reader.get_object(i + 1)

1225 if o is not None:

1226 self._objects[i] = o.replicate(self)

1227 else:

1228 self._objects.clear()

1229 self._root_object = reader.root_object.clone(self)

1230 self._pages = self._root_object.raw_get("/Pages")

1231

1232 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest

1233 # must be done here before rewriting

1234 if self.incremental:

1235 self._original_hash = [

1236 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1237 ]

1238 self._flatten()

1239 assert self.flattened_pages is not None

1240 for p in self.flattened_pages:

1241 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1242 if not self.incremental:

1243 p[NameObject("/Parent")] = self._pages

1244 if not self.incremental:

1245 cast(DictionaryObject, self._pages.get_object())[

1246 NameObject("/Kids")

1247 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1248

1249 def clone_document_from_reader(

1250 self,

1251 reader: PdfReader,

1252 after_page_append: Optional[Callable[[PageObject], None]] = None,

1253 ) -> None:

1254 """

1255 Create a copy (clone) of a document from a PDF file reader cloning

1256 section '/Root' and '/Info' and '/ID' of the pdf.

1257

1258 Args:

1259 reader: PDF file reader instance from which the clone

1260 should be created.

1261 after_page_append:

1262 Callback function that is invoked after each page is appended to

1263 the writer. Signature includes a reference to the appended page

1264 (delegates to append_pages_from_reader). The single parameter of

1265 the callback is a reference to the page just appended to the

1266 document.

1267

1268 """

1269 self.clone_reader_document_root(reader)

1270 inf = reader._info

1271 if self.incremental:

1272 if inf is not None:

1273 self._info_obj = cast(

1274 IndirectObject, inf.clone(self).indirect_reference

1275 )

1276 assert isinstance(self._info, DictionaryObject), "for mypy"

1277 self._original_hash[

1278 self._info_obj.indirect_reference.idnum - 1

1279 ] = self._info.hash_bin()

1280 elif inf is not None:

1281 self._info_obj = self._add_object(

1282 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1283 )

1284 # else: _info_obj = None done in clone_reader_document_root()

1285

1286 try:

1287 self._ID = cast(ArrayObject, reader._ID).clone(self)

1288 except AttributeError:

1289 pass

1290

1291 if callable(after_page_append):

1292 for page in cast(

1293 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1294 ):

1295 after_page_append(page.get_object())

1296

1297 def _compute_document_identifier(self) -> ByteStringObject:

1298 stream = BytesIO()

1299 self._write_pdf_structure(stream)

1300 stream.seek(0)

1301 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1302

1303 def generate_file_identifiers(self) -> None:

1304 """

1305 Generate an identifier for the PDF that will be written.

1306

1307 The only point of this is ensuring uniqueness. Reproducibility is not

1308 required.

1309 When a file is first written, both identifiers shall be set to the same value.

1310 If both identifiers match when a file reference is resolved, it is very

1311 likely that the correct and unchanged file has been found. If only the first

1312 identifier matches, a different version of the correct file has been found.

1313 see §14.4 "File Identifiers".

1314 """

1315 if self._ID:

1316 id1 = self._ID[0]

1317 id2 = self._compute_document_identifier()

1318 else:

1319 id1 = self._compute_document_identifier()

1320 id2 = id1

1321 self._ID = ArrayObject((id1, id2))

1322

1323 def encrypt(

1324 self,

1325 user_password: str,

1326 owner_password: Optional[str] = None,

1327 use_128bit: bool = True,

1328 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1329 *,

1330 algorithm: Optional[str] = None,

1331 ) -> None:

1332 """

1333 Encrypt this PDF file with the PDF Standard encryption handler.

1334

1335 Args:

1336 user_password: The password which allows for opening

1337 and reading the PDF file with the restrictions provided.

1338 owner_password: The password which allows for

1339 opening the PDF files without any restrictions. By default,

1340 the owner password is the same as the user password.

1341 use_128bit: flag as to whether to use 128bit

1342 encryption. When false, 40bit encryption will be used.

1343 By default, this flag is on.

1344 permissions_flag: permissions as described in

1345 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1346 the permission is granted.

1347 Hence an integer value of -1 will set all flags.

1348 Bit position 3 is for printing, 4 is for modifying content,

1349 5 and 6 control annotations, 9 for form fields,

1350 10 for extraction of text and graphics.

1351 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1352 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1353 `use_128bit` will be ignored.

1354

1355 """

1356 if owner_password is None:

1357 owner_password = user_password

1358

1359 if algorithm is not None:

1360 try:

1361 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1362 except AttributeError:

1363 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1364 else:

1365 alg = EncryptAlgorithm.RC4_128

1366 if not use_128bit:

1367 alg = EncryptAlgorithm.RC4_40

1368 self.generate_file_identifiers()

1369 assert self._ID

1370 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1371 # in case call `encrypt` again

1372 entry = self._encryption.write_entry(user_password, owner_password)

1373 if self._encrypt_entry:

1374 # replace old encrypt_entry

1375 assert self._encrypt_entry.indirect_reference is not None

1376 entry.indirect_reference = self._encrypt_entry.indirect_reference

1377 self._objects[entry.indirect_reference.idnum - 1] = entry

1378 else:

1379 self._add_object(entry)

1380 self._encrypt_entry = entry

1381

1382 def write_stream(self, stream: StreamType) -> None:

1383 if hasattr(stream, "mode") and "b" not in stream.mode:

1384 logger_warning(

1385 f"File <{stream.name}> to write to is not in binary mode. "

1386 "It may not be written to correctly.",

1387 __name__,

1388 )

1389 # deprecated to be removed in pypdf 6.0.0 :

1390 # if not self._root:

1391 # self._root = self._add_object(self._root_object)

1392 # self._sweep_indirect_references(self._root)

1393

1394 if self.incremental:

1395 self._reader.stream.seek(0)

1396 stream.write(self._reader.stream.read(-1))

1397 if len(self.list_objects_in_increment()) > 0:

1398 self._write_increment(stream) # writes objs, xref stream and startxref

1399 else:

1400 object_positions, free_objects = self._write_pdf_structure(stream)

1401 xref_location = self._write_xref_table(

1402 stream, object_positions, free_objects

1403 )

1404 self._write_trailer(stream, xref_location)

1405

1406 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:

1407 """

1408 Write the collection of pages added to this object out as a PDF file.

1409

1410 Args:

1411 stream: An object to write the file to. The object can support

1412 the write method and the tell method, similar to a file object, or

1413 be a file path, just like the fileobj, just named it stream to keep

1414 existing workflow.

1415

1416 Returns:

1417 A tuple (bool, IO).

1418

1419 """

1420 my_file = False

1421

1422 if stream == "":

1423 raise ValueError(f"Output({stream=}) is empty.")

1424

1425 if isinstance(stream, (str, Path)):

1426 stream = FileIO(stream, "wb")

1427 my_file = True

1428

1429 self.write_stream(stream)

1430

1431 if my_file:

1432 stream.close()

1433 else:

1434 stream.flush()

1435

1436 return my_file, stream

1437

1438 def list_objects_in_increment(self) -> List[IndirectObject]:

1439 """

1440 For analysis or debugging.

1441 Provides the list of new or modified objects that will be written

1442 in the increment.

1443 Deleted objects will not be freed but will become orphans.

1444

1445 Returns:

1446 List of new or modified IndirectObjects

1447

1448 """

1449 original_hash_count = len(self._original_hash)

1450 return [

1451 cast(IndirectObject, obj).indirect_reference

1452 for i, obj in enumerate(self._objects)

1453 if (

1454 obj is not None

1455 and (

1456 i >= original_hash_count

1457 or obj.hash_bin() != self._original_hash[i]

1458 )

1459 )

1460 ]

1461

1462 def _write_increment(self, stream: StreamType) -> None:

1463 object_positions = {}

1464 object_blocks = []

1465 current_start = -1

1466 current_stop = -2

1467 original_hash_count = len(self._original_hash)

1468 for i, obj in enumerate(self._objects):

1469 if obj is not None and (

1470 i >= original_hash_count

1471 or obj.hash_bin() != self._original_hash[i]

1472 ):

1473 idnum = i + 1

1474 assert isinstance(obj, PdfObject), "mypy"

1475 # first write new/modified object

1476 object_positions[idnum] = stream.tell()

1477 stream.write(f"{idnum} 0 obj\n".encode())

1478 """ encryption is not operational

1479 if self._encryption and obj != self._encrypt_entry:

1480 obj = self._encryption.encrypt_object(obj, idnum, 0)

1481 """

1482 obj.write_to_stream(stream)

1483 stream.write(b"\nendobj\n")

1484

1485 # prepare xref

1486 if idnum != current_stop:

1487 if current_start > 0:

1488 object_blocks.append(

1489 [current_start, current_stop - current_start]

1490 )

1491 current_start = idnum

1492 current_stop = idnum + 1

1493 assert current_start > 0, "for pytest only"

1494 object_blocks.append([current_start, current_stop - current_start])

1495 # write incremented xref

1496 xref_location = stream.tell()

1497 xr_id = len(self._objects) + 1

1498 stream.write(f"{xr_id} 0 obj".encode())

1499 init_data = {

1500 NameObject("/Type"): NameObject("/XRef"),

1501 NameObject("/Size"): NumberObject(xr_id + 1),

1502 NameObject("/Root"): self.root_object.indirect_reference,

1503 NameObject("/Filter"): NameObject("/FlateDecode"),

1504 NameObject("/Index"): ArrayObject(

1505 [NumberObject(_it) for _su in object_blocks for _it in _su]

1506 ),

1507 NameObject("/W"): ArrayObject(

1508 [NumberObject(1), NumberObject(4), NumberObject(1)]

1509 ),

1510 "__streamdata__": b"",

1511 }

1512 if self._info is not None and (

1513 self._info.indirect_reference.idnum - 1 # type: ignore

1514 >= len(self._original_hash)

1515 or cast(IndirectObject, self._info).hash_bin() # kept for future

1516 != self._original_hash[

1517 self._info.indirect_reference.idnum - 1 # type: ignore

1518 ]

1519 ):

1520 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1521 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1522 if self._ID:

1523 init_data[NameObject(TK.ID)] = self._ID

1524 xr = StreamObject.initialize_from_dictionary(init_data)

1525 xr.set_data(

1526 b"".join(

1527 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1528 )

1529 )

1530 xr.write_to_stream(stream)

1531 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1532

1533 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:

1534 object_positions = []

1535 free_objects = []

1536 stream.write(self.pdf_header.encode() + b"\n")

1537 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1538

1539 for idnum, obj in enumerate(self._objects, start=1):

1540 if obj is not None:

1541 object_positions.append(stream.tell())

1542 stream.write(f"{idnum} 0 obj\n".encode())

1543 if self._encryption and obj != self._encrypt_entry:

1544 obj = self._encryption.encrypt_object(obj, idnum, 0)

1545 obj.write_to_stream(stream)

1546 stream.write(b"\nendobj\n")

1547 else:

1548 object_positions.append(-1)

1549 free_objects.append(idnum)

1550 free_objects.append(0) # add 0 to loop in accordance with specification

1551 return object_positions, free_objects

1552

1553 def _write_xref_table(

1554 self, stream: StreamType, object_positions: List[int], free_objects: List[int]

1555 ) -> int:

1556 xref_location = stream.tell()

1557 stream.write(b"xref\n")

1558 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1559 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1560 free_idx = 1

1561 for offset in object_positions:

1562 if offset > 0:

1563 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1564 else:

1565 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1566 free_idx += 1

1567 return xref_location

1568

1569 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1570 """

1571 Write the PDF trailer to the stream.

1572

1573 To quote the PDF specification:

1574 [The] trailer [gives] the location of the cross-reference table and

1575 of certain special objects within the body of the file.

1576 """

1577 stream.write(b"trailer\n")

1578 trailer = DictionaryObject(

1579 {

1580 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1581 NameObject(TK.ROOT): self.root_object.indirect_reference,

1582 }

1583 )

1584 if self._info is not None:

1585 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1586 if self._ID is not None:

1587 trailer[NameObject(TK.ID)] = self._ID

1588 if self._encrypt_entry:

1589 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1590 trailer.write_to_stream(stream)

1591 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1592

1593 @property

1594 def metadata(self) -> Optional[DocumentInformation]:

1595 """

1596 Retrieve/set the PDF file's document information dictionary, if it exists.

1597

1598 Args:

1599 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1600

1601 Note that some PDF files use (XMP) metadata streams instead of document

1602 information dictionaries, and these metadata streams will not be

1603 accessed by this function, but by :meth:`~xmp_metadata`.

1604

1605 """

1606 return super().metadata

1607

1608 @metadata.setter

1609 def metadata(

1610 self,

1611 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],

1612 ) -> None:

1613 if value is None:

1614 self._info = None

1615 else:

1616 if self._info is not None:

1617 self._info.clear()

1618

1619 self.add_metadata(value)

1620

1621 def add_metadata(self, infos: Dict[str, Any]) -> None:

1622 """

1623 Add custom metadata to the output.

1624

1625 Args:

1626 infos: a Python dictionary where each key is a field

1627 and each value is your new metadata.

1628

1629 """

1630 args = {}

1631 if isinstance(infos, PdfObject):

1632 infos = cast(DictionaryObject, infos.get_object())

1633 for key, value in list(infos.items()):

1634 if isinstance(value, PdfObject):

1635 value = value.get_object()

1636 args[NameObject(key)] = create_string_object(str(value))

1637 if self._info is None:

1638 self._info = DictionaryObject()

1639 self._info.update(args)

1640

1641 def compress_identical_objects(

1642 self,

1643 remove_identicals: bool = True,

1644 remove_orphans: bool = True,

1645 ) -> None:

1646 """

1647 Parse the PDF file and merge objects that have the same hash.

1648 This will make objects common to multiple pages.

1649 Recommended to be used just before writing output.

1650

1651 Args:

1652 remove_identicals: Remove identical objects.

1653 remove_orphans: Remove unreferenced objects.

1654

1655 """

1656

1657 def replace_in_obj(

1658 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]

1659 ) -> None:

1660 if isinstance(obj, DictionaryObject):

1661 key_val = obj.items()

1662 elif isinstance(obj, ArrayObject):

1663 key_val = enumerate(obj) # type: ignore

1664 else:

1665 return

1666 assert isinstance(obj, (DictionaryObject, ArrayObject))

1667 for k, v in key_val:

1668 if isinstance(v, IndirectObject):

1669 orphans[v.idnum - 1] = False

1670 if v in crossref:

1671 obj[k] = crossref[v]

1672 else:

1673 """the filtering on DictionaryObject and ArrayObject only

1674 will be performed within replace_in_obj"""

1675 replace_in_obj(v, crossref)

1676

1677 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])

1678 self._idnum_hash = {}

1679 orphans = [True] * len(self._objects)

1680 # look for similar objects

1681 for idx, obj in enumerate(self._objects):

1682 if is_null_or_none(obj):

1683 continue

1684 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1685 assert isinstance(obj.indirect_reference, IndirectObject)

1686 h = obj.hash_value()

1687 if remove_identicals and h in self._idnum_hash:

1688 self._idnum_hash[h][1].append(obj.indirect_reference)

1689 self._objects[idx] = None

1690 else:

1691 self._idnum_hash[h] = (obj.indirect_reference, [])

1692

1693 # generate the dict converting others to 1st

1694 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1695 cnv_rev: Dict[IndirectObject, IndirectObject] = {}

1696 for k, v in cnv.items():

1697 cnv_rev.update(zip(v, (k,) * len(v)))

1698

1699 # replace reference to merged objects

1700 for obj in self._objects:

1701 if isinstance(obj, (DictionaryObject, ArrayObject)):

1702 replace_in_obj(obj, cnv_rev)

1703

1704 # remove orphans (if applicable)

1705 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

1706

1707 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore

1708

1709 try:

1710 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore

1711 except AttributeError:

1712 pass

1713 for i in compress(range(len(self._objects)), orphans):

1714 self._objects[i] = None

1715

1716 def _sweep_indirect_references(

1717 self,

1718 root: Union[

1719 ArrayObject,

1720 BooleanObject,

1721 DictionaryObject,

1722 FloatObject,

1723 IndirectObject,

1724 NameObject,

1725 PdfObject,

1726 NumberObject,

1727 TextStringObject,

1728 NullObject,

1729 ],

1730 ) -> None: # deprecated

1731 """

1732 Resolving any circular references to Page objects.

1733

1734 Circular references to Page objects can arise when objects such as

1735 annotations refer to their associated page. If these references are not

1736 properly handled, the PDF file will contain multiple copies of the same

1737 Page object. To address this problem, Page objects store their original

1738 object reference number. This method adds the reference number of any

1739 circularly referenced Page objects to an external reference map. This

1740 ensures that self-referencing trees reference the correct new object

1741 location, rather than copying in a new copy of the Page object.

1742

1743 Args:

1744 root: The root of the PDF object tree to sweep.

1745

1746 """

1747 deprecate(

1748 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed",

1749 )

1750

1751 def _resolve_indirect_object(

1752 self, data: IndirectObject

1753 ) -> IndirectObject: # deprecated

1754 """

1755 Resolves an indirect object to an indirect object in this PDF file.

1756

1757 If the input indirect object already belongs to this PDF file, it is

1758 returned directly. Otherwise, the object is retrieved from the input

1759 object's PDF file using the object's ID number and generation number. If

1760 the object cannot be found, a warning is logged and a `NullObject` is

1761 returned.

1762

1763 If the object is not already in this PDF file, it is added to the file's

1764 list of objects and assigned a new ID number and generation number of 0.

1765 The hash value of the object is then added to the `_idnum_hash`

1766 dictionary, with the corresponding `IndirectObject` reference as the

1767 value.

1768

1769 Args:

1770 data: The `IndirectObject` to resolve.

1771

1772 Returns:

1773 The resolved `IndirectObject` in this PDF file.

1774

1775 Raises:

1776 ValueError: If the input stream is closed.

1777

1778 """

1779 deprecate(

1780 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed",

1781 )

1782 return IndirectObject(0, 0, self)

1783

1784 def get_reference(self, obj: PdfObject) -> IndirectObject:

1785 idnum = self._objects.index(obj) + 1

1786 ref = IndirectObject(idnum, 0, self)

1787 assert ref.get_object() == obj

1788 return ref

1789

1790 def get_outline_root(self) -> TreeObject:

1791 if CO.OUTLINES in self._root_object:

1792 # Entries in the catalog dictionary

1793 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1794 if not isinstance(outline, TreeObject):

1795 t = TreeObject(outline)

1796 self._replace_object(outline.indirect_reference.idnum, t)

1797 outline = t

1798 idnum = self._objects.index(outline) + 1

1799 outline_ref = IndirectObject(idnum, 0, self)

1800 assert outline_ref.get_object() == outline

1801 else:

1802 outline = TreeObject()

1803 outline.update({})

1804 outline_ref = self._add_object(outline)

1805 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1806

1807 return outline

1808

1809 def get_threads_root(self) -> ArrayObject:

1810 """

1811 The list of threads.

1812

1813 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1814

1815 Returns:

1816 An array (possibly empty) of Dictionaries with an ``/F`` key,

1817 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1818

1819 """

1820 if CO.THREADS in self._root_object:

1821 # Entries in the catalog dictionary

1822 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1823 else:

1824 threads = ArrayObject()

1825 self._root_object[NameObject(CO.THREADS)] = threads

1826 return threads

1827

1828 @property

1829 def threads(self) -> ArrayObject:

1830 """

1831 Read-only property for the list of threads.

1832

1833 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1834

1835 Each element is a dictionary with an ``/F`` key, and optionally

1836 information about the thread in ``/I`` or ``/Metadata`` keys.

1837 """

1838 return self.get_threads_root()

1839

1840 def add_outline_item_destination(

1841 self,

1842 page_destination: Union[IndirectObject, PageObject, TreeObject],

1843 parent: Union[None, TreeObject, IndirectObject] = None,

1844 before: Union[None, TreeObject, IndirectObject] = None,

1845 is_open: bool = True,

1846 ) -> IndirectObject:

1847 page_destination = cast(PageObject, page_destination.get_object())

1848 if isinstance(page_destination, PageObject):

1849 return self.add_outline_item_destination(

1850 Destination(

1851 f"page #{page_destination.page_number}",

1852 cast(IndirectObject, page_destination.indirect_reference),

1853 Fit.fit(),

1854 )

1855 )

1856

1857 if parent is None:

1858 parent = self.get_outline_root()

1859

1860 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1861 parent = cast(TreeObject, parent.get_object())

1862 page_destination_ref = self._add_object(page_destination)

1863 if before is not None:

1864 before = before.indirect_reference

1865 parent.insert_child(

1866 page_destination_ref,

1867 before,

1868 self,

1869 page_destination.inc_parent_counter_outline

1870 if is_open

1871 else (lambda x, y: 0), # noqa: ARG005

1872 )

1873 if "/Count" not in page_destination:

1874 page_destination[NameObject("/Count")] = NumberObject(0)

1875

1876 return page_destination_ref

1877

1878 def add_outline_item_dict(

1879 self,

1880 outline_item: OutlineItemType,

1881 parent: Union[None, TreeObject, IndirectObject] = None,

1882 before: Union[None, TreeObject, IndirectObject] = None,

1883 is_open: bool = True,

1884 ) -> IndirectObject:

1885 outline_item_object = TreeObject()

1886 outline_item_object.update(outline_item)

1887

1888 """code currently unreachable

1889 if "/A" in outline_item:

1890 action = DictionaryObject()

1891 a_dict = cast(DictionaryObject, outline_item["/A"])

1892 for k, v in list(a_dict.items()):

1893 action[NameObject(str(k))] = v

1894 action_ref = self._add_object(action)

1895 outline_item_object[NameObject("/A")] = action_ref

1896 """

1897 return self.add_outline_item_destination(

1898 outline_item_object, parent, before, is_open

1899 )

1900

1901 def add_outline_item(

1902 self,

1903 title: str,

1904 page_number: Union[None, PageObject, IndirectObject, int],

1905 parent: Union[None, TreeObject, IndirectObject] = None,

1906 before: Union[None, TreeObject, IndirectObject] = None,

1907 color: Optional[Union[Tuple[float, float, float], str]] = None,

1908 bold: bool = False,

1909 italic: bool = False,

1910 fit: Fit = PAGE_FIT,

1911 is_open: bool = True,

1912 ) -> IndirectObject:

1913 """

1914 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1915

1916 Args:

1917 title: Title to use for this outline item.

1918 page_number: Page number this outline item will point to.

1919 parent: A reference to a parent outline item to create nested

1920 outline items.

1921 before:

1922 color: Color of the outline item's font as a red, green, blue tuple

1923 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1924 bold: Outline item font is bold

1925 italic: Outline item font is italic

1926 fit: The fit of the destination page.

1927

1928 Returns:

1929 The added outline item as an indirect object.

1930

1931 """

1932 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

1933 if isinstance(italic, Fit): # it means that we are on the old params

1934 if fit is not None and page_number is None:

1935 page_number = fit

1936 return self.add_outline_item(

1937 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

1938 )

1939 if page_number is None:

1940 action_ref = None

1941 else:

1942 if isinstance(page_number, IndirectObject):

1943 page_ref = page_number

1944 elif isinstance(page_number, PageObject):

1945 page_ref = page_number.indirect_reference

1946 elif isinstance(page_number, int):

1947 try:

1948 page_ref = self.pages[page_number].indirect_reference

1949 except IndexError:

1950 page_ref = NumberObject(page_number)

1951 if page_ref is None:

1952 logger_warning(

1953 f"can not find reference of page {page_number}",

1954 __name__,

1955 )

1956 page_ref = NullObject()

1957 dest = Destination(

1958 NameObject("/" + title + " outline item"),

1959 page_ref,

1960 fit,

1961 )

1962

1963 action_ref = self._add_object(

1964 DictionaryObject(

1965 {

1966 NameObject(GoToActionArguments.D): dest.dest_array,

1967 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1968 }

1969 )

1970 )

1971 outline_item = self._add_object(

1972 _create_outline_item(action_ref, title, color, italic, bold)

1973 )

1974

1975 if parent is None:

1976 parent = self.get_outline_root()

1977 return self.add_outline_item_destination(outline_item, parent, before, is_open)

1978

1979 def add_outline(self) -> None:

1980 raise NotImplementedError(

1981 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

1982 )

1983

1984 def add_named_destination_array(

1985 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

1986 ) -> None:

1987 named_dest = self.get_named_dest_root()

1988 i = 0

1989 while i < len(named_dest):

1990 if title < named_dest[i]:

1991 named_dest.insert(i, destination)

1992 named_dest.insert(i, TextStringObject(title))

1993 return

1994 i += 2

1995 named_dest.extend([TextStringObject(title), destination])

1996 return

1997

1998 def add_named_destination_object(

1999 self,

2000 page_destination: PdfObject,

2001 ) -> IndirectObject:

2002 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore

2003 self.add_named_destination_array(

2004 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore

2005 )

2006

2007 return page_destination_ref

2008

2009 def add_named_destination(

2010 self,

2011 title: str,

2012 page_number: int,

2013 ) -> IndirectObject:

2014 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore

2015 dest = DictionaryObject()

2016 dest.update(

2017 {

2018 NameObject(GoToActionArguments.D): ArrayObject(

2019 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

2020 ),

2021 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

2022 }

2023 )

2024

2025 dest_ref = self._add_object(dest)

2026 if not isinstance(title, TextStringObject):

2027 title = TextStringObject(str(title))

2028

2029 self.add_named_destination_array(title, dest_ref)

2030 return dest_ref

2031

2032 def remove_links(self) -> None:

2033 """Remove links and annotations from this output."""

2034 for page in self.pages:

2035 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

2036

2037 def remove_annotations(

2038 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

2039 ) -> None:

2040 """

2041 Remove annotations by annotation subtype.

2042

2043 Args:

2044 subtypes: subtype or list of subtypes to be removed.

2045 Examples are: "/Link", "/FileAttachment", "/Sound",

2046 "/Movie", "/Screen", ...

2047 If you want to remove all annotations, use subtypes=None.

2048

2049 """

2050 for page in self.pages:

2051 self._remove_annots_from_page(page, subtypes)

2052

2053 def _remove_annots_from_page(

2054 self,

2055 page: Union[IndirectObject, PageObject, DictionaryObject],

2056 subtypes: Optional[Iterable[str]],

2057 ) -> None:

2058 page = cast(DictionaryObject, page.get_object())

2059 if PG.ANNOTS in page:

2060 i = 0

2061 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

2062 an = cast(ArrayObject, page[PG.ANNOTS])[i]

2063 obj = cast(DictionaryObject, an.get_object())

2064 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

2065 if isinstance(an, IndirectObject):

2066 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

2067 del page[PG.ANNOTS][i] # type:ignore

2068 else:

2069 i += 1

2070

2071 def remove_objects_from_page(

2072 self,

2073 page: Union[PageObject, DictionaryObject],

2074 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

2075 text_filters: Optional[Dict[str, Any]] = None

2076 ) -> None:

2077 """

2078 Remove objects specified by ``to_delete`` from the given page.

2079

2080 Args:

2081 page: Page object to clean up.

2082 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

2083 or a list of ObjectDeletionFlag

2084 text_filters: Properties of text to be deleted, if applicable. Optional.

2085 This is a Python dictionary with the following properties:

2086

2087 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

2088

2089 """

2090 if isinstance(to_delete, (list, tuple)):

2091 for to_d in to_delete:

2092 self.remove_objects_from_page(page, to_d)

2093 return None

2094 assert isinstance(to_delete, ObjectDeletionFlag)

2095

2096 if to_delete & ObjectDeletionFlag.LINKS:

2097 return self._remove_annots_from_page(page, ("/Link",))

2098 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

2099 return self._remove_annots_from_page(

2100 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

2101 )

2102 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

2103 return self._remove_annots_from_page(page, ("/3D",))

2104 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

2105 return self._remove_annots_from_page(page, None)

2106

2107 jump_operators = []

2108 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

2109 jump_operators = (

2110 [

2111 b"w", b"J", b"j", b"M", b"d", b"i",

2112 b"W", b"W*",

2113 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

2114 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

2115 b"sh"

2116 ]

2117 )

2118 if to_delete & ObjectDeletionFlag.TEXT:

2119 jump_operators = [b"Tj", b"TJ", b"'", b'"']

2120

2121 def clean(

2122 content: ContentStream,

2123 images: List[str],

2124 forms: List[str],

2125 text_filters: Optional[Dict[str, Any]] = None

2126 ) -> None:

2127 nonlocal jump_operators, to_delete

2128

2129 font_id = None

2130 font_ids_to_delete = []

2131 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

2132 font_ids_to_delete = text_filters.get("font_ids", [])

2133

2134 i = 0

2135 while i < len(content.operations):

2136 operands, operator = content.operations[i]

2137 if operator == b"Tf":

2138 font_id = operands[0]

2139 if (

2140 (

2141 operator == b"INLINE IMAGE"

2142 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2143 )

2144 or (operator in jump_operators)

2145 or (

2146 operator == b"Do"

2147 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2148 and (operands[0] in images)

2149 )

2150 ):

2151 if (

2152 not to_delete & ObjectDeletionFlag.TEXT

2153 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2154 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2155 ):

2156 del content.operations[i]

2157 else:

2158 i += 1

2159 else:

2160 i += 1

2161 content.get_data() # this ensures ._data is rebuilt from the .operations

2162

2163 def clean_forms(

2164 elt: DictionaryObject, stack: List[DictionaryObject]

2165 ) -> Tuple[List[str], List[str]]:

2166 nonlocal to_delete

2167 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2168 if (elt in stack) or (

2169 hasattr(elt, "indirect_reference")

2170 and any(

2171 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2172 for x in stack

2173 )

2174 ):

2175 # to prevent infinite looping

2176 return [], [] # pragma: no cover

2177 try:

2178 d = cast(

2179 Dict[Any, Any],

2180 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2181 )

2182 except KeyError:

2183 d = {}

2184 images = []

2185 forms = []

2186 for k, v in d.items():

2187 o = v.get_object()

2188 try:

2189 content: Any = None

2190 if (

2191 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2192 and o["/Subtype"] == "/Image"

2193 ):

2194 content = NullObject() # to delete the image keeping the entry

2195 images.append(k)

2196 if o["/Subtype"] == "/Form":

2197 forms.append(k)

2198 if isinstance(o, ContentStream):

2199 content = o

2200 else:

2201 content = ContentStream(o, self)

2202 content.update(

2203 {

2204 k1: v1

2205 for k1, v1 in o.items()

2206 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2207 }

2208 )

2209 try:

2210 content.indirect_reference = o.indirect_reference

2211 except AttributeError: # pragma: no cover

2212 pass

2213 stack.append(elt)

2214 clean_forms(content, stack) # clean subforms

2215 if content is not None:

2216 if isinstance(v, IndirectObject):

2217 self._objects[v.idnum - 1] = content

2218 else:

2219 # should only occur in a PDF not respecting PDF spec

2220 # where streams must be indirected.

2221 d[k] = self._add_object(content) # pragma: no cover

2222 except (TypeError, KeyError):

2223 pass

2224 for im in images:

2225 del d[im] # for clean-up

2226 if isinstance(elt, StreamObject): # for /Form

2227 if not isinstance(elt, ContentStream): # pragma: no cover

2228 e = ContentStream(elt, self)

2229 e.update(elt.items())

2230 elt = e

2231 clean(elt, images, forms, text_filters) # clean the content

2232 return images, forms

2233

2234 if not isinstance(page, PageObject):

2235 page = PageObject(self, page.indirect_reference) # pragma: no cover

2236 if "/Contents" in page:

2237 content = cast(ContentStream, page.get_contents())

2238

2239 images, forms = clean_forms(page, [])

2240

2241 clean(content, images, forms, text_filters)

2242 page.replace_contents(content)

2243

2244 def remove_images(

2245 self,

2246 to_delete: ImageType = ImageType.ALL,

2247 ) -> None:

2248 """

2249 Remove images from this output.

2250

2251 Args:

2252 to_delete: The type of images to be deleted

2253 (default = all images types)

2254

2255 """

2256 if isinstance(to_delete, bool):

2257 to_delete = ImageType.ALL

2258

2259 i = ObjectDeletionFlag.NONE

2260

2261 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2262 if to_delete & ImageType[image]:

2263 i |= ObjectDeletionFlag[image]

2264

2265 for page in self.pages:

2266 self.remove_objects_from_page(page, i)

2267

2268 def remove_text(self, font_names: Optional[List[str]] = None) -> None:

2269 """

2270 Remove text from the PDF.

2271

2272 Args:

2273 font_names: List of font names to remove, such as "Helvetica-Bold".

2274 Optional. If not specified, all text will be removed.

2275 """

2276 if not font_names:

2277 font_names = []

2278

2279 for page in self.pages:

2280 resource_ids_to_remove = []

2281

2282 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2283 # Font names need to be converted to resource names/IDs for easier removal

2284 if font_names:

2285 # Recursively loop through page objects to gather font info

2286 def get_font_info(

2287 obj: Any,

2288 font_info: Optional[Dict[str, Any]] = None,

2289 key: Optional[str] = None

2290 ) -> Dict[str, Any]:

2291 if font_info is None:

2292 font_info = {}

2293 if isinstance(obj, IndirectObject):

2294 obj = obj.get_object()

2295 if isinstance(obj, dict):

2296 if obj.get("/Type") == "/Font":

2297 font_name = obj.get("/BaseFont", "")

2298 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2299 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2300 if normalized_font_name not in font_info:

2301 font_info[normalized_font_name] = {

2302 "normalized_font_name": normalized_font_name,

2303 "resource_ids": [],

2304 }

2305 if key not in font_info[normalized_font_name]["resource_ids"]:

2306 font_info[normalized_font_name]["resource_ids"].append(key)

2307 for k in obj:

2308 font_info = get_font_info(obj[k], font_info, k)

2309 elif isinstance(obj, (list, ArrayObject)):

2310 for child_obj in obj:

2311 font_info = get_font_info(child_obj, font_info)

2312 return font_info

2313

2314 # Add relevant resource names for removal

2315 font_info = get_font_info(page.get("/Resources"))

2316 for font_name in font_names:

2317 if font_name in font_info:

2318 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2319

2320 text_filters = {}

2321 if font_names:

2322 text_filters["font_ids"] = resource_ids_to_remove

2323 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2324

2325 def add_uri(

2326 self,

2327 page_number: int,

2328 uri: str,

2329 rect: RectangleObject,

2330 border: Optional[ArrayObject] = None,

2331 ) -> None:

2332 """

2333 Add an URI from a rectangular area to the specified page.

2334

2335 Args:

2336 page_number: index of the page on which to place the URI action.

2337 uri: URI of resource to link to.

2338 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2339 array of four integers specifying the clickable rectangular area

2340 ``[xLL, yLL, xUR, yUR]``, or string in the form

2341 ``"[ xLL yLL xUR yUR ]"``.

2342 border: if provided, an array describing border-drawing

2343 properties. See the PDF spec for details. No border will be

2344 drawn if this argument is omitted.

2345

2346 """

2347 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore

2348 page_ref = cast(Dict[str, Any], self.get_object(page_link))

2349

2350 border_arr: BorderArrayType

2351 if border is not None:

2352 border_arr = [NumberObject(n) for n in border[:3]]

2353 if len(border) == 4:

2354 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2355 border_arr.append(dash_pattern)

2356 else:

2357 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2358

2359 if isinstance(rect, str):

2360 rect = NumberObject(rect)

2361 elif isinstance(rect, RectangleObject):

2362 pass

2363 else:

2364 rect = RectangleObject(rect)

2365

2366 lnk2 = DictionaryObject()

2367 lnk2.update(

2368 {

2369 NameObject("/S"): NameObject("/URI"),

2370 NameObject("/URI"): TextStringObject(uri),

2371 }

2372 )

2373 lnk = DictionaryObject()

2374 lnk.update(

2375 {

2376 NameObject(AA.Type): NameObject("/Annot"),

2377 NameObject(AA.Subtype): NameObject("/Link"),

2378 NameObject(AA.P): page_link,

2379 NameObject(AA.Rect): rect,

2380 NameObject("/H"): NameObject("/I"),

2381 NameObject(AA.Border): ArrayObject(border_arr),

2382 NameObject("/A"): lnk2,

2383 }

2384 )

2385 lnk_ref = self._add_object(lnk)

2386

2387 if PG.ANNOTS in page_ref:

2388 page_ref[PG.ANNOTS].append(lnk_ref)

2389 else:

2390 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2391

2392 _valid_layouts = (

2393 "/NoLayout",

2394 "/SinglePage",

2395 "/OneColumn",

2396 "/TwoColumnLeft",

2397 "/TwoColumnRight",

2398 "/TwoPageLeft",

2399 "/TwoPageRight",

2400 )

2401

2402 def _get_page_layout(self) -> Optional[LayoutType]:

2403 try:

2404 return cast(LayoutType, self._root_object["/PageLayout"])

2405 except KeyError:

2406 return None

2407

2408 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2409 """

2410 Set the page layout.

2411

2412 Args:

2413 layout: The page layout to be used.

2414

2415 .. list-table:: Valid ``layout`` arguments

2416 :widths: 50 200

2417

2418 * - /NoLayout

2419 - Layout explicitly not specified

2420 * - /SinglePage

2421 - Show one page at a time

2422 * - /OneColumn

2423 - Show one column at a time

2424 * - /TwoColumnLeft

2425 - Show pages in two columns, odd-numbered pages on the left

2426 * - /TwoColumnRight

2427 - Show pages in two columns, odd-numbered pages on the right

2428 * - /TwoPageLeft

2429 - Show two pages at a time, odd-numbered pages on the left

2430 * - /TwoPageRight

2431 - Show two pages at a time, odd-numbered pages on the right

2432

2433 """

2434 if not isinstance(layout, NameObject):

2435 if layout not in self._valid_layouts:

2436 logger_warning(

2437 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",

2438 __name__,

2439 )

2440 layout = NameObject(layout)

2441 self._root_object.update({NameObject("/PageLayout"): layout})

2442

2443 def set_page_layout(self, layout: LayoutType) -> None:

2444 """

2445 Set the page layout.

2446

2447 Args:

2448 layout: The page layout to be used

2449

2450 .. list-table:: Valid ``layout`` arguments

2451 :widths: 50 200

2452

2453 * - /NoLayout

2454 - Layout explicitly not specified

2455 * - /SinglePage

2456 - Show one page at a time

2457 * - /OneColumn

2458 - Show one column at a time

2459 * - /TwoColumnLeft

2460 - Show pages in two columns, odd-numbered pages on the left

2461 * - /TwoColumnRight

2462 - Show pages in two columns, odd-numbered pages on the right

2463 * - /TwoPageLeft

2464 - Show two pages at a time, odd-numbered pages on the left

2465 * - /TwoPageRight

2466 - Show two pages at a time, odd-numbered pages on the right

2467

2468 """

2469 self._set_page_layout(layout)

2470

2471 @property

2472 def page_layout(self) -> Optional[LayoutType]:

2473 """

2474 Page layout property.

2475

2476 .. list-table:: Valid ``layout`` values

2477 :widths: 50 200

2478

2479 * - /NoLayout

2480 - Layout explicitly not specified

2481 * - /SinglePage

2482 - Show one page at a time

2483 * - /OneColumn

2484 - Show one column at a time

2485 * - /TwoColumnLeft

2486 - Show pages in two columns, odd-numbered pages on the left

2487 * - /TwoColumnRight

2488 - Show pages in two columns, odd-numbered pages on the right

2489 * - /TwoPageLeft

2490 - Show two pages at a time, odd-numbered pages on the left

2491 * - /TwoPageRight

2492 - Show two pages at a time, odd-numbered pages on the right

2493 """

2494 return self._get_page_layout()

2495

2496 @page_layout.setter

2497 def page_layout(self, layout: LayoutType) -> None:

2498 self._set_page_layout(layout)

2499

2500 _valid_modes = (

2501 "/UseNone",

2502 "/UseOutlines",

2503 "/UseThumbs",

2504 "/FullScreen",

2505 "/UseOC",

2506 "/UseAttachments",

2507 )

2508

2509 def _get_page_mode(self) -> Optional[PagemodeType]:

2510 try:

2511 return cast(PagemodeType, self._root_object["/PageMode"])

2512 except KeyError:

2513 return None

2514

2515 @property

2516 def page_mode(self) -> Optional[PagemodeType]:

2517 """

2518 Page mode property.

2519

2520 .. list-table:: Valid ``mode`` values

2521 :widths: 50 200

2522

2523 * - /UseNone

2524 - Do not show outline or thumbnails panels

2525 * - /UseOutlines

2526 - Show outline (aka bookmarks) panel

2527 * - /UseThumbs

2528 - Show page thumbnails panel

2529 * - /FullScreen

2530 - Fullscreen view

2531 * - /UseOC

2532 - Show Optional Content Group (OCG) panel

2533 * - /UseAttachments

2534 - Show attachments panel

2535 """

2536 return self._get_page_mode()

2537

2538 @page_mode.setter

2539 def page_mode(self, mode: PagemodeType) -> None:

2540 if isinstance(mode, NameObject):

2541 mode_name: NameObject = mode

2542 else:

2543 if mode not in self._valid_modes:

2544 logger_warning(

2545 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__

2546 )

2547 mode_name = NameObject(mode)

2548 self._root_object.update({NameObject("/PageMode"): mode_name})

2549

2550 def add_annotation(

2551 self,

2552 page_number: Union[int, PageObject],

2553 annotation: Dict[str, Any],

2554 ) -> DictionaryObject:

2555 """

2556 Add a single annotation to the page.

2557 The added annotation must be a new annotation.

2558 It cannot be recycled.

2559

2560 Args:

2561 page_number: PageObject or page index.

2562 annotation: Annotation to be added (created with annotation).

2563

2564 Returns:

2565 The inserted object.

2566 This can be used for popup creation, for example.

2567

2568 """

2569 page = page_number

2570 if isinstance(page, int):

2571 page = self.pages[page]

2572 elif not isinstance(page, PageObject):

2573 raise TypeError("page: invalid type")

2574

2575 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2576 to_add[NameObject("/P")] = page.indirect_reference

2577

2578 if page.annotations is None:

2579 page[NameObject("/Annots")] = ArrayObject()

2580 assert page.annotations is not None

2581

2582 # Internal link annotations need the correct object type for the

2583 # destination

2584 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2585 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")])

2586 dest = Destination(

2587 NameObject("/LinkName"),

2588 tmp["target_page_index"],

2589 Fit(

2590 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2591 ), # I have no clue why this dict-hack is necessary

2592 )

2593 to_add[NameObject("/Dest")] = dest.dest_array

2594

2595 page.annotations.append(self._add_object(to_add))

2596

2597 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2598 cast(DictionaryObject, to_add["/Parent"].get_object())[

2599 NameObject("/Popup")

2600 ] = to_add.indirect_reference

2601

2602 return to_add

2603

2604 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2605 """

2606 Perform some clean up in the page.

2607 Currently: convert NameObject named destination to TextStringObject

2608 (required for names/dests list)

2609

2610 Args:

2611 page:

2612

2613 Returns:

2614 The cleaned PageObject

2615

2616 """

2617 page = cast("PageObject", page.get_object())

2618 for a in page.get("/Annots", []):

2619 a_obj = a.get_object()

2620 d = a_obj.get("/Dest", None)

2621 act = a_obj.get("/A", None)

2622 if isinstance(d, NameObject):

2623 a_obj[NameObject("/Dest")] = TextStringObject(d)

2624 elif act is not None:

2625 act = act.get_object()

2626 d = act.get("/D", None)

2627 if isinstance(d, NameObject):

2628 act[NameObject("/D")] = TextStringObject(d)

2629 return page

2630

2631 def _create_stream(

2632 self, fileobj: Union[Path, StrByteType, PdfReader]

2633 ) -> Tuple[IOBase, Optional[Encryption]]:

2634 # If the fileobj parameter is a string, assume it is a path

2635 # and create a file object at that location. If it is a file,

2636 # copy the file's contents into a BytesIO stream object; if

2637 # it is a PdfReader, copy that reader's stream into a

2638 # BytesIO stream.

2639 # If fileobj is none of the above types, it is not modified

2640 encryption_obj = None

2641 stream: IOBase

2642 if isinstance(fileobj, (str, Path)):

2643 with FileIO(fileobj, "rb") as f:

2644 stream = BytesIO(f.read())

2645 elif isinstance(fileobj, PdfReader):

2646 if fileobj._encryption:

2647 encryption_obj = fileobj._encryption

2648 orig_tell = fileobj.stream.tell()

2649 fileobj.stream.seek(0)

2650 stream = BytesIO(fileobj.stream.read())

2651

2652 # reset the stream to its original location

2653 fileobj.stream.seek(orig_tell)

2654 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2655 fileobj.seek(0)

2656 filecontent = fileobj.read()

2657 stream = BytesIO(filecontent)

2658 else:

2659 raise NotImplementedError(

2660 "Merging requires an object that PdfReader can parse. "

2661 "Typically, that is a Path or a string representing a Path, "

2662 "a file object, or an object implementing .seek and .read. "

2663 "Passing a PdfReader directly works as well."

2664 )

2665 return stream, encryption_obj

2666

2667 def append(

2668 self,

2669 fileobj: Union[StrByteType, PdfReader, Path],

2670 outline_item: Union[

2671 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]

2672 ] = None,

2673 pages: Union[

2674 None,

2675 PageRange,

2676 Tuple[int, int],

2677 Tuple[int, int, int],

2678 List[int],

2679 List[PageObject],

2680 ] = None,

2681 import_outline: bool = True,

2682 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None,

2683 ) -> None:

2684 """

2685 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2686 concatenate all pages onto the end of the file instead of specifying a

2687 position.

2688

2689 Args:

2690 fileobj: A File Object or an object that supports the standard

2691 read and seek methods similar to a File Object. Could also be a

2692 string representing a path to a PDF file.

2693 outline_item: Optionally, you may specify a string to build an

2694 outline (aka 'bookmark') to identify the beginning of the

2695 included file.

2696 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2697 or a ``(start, stop[, step])`` tuple

2698 or a list of pages to be processed

2699 to merge only the specified range of pages from the source

2700 document into the output document.

2701 import_outline: You may prevent the source document's

2702 outline (collection of outline items, previously referred to as

2703 'bookmarks') from being imported by specifying this as ``False``.

2704 excluded_fields: Provide the list of fields/keys to be ignored

2705 if ``/Annots`` is part of the list, the annotation will be ignored

2706 if ``/B`` is part of the list, the articles will be ignored

2707

2708 """

2709 if excluded_fields is None:

2710 excluded_fields = ()

2711 if isinstance(outline_item, (tuple, list, PageRange)):

2712 if isinstance(pages, bool):

2713 if not isinstance(import_outline, bool):

2714 excluded_fields = import_outline

2715 import_outline = pages

2716 pages = outline_item

2717 self.merge(

2718 None,

2719 fileobj,

2720 None,

2721 pages,

2722 import_outline,

2723 excluded_fields,

2724 )

2725 else: # if isinstance(outline_item, str):

2726 self.merge(

2727 None,

2728 fileobj,

2729 outline_item,

2730 pages,

2731 import_outline,

2732 excluded_fields,

2733 )

2734

2735 def merge(

2736 self,

2737 position: Optional[int],

2738 fileobj: Union[Path, StrByteType, PdfReader],

2739 outline_item: Optional[str] = None,

2740 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None,

2741 import_outline: bool = True,

2742 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (),

2743 ) -> None:

2744 """

2745 Merge the pages from the given file into the output file at the

2746 specified page number.

2747

2748 Args:

2749 position: The *page number* to insert this file. File will

2750 be inserted after the given number.

2751 fileobj: A File Object or an object that supports the standard

2752 read and seek methods similar to a File Object. Could also be a

2753 string representing a path to a PDF file.

2754 outline_item: Optionally, you may specify a string to build an outline

2755 (aka 'bookmark') to identify the

2756 beginning of the included file.

2757 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2758 or a ``(start, stop[, step])`` tuple

2759 or a list of pages to be processed

2760 to merge only the specified range of pages from the source

2761 document into the output document.

2762 import_outline: You may prevent the source document's

2763 outline (collection of outline items, previously referred to as

2764 'bookmarks') from being imported by specifying this as ``False``.

2765 excluded_fields: provide the list of fields/keys to be ignored

2766 if ``/Annots`` is part of the list, the annotation will be ignored

2767 if ``/B`` is part of the list, the articles will be ignored

2768

2769 Raises:

2770 TypeError: The pages attribute is not configured properly

2771

2772 """

2773 if isinstance(fileobj, PdfDocCommon):

2774 reader = fileobj

2775 else:

2776 stream, encryption_obj = self._create_stream(fileobj)

2777 # Create a new PdfReader instance using the stream

2778 # (either file or BytesIO or StringIO) created above

2779 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2780

2781 if excluded_fields is None:

2782 excluded_fields = ()

2783 # Find the range of pages to merge.

2784 if pages is None:

2785 pages = list(range(len(reader.pages)))

2786 elif isinstance(pages, PageRange):

2787 pages = list(range(*pages.indices(len(reader.pages))))

2788 elif isinstance(pages, list):

2789 pass # keep unchanged

2790 elif isinstance(pages, tuple) and len(pages) <= 3:

2791 pages = list(range(*pages))

2792 elif not isinstance(pages, tuple):

2793 raise TypeError(

2794 '"pages" must be a tuple of (start, stop[, step]) or a list'

2795 )

2796

2797 srcpages = {}

2798 for page in pages:

2799 if isinstance(page, PageObject):

2800 pg = page

2801 else:

2802 pg = reader.pages[page]

2803 assert pg.indirect_reference is not None

2804 if position is None:

2805 # numbers in the exclude list identifies that the exclusion is

2806 # only applicable to 1st level of cloning

2807 srcpages[pg.indirect_reference.idnum] = self.add_page(

2808 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2809 )

2810 else:

2811 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2812 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2813 )

2814 position += 1

2815 srcpages[pg.indirect_reference.idnum].original_page = pg

2816

2817 reader._named_destinations = (

2818 reader.named_destinations

2819 ) # need for the outline processing below

2820

2821 arr: Any

2822

2823 def _process_named_dests(dest: Any) -> None:

2824 arr = dest.dest_array

2825 if "/Names" in self._root_object and dest["/Title"] in cast(

2826 List[Any],

2827 cast(

2828 DictionaryObject,

2829 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2830 ).get("/Names", DictionaryObject()),

2831 ):

2832 # already exists: should not duplicate it

2833 pass

2834 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2835 pass

2836 elif isinstance(dest["/Page"], int):

2837 # the page reference is a page number normally not a PDF Reference

2838 # page numbers as int are normally accepted only in external goto

2839 try:

2840 p = reader.pages[dest["/Page"]]

2841 except IndexError:

2842 return

2843 assert p.indirect_reference is not None

2844 try:

2845 arr[NumberObject(0)] = NumberObject(

2846 srcpages[p.indirect_reference.idnum].page_number

2847 )

2848 self.add_named_destination_array(dest["/Title"], arr)

2849 except KeyError:

2850 pass

2851 elif dest["/Page"].indirect_reference.idnum in srcpages:

2852 arr[NumberObject(0)] = srcpages[

2853 dest["/Page"].indirect_reference.idnum

2854 ].indirect_reference

2855 self.add_named_destination_array(dest["/Title"], arr)

2856

2857 for dest in reader._named_destinations.values():

2858 _process_named_dests(dest)

2859

2860 outline_item_typ: TreeObject

2861 if outline_item is not None:

2862 outline_item_typ = cast(

2863 "TreeObject",

2864 self.add_outline_item(

2865 TextStringObject(outline_item),

2866 next(iter(srcpages.values())).indirect_reference,

2867 fit=PAGE_FIT,

2868 ).get_object(),

2869 )

2870 else:

2871 outline_item_typ = self.get_outline_root()

2872

2873 _ro = reader.root_object

2874 if import_outline and CO.OUTLINES in _ro:

2875 outline = self._get_filtered_outline(

2876 _ro.get(CO.OUTLINES, None), srcpages, reader

2877 )

2878 self._insert_filtered_outline(

2879 outline, outline_item_typ, None

2880 ) # TODO: use before parameter

2881

2882 if "/Annots" not in excluded_fields:

2883 for pag in srcpages.values():

2884 lst = self._insert_filtered_annotations(

2885 pag.original_page.get("/Annots", []), pag, srcpages, reader

2886 )

2887 if len(lst) > 0:

2888 pag[NameObject("/Annots")] = lst

2889 self.clean_page(pag)

2890

2891 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:

2892 if "/AcroForm" not in self._root_object:

2893 self._root_object[NameObject("/AcroForm")] = self._add_object(

2894 cast(

2895 DictionaryObject,

2896 reader.root_object["/AcroForm"],

2897 ).clone(self, False, ("/Fields",))

2898 )

2899 arr = ArrayObject()

2900 else:

2901 arr = cast(

2902 ArrayObject,

2903 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2904 )

2905 trslat = self._id_translated[id(reader)]

2906 try:

2907 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore

2908 try:

2909 ind = IndirectObject(trslat[f.idnum], 0, self)

2910 if ind not in arr:

2911 arr.append(ind)

2912 except KeyError:

2913 # for trslat[] which mean the field has not be copied

2914 # through the page

2915 pass

2916 except KeyError: # for /Acroform or /Fields are not existing

2917 arr = self._add_object(ArrayObject())

2918 cast(DictionaryObject, self._root_object["/AcroForm"])[

2919 NameObject("/Fields")

2920 ] = arr

2921

2922 if "/B" not in excluded_fields:

2923 self.add_filtered_articles("", srcpages, reader)

2924

2925 def _add_articles_thread(

2926 self,

2927 thread: DictionaryObject, # thread entry from the reader's array of threads

2928 pages: Dict[int, PageObject],

2929 reader: PdfReader,

2930 ) -> IndirectObject:

2931 """

2932 Clone the thread with only the applicable articles.

2933

2934 Args:

2935 thread:

2936 pages:

2937 reader:

2938

2939 Returns:

2940 The added thread as an indirect reference

2941

2942 """

2943 nthread = thread.clone(

2944 self, force_duplicate=True, ignore_fields=("/F",)

2945 ) # use of clone to keep link between reader and writer

2946 self.threads.append(nthread.indirect_reference)

2947 first_article = cast("DictionaryObject", thread["/F"])

2948 current_article: Optional[DictionaryObject] = first_article

2949 new_article: Optional[DictionaryObject] = None

2950 while current_article is not None:

2951 pag = self._get_cloned_page(

2952 cast("PageObject", current_article["/P"]), pages, reader

2953 )

2954 if pag is not None:

2955 if new_article is None:

2956 new_article = cast(

2957 "DictionaryObject",

2958 self._add_object(DictionaryObject()).get_object(),

2959 )

2960 new_first = new_article

2961 nthread[NameObject("/F")] = new_article.indirect_reference

2962 else:

2963 new_article2 = cast(

2964 "DictionaryObject",

2965 self._add_object(

2966 DictionaryObject(

2967 {NameObject("/V"): new_article.indirect_reference}

2968 )

2969 ).get_object(),

2970 )

2971 new_article[NameObject("/N")] = new_article2.indirect_reference

2972 new_article = new_article2

2973 new_article[NameObject("/P")] = pag

2974 new_article[NameObject("/T")] = nthread.indirect_reference

2975 new_article[NameObject("/R")] = current_article["/R"]

2976 pag_obj = cast("PageObject", pag.get_object())

2977 if "/B" not in pag_obj:

2978 pag_obj[NameObject("/B")] = ArrayObject()

2979 cast("ArrayObject", pag_obj["/B"]).append(

2980 new_article.indirect_reference

2981 )

2982 current_article = cast("DictionaryObject", current_article["/N"])

2983 if current_article == first_article:

2984 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore

2985 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore

2986 current_article = None

2987 assert nthread.indirect_reference is not None

2988 return nthread.indirect_reference

2989

2990 def add_filtered_articles(

2991 self,

2992 fltr: Union[

2993 Pattern[Any], str

2994 ], # thread entry from the reader's array of threads

2995 pages: Dict[int, PageObject],

2996 reader: PdfReader,

2997 ) -> None:

2998 """

2999 Add articles matching the defined criteria.

3000

3001 Args:

3002 fltr:

3003 pages:

3004 reader:

3005

3006 """

3007 if isinstance(fltr, str):

3008 fltr = re.compile(fltr)

3009 elif not isinstance(fltr, Pattern):

3010 fltr = re.compile("")

3011 for p in pages.values():

3012 pp = p.original_page

3013 for a in pp.get("/B", ()):

3014 thr = a.get_object().get("/T")

3015 if thr is None:

3016 continue

3017 thr = thr.get_object()

3018 if thr.indirect_reference.idnum not in self._id_translated[

3019 id(reader)

3020 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

3021 self._add_articles_thread(thr, pages, reader)

3022

3023 def _get_cloned_page(

3024 self,

3025 page: Union[None, IndirectObject, PageObject, NullObject],

3026 pages: Dict[int, PageObject],

3027 reader: PdfReader,

3028 ) -> Optional[IndirectObject]:

3029 if isinstance(page, NullObject):

3030 return None

3031 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

3032 _i = page.indirect_reference

3033 elif isinstance(page, IndirectObject):

3034 _i = page

3035 try:

3036 return pages[_i.idnum].indirect_reference # type: ignore

3037 except Exception:

3038 return None

3039

3040 def _insert_filtered_annotations(

3041 self,

3042 annots: Union[IndirectObject, List[DictionaryObject], None],

3043 page: PageObject,

3044 pages: Dict[int, PageObject],

3045 reader: PdfReader,

3046 ) -> List[Destination]:

3047 outlist = ArrayObject()

3048 if isinstance(annots, IndirectObject):

3049 annots = cast("List[Any]", annots.get_object())

3050 if annots is None:

3051 return outlist

3052 if not isinstance(annots, list):

3053 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)

3054 return outlist

3055 for an in annots:

3056 ano = cast("DictionaryObject", an.get_object())

3057 if (

3058 ano["/Subtype"] != "/Link"

3059 or "/A" not in ano

3060 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"

3061 or "/Dest" in ano

3062 ):

3063 if "/Dest" not in ano:

3064 outlist.append(self._add_object(ano.clone(self)))

3065 else:

3066 d = ano["/Dest"]

3067 if isinstance(d, str):

3068 # it is a named dest

3069 if str(d) in self.get_named_dest_root():

3070 outlist.append(ano.clone(self).indirect_reference)

3071 else:

3072 d = cast("ArrayObject", d)

3073 p = self._get_cloned_page(d[0], pages, reader)

3074 if p is not None:

3075 anc = ano.clone(self, ignore_fields=("/Dest",))

3076 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

3077 outlist.append(self._add_object(anc))

3078 else:

3079 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

3080 if d is None or isinstance(d, NullObject):

3081 continue

3082 if isinstance(d, str):

3083 # it is a named dest

3084 if str(d) in self.get_named_dest_root():

3085 outlist.append(ano.clone(self).indirect_reference)

3086 else:

3087 d = cast("ArrayObject", d)

3088 p = self._get_cloned_page(d[0], pages, reader)

3089 if p is not None:

3090 anc = ano.clone(self, ignore_fields=("/D",))

3091 cast("DictionaryObject", anc["/A"])[

3092 NameObject("/D")

3093 ] = ArrayObject([p, *d[1:]])

3094 outlist.append(self._add_object(anc))

3095 return outlist

3096

3097 def _get_filtered_outline(

3098 self,

3099 node: Any,

3100 pages: Dict[int, PageObject],

3101 reader: PdfReader,

3102 ) -> List[Destination]:

3103 """

3104 Extract outline item entries that are part of the specified page set.

3105

3106 Args:

3107 node:

3108 pages:

3109 reader:

3110

3111 Returns:

3112 A list of destination objects.

3113

3114 """

3115 new_outline = []

3116 if node is None:

3117 node = NullObject()

3118 node = node.get_object()

3119 if is_null_or_none(node):

3120 node = DictionaryObject()

3121 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

3122 node = node.get("/First", None)

3123 if node is not None:

3124 node = node.get_object()

3125 new_outline += self._get_filtered_outline(node, pages, reader)

3126 else:

3127 v: Union[None, IndirectObject, NullObject]

3128 while node is not None:

3129 node = node.get_object()

3130 o = cast("Destination", reader._build_outline_item(node))

3131 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3132 if v is None:

3133 v = NullObject()

3134 o[NameObject("/Page")] = v

3135 if "/First" in node:

3136 o._filtered_children = self._get_filtered_outline(

3137 node["/First"], pages, reader

3138 )

3139 else:

3140 o._filtered_children = []

3141 if (

3142 not isinstance(o["/Page"], NullObject)

3143 or len(o._filtered_children) > 0

3144 ):

3145 new_outline.append(o)

3146 node = node.get("/Next", None)

3147 return new_outline

3148

3149 def _clone_outline(self, dest: Destination) -> TreeObject:

3150 n_ol = TreeObject()

3151 self._add_object(n_ol)

3152 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3153 if not isinstance(dest["/Page"], NullObject):

3154 if dest.node is not None and "/A" in dest.node:

3155 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3156 else:

3157 n_ol[NameObject("/Dest")] = dest.dest_array

3158 # TODO: /SE

3159 if dest.node is not None:

3160 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3161 n_ol[NameObject("/C")] = ArrayObject(

3162 dest.node.get(

3163 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3164 )

3165 )

3166 return n_ol

3167

3168 def _insert_filtered_outline(

3169 self,

3170 outlines: List[Destination],

3171 parent: Union[TreeObject, IndirectObject],

3172 before: Union[None, TreeObject, IndirectObject] = None,

3173 ) -> None:

3174 for dest in outlines:

3175 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3176 # with np=self.add_outline_item_destination(dest,parent,before)

3177 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3178 np = parent

3179 else:

3180 np = self._clone_outline(dest)

3181 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3182 self._insert_filtered_outline(dest._filtered_children, np, None)

3183

3184 def close(self) -> None:

3185 """Implemented for API harmonization."""

3186 return

3187

3188 def find_outline_item(

3189 self,

3190 outline_item: Dict[str, Any],

3191 root: Optional[OutlineType] = None,

3192 ) -> Optional[List[int]]:

3193 if root is None:

3194 o = self.get_outline_root()

3195 else:

3196 o = cast("TreeObject", root)

3197

3198 i = 0

3199 while o is not None:

3200 if (

3201 o.indirect_reference == outline_item

3202 or o.get("/Title", None) == outline_item

3203 ):

3204 return [i]

3205 if "/First" in o:

3206 res = self.find_outline_item(

3207 outline_item, cast(OutlineType, o["/First"])

3208 )

3209 if res:

3210 return ([i] if "/Title" in o else []) + res

3211 if "/Next" in o:

3212 i += 1

3213 o = cast(TreeObject, o["/Next"])

3214 else:

3215 return None

3216

3217 def find_bookmark(

3218 self,

3219 outline_item: Dict[str, Any],

3220 root: Optional[OutlineType] = None,

3221 ) -> None: # deprecated

3222 """

3223 .. deprecated:: 2.9.0

3224 Use :meth:`find_outline_item` instead.

3225 """

3226 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0")

3227

3228 def reset_translation(

3229 self, reader: Union[None, PdfReader, IndirectObject] = None

3230 ) -> None:

3231 """

3232 Reset the translation table between reader and the writer object.

3233

3234 Late cloning will create new independent objects.

3235

3236 Args:

3237 reader: PdfReader or IndirectObject referencing a PdfReader object.

3238 if set to None or omitted, all tables will be reset.

3239

3240 """

3241 if reader is None:

3242 self._id_translated = {}

3243 elif isinstance(reader, PdfReader):

3244 try:

3245 del self._id_translated[id(reader)]

3246 except Exception:

3247 pass

3248 elif isinstance(reader, IndirectObject):

3249 try:

3250 del self._id_translated[id(reader.pdf)]

3251 except Exception:

3252 pass

3253 else:

3254 raise Exception("invalid parameter {reader}")

3255

3256 def set_page_label(

3257 self,

3258 page_index_from: int,

3259 page_index_to: int,

3260 style: Optional[PageLabelStyle] = None,

3261 prefix: Optional[str] = None,

3262 start: Optional[int] = 0,

3263 ) -> None:

3264 """

3265 Set a page label to a range of pages.

3266

3267 Page indexes must be given starting from 0.

3268 Labels must have a style, a prefix or both.

3269 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3270

3271 Args:

3272 page_index_from: page index of the beginning of the range starting from 0

3273 page_index_to: page index of the beginning of the range starting from 0

3274 style: The numbering style to be used for the numeric portion of each page label:

3275

3276 * ``/D`` Decimal Arabic numerals

3277 * ``/R`` Uppercase Roman numerals

3278 * ``/r`` Lowercase Roman numerals

3279 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3280 AA to ZZ for the next 26, and so on)

3281 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3282 aa to zz for the next 26, and so on)

3283

3284 prefix: The label prefix for page labels in this range.

3285 start: The value of the numeric portion for the first page label

3286 in the range.

3287 Subsequent pages are numbered sequentially from this value,

3288 which must be greater than or equal to 1.

3289 Default value: 1.

3290

3291 """

3292 if style is None and prefix is None:

3293 raise ValueError("At least one of style and prefix must be given")

3294 if page_index_from < 0:

3295 raise ValueError("page_index_from must be greater or equal than 0")

3296 if page_index_to < page_index_from:

3297 raise ValueError(

3298 "page_index_to must be greater or equal than page_index_from"

3299 )

3300 if page_index_to >= len(self.pages):

3301 raise ValueError("page_index_to exceeds number of pages")

3302 if start is not None and start != 0 and start < 1:

3303 raise ValueError("If given, start must be greater or equal than one")

3304

3305 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3306

3307 def _set_page_label(

3308 self,

3309 page_index_from: int,

3310 page_index_to: int,

3311 style: Optional[PageLabelStyle] = None,

3312 prefix: Optional[str] = None,

3313 start: Optional[int] = 0,

3314 ) -> None:

3315 """

3316 Set a page label to a range of pages.

3317

3318 Page indexes must be given starting from 0.

3319 Labels must have a style, a prefix or both.

3320 If a range is not assigned any page label a decimal label starting from 1 is applied.

3321

3322 Args:

3323 page_index_from: page index of the beginning of the range starting from 0

3324 page_index_to: page index of the beginning of the range starting from 0

3325 style: The numbering style to be used for the numeric portion of each page label:

3326 /D Decimal Arabic numerals

3327 /R Uppercase Roman numerals

3328 /r Lowercase Roman numerals

3329 /A Uppercase letters (A to Z for the first 26 pages,

3330 AA to ZZ for the next 26, and so on)

3331 /a Lowercase letters (a to z for the first 26 pages,

3332 aa to zz for the next 26, and so on)

3333 prefix: The label prefix for page labels in this range.

3334 start: The value of the numeric portion for the first page label

3335 in the range.

3336 Subsequent pages are numbered sequentially from this value,

3337 which must be greater than or equal to 1. Default value: 1.

3338

3339 """

3340 default_page_label = DictionaryObject()

3341 default_page_label[NameObject("/S")] = NameObject("/D")

3342

3343 new_page_label = DictionaryObject()

3344 if style is not None:

3345 new_page_label[NameObject("/S")] = NameObject(style)

3346 if prefix is not None:

3347 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3348 if start != 0:

3349 new_page_label[NameObject("/St")] = NumberObject(start)

3350

3351 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3352 nums = ArrayObject()

3353 nums_insert(NumberObject(0), default_page_label, nums)

3354 page_labels = TreeObject()

3355 page_labels[NameObject("/Nums")] = nums

3356 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3357

3358 page_labels = cast(

3359 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3360 )

3361 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3362

3363 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3364 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3365 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3366 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3367 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3368

3369 page_labels[NameObject("/Nums")] = nums

3370 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3371

3372 def _repr_mimebundle_(

3373 self,

3374 include: Union[None, Iterable[str]] = None,

3375 exclude: Union[None, Iterable[str]] = None,

3376 ) -> Dict[str, Any]:

3377 """

3378 Integration into Jupyter Notebooks.

3379

3380 This method returns a dictionary that maps a mime-type to its

3381 representation.

3382

3383 .. seealso::

3384

3385 https://ipython.readthedocs.io/en/stable/config/integrating.html

3386 """

3387 pdf_data = BytesIO()

3388 self.write(pdf_data)

3389 data = {

3390 "application/pdf": pdf_data,

3391 }

3392

3393 if include is not None:

3394 # Filter representations based on include list

3395 data = {k: v for k, v in data.items() if k in include}

3396

3397 if exclude is not None:

3398 # Remove representations based on exclude list

3399 data = {k: v for k, v in data.items() if k not in exclude}

3400

3401 return data

3402

3403

3404def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject:

3405 if isinstance(obj, PdfObject):

3406 return obj

3407 if isinstance(obj, dict):

3408 to_add = DictionaryObject()

3409 for key, value in obj.items():

3410 to_add[NameObject(key)] = _pdf_objectify(value)

3411 return to_add

3412 if isinstance(obj, str):

3413 if obj.startswith("/"):

3414 return NameObject(obj)

3415 return TextStringObject(obj)

3416 if isinstance(obj, (float, int)):

3417 return FloatObject(obj)

3418 if isinstance(obj, list):

3419 return ArrayObject(_pdf_objectify(i) for i in obj)

3420 raise NotImplementedError(

3421 f"{type(obj)=} could not be cast to a PdfObject"

3422 )

3423

3424

3425def _create_outline_item(

3426 action_ref: Union[None, IndirectObject],

3427 title: str,

3428 color: Union[Tuple[float, float, float], str, None],

3429 italic: bool,

3430 bold: bool,

3431) -> TreeObject:

3432 outline_item = TreeObject()

3433 if action_ref is not None:

3434 outline_item[NameObject("/A")] = action_ref

3435 outline_item.update(

3436 {

3437 NameObject("/Title"): create_string_object(title),

3438 }

3439 )

3440 if color:

3441 if isinstance(color, str):

3442 color = hex_to_rgb(color)

3443 outline_item.update(

3444 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3445 )

3446 if italic or bold:

3447 format_flag = 0

3448 if italic:

3449 format_flag += OutlineFontFlag.italic

3450 if bold:

3451 format_flag += OutlineFontFlag.bold

3452 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3453 return outline_item

3454

3455

3456def generate_appearance_stream(

3457 txt: str,

3458 sel: List[str],

3459 da: str,

3460 font_full_rev: Dict[str, bytes],

3461 rct: RectangleObject,

3462 font_height: float,

3463 y_offset: float,

3464) -> bytes:

3465 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()

3466 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):

3467 if line in sel:

3468 # may be improved but cannot find how to get fill working => replaced with lined box

3469 ap_stream += (

3470 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"

3471 f"0.5 0.5 0.5 rg s\n{da}\n"

3472 ).encode()

3473 if line_number == 0:

3474 ap_stream += f"2 {y_offset} Td\n".encode()

3475 else:

3476 # Td is a relative translation

3477 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()

3478 enc_line: List[bytes] = [

3479 font_full_rev.get(c, c.encode("utf-16-be")) for c in line

3480 ]

3481 if any(len(c) >= 2 for c in enc_line):

3482 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"

3483 else:

3484 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"

3485 ap_stream += b"ET\nQ\nEMC\nQ\n"

3486 return ap_stream

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%

1451 statements