Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import uuid

36from collections.abc import Iterable, Mapping

37from io import BytesIO, FileIO, IOBase

38from itertools import compress

39from pathlib import Path

40from re import Pattern

41from types import TracebackType

42from typing import (

43 IO,

44 Any,

45 Callable,

46 Optional,

47 Union,

48 cast,

49)

51from ._doc_common import DocumentInformation, PdfDocCommon

52from ._encryption import EncryptAlgorithm, Encryption

53from ._page import PageObject, Transformation

54from ._page_labels import nums_clear_range, nums_insert, nums_next

55from ._reader import PdfReader

56from ._utils import (

57 StrByteType,

58 StreamType,

59 _get_max_pdf_version_header,

60 deprecation_no_replacement,

61 logger_warning,

62)

63from .constants import AnnotationDictionaryAttributes as AA

64from .constants import CatalogAttributes as CA

65from .constants import (

66 CatalogDictionary,

67 GoToActionArguments,

68 ImageType,

69 InteractiveFormDictEntries,

70 OutlineFontFlag,

71 PageLabelStyle,

72 PagesAttributes,

73 TypFitArguments,

74 UserAccessPermissions,

75)

76from .constants import Core as CO

77from .constants import FieldDictionaryAttributes as FA

78from .constants import PageAttributes as PG

79from .constants import TrailerKeys as TK

80from .errors import PdfReadError, PyPdfError

81from .generic import (

82 PAGE_FIT,

83 ArrayObject,

84 BooleanObject,

85 ByteStringObject,

86 ContentStream,

87 Destination,

88 DictionaryObject,

89 EmbeddedFile,

90 Fit,

91 FloatObject,

92 IndirectObject,

93 NameObject,

94 NullObject,

95 NumberObject,

96 PdfObject,

97 RectangleObject,

98 ReferenceLink,

99 StreamObject,

100 TextStringObject,

101 TreeObject,

102 ViewerPreferences,

103 create_string_object,

104 extract_links,

105 hex_to_rgb,

106 is_null_or_none,

107)

108from .generic._appearance_stream import TextStreamAppearance

109from .pagerange import PageRange, PageRangeSpec

110from .types import (

111 AnnotationSubtype,

112 BorderArrayType,

113 LayoutType,

114 OutlineItemType,

115 OutlineType,

116 PagemodeType,

117)

118from .xmp import XmpInformation

119

120ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

121

122

123class ObjectDeletionFlag(enum.IntFlag):

124 NONE = 0

125 TEXT = enum.auto()

126 LINKS = enum.auto()

127 ATTACHMENTS = enum.auto()

128 OBJECTS_3D = enum.auto()

129 ALL_ANNOTATIONS = enum.auto()

130 XOBJECT_IMAGES = enum.auto()

131 INLINE_IMAGES = enum.auto()

132 DRAWING_IMAGES = enum.auto()

133 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

134

135

136def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

137 hash = hashlib.md5(usedforsecurity=False)

138 for block in iter(lambda: stream.read(blocksize), b""):

139 hash.update(block)

140 return hash.hexdigest()

141

142

143class PdfWriter(PdfDocCommon):

144 """

145 Write a PDF file out, given pages produced by another class or through

146 cloning a PDF file during initialization.

147

148 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

149

150 Args:

151 clone_from: identical to fileobj (for compatibility)

152

153 incremental: If true, loads the document and set the PdfWriter in incremental mode.

154

155 When writing incrementally, the original document is written first and new/modified

156 content is appended. To be used for signed document/forms to keep signature valid.

157

158 full: If true, loads all the objects (always full if incremental = True).

159 This parameter may allow loading large PDFs.

160

161 strict: If true, pypdf will raise an exception if a PDF does not follow the specification.

162 If false, pypdf will try to be forgiving and do something reasonable, but it will log

163 a warning message. It is a best-effort approach.

164

165 """

166

167 def __init__(

168 self,

169 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

170 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

171 incremental: bool = False,

172 full: bool = False,

173 strict: bool = False,

174 ) -> None:

175 self.strict = strict

176 """

177 If true, pypdf will raise an exception if a PDF does not follow the specification.

178 If false, pypdf will try to be forgiving and do something reasonable, but it will log

179 a warning message. It is a best-effort approach.

180 """

181

182 self.incremental = incremental or full

183 """

184 Returns if the PdfWriter object has been started in incremental mode.

185 """

186

187 self._objects: list[Optional[PdfObject]] = []

188 """

189 The indirect objects in the PDF.

190 For the incremental case, it will be filled with None

191 in clone_reader_document_root.

192 """

193

194 self._original_hash: list[int] = []

195 """

196 List of hashes after import; used to identify changes.

197 """

198

199 self._idnum_hash: dict[bytes, tuple[IndirectObject, list[IndirectObject]]] = {}

200 """

201 Maps hash values of indirect objects to the list of IndirectObjects.

202 This is used for compression.

203 """

204

205 self._id_translated: dict[int, dict[int, int]] = {}

206 """List of already translated IDs.

207 dict[id(pdf)][(idnum, generation)]

208 """

209

210 self._info_obj: Optional[PdfObject]

211 """The PDF files's document information dictionary,

212 the Info entry in the PDF file's trailer dictionary."""

213

214 self._ID: Union[ArrayObject, None] = None

215 """The PDF file identifier,

216 defined by the ID in the PDF file's trailer dictionary."""

217

218 self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []

219 "Tracks links in pages added to the writer for resolving later."

220 self._merged_in_pages: dict[Optional[IndirectObject], Optional[IndirectObject]] = {}

221 "Tracks pages added to the writer and what page they turned into."

222

223 if self.incremental:

224 if isinstance(fileobj, (str, Path)):

225 with open(fileobj, "rb") as f:

226 fileobj = BytesIO(f.read(-1))

227 if isinstance(fileobj, BytesIO):

228 fileobj = PdfReader(fileobj)

229 if not isinstance(fileobj, PdfReader):

230 raise PyPdfError("Invalid type for incremental mode")

231 self._reader = fileobj # prev content is in _reader.stream

232 self._header = fileobj.pdf_header.encode()

233 self._readonly = True # TODO: to be analysed

234 else:

235 self._header = b"%PDF-1.3"

236 self._info_obj = self._add_object(

237 DictionaryObject(

238 {NameObject("/Producer"): create_string_object("pypdf")}

239 )

240 )

241

242 def _get_clone_from(

243 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

244 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

245 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

246 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

247 fileobj == "" or clone_from is not None

248 ):

249 return clone_from

250 cloning = True

251 if isinstance(fileobj, (str, Path)) and (

252 not Path(str(fileobj)).exists()

253 or Path(str(fileobj)).stat().st_size == 0

254 ):

255 cloning = False

256 if isinstance(fileobj, (IOBase, BytesIO)):

257 t = fileobj.tell()

258 if fileobj.seek(0, 2) == 0:

259 cloning = False

260 fileobj.seek(t, 0)

261 if cloning:

262 clone_from = fileobj

263 return clone_from

264

265 clone_from = _get_clone_from(fileobj, clone_from)

266 # To prevent overwriting

267 self.temp_fileobj = fileobj

268 self.fileobj = ""

269 self._with_as_usage = False

270 self._cloned = False

271 # The root of our page tree node

272 pages = DictionaryObject(

273 {

274 NameObject(PagesAttributes.TYPE): NameObject("/Pages"),

275 NameObject(PagesAttributes.COUNT): NumberObject(0),

276 NameObject(PagesAttributes.KIDS): ArrayObject(),

277 }

278 )

279 self.flattened_pages = []

280 self._encryption: Optional[Encryption] = None

281 self._encrypt_entry: Optional[DictionaryObject] = None

282

283 if clone_from is not None:

284 if not isinstance(clone_from, PdfReader):

285 clone_from = PdfReader(clone_from)

286 self.clone_document_from_reader(clone_from)

287 self._cloned = True

288 else:

289 self._pages = self._add_object(pages)

290 self._root_object = DictionaryObject(

291 {

292 NameObject(PagesAttributes.TYPE): NameObject(CO.CATALOG),

293 NameObject(CO.PAGES): self._pages,

294 }

295 )

296 self._add_object(self._root_object)

297 if full and not incremental:

298 self.incremental = False

299 if isinstance(self._ID, list):

300 if isinstance(self._ID[0], TextStringObject):

301 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

302 if isinstance(self._ID[1], TextStringObject):

303 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

304

305 # for commonality

306 @property

307 def is_encrypted(self) -> bool:

308 """

309 Read-only boolean property showing whether this PDF file is encrypted.

310

311 Note that this property, if true, will remain true even after the

312 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

313 """

314 return False

315

316 @property

317 def root_object(self) -> DictionaryObject:

318 """

319 Provide direct access to PDF Structure.

320

321 Note:

322 Recommended only for read access.

323

324 """

325 return self._root_object

326

327 @property

328 def _info(self) -> Optional[DictionaryObject]:

329 """

330 Provide access to "/Info". Standardized with PdfReader.

331

332 Returns:

333 /Info Dictionary; None if the entry does not exist

334

335 """

336 return (

337 None

338 if self._info_obj is None

339 else cast(DictionaryObject, self._info_obj.get_object())

340 )

341

342 @_info.setter

343 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

344 if value is None:

345 try:

346 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore

347 except (KeyError, AttributeError):

348 pass

349 self._info_obj = None

350 else:

351 if self._info_obj is None:

352 self._info_obj = self._add_object(DictionaryObject())

353 obj = cast(DictionaryObject, self._info_obj.get_object())

354 obj.clear()

355 obj.update(cast(DictionaryObject, value.get_object()))

356

357 @property

358 def xmp_metadata(self) -> Optional[XmpInformation]:

359 """XMP (Extensible Metadata Platform) data."""

360 return cast(XmpInformation, self.root_object.xmp_metadata)

361

362 @xmp_metadata.setter

363 def xmp_metadata(self, value: Union[XmpInformation, bytes, None]) -> None:

364 """XMP (Extensible Metadata Platform) data."""

365 if value is None:

366 if "/Metadata" in self.root_object:

367 del self.root_object["/Metadata"]

368 return

369

370 metadata = self.root_object.get("/Metadata", None)

371 if not isinstance(metadata, IndirectObject):

372 if metadata is not None:

373 del self.root_object["/Metadata"]

374 metadata_stream = StreamObject()

375 stream_reference = self._add_object(metadata_stream)

376 self.root_object[NameObject("/Metadata")] = stream_reference

377 else:

378 metadata_stream = cast(StreamObject, metadata.get_object())

379

380 if isinstance(value, XmpInformation):

381 bytes_data = value.stream.get_data()

382 else:

383 bytes_data = value

384 metadata_stream.set_data(bytes_data)

385

386 @property

387 def with_as_usage(self) -> bool:

388 deprecation_no_replacement("with_as_usage", "5.0")

389 return self._with_as_usage

390

391 @with_as_usage.setter

392 def with_as_usage(self, value: bool) -> None:

393 deprecation_no_replacement("with_as_usage", "5.0")

394 self._with_as_usage = value

395

396 def __enter__(self) -> "PdfWriter":

397 """Store how writer is initialized by 'with'."""

398 c: bool = self._cloned

399 t = self.temp_fileobj

400 self.__init__() # type: ignore

401 self._cloned = c

402 self._with_as_usage = True

403 self.fileobj = t # type: ignore

404 return self

405

406 def __exit__(

407 self,

408 exc_type: Optional[type[BaseException]],

409 exc: Optional[BaseException],

410 traceback: Optional[TracebackType],

411 ) -> None:

412 """Write data to the fileobj."""

413 if self.fileobj and not self._cloned:

414 self.write(self.fileobj)

415

416 @property

417 def pdf_header(self) -> str:

418 """

419 Read/Write property of the PDF header that is written.

420

421 This should be something like ``'%PDF-1.5'``. It is recommended to set

422 the lowest version that supports all features which are used within the

423 PDF file.

424

425 Note: `pdf_header` returns a string but accepts bytes or str for writing

426 """

427 return self._header.decode()

428

429 @pdf_header.setter

430 def pdf_header(self, new_header: Union[str, bytes]) -> None:

431 if isinstance(new_header, str):

432 new_header = new_header.encode()

433 self._header = new_header

434

435 def _add_object(self, obj: PdfObject) -> IndirectObject:

436 if (

437 getattr(obj, "indirect_reference", None) is not None

438 and obj.indirect_reference.pdf == self # type: ignore

439 ):

440 return obj.indirect_reference # type: ignore

441 # check for /Contents in Pages (/Contents in annotations are strings)

442 if isinstance(obj, DictionaryObject) and isinstance(

443 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

444 ):

445 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

446 self._objects.append(obj)

447 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

448 return obj.indirect_reference

449

450 def get_object(

451 self,

452 indirect_reference: Union[int, IndirectObject],

453 ) -> PdfObject:

454 if isinstance(indirect_reference, int):

455 obj = self._objects[indirect_reference - 1]

456 elif indirect_reference.pdf != self:

457 raise ValueError("PDF must be self")

458 else:

459 obj = self._objects[indirect_reference.idnum - 1]

460 assert obj is not None, "mypy"

461 return obj

462

463 def _replace_object(

464 self,

465 indirect_reference: Union[int, IndirectObject],

466 obj: PdfObject,

467 ) -> PdfObject:

468 if isinstance(indirect_reference, IndirectObject):

469 if indirect_reference.pdf != self:

470 raise ValueError("PDF must be self")

471 indirect_reference = indirect_reference.idnum

472 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore

473 if (

474 getattr(obj, "indirect_reference", None) is not None

475 and obj.indirect_reference.pdf != self # type: ignore

476 ):

477 obj = obj.clone(self)

478 self._objects[indirect_reference - 1] = obj

479 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

480

481 assert isinstance(obj, PdfObject), "mypy"

482 return obj

483

484 def _add_page(

485 self,

486 page: PageObject,

487 index: int,

488 excluded_keys: Iterable[str] = (),

489 ) -> PageObject:

490 if not isinstance(page, PageObject) or page.get(PagesAttributes.TYPE, None) != CO.PAGE:

491 raise ValueError("Invalid page object")

492 assert self.flattened_pages is not None, "for mypy"

493 page_org = page

494 excluded_keys = list(excluded_keys)

495 excluded_keys += [PagesAttributes.PARENT, "/StructParents"]

496 # Acrobat does not accept two indirect references pointing on the same

497 # page; therefore in order to add multiple copies of the same

498 # page, we need to create a new dictionary for the page, however the

499 # objects below (including content) are not duplicated:

500 try: # delete an already existing page

501 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore

502 page_org.indirect_reference.idnum # type: ignore

503 ]

504 except Exception:

505 pass

506

507 page = cast(

508 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

509 )

510 if page_org.pdf is not None:

511 other = page_org.pdf.pdf_header

512 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

513

514 node, idx = self._get_page_in_node(index)

515 page[NameObject(PagesAttributes.PARENT)] = node.indirect_reference

516

517 if idx >= 0:

518 cast(ArrayObject, node[PagesAttributes.KIDS]).insert(idx, page.indirect_reference)

519 self.flattened_pages.insert(index, page)

520 else:

521 cast(ArrayObject, node[PagesAttributes.KIDS]).append(page.indirect_reference)

522 self.flattened_pages.append(page)

523 recurse = 0

524 while not is_null_or_none(node):

525 node = cast(DictionaryObject, node.get_object())

526 node[NameObject(PagesAttributes.COUNT)] = NumberObject(cast(int, node[PagesAttributes.COUNT]) + 1)

527 node = node.get(PagesAttributes.PARENT, None) # type: ignore[assignment] # TODO: Fix.

528 recurse += 1

529 if recurse > 1000:

530 raise PyPdfError("Too many recursive calls!")

531

532 if page_org.pdf is not None:

533 # the page may contain links to other pages, and those other

534 # pages may or may not already be added. we store the

535 # information we need, so that we can resolve the references

536 # later.

537 self._unresolved_links.extend(extract_links(page, page_org))

538 self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference

539

540 return page

541

542 def set_need_appearances_writer(self, state: bool = True) -> None:

543 """

544 Sets the "NeedAppearances" flag in the PDF writer.

545

546 The "NeedAppearances" flag indicates whether the appearance dictionary

547 for form fields should be automatically generated by the PDF viewer or

548 if the embedded appearance should be used.

549

550 Args:

551 state: The actual value of the NeedAppearances flag.

552

553 Returns:

554 None

555

556 """

557 # See §12.7.2 and §7.7.2 for more information:

558 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

559 try:

560 # get the AcroForm tree

561 if CatalogDictionary.ACRO_FORM not in self._root_object:

562 self._root_object[

563 NameObject(CatalogDictionary.ACRO_FORM)

564 ] = self._add_object(DictionaryObject())

565

566 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

567 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

568 need_appearances

569 ] = BooleanObject(state)

570 except Exception as exc: # pragma: no cover

571 logger_warning(

572 f"set_need_appearances_writer({state}) catch : {exc}", __name__

573 )

574

575 def create_viewer_preferences(self) -> ViewerPreferences:

576 o = ViewerPreferences()

577 self._root_object[

578 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

579 ] = self._add_object(o)

580 return o

581

582 def add_page(

583 self,

584 page: PageObject,

585 excluded_keys: Iterable[str] = (),

586 ) -> PageObject:

587 """

588 Add a page to this PDF file.

589

590 Recommended for advanced usage including the adequate excluded_keys.

591

592 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

593 instance.

594

595 Args:

596 page: The page to add to the document. Should be

597 an instance of :class:`PageObject<pypdf._page.PageObject>`

598 excluded_keys:

599

600 Returns:

601 The added PageObject.

602

603 """

604 assert self.flattened_pages is not None, "mypy"

605 return self._add_page(page, len(self.flattened_pages), excluded_keys)

606

607 def insert_page(

608 self,

609 page: PageObject,

610 index: int = 0,

611 excluded_keys: Iterable[str] = (),

612 ) -> PageObject:

613 """

614 Insert a page in this PDF file. The page is usually acquired from a

615 :class:`PdfReader<pypdf.PdfReader>` instance.

616

617 Args:

618 page: The page to add to the document.

619 index: Position at which the page will be inserted.

620 excluded_keys:

621

622 Returns:

623 The added PageObject.

624

625 """

626 assert self.flattened_pages is not None, "mypy"

627 if index < 0:

628 index = len(self.flattened_pages) + index

629 if index < 0:

630 raise ValueError("Invalid index value")

631 if index >= len(self.flattened_pages):

632 return self.add_page(page, excluded_keys)

633 return self._add_page(page, index, excluded_keys)

634

635 def _get_page_number_by_indirect(

636 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

637 ) -> Optional[int]:

638 """

639 Generate _page_id2num.

640

641 Args:

642 indirect_reference:

643

644 Returns:

645 The page number or None

646

647 """

648 # To provide same function as in PdfReader

649 if is_null_or_none(indirect_reference):

650 return None

651 assert indirect_reference is not None, "mypy"

652 if isinstance(indirect_reference, int):

653 indirect_reference = IndirectObject(indirect_reference, 0, self)

654 obj = indirect_reference.get_object()

655 if isinstance(obj, PageObject):

656 return obj.page_number

657 return None

658

659 def add_blank_page(

660 self, width: Optional[float] = None, height: Optional[float] = None

661 ) -> PageObject:

662 """

663 Append a blank page to this PDF file and return it.

664

665 If no page size is specified, use the size of the last page.

666

667 Args:

668 width: The width of the new page expressed in default user

669 space units.

670 height: The height of the new page expressed in default

671 user space units.

672

673 Returns:

674 The newly appended page.

675

676 Raises:

677 PageSizeNotDefinedError: if width and height are not defined

678 and previous page does not exist.

679

680 """

681 page = PageObject.create_blank_page(self, width, height)

682 return self.add_page(page)

683

684 def insert_blank_page(

685 self,

686 width: Optional[Union[float, decimal.Decimal]] = None,

687 height: Optional[Union[float, decimal.Decimal]] = None,

688 index: int = 0,

689 ) -> PageObject:

690 """

691 Insert a blank page to this PDF file and return it.

692

693 If no page size is specified, use the size of the last page.

694

695 Args:

696 width: The width of the new page expressed in default user

697 space units.

698 height: The height of the new page expressed in default

699 user space units.

700 index: Position to add the page.

701

702 Returns:

703 The newly inserted page.

704

705 Raises:

706 PageSizeNotDefinedError: if width and height are not defined

707 and previous page does not exist.

708

709 """

710 if width is None or (height is None and index < self.get_num_pages()):

711 oldpage = self.pages[index]

712 width = oldpage.mediabox.width

713 height = oldpage.mediabox.height

714 page = PageObject.create_blank_page(self, width, height)

715 self.insert_page(page, index)

716 return page

717

718 @property

719 def open_destination(

720 self,

721 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

722 return super().open_destination

723

724 @open_destination.setter

725 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

726 if dest is None:

727 try:

728 del self._root_object["/OpenAction"]

729 except KeyError:

730 pass

731 elif isinstance(dest, str):

732 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

733 elif isinstance(dest, Destination):

734 self._root_object[NameObject("/OpenAction")] = dest.dest_array

735 elif isinstance(dest, PageObject):

736 self._root_object[NameObject("/OpenAction")] = Destination(

737 "Opening",

738 dest.indirect_reference

739 if dest.indirect_reference is not None

740 else NullObject(),

741 PAGE_FIT,

742 ).dest_array

743

744 def add_js(self, javascript: str) -> None:

745 """

746 Add JavaScript which will launch upon opening this PDF.

747

748 Args:

749 javascript: Your JavaScript.

750

751 Example:

752 This will launch the print window when the PDF is opened.

753

754 >>> from pypdf import PdfWriter

755 >>> output = PdfWriter()

756 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

757

758 """

759 # Names / JavaScript preferred to be able to add multiple scripts

760 if "/Names" not in self._root_object:

761 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

762 names = cast(DictionaryObject, self._root_object[CA.NAMES])

763 if "/JavaScript" not in names:

764 names[NameObject("/JavaScript")] = DictionaryObject(

765 {NameObject("/Names"): ArrayObject()}

766 )

767 js_list = cast(

768 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

769 )

770 # We need a name for parameterized JavaScript in the PDF file,

771 # but it can be anything.

772 js_list.append(create_string_object(str(uuid.uuid4())))

773

774 js = DictionaryObject(

775 {

776 NameObject(PagesAttributes.TYPE): NameObject("/Action"),

777 NameObject("/S"): NameObject("/JavaScript"),

778 NameObject("/JS"): TextStringObject(f"{javascript}"),

779 }

780 )

781 js_list.append(self._add_object(js))

782

783 def add_attachment(self, filename: str, data: Union[str, bytes]) -> "EmbeddedFile":

784 """

785 Embed a file inside the PDF.

786

787 Reference:

788 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

789 Section 7.11.3

790

791 Args:

792 filename: The filename to display.

793 data: The data in the file.

794

795 Returns:

796 EmbeddedFile instance for the newly created embedded file.

797

798 """

799 return EmbeddedFile._create_new(self, filename, data)

800

801 def append_pages_from_reader(

802 self,

803 reader: PdfReader,

804 after_page_append: Optional[Callable[[PageObject], None]] = None,

805 ) -> None:

806 """

807 Copy pages from reader to writer. Includes an optional callback

808 parameter which is invoked after pages are appended to the writer.

809

810 ``append`` should be preferred.

811

812 Args:

813 reader: a PdfReader object from which to copy page

814 annotations to this writer object. The writer's annots

815 will then be updated.

816 after_page_append:

817 Callback function that is invoked after each page is appended to

818 the writer. Signature includes a reference to the appended page

819 (delegates to append_pages_from_reader). The single parameter of

820 the callback is a reference to the page just appended to the

821 document.

822

823 """

824 reader_num_pages = len(reader.pages)

825 # Copy pages from reader to writer

826 for reader_page_number in range(reader_num_pages):

827 reader_page = reader.pages[reader_page_number]

828 writer_page = self.add_page(reader_page)

829 # Trigger callback, pass writer page as parameter

830 if callable(after_page_append):

831 after_page_append(writer_page)

832

833 def _merge_content_stream_to_page(

834 self,

835 page: PageObject,

836 new_content_data: bytes,

837 ) -> None:

838 """

839 Combines existing content stream(s) with new content (as bytes).

840

841 Args:

842 page: The page to which the new content data will be added.

843 new_content_data: A binary-encoded new content stream, for

844 instance the commands to draw an XObject.

845 """

846 # First resolve the existing page content. This always is an IndirectObject:

847 # PDF Explained by John Whitington

848 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

849 if NameObject("/Contents") in page:

850 existing_content_ref = page[NameObject("/Contents")]

851 existing_content = existing_content_ref.get_object()

852

853 if isinstance(existing_content, ArrayObject):

854 # Create a new StreamObject for the new_content_data

855 new_stream_obj = StreamObject()

856 new_stream_obj.set_data(new_content_data)

857 existing_content.append(self._add_object(new_stream_obj))

858 page[NameObject("/Contents")] = self._add_object(existing_content)

859 if isinstance(existing_content, StreamObject):

860 # Merge new content to existing StreamObject

861 merged_data = existing_content.get_data() + b"\n" + new_content_data

862 new_stream = StreamObject()

863 new_stream.set_data(merged_data)

864 page[NameObject("/Contents")] = self._add_object(new_stream)

865 else:

866 # If no existing content, then we have an empty page.

867 # Create a new StreamObject in a new /Contents entry.

868 new_stream = StreamObject()

869 new_stream.set_data(new_content_data)

870 page[NameObject("/Contents")] = self._add_object(new_stream)

871

872 def _add_apstream_object(

873 self,

874 page: PageObject,

875 appearance_stream_obj: StreamObject,

876 object_name: str,

877 x_offset: float,

878 y_offset: float,

879 ) -> None:

880 """

881 Adds an appearance stream to the page content in the form of

882 an XObject.

883

884 Args:

885 page: The page to which to add the appearance stream.

886 appearance_stream_obj: The appearance stream.

887 object_name: The name of the appearance stream.

888 x_offset: The horizontal offset for the appearance stream.

889 y_offset: The vertical offset for the appearance stream.

890 """

891 # Prepare XObject resource dictionary on the page. This currently

892 # only deals with font resources, but can easily be adapted to also

893 # include other resources.

894 pg_res = cast(DictionaryObject, page[PG.RESOURCES])

895 if "/Resources" in appearance_stream_obj:

896 ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"])

897 # No need to check "if "/Font" in ap_stream_res", because the only reason this

898 # code runs would be if we are flattening form fields, and the associated code

899 # either adds a Font resource or no resource at all. This probably needs to

900 # change if we want to use this method to flatten markup annotations.

901 ap_stream_font_dict = cast(DictionaryObject, ap_stream_res["/Font"])

902 if "/Font" not in pg_res:

903 pg_res[NameObject("/Font")] = DictionaryObject()

904 pg_font_res = cast(DictionaryObject, pg_res["/Font"])

905 # Merge fonts from the appearance stream into the page's font resources

906 for font_name, font_ref in ap_stream_font_dict.items():

907 if font_name not in pg_font_res:

908 pg_font_res[font_name] = font_ref

909 # Always add the resolved stream object to the writer to get a new IndirectObject.

910 # This ensures we have a valid IndirectObject managed by *this* writer.

911 xobject_ref = self._add_object(appearance_stream_obj)

912 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()

913 if "/XObject" not in pg_res:

914 pg_res[NameObject("/XObject")] = DictionaryObject()

915 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])

916 if xobject_name not in pg_xo_res:

917 pg_xo_res[xobject_name] = xobject_ref

918 else:

919 logger_warning(

920 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",

921 __name__

922 )

923 xobject_cm = Transformation().translate(x_offset, y_offset)

924 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()

925 self._merge_content_stream_to_page(page, xobject_drawing_commands)

926

927 FFBITS_NUL = FA.FfBits(0)

928

929 def update_page_form_field_values(

930 self,

931 page: Union[PageObject, list[PageObject], None],

932 fields: Mapping[str, Union[str, list[str], tuple[str, str, float]]],

933 flags: FA.FfBits = FFBITS_NUL,

934 auto_regenerate: Optional[bool] = True,

935 flatten: bool = False,

936 ) -> None:

937 """

938 Update the form field values for a given page from a fields dictionary.

939

940 Copy field texts and values from fields to page.

941 If the field links to a parent object, add the information to the parent.

942

943 Args:

944 page: `PageObject` - references **PDF writer's page** where the

945 annotations and field data will be updated.

946 `List[Pageobject]` - provides list of pages to be processed.

947 `None` - all pages.

948 fields: a Python dictionary of:

949

950 * field names (/T) as keys and text values (/V) as value

951 * field names (/T) as keys and list of text values (/V) for multiple choice list

952 * field names (/T) as keys and tuple of:

953 * text values (/V)

954 * font id (e.g. /F1, the font id must exist)

955 * font size (0 for autosize)

956

957 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

958

959 auto_regenerate: Set/unset the need_appearances flag;

960 the flag is unchanged if auto_regenerate is None.

961

962 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's

963 appearance stream to the page contents. Note that this option does not remove the

964 annotation itself.

965

966 """

967 if CatalogDictionary.ACRO_FORM not in self._root_object:

968 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

969 acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

970 if InteractiveFormDictEntries.Fields not in acro_form:

971 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

972 if isinstance(auto_regenerate, bool):

973 self.set_need_appearances_writer(auto_regenerate)

974 # Iterate through pages, update field values

975 if page is None:

976 page = list(self.pages)

977 if isinstance(page, list):

978 for p in page:

979 if PG.ANNOTS in p: # just to prevent warnings

980 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)

981 return

982 if PG.ANNOTS not in page:

983 logger_warning("No fields to update on this page", __name__)

984 return

985 for annotation in page[PG.ANNOTS]: # type: ignore

986 annotation = cast(DictionaryObject, annotation.get_object())

987 if annotation.get("/Subtype", "") != "/Widget":

988 continue

989 if "/FT" in annotation and "/T" in annotation:

990 parent_annotation = annotation

991 else:

992 parent_annotation = annotation.get(

993 PG.PARENT, DictionaryObject()

994 ).get_object()

995

996 for field, value in fields.items():

997 rectangle = cast(RectangleObject, annotation[AA.Rect])

998 if not (

999 self._get_qualified_field_name(parent_annotation) == field

1000 or parent_annotation.get("/T", None) == field

1001 ):

1002 continue

1003 if (

1004 parent_annotation.get("/FT", None) == "/Ch"

1005 and "/I" in parent_annotation

1006 ):

1007 del parent_annotation["/I"]

1008 if flags:

1009 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1010 # Set the field value

1011 if not (value is None and flatten): # Only change values if given by user and not flattening.

1012 if isinstance(value, list):

1013 lst = ArrayObject(TextStringObject(v) for v in value)

1014 parent_annotation[NameObject(FA.V)] = lst

1015 elif isinstance(value, tuple):

1016 annotation[NameObject(FA.V)] = TextStringObject(

1017 value[0],

1018 )

1019 else:

1020 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1021 # Get or create the field's appearance stream object

1022 if parent_annotation.get(FA.FT) == "/Btn":

1023 # Checkbox button (no /FT found in Radio widgets);

1024 # We can find the associated appearance stream object

1025 # within the annotation.

1026 v = NameObject(value)

1027 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1028 normal_ap = cast(DictionaryObject, ap["/N"])

1029 if v not in normal_ap:

1030 v = NameObject("/Off")

1031 appearance_stream_obj = normal_ap.get(v)

1032 # Other cases will be updated through the for loop

1033 annotation[NameObject(AA.AS)] = v

1034 annotation[NameObject(FA.V)] = v

1035 elif (

1036 parent_annotation.get(FA.FT) == "/Tx"

1037 or parent_annotation.get(FA.FT) == "/Ch"

1038 ):

1039 # Textbox; we need to generate the appearance stream object

1040 if isinstance(value, tuple):

1041 appearance_stream_obj = TextStreamAppearance.from_text_annotation(

1042 acro_form, parent_annotation, annotation, value[1], value[2]

1043 )

1044 else:

1045 appearance_stream_obj = TextStreamAppearance.from_text_annotation(

1046 acro_form, parent_annotation, annotation

1047 )

1048 # Add the appearance stream object

1049 if AA.AP not in annotation:

1050 annotation[NameObject(AA.AP)] = DictionaryObject(

1051 {NameObject("/N"): self._add_object(appearance_stream_obj)}

1052 )

1053 elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])):

1054 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1055 NameObject("/N")

1056 ] = self._add_object(appearance_stream_obj)

1057 else: # [/AP][/N] exists

1058 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore

1059 self._objects[n - 1] = appearance_stream_obj

1060 appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self)

1061 elif (

1062 annotation.get(FA.FT) == "/Sig"

1063 ): # deprecated # not implemented yet

1064 logger_warning("Signature forms not implemented yet", __name__)

1065 if flatten and appearance_stream_obj is not None:

1066 self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])

1067

1068 def reattach_fields(

1069 self, page: Optional[PageObject] = None

1070 ) -> list[DictionaryObject]:

1071 """

1072 Parse annotations within the page looking for orphan fields and

1073 reattach then into the Fields Structure.

1074

1075 Args:

1076 page: page to analyze.

1077 If none is provided, all pages will be analyzed.

1078

1079 Returns:

1080 list of reattached fields.

1081

1082 """

1083 lst = []

1084 if page is None:

1085 for p in self.pages:

1086 lst += self.reattach_fields(p)

1087 return lst

1088

1089 try:

1090 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1091 except KeyError:

1092 af = DictionaryObject()

1093 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1094 try:

1095 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1096 except KeyError:

1097 fields = ArrayObject()

1098 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1099

1100 if "/Annots" not in page:

1101 return lst

1102 annotations = cast(ArrayObject, page["/Annots"])

1103 for idx, annotation in enumerate(annotations):

1104 is_indirect = isinstance(annotation, IndirectObject)

1105 annotation = cast(DictionaryObject, annotation.get_object())

1106 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1107 if (

1108 "indirect_reference" in annotation.__dict__

1109 and annotation.indirect_reference in fields

1110 ):

1111 continue

1112 if not is_indirect:

1113 annotations[idx] = self._add_object(annotation)

1114 fields.append(annotation.indirect_reference)

1115 lst.append(annotation)

1116 return lst

1117

1118 def clone_reader_document_root(self, reader: PdfReader) -> None:

1119 """

1120 Copy the reader document root to the writer and all sub-elements,

1121 including pages, threads, outlines,... For partial insertion, ``append``

1122 should be considered.

1123

1124 Args:

1125 reader: PdfReader from which the document root should be copied.

1126

1127 """

1128 self._info_obj = None

1129 if self.incremental:

1130 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)

1131 for i in range(len(self._objects)):

1132 o = reader.get_object(i + 1)

1133 if o is not None:

1134 self._objects[i] = o.replicate(self)

1135 else:

1136 self._objects.clear()

1137 self._root_object = reader.root_object.clone(self)

1138 self._pages = self._root_object.raw_get("/Pages")

1139

1140 if len(self._objects) > cast(int, reader.trailer["/Size"]):

1141 if self.strict:

1142 raise PdfReadError(

1143 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}"

1144 )

1145 logger_warning(

1146 f"Object count {len(self._objects)} exceeds defined trailer size {reader.trailer['/Size']}",

1147 __name__

1148 )

1149

1150 # must be done here before rewriting

1151 if self.incremental:

1152 self._original_hash = [

1153 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1154 ]

1155

1156 try:

1157 self._flatten()

1158 except IndexError:

1159 raise PdfReadError("Got index error while flattening.")

1160

1161 assert self.flattened_pages is not None

1162 for p in self.flattened_pages:

1163 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1164 if not self.incremental:

1165 p[NameObject("/Parent")] = self._pages

1166 if not self.incremental:

1167 cast(DictionaryObject, self._pages.get_object())[

1168 NameObject("/Kids")

1169 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1170

1171 def clone_document_from_reader(

1172 self,

1173 reader: PdfReader,

1174 after_page_append: Optional[Callable[[PageObject], None]] = None,

1175 ) -> None:

1176 """

1177 Create a copy (clone) of a document from a PDF file reader cloning

1178 section '/Root' and '/Info' and '/ID' of the pdf.

1179

1180 Args:

1181 reader: PDF file reader instance from which the clone

1182 should be created.

1183 after_page_append:

1184 Callback function that is invoked after each page is appended to

1185 the writer. Signature includes a reference to the appended page

1186 (delegates to append_pages_from_reader). The single parameter of

1187 the callback is a reference to the page just appended to the

1188 document.

1189

1190 """

1191 self.clone_reader_document_root(reader)

1192 inf = reader._info

1193 if self.incremental:

1194 if inf is not None:

1195 self._info_obj = cast(

1196 IndirectObject, inf.clone(self).indirect_reference

1197 )

1198 assert isinstance(self._info, DictionaryObject), "for mypy"

1199 self._original_hash[

1200 self._info_obj.indirect_reference.idnum - 1

1201 ] = self._info.hash_bin()

1202 elif inf is not None:

1203 self._info_obj = self._add_object(

1204 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1205 )

1206 # else: _info_obj = None done in clone_reader_document_root()

1207

1208 try:

1209 self._ID = cast(ArrayObject, reader._ID).clone(self)

1210 except AttributeError:

1211 pass

1212

1213 if callable(after_page_append):

1214 for page in cast(

1215 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1216 ):

1217 after_page_append(page.get_object())

1218

1219 def _compute_document_identifier(self) -> ByteStringObject:

1220 stream = BytesIO()

1221 self._write_pdf_structure(stream)

1222 stream.seek(0)

1223 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1224

1225 def generate_file_identifiers(self) -> None:

1226 """

1227 Generate an identifier for the PDF that will be written.

1228

1229 The only point of this is ensuring uniqueness. Reproducibility is not

1230 required.

1231 When a file is first written, both identifiers shall be set to the same value.

1232 If both identifiers match when a file reference is resolved, it is very

1233 likely that the correct and unchanged file has been found. If only the first

1234 identifier matches, a different version of the correct file has been found.

1235 see §14.4 "File Identifiers".

1236 """

1237 if self._ID:

1238 id1 = self._ID[0]

1239 id2 = self._compute_document_identifier()

1240 else:

1241 id1 = self._compute_document_identifier()

1242 id2 = id1

1243 self._ID = ArrayObject((id1, id2))

1244

1245 def encrypt(

1246 self,

1247 user_password: str,

1248 owner_password: Optional[str] = None,

1249 use_128bit: bool = True,

1250 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1251 *,

1252 algorithm: Optional[str] = None,

1253 ) -> None:

1254 """

1255 Encrypt this PDF file with the PDF Standard encryption handler.

1256

1257 Args:

1258 user_password: The password which allows for opening

1259 and reading the PDF file with the restrictions provided.

1260 owner_password: The password which allows for

1261 opening the PDF files without any restrictions. By default,

1262 the owner password is the same as the user password.

1263 use_128bit: flag as to whether to use 128bit

1264 encryption. When false, 40bit encryption will be used.

1265 By default, this flag is on.

1266 permissions_flag: permissions as described in

1267 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1268 the permission is granted.

1269 Hence an integer value of -1 will set all flags.

1270 Bit position 3 is for printing, 4 is for modifying content,

1271 5 and 6 control annotations, 9 for form fields,

1272 10 for extraction of text and graphics.

1273 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1274 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1275 `use_128bit` will be ignored.

1276

1277 """

1278 if owner_password is None:

1279 owner_password = user_password

1280

1281 if algorithm is not None:

1282 try:

1283 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1284 except AttributeError:

1285 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1286 else:

1287 alg = EncryptAlgorithm.RC4_128

1288 if not use_128bit:

1289 alg = EncryptAlgorithm.RC4_40

1290 self.generate_file_identifiers()

1291 assert self._ID

1292 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1293 # in case call `encrypt` again

1294 entry = self._encryption.write_entry(user_password, owner_password)

1295 if self._encrypt_entry:

1296 # replace old encrypt_entry

1297 assert self._encrypt_entry.indirect_reference is not None

1298 entry.indirect_reference = self._encrypt_entry.indirect_reference

1299 self._objects[entry.indirect_reference.idnum - 1] = entry

1300 else:

1301 self._add_object(entry)

1302 self._encrypt_entry = entry

1303

1304 def _resolve_links(self) -> None:

1305 """Patch up links that were added to the document earlier, to

1306 make sure they still point to the same pages.

1307 """

1308 for (new_link, old_link) in self._unresolved_links:

1309 old_page = old_link.find_referenced_page()

1310 if not old_page:

1311 continue

1312 new_page = self._merged_in_pages.get(old_page)

1313 if new_page is None:

1314 continue

1315 new_link.patch_reference(self, new_page)

1316

1317 def write_stream(self, stream: StreamType) -> None:

1318 if hasattr(stream, "mode") and "b" not in stream.mode:

1319 logger_warning(

1320 f"File <{stream.name}> to write to is not in binary mode. "

1321 "It may not be written to correctly.",

1322 __name__,

1323 )

1324 self._resolve_links()

1325

1326 if self.incremental:

1327 self._reader.stream.seek(0)

1328 stream.write(self._reader.stream.read(-1))

1329 if len(self.list_objects_in_increment()) > 0:

1330 self._write_increment(stream) # writes objs, xref stream and startxref

1331 else:

1332 object_positions, free_objects = self._write_pdf_structure(stream)

1333 xref_location = self._write_xref_table(

1334 stream, object_positions, free_objects

1335 )

1336 self._write_trailer(stream, xref_location)

1337

1338 def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:

1339 """

1340 Write the collection of pages added to this object out as a PDF file.

1341

1342 Args:

1343 stream: An object to write the file to. The object can support

1344 the write method and the tell method, similar to a file object, or

1345 be a file path, just like the fileobj, just named it stream to keep

1346 existing workflow.

1347

1348 Returns:

1349 A tuple (bool, IO).

1350

1351 """

1352 my_file = False

1353

1354 if stream == "":

1355 raise ValueError(f"Output({stream=}) is empty.")

1356

1357 if isinstance(stream, (str, Path)):

1358 stream = FileIO(stream, "wb")

1359 my_file = True

1360

1361 self.write_stream(stream)

1362

1363 if my_file:

1364 stream.close()

1365 else:

1366 stream.flush()

1367

1368 return my_file, stream

1369

1370 def list_objects_in_increment(self) -> list[IndirectObject]:

1371 """

1372 For analysis or debugging.

1373 Provides the list of new or modified objects that will be written

1374 in the increment.

1375 Deleted objects will not be freed but will become orphans.

1376

1377 Returns:

1378 List of new or modified IndirectObjects

1379

1380 """

1381 original_hash_count = len(self._original_hash)

1382 return [

1383 cast(IndirectObject, obj).indirect_reference

1384 for i, obj in enumerate(self._objects)

1385 if (

1386 obj is not None

1387 and (

1388 i >= original_hash_count

1389 or obj.hash_bin() != self._original_hash[i]

1390 )

1391 )

1392 ]

1393

1394 def _write_increment(self, stream: StreamType) -> None:

1395 object_positions = {}

1396 object_blocks = []

1397 current_start = -1

1398 current_stop = -2

1399 original_hash_count = len(self._original_hash)

1400 for i, obj in enumerate(self._objects):

1401 if obj is not None and (

1402 i >= original_hash_count

1403 or obj.hash_bin() != self._original_hash[i]

1404 ):

1405 idnum = i + 1

1406 assert isinstance(obj, PdfObject), "mypy"

1407 # first write new/modified object

1408 object_positions[idnum] = stream.tell()

1409 stream.write(f"{idnum} 0 obj\n".encode())

1410 """ encryption is not operational

1411 if self._encryption and obj != self._encrypt_entry:

1412 obj = self._encryption.encrypt_object(obj, idnum, 0)

1413 """

1414 obj.write_to_stream(stream)

1415 stream.write(b"\nendobj\n")

1416

1417 # prepare xref

1418 if idnum != current_stop:

1419 if current_start > 0:

1420 object_blocks.append(

1421 [current_start, current_stop - current_start]

1422 )

1423 current_start = idnum

1424 current_stop = idnum + 1

1425 assert current_start > 0, "for pytest only"

1426 object_blocks.append([current_start, current_stop - current_start])

1427 # write incremented xref

1428 xref_location = stream.tell()

1429 xr_id = len(self._objects) + 1

1430 stream.write(f"{xr_id} 0 obj".encode())

1431 init_data = {

1432 NameObject("/Type"): NameObject("/XRef"),

1433 NameObject("/Size"): NumberObject(xr_id + 1),

1434 NameObject("/Root"): self.root_object.indirect_reference,

1435 NameObject("/Filter"): NameObject("/FlateDecode"),

1436 NameObject("/Index"): ArrayObject(

1437 [NumberObject(_it) for _su in object_blocks for _it in _su]

1438 ),

1439 NameObject("/W"): ArrayObject(

1440 [NumberObject(1), NumberObject(4), NumberObject(1)]

1441 ),

1442 "__streamdata__": b"",

1443 }

1444 if self._info is not None and (

1445 self._info.indirect_reference.idnum - 1 # type: ignore

1446 >= len(self._original_hash)

1447 or cast(IndirectObject, self._info).hash_bin() # kept for future

1448 != self._original_hash[

1449 self._info.indirect_reference.idnum - 1 # type: ignore

1450 ]

1451 ):

1452 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1453 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1454 if self._ID:

1455 init_data[NameObject(TK.ID)] = self._ID

1456 xr = StreamObject.initialize_from_dictionary(init_data)

1457 xr.set_data(

1458 b"".join(

1459 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1460 )

1461 )

1462 xr.write_to_stream(stream)

1463 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1464

1465 def _write_pdf_structure(self, stream: StreamType) -> tuple[list[int], list[int]]:

1466 object_positions = []

1467 free_objects = []

1468 stream.write(self.pdf_header.encode() + b"\n")

1469 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1470

1471 for idnum, obj in enumerate(self._objects, start=1):

1472 if obj is not None:

1473 object_positions.append(stream.tell())

1474 stream.write(f"{idnum} 0 obj\n".encode())

1475 if self._encryption and obj != self._encrypt_entry:

1476 obj = self._encryption.encrypt_object(obj, idnum, 0)

1477 obj.write_to_stream(stream)

1478 stream.write(b"\nendobj\n")

1479 else:

1480 object_positions.append(-1)

1481 free_objects.append(idnum)

1482 free_objects.append(0) # add 0 to loop in accordance with specification

1483 return object_positions, free_objects

1484

1485 def _write_xref_table(

1486 self, stream: StreamType, object_positions: list[int], free_objects: list[int]

1487 ) -> int:

1488 xref_location = stream.tell()

1489 stream.write(b"xref\n")

1490 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1491 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1492 free_idx = 1

1493 for offset in object_positions:

1494 if offset > 0:

1495 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1496 else:

1497 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1498 free_idx += 1

1499 return xref_location

1500

1501 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1502 """

1503 Write the PDF trailer to the stream.

1504

1505 To quote the PDF specification:

1506 [The] trailer [gives] the location of the cross-reference table and

1507 of certain special objects within the body of the file.

1508 """

1509 stream.write(b"trailer\n")

1510 trailer = DictionaryObject(

1511 {

1512 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1513 NameObject(TK.ROOT): self.root_object.indirect_reference,

1514 }

1515 )

1516 if self._info is not None:

1517 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1518 if self._ID is not None:

1519 trailer[NameObject(TK.ID)] = self._ID

1520 if self._encrypt_entry:

1521 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1522 trailer.write_to_stream(stream)

1523 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1524

1525 @property

1526 def metadata(self) -> Optional[DocumentInformation]:

1527 """

1528 Retrieve/set the PDF file's document information dictionary, if it exists.

1529

1530 Args:

1531 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1532

1533 Note that some PDF files use (XMP) metadata streams instead of document

1534 information dictionaries, and these metadata streams will not be

1535 accessed by this function, but by :meth:`~xmp_metadata`.

1536

1537 """

1538 return super().metadata

1539

1540 @metadata.setter

1541 def metadata(

1542 self,

1543 value: Optional[Union[DocumentInformation, DictionaryObject, dict[Any, Any]]],

1544 ) -> None:

1545 if value is None:

1546 self._info = None

1547 else:

1548 if self._info is not None:

1549 self._info.clear()

1550

1551 self.add_metadata(value)

1552

1553 def add_metadata(self, infos: dict[str, Any]) -> None:

1554 """

1555 Add custom metadata to the output.

1556

1557 Args:

1558 infos: a Python dictionary where each key is a field

1559 and each value is your new metadata.

1560

1561 """

1562 args = {}

1563 if isinstance(infos, PdfObject):

1564 infos = cast(DictionaryObject, infos.get_object())

1565 for key, value in list(infos.items()):

1566 if isinstance(value, PdfObject):

1567 value = value.get_object()

1568 args[NameObject(key)] = create_string_object(str(value))

1569 if self._info is None:

1570 self._info = DictionaryObject()

1571 self._info.update(args)

1572

1573 def compress_identical_objects(

1574 self,

1575 remove_identicals: bool = True,

1576 remove_orphans: bool = True,

1577 ) -> None:

1578 """

1579 Parse the PDF file and merge objects that have the same hash.

1580 This will make objects common to multiple pages.

1581 Recommended to be used just before writing output.

1582

1583 Args:

1584 remove_identicals: Remove identical objects.

1585 remove_orphans: Remove unreferenced objects.

1586

1587 """

1588

1589 def replace_in_obj(

1590 obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]

1591 ) -> None:

1592 if isinstance(obj, DictionaryObject):

1593 key_val = obj.items()

1594 elif isinstance(obj, ArrayObject):

1595 key_val = enumerate(obj) # type: ignore

1596 else:

1597 return

1598 assert isinstance(obj, (DictionaryObject, ArrayObject))

1599 for k, v in key_val:

1600 if isinstance(v, IndirectObject):

1601 orphans[v.idnum - 1] = False

1602 if v in crossref:

1603 obj[k] = crossref[v]

1604 else:

1605 """the filtering on DictionaryObject and ArrayObject only

1606 will be performed within replace_in_obj"""

1607 replace_in_obj(v, crossref)

1608

1609 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])

1610 self._idnum_hash = {}

1611 orphans = [True] * len(self._objects)

1612 # look for similar objects

1613 for idx, obj in enumerate(self._objects):

1614 if is_null_or_none(obj):

1615 continue

1616 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1617 assert isinstance(obj.indirect_reference, IndirectObject)

1618 h = obj.hash_value()

1619 if remove_identicals and h in self._idnum_hash:

1620 self._idnum_hash[h][1].append(obj.indirect_reference)

1621 self._objects[idx] = None

1622 else:

1623 self._idnum_hash[h] = (obj.indirect_reference, [])

1624

1625 # generate the dict converting others to 1st

1626 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1627 cnv_rev: dict[IndirectObject, IndirectObject] = {}

1628 for k, v in cnv.items():

1629 cnv_rev.update(zip(v, (k,) * len(v)))

1630

1631 # replace reference to merged objects

1632 for obj in self._objects:

1633 if isinstance(obj, (DictionaryObject, ArrayObject)):

1634 replace_in_obj(obj, cnv_rev)

1635

1636 # remove orphans (if applicable)

1637 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

1638

1639 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore

1640

1641 try:

1642 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore

1643 except AttributeError:

1644 pass

1645 for i in compress(range(len(self._objects)), orphans):

1646 self._objects[i] = None

1647

1648 def get_reference(self, obj: PdfObject) -> IndirectObject:

1649 idnum = self._objects.index(obj) + 1

1650 ref = IndirectObject(idnum, 0, self)

1651 assert ref.get_object() == obj

1652 return ref

1653

1654 def get_outline_root(self) -> TreeObject:

1655 if CO.OUTLINES in self._root_object:

1656 # Entries in the catalog dictionary

1657 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1658 if not isinstance(outline, TreeObject):

1659 t = TreeObject(outline)

1660 self._replace_object(outline.indirect_reference.idnum, t)

1661 outline = t

1662 idnum = self._objects.index(outline) + 1

1663 outline_ref = IndirectObject(idnum, 0, self)

1664 assert outline_ref.get_object() == outline

1665 else:

1666 outline = TreeObject()

1667 outline.update({})

1668 outline_ref = self._add_object(outline)

1669 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1670

1671 return outline

1672

1673 def get_threads_root(self) -> ArrayObject:

1674 """

1675 The list of threads.

1676

1677 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1678

1679 Returns:

1680 An array (possibly empty) of Dictionaries with an ``/F`` key,

1681 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1682

1683 """

1684 if CO.THREADS in self._root_object:

1685 # Entries in the catalog dictionary

1686 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1687 else:

1688 threads = ArrayObject()

1689 self._root_object[NameObject(CO.THREADS)] = threads

1690 return threads

1691

1692 @property

1693 def threads(self) -> ArrayObject:

1694 """

1695 Read-only property for the list of threads.

1696

1697 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1698

1699 Each element is a dictionary with an ``/F`` key, and optionally

1700 information about the thread in ``/I`` or ``/Metadata`` keys.

1701 """

1702 return self.get_threads_root()

1703

1704 def add_outline_item_destination(

1705 self,

1706 page_destination: Union[IndirectObject, PageObject, TreeObject],

1707 parent: Union[None, TreeObject, IndirectObject] = None,

1708 before: Union[None, TreeObject, IndirectObject] = None,

1709 is_open: bool = True,

1710 ) -> IndirectObject:

1711 page_destination = cast(PageObject, page_destination.get_object())

1712 if isinstance(page_destination, PageObject):

1713 return self.add_outline_item_destination(

1714 Destination(

1715 f"page #{page_destination.page_number}",

1716 cast(IndirectObject, page_destination.indirect_reference),

1717 Fit.fit(),

1718 )

1719 )

1720

1721 if parent is None:

1722 parent = self.get_outline_root()

1723

1724 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1725 parent = cast(TreeObject, parent.get_object())

1726 page_destination_ref = self._add_object(page_destination)

1727 if before is not None:

1728 before = before.indirect_reference

1729 parent.insert_child(

1730 page_destination_ref,

1731 before,

1732 self,

1733 page_destination.inc_parent_counter_outline

1734 if is_open

1735 else (lambda x, y: 0), # noqa: ARG005

1736 )

1737 if "/Count" not in page_destination:

1738 page_destination[NameObject("/Count")] = NumberObject(0)

1739

1740 return page_destination_ref

1741

1742 def add_outline_item_dict(

1743 self,

1744 outline_item: OutlineItemType,

1745 parent: Union[None, TreeObject, IndirectObject] = None,

1746 before: Union[None, TreeObject, IndirectObject] = None,

1747 is_open: bool = True,

1748 ) -> IndirectObject:

1749 outline_item_object = TreeObject()

1750 outline_item_object.update(outline_item)

1751

1752 """code currently unreachable

1753 if "/A" in outline_item:

1754 action = DictionaryObject()

1755 a_dict = cast(DictionaryObject, outline_item["/A"])

1756 for k, v in list(a_dict.items()):

1757 action[NameObject(str(k))] = v

1758 action_ref = self._add_object(action)

1759 outline_item_object[NameObject("/A")] = action_ref

1760 """

1761 return self.add_outline_item_destination(

1762 outline_item_object, parent, before, is_open

1763 )

1764

1765 def add_outline_item(

1766 self,

1767 title: str,

1768 page_number: Union[None, PageObject, IndirectObject, int],

1769 parent: Union[None, TreeObject, IndirectObject] = None,

1770 before: Union[None, TreeObject, IndirectObject] = None,

1771 color: Optional[Union[tuple[float, float, float], str]] = None,

1772 bold: bool = False,

1773 italic: bool = False,

1774 fit: Fit = PAGE_FIT,

1775 is_open: bool = True,

1776 ) -> IndirectObject:

1777 """

1778 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1779

1780 Args:

1781 title: Title to use for this outline item.

1782 page_number: Page number this outline item will point to.

1783 parent: A reference to a parent outline item to create nested

1784 outline items.

1785 before:

1786 color: Color of the outline item's font as a red, green, blue tuple

1787 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1788 bold: Outline item font is bold

1789 italic: Outline item font is italic

1790 fit: The fit of the destination page.

1791

1792 Returns:

1793 The added outline item as an indirect object.

1794

1795 """

1796 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

1797 if isinstance(italic, Fit): # it means that we are on the old params

1798 if fit is not None and page_number is None:

1799 page_number = fit

1800 return self.add_outline_item(

1801 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

1802 )

1803 if page_number is None:

1804 action_ref = None

1805 else:

1806 if isinstance(page_number, IndirectObject):

1807 page_ref = page_number

1808 elif isinstance(page_number, PageObject):

1809 page_ref = page_number.indirect_reference

1810 elif isinstance(page_number, int):

1811 try:

1812 page_ref = self.pages[page_number].indirect_reference

1813 except IndexError:

1814 page_ref = NumberObject(page_number)

1815 if page_ref is None:

1816 logger_warning(

1817 f"can not find reference of page {page_number}",

1818 __name__,

1819 )

1820 page_ref = NullObject()

1821 dest = Destination(

1822 NameObject("/" + title + " outline item"),

1823 page_ref,

1824 fit,

1825 )

1826

1827 action_ref = self._add_object(

1828 DictionaryObject(

1829 {

1830 NameObject(GoToActionArguments.D): dest.dest_array,

1831 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1832 }

1833 )

1834 )

1835 outline_item = self._add_object(

1836 _create_outline_item(action_ref, title, color, italic, bold)

1837 )

1838

1839 if parent is None:

1840 parent = self.get_outline_root()

1841 return self.add_outline_item_destination(outline_item, parent, before, is_open)

1842

1843 def add_outline(self) -> None:

1844 raise NotImplementedError(

1845 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

1846 )

1847

1848 def add_named_destination_array(

1849 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

1850 ) -> None:

1851 named_dest = self.get_named_dest_root()

1852 i = 0

1853 while i < len(named_dest):

1854 if title < named_dest[i]:

1855 named_dest.insert(i, destination)

1856 named_dest.insert(i, TextStringObject(title))

1857 return

1858 i += 2

1859 named_dest.extend([TextStringObject(title), destination])

1860 return

1861

1862 def add_named_destination_object(

1863 self,

1864 page_destination: PdfObject,

1865 ) -> IndirectObject:

1866 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore

1867 self.add_named_destination_array(

1868 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore

1869 )

1870

1871 return page_destination_ref

1872

1873 def add_named_destination(

1874 self,

1875 title: str,

1876 page_number: int,

1877 ) -> IndirectObject:

1878 page_ref = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore

1879 dest = DictionaryObject()

1880 dest.update(

1881 {

1882 NameObject(GoToActionArguments.D): ArrayObject(

1883 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

1884 ),

1885 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1886 }

1887 )

1888

1889 dest_ref = self._add_object(dest)

1890 if not isinstance(title, TextStringObject):

1891 title = TextStringObject(str(title))

1892

1893 self.add_named_destination_array(title, dest_ref)

1894 return dest_ref

1895

1896 def remove_links(self) -> None:

1897 """Remove links and annotations from this output."""

1898 for page in self.pages:

1899 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

1900

1901 def remove_annotations(

1902 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

1903 ) -> None:

1904 """

1905 Remove annotations by annotation subtype.

1906

1907 Args:

1908 subtypes: subtype or list of subtypes to be removed.

1909 Examples are: "/Link", "/FileAttachment", "/Sound",

1910 "/Movie", "/Screen", ...

1911 If you want to remove all annotations, use subtypes=None.

1912

1913 """

1914 for page in self.pages:

1915 self._remove_annots_from_page(page, subtypes)

1916

1917 def _remove_annots_from_page(

1918 self,

1919 page: Union[IndirectObject, PageObject, DictionaryObject],

1920 subtypes: Optional[Iterable[str]],

1921 ) -> None:

1922 page = cast(DictionaryObject, page.get_object())

1923 if PG.ANNOTS in page:

1924 i = 0

1925 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

1926 an = cast(ArrayObject, page[PG.ANNOTS])[i]

1927 obj = cast(DictionaryObject, an.get_object())

1928 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

1929 if isinstance(an, IndirectObject):

1930 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

1931 del page[PG.ANNOTS][i] # type:ignore

1932 else:

1933 i += 1

1934

1935 def remove_objects_from_page(

1936 self,

1937 page: Union[PageObject, DictionaryObject],

1938 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

1939 text_filters: Optional[dict[str, Any]] = None

1940 ) -> None:

1941 """

1942 Remove objects specified by ``to_delete`` from the given page.

1943

1944 Args:

1945 page: Page object to clean up.

1946 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

1947 or a list of ObjectDeletionFlag

1948 text_filters: Properties of text to be deleted, if applicable. Optional.

1949 This is a Python dictionary with the following properties:

1950

1951 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

1952

1953 """

1954 if isinstance(to_delete, (list, tuple)):

1955 for to_d in to_delete:

1956 self.remove_objects_from_page(page, to_d)

1957 return None

1958 assert isinstance(to_delete, ObjectDeletionFlag)

1959

1960 if to_delete & ObjectDeletionFlag.LINKS:

1961 return self._remove_annots_from_page(page, ("/Link",))

1962 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

1963 return self._remove_annots_from_page(

1964 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

1965 )

1966 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

1967 return self._remove_annots_from_page(page, ("/3D",))

1968 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

1969 return self._remove_annots_from_page(page, None)

1970

1971 jump_operators = []

1972 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

1973 jump_operators = (

1974 [

1975 b"w", b"J", b"j", b"M", b"d", b"i",

1976 b"W", b"W*",

1977 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

1978 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

1979 b"sh"

1980 ]

1981 )

1982 if to_delete & ObjectDeletionFlag.TEXT:

1983 jump_operators = [b"Tj", b"TJ", b"'", b'"']

1984

1985 def clean(

1986 content: ContentStream,

1987 images: list[str],

1988 forms: list[str],

1989 text_filters: Optional[dict[str, Any]] = None

1990 ) -> None:

1991 nonlocal jump_operators, to_delete

1992

1993 font_id = None

1994 font_ids_to_delete = []

1995 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

1996 font_ids_to_delete = text_filters.get("font_ids", [])

1997

1998 i = 0

1999 while i < len(content.operations):

2000 operands, operator = content.operations[i]

2001 if operator == b"Tf":

2002 font_id = operands[0]

2003 if (

2004 (

2005 operator == b"INLINE IMAGE"

2006 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2007 )

2008 or (operator in jump_operators)

2009 or (

2010 operator == b"Do"

2011 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2012 and (operands[0] in images)

2013 )

2014 ):

2015 if (

2016 not to_delete & ObjectDeletionFlag.TEXT

2017 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2018 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2019 ):

2020 del content.operations[i]

2021 else:

2022 i += 1

2023 else:

2024 i += 1

2025 content.get_data() # this ensures ._data is rebuilt from the .operations

2026

2027 def clean_forms(

2028 elt: DictionaryObject, stack: list[DictionaryObject]

2029 ) -> tuple[list[str], list[str]]:

2030 nonlocal to_delete

2031 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2032 if (elt in stack) or (

2033 hasattr(elt, "indirect_reference")

2034 and any(

2035 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2036 for x in stack

2037 )

2038 ):

2039 # to prevent infinite looping

2040 return [], [] # pragma: no cover

2041 try:

2042 d = cast(

2043 dict[Any, Any],

2044 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2045 )

2046 except KeyError:

2047 d = {}

2048 images = []

2049 forms = []

2050 for k, v in d.items():

2051 o = v.get_object()

2052 try:

2053 content: Any = None

2054 if (

2055 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2056 and o["/Subtype"] == "/Image"

2057 ):

2058 content = NullObject() # to delete the image keeping the entry

2059 images.append(k)

2060 if o["/Subtype"] == "/Form":

2061 forms.append(k)

2062 if isinstance(o, ContentStream):

2063 content = o

2064 else:

2065 content = ContentStream(o, self)

2066 content.update(

2067 {

2068 k1: v1

2069 for k1, v1 in o.items()

2070 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2071 }

2072 )

2073 try:

2074 content.indirect_reference = o.indirect_reference

2075 except AttributeError: # pragma: no cover

2076 pass

2077 stack.append(elt)

2078 clean_forms(content, stack) # clean subforms

2079 if content is not None:

2080 if isinstance(v, IndirectObject):

2081 self._objects[v.idnum - 1] = content

2082 else:

2083 # should only occur in a PDF not respecting PDF spec

2084 # where streams must be indirected.

2085 d[k] = self._add_object(content) # pragma: no cover

2086 except (TypeError, KeyError):

2087 pass

2088 for im in images:

2089 del d[im] # for clean-up

2090 if isinstance(elt, StreamObject): # for /Form

2091 if not isinstance(elt, ContentStream): # pragma: no cover

2092 e = ContentStream(elt, self)

2093 e.update(elt.items())

2094 elt = e

2095 clean(elt, images, forms, text_filters) # clean the content

2096 return images, forms

2097

2098 if not isinstance(page, PageObject):

2099 page = PageObject(self, page.indirect_reference) # pragma: no cover

2100 if "/Contents" in page:

2101 content = cast(ContentStream, page.get_contents())

2102

2103 images, forms = clean_forms(page, [])

2104

2105 clean(content, images, forms, text_filters)

2106 page.replace_contents(content)

2107 return [], [] # type: ignore[return-value]

2108

2109 def remove_images(

2110 self,

2111 to_delete: ImageType = ImageType.ALL,

2112 ) -> None:

2113 """

2114 Remove images from this output.

2115

2116 Args:

2117 to_delete: The type of images to be deleted

2118 (default = all images types)

2119

2120 """

2121 if isinstance(to_delete, bool):

2122 to_delete = ImageType.ALL

2123

2124 i = ObjectDeletionFlag.NONE

2125

2126 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2127 if to_delete & ImageType[image]:

2128 i |= ObjectDeletionFlag[image]

2129

2130 for page in self.pages:

2131 self.remove_objects_from_page(page, i)

2132

2133 def remove_text(self, font_names: Optional[list[str]] = None) -> None:

2134 """

2135 Remove text from the PDF.

2136

2137 Args:

2138 font_names: List of font names to remove, such as "Helvetica-Bold".

2139 Optional. If not specified, all text will be removed.

2140 """

2141 if not font_names:

2142 font_names = []

2143

2144 for page in self.pages:

2145 resource_ids_to_remove = []

2146

2147 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2148 # Font names need to be converted to resource names/IDs for easier removal

2149 if font_names:

2150 # Recursively loop through page objects to gather font info

2151 def get_font_info(

2152 obj: Any,

2153 font_info: Optional[dict[str, Any]] = None,

2154 key: Optional[str] = None

2155 ) -> dict[str, Any]:

2156 if font_info is None:

2157 font_info = {}

2158 if isinstance(obj, IndirectObject):

2159 obj = obj.get_object()

2160 if isinstance(obj, dict):

2161 if obj.get("/Type") == "/Font":

2162 font_name = obj.get("/BaseFont", "")

2163 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2164 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2165 if normalized_font_name not in font_info:

2166 font_info[normalized_font_name] = {

2167 "normalized_font_name": normalized_font_name,

2168 "resource_ids": [],

2169 }

2170 if key not in font_info[normalized_font_name]["resource_ids"]:

2171 font_info[normalized_font_name]["resource_ids"].append(key)

2172 for k in obj:

2173 font_info = get_font_info(obj[k], font_info, k)

2174 elif isinstance(obj, (list, ArrayObject)):

2175 for child_obj in obj:

2176 font_info = get_font_info(child_obj, font_info)

2177 return font_info

2178

2179 # Add relevant resource names for removal

2180 font_info = get_font_info(page.get("/Resources"))

2181 for font_name in font_names:

2182 if font_name in font_info:

2183 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2184

2185 text_filters = {}

2186 if font_names:

2187 text_filters["font_ids"] = resource_ids_to_remove

2188 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2189

2190 def add_uri(

2191 self,

2192 page_number: int,

2193 uri: str,

2194 rect: RectangleObject,

2195 border: Optional[ArrayObject] = None,

2196 ) -> None:

2197 """

2198 Add an URI from a rectangular area to the specified page.

2199

2200 Args:

2201 page_number: index of the page on which to place the URI action.

2202 uri: URI of resource to link to.

2203 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2204 array of four integers specifying the clickable rectangular area

2205 ``[xLL, yLL, xUR, yUR]``, or string in the form

2206 ``"[ xLL yLL xUR yUR ]"``.

2207 border: if provided, an array describing border-drawing

2208 properties. See the PDF spec for details. No border will be

2209 drawn if this argument is omitted.

2210

2211 """

2212 page_link = self.get_object(self._pages)[PagesAttributes.KIDS][page_number] # type: ignore

2213 page_ref = cast(dict[str, Any], self.get_object(page_link))

2214

2215 border_arr: BorderArrayType

2216 if border is not None:

2217 border_arr = [NumberObject(n) for n in border[:3]]

2218 if len(border) == 4:

2219 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2220 border_arr.append(dash_pattern)

2221 else:

2222 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2223

2224 if isinstance(rect, str):

2225 rect = NumberObject(rect)

2226 elif isinstance(rect, RectangleObject):

2227 pass

2228 else:

2229 rect = RectangleObject(rect)

2230

2231 lnk2 = DictionaryObject()

2232 lnk2.update(

2233 {

2234 NameObject("/S"): NameObject("/URI"),

2235 NameObject("/URI"): TextStringObject(uri),

2236 }

2237 )

2238 lnk = DictionaryObject()

2239 lnk.update(

2240 {

2241 NameObject(AA.Type): NameObject("/Annot"),

2242 NameObject(AA.Subtype): NameObject("/Link"),

2243 NameObject(AA.P): page_link,

2244 NameObject(AA.Rect): rect,

2245 NameObject("/H"): NameObject("/I"),

2246 NameObject(AA.Border): ArrayObject(border_arr),

2247 NameObject("/A"): lnk2,

2248 }

2249 )

2250 lnk_ref = self._add_object(lnk)

2251

2252 if PG.ANNOTS in page_ref:

2253 page_ref[PG.ANNOTS].append(lnk_ref)

2254 else:

2255 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2256

2257 _valid_layouts = (

2258 "/NoLayout",

2259 "/SinglePage",

2260 "/OneColumn",

2261 "/TwoColumnLeft",

2262 "/TwoColumnRight",

2263 "/TwoPageLeft",

2264 "/TwoPageRight",

2265 )

2266

2267 def _get_page_layout(self) -> Optional[LayoutType]:

2268 try:

2269 return cast(LayoutType, self._root_object["/PageLayout"])

2270 except KeyError:

2271 return None

2272

2273 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2274 """

2275 Set the page layout.

2276

2277 Args:

2278 layout: The page layout to be used.

2279

2280 .. list-table:: Valid ``layout`` arguments

2281 :widths: 50 200

2282

2283 * - /NoLayout

2284 - Layout explicitly not specified

2285 * - /SinglePage

2286 - Show one page at a time

2287 * - /OneColumn

2288 - Show one column at a time

2289 * - /TwoColumnLeft

2290 - Show pages in two columns, odd-numbered pages on the left

2291 * - /TwoColumnRight

2292 - Show pages in two columns, odd-numbered pages on the right

2293 * - /TwoPageLeft

2294 - Show two pages at a time, odd-numbered pages on the left

2295 * - /TwoPageRight

2296 - Show two pages at a time, odd-numbered pages on the right

2297

2298 """

2299 if not isinstance(layout, NameObject):

2300 if layout not in self._valid_layouts:

2301 logger_warning(

2302 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",

2303 __name__,

2304 )

2305 layout = NameObject(layout)

2306 self._root_object.update({NameObject("/PageLayout"): layout})

2307

2308 def set_page_layout(self, layout: LayoutType) -> None:

2309 """

2310 Set the page layout.

2311

2312 Args:

2313 layout: The page layout to be used

2314

2315 .. list-table:: Valid ``layout`` arguments

2316 :widths: 50 200

2317

2318 * - /NoLayout

2319 - Layout explicitly not specified

2320 * - /SinglePage

2321 - Show one page at a time

2322 * - /OneColumn

2323 - Show one column at a time

2324 * - /TwoColumnLeft

2325 - Show pages in two columns, odd-numbered pages on the left

2326 * - /TwoColumnRight

2327 - Show pages in two columns, odd-numbered pages on the right

2328 * - /TwoPageLeft

2329 - Show two pages at a time, odd-numbered pages on the left

2330 * - /TwoPageRight

2331 - Show two pages at a time, odd-numbered pages on the right

2332

2333 """

2334 self._set_page_layout(layout)

2335

2336 @property

2337 def page_layout(self) -> Optional[LayoutType]:

2338 """

2339 Page layout property.

2340

2341 .. list-table:: Valid ``layout`` values

2342 :widths: 50 200

2343

2344 * - /NoLayout

2345 - Layout explicitly not specified

2346 * - /SinglePage

2347 - Show one page at a time

2348 * - /OneColumn

2349 - Show one column at a time

2350 * - /TwoColumnLeft

2351 - Show pages in two columns, odd-numbered pages on the left

2352 * - /TwoColumnRight

2353 - Show pages in two columns, odd-numbered pages on the right

2354 * - /TwoPageLeft

2355 - Show two pages at a time, odd-numbered pages on the left

2356 * - /TwoPageRight

2357 - Show two pages at a time, odd-numbered pages on the right

2358 """

2359 return self._get_page_layout()

2360

2361 @page_layout.setter

2362 def page_layout(self, layout: LayoutType) -> None:

2363 self._set_page_layout(layout)

2364

2365 _valid_modes = (

2366 "/UseNone",

2367 "/UseOutlines",

2368 "/UseThumbs",

2369 "/FullScreen",

2370 "/UseOC",

2371 "/UseAttachments",

2372 )

2373

2374 def _get_page_mode(self) -> Optional[PagemodeType]:

2375 try:

2376 return cast(PagemodeType, self._root_object["/PageMode"])

2377 except KeyError:

2378 return None

2379

2380 @property

2381 def page_mode(self) -> Optional[PagemodeType]:

2382 """

2383 Page mode property.

2384

2385 .. list-table:: Valid ``mode`` values

2386 :widths: 50 200

2387

2388 * - /UseNone

2389 - Do not show outline or thumbnails panels

2390 * - /UseOutlines

2391 - Show outline (aka bookmarks) panel

2392 * - /UseThumbs

2393 - Show page thumbnails panel

2394 * - /FullScreen

2395 - Fullscreen view

2396 * - /UseOC

2397 - Show Optional Content Group (OCG) panel

2398 * - /UseAttachments

2399 - Show attachments panel

2400 """

2401 return self._get_page_mode()

2402

2403 @page_mode.setter

2404 def page_mode(self, mode: PagemodeType) -> None:

2405 if isinstance(mode, NameObject):

2406 mode_name: NameObject = mode

2407 else:

2408 if mode not in self._valid_modes:

2409 logger_warning(

2410 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__

2411 )

2412 mode_name = NameObject(mode)

2413 self._root_object.update({NameObject("/PageMode"): mode_name})

2414

2415 def add_annotation(

2416 self,

2417 page_number: Union[int, PageObject],

2418 annotation: dict[str, Any],

2419 ) -> DictionaryObject:

2420 """

2421 Add a single annotation to the page.

2422 The added annotation must be a new annotation.

2423 It cannot be recycled.

2424

2425 Args:

2426 page_number: PageObject or page index.

2427 annotation: Annotation to be added (created with annotation).

2428

2429 Returns:

2430 The inserted object.

2431 This can be used for popup creation, for example.

2432

2433 """

2434 page = page_number

2435 if isinstance(page, int):

2436 page = self.pages[page]

2437 elif not isinstance(page, PageObject):

2438 raise TypeError("page: invalid type")

2439

2440 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2441 to_add[NameObject("/P")] = page.indirect_reference

2442

2443 if page.annotations is None:

2444 page[NameObject("/Annots")] = ArrayObject()

2445 assert page.annotations is not None

2446

2447 # Internal link annotations need the correct object type for the

2448 # destination

2449 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2450 tmp = cast(dict[Any, Any], to_add[NameObject("/Dest")])

2451 dest = Destination(

2452 NameObject("/LinkName"),

2453 tmp["target_page_index"],

2454 Fit(

2455 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2456 ), # I have no clue why this dict-hack is necessary

2457 )

2458 to_add[NameObject("/Dest")] = dest.dest_array

2459

2460 page.annotations.append(self._add_object(to_add))

2461

2462 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2463 cast(DictionaryObject, to_add["/Parent"].get_object())[

2464 NameObject("/Popup")

2465 ] = to_add.indirect_reference

2466

2467 return to_add

2468

2469 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2470 """

2471 Perform some clean up in the page.

2472 Currently: convert NameObject named destination to TextStringObject

2473 (required for names/dests list)

2474

2475 Args:

2476 page:

2477

2478 Returns:

2479 The cleaned PageObject

2480

2481 """

2482 page = cast("PageObject", page.get_object())

2483 for a in page.get("/Annots", []):

2484 a_obj = a.get_object()

2485 d = a_obj.get("/Dest", None)

2486 act = a_obj.get("/A", None)

2487 if isinstance(d, NameObject):

2488 a_obj[NameObject("/Dest")] = TextStringObject(d)

2489 elif act is not None:

2490 act = act.get_object()

2491 d = act.get("/D", None)

2492 if isinstance(d, NameObject):

2493 act[NameObject("/D")] = TextStringObject(d)

2494 return page

2495

2496 def _create_stream(

2497 self, fileobj: Union[Path, StrByteType, PdfReader]

2498 ) -> tuple[IOBase, Optional[Encryption]]:

2499 # If the fileobj parameter is a string, assume it is a path

2500 # and create a file object at that location. If it is a file,

2501 # copy the file's contents into a BytesIO stream object; if

2502 # it is a PdfReader, copy that reader's stream into a

2503 # BytesIO stream.

2504 # If fileobj is none of the above types, it is not modified

2505 encryption_obj = None

2506 stream: IOBase

2507 if isinstance(fileobj, (str, Path)):

2508 with FileIO(fileobj, "rb") as f:

2509 stream = BytesIO(f.read())

2510 elif isinstance(fileobj, PdfReader):

2511 if fileobj._encryption:

2512 encryption_obj = fileobj._encryption

2513 orig_tell = fileobj.stream.tell()

2514 fileobj.stream.seek(0)

2515 stream = BytesIO(fileobj.stream.read())

2516

2517 # reset the stream to its original location

2518 fileobj.stream.seek(orig_tell)

2519 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2520 fileobj.seek(0)

2521 filecontent = fileobj.read()

2522 stream = BytesIO(filecontent)

2523 else:

2524 raise NotImplementedError(

2525 "Merging requires an object that PdfReader can parse. "

2526 "Typically, that is a Path or a string representing a Path, "

2527 "a file object, or an object implementing .seek and .read. "

2528 "Passing a PdfReader directly works as well."

2529 )

2530 return stream, encryption_obj

2531

2532 def append(

2533 self,

2534 fileobj: Union[StrByteType, PdfReader, Path],

2535 outline_item: Union[

2536 str, None, PageRange, tuple[int, int], tuple[int, int, int], list[int]

2537 ] = None,

2538 pages: Union[

2539 None,

2540 PageRange,

2541 tuple[int, int],

2542 tuple[int, int, int],

2543 list[int],

2544 list[PageObject],

2545 ] = None,

2546 import_outline: bool = True,

2547 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = None,

2548 ) -> None:

2549 """

2550 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2551 concatenate all pages onto the end of the file instead of specifying a

2552 position.

2553

2554 Args:

2555 fileobj: A File Object or an object that supports the standard

2556 read and seek methods similar to a File Object. Could also be a

2557 string representing a path to a PDF file.

2558 outline_item: Optionally, you may specify a string to build an

2559 outline (aka 'bookmark') to identify the beginning of the

2560 included file.

2561 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2562 or a ``(start, stop[, step])`` tuple

2563 or a list of pages to be processed

2564 to merge only the specified range of pages from the source

2565 document into the output document.

2566 import_outline: You may prevent the source document's

2567 outline (collection of outline items, previously referred to as

2568 'bookmarks') from being imported by specifying this as ``False``.

2569 excluded_fields: Provide the list of fields/keys to be ignored

2570 if ``/Annots`` is part of the list, the annotation will be ignored

2571 if ``/B`` is part of the list, the articles will be ignored

2572

2573 """

2574 if excluded_fields is None:

2575 excluded_fields = ()

2576 if isinstance(outline_item, (tuple, list, PageRange)):

2577 if isinstance(pages, bool):

2578 if not isinstance(import_outline, bool):

2579 excluded_fields = import_outline

2580 import_outline = pages

2581 pages = outline_item

2582 self.merge(

2583 None,

2584 fileobj,

2585 None,

2586 pages,

2587 import_outline,

2588 excluded_fields,

2589 )

2590 else: # if isinstance(outline_item, str):

2591 self.merge(

2592 None,

2593 fileobj,

2594 outline_item,

2595 pages,

2596 import_outline,

2597 excluded_fields,

2598 )

2599

2600 def merge(

2601 self,

2602 position: Optional[int],

2603 fileobj: Union[Path, StrByteType, PdfReader],

2604 outline_item: Optional[str] = None,

2605 pages: Optional[Union[PageRangeSpec, list[PageObject]]] = None,

2606 import_outline: bool = True,

2607 excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),

2608 ) -> None:

2609 """

2610 Merge the pages from the given file into the output file at the

2611 specified page number.

2612

2613 Args:

2614 position: The *page number* to insert this file. File will

2615 be inserted after the given number.

2616 fileobj: A File Object or an object that supports the standard

2617 read and seek methods similar to a File Object. Could also be a

2618 string representing a path to a PDF file.

2619 outline_item: Optionally, you may specify a string to build an outline

2620 (aka 'bookmark') to identify the

2621 beginning of the included file.

2622 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2623 or a ``(start, stop[, step])`` tuple

2624 or a list of pages to be processed

2625 to merge only the specified range of pages from the source

2626 document into the output document.

2627 import_outline: You may prevent the source document's

2628 outline (collection of outline items, previously referred to as

2629 'bookmarks') from being imported by specifying this as ``False``.

2630 excluded_fields: provide the list of fields/keys to be ignored

2631 if ``/Annots`` is part of the list, the annotation will be ignored

2632 if ``/B`` is part of the list, the articles will be ignored

2633

2634 Raises:

2635 TypeError: The pages attribute is not configured properly

2636

2637 """

2638 if isinstance(fileobj, PdfDocCommon):

2639 reader = fileobj

2640 else:

2641 stream, _encryption_obj = self._create_stream(fileobj)

2642 # Create a new PdfReader instance using the stream

2643 # (either file or BytesIO or StringIO) created above

2644 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2645

2646 if excluded_fields is None:

2647 excluded_fields = ()

2648 # Find the range of pages to merge.

2649 if pages is None:

2650 pages = list(range(len(reader.pages)))

2651 elif isinstance(pages, PageRange):

2652 pages = list(range(*pages.indices(len(reader.pages))))

2653 elif isinstance(pages, list):

2654 pass # keep unchanged

2655 elif isinstance(pages, tuple) and len(pages) <= 3:

2656 pages = list(range(*pages))

2657 elif not isinstance(pages, tuple):

2658 raise TypeError(

2659 '"pages" must be a tuple of (start, stop[, step]) or a list'

2660 )

2661

2662 srcpages = {}

2663 for page in pages:

2664 if isinstance(page, PageObject):

2665 pg = page

2666 else:

2667 pg = reader.pages[page]

2668 assert pg.indirect_reference is not None

2669 if position is None:

2670 # numbers in the exclude list identifies that the exclusion is

2671 # only applicable to 1st level of cloning

2672 srcpages[pg.indirect_reference.idnum] = self.add_page(

2673 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2674 )

2675 else:

2676 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2677 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2678 )

2679 position += 1

2680 srcpages[pg.indirect_reference.idnum].original_page = pg

2681

2682 reader._named_destinations = (

2683 reader.named_destinations

2684 ) # need for the outline processing below

2685

2686 arr: Any

2687

2688 def _process_named_dests(dest: Any) -> None:

2689 arr = dest.dest_array

2690 if "/Names" in self._root_object and dest["/Title"] in cast(

2691 list[Any],

2692 cast(

2693 DictionaryObject,

2694 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2695 ).get("/Names", DictionaryObject()),

2696 ):

2697 # already exists: should not duplicate it

2698 pass

2699 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2700 pass

2701 elif isinstance(dest["/Page"], int):

2702 # the page reference is a page number normally not a PDF Reference

2703 # page numbers as int are normally accepted only in external goto

2704 try:

2705 p = reader.pages[dest["/Page"]]

2706 except IndexError:

2707 return

2708 assert p.indirect_reference is not None

2709 try:

2710 arr[NumberObject(0)] = NumberObject(

2711 srcpages[p.indirect_reference.idnum].page_number

2712 )

2713 self.add_named_destination_array(dest["/Title"], arr)

2714 except KeyError:

2715 pass

2716 elif dest["/Page"].indirect_reference.idnum in srcpages:

2717 arr[NumberObject(0)] = srcpages[

2718 dest["/Page"].indirect_reference.idnum

2719 ].indirect_reference

2720 self.add_named_destination_array(dest["/Title"], arr)

2721

2722 for dest in reader._named_destinations.values():

2723 _process_named_dests(dest)

2724

2725 outline_item_typ: TreeObject

2726 if outline_item is not None:

2727 outline_item_typ = cast(

2728 "TreeObject",

2729 self.add_outline_item(

2730 TextStringObject(outline_item),

2731 next(iter(srcpages.values())).indirect_reference,

2732 fit=PAGE_FIT,

2733 ).get_object(),

2734 )

2735 else:

2736 outline_item_typ = self.get_outline_root()

2737

2738 _ro = reader.root_object

2739 if import_outline and CO.OUTLINES in _ro:

2740 outline = self._get_filtered_outline(

2741 _ro.get(CO.OUTLINES, None), srcpages, reader

2742 )

2743 self._insert_filtered_outline(

2744 outline, outline_item_typ, None

2745 ) # TODO: use before parameter

2746

2747 if "/Annots" not in excluded_fields:

2748 for pag in srcpages.values():

2749 lst = self._insert_filtered_annotations(

2750 pag.original_page.get("/Annots", []), pag, srcpages, reader

2751 )

2752 if len(lst) > 0:

2753 pag[NameObject("/Annots")] = lst

2754 self.clean_page(pag)

2755

2756 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:

2757 if "/AcroForm" not in self._root_object:

2758 self._root_object[NameObject("/AcroForm")] = self._add_object(

2759 cast(

2760 DictionaryObject,

2761 reader.root_object["/AcroForm"],

2762 ).clone(self, False, ("/Fields",))

2763 )

2764 arr = ArrayObject()

2765 else:

2766 arr = cast(

2767 ArrayObject,

2768 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2769 )

2770 trslat = self._id_translated[id(reader)]

2771 try:

2772 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore

2773 try:

2774 ind = IndirectObject(trslat[f.idnum], 0, self)

2775 if ind not in arr:

2776 arr.append(ind)

2777 except KeyError:

2778 # for trslat[] which mean the field has not be copied

2779 # through the page

2780 pass

2781 except KeyError: # for /Acroform or /Fields are not existing

2782 arr = self._add_object(ArrayObject())

2783 cast(DictionaryObject, self._root_object["/AcroForm"])[

2784 NameObject("/Fields")

2785 ] = arr

2786

2787 if "/B" not in excluded_fields:

2788 self.add_filtered_articles("", srcpages, reader)

2789

2790 def _add_articles_thread(

2791 self,

2792 thread: DictionaryObject, # thread entry from the reader's array of threads

2793 pages: dict[int, PageObject],

2794 reader: PdfReader,

2795 ) -> IndirectObject:

2796 """

2797 Clone the thread with only the applicable articles.

2798

2799 Args:

2800 thread:

2801 pages:

2802 reader:

2803

2804 Returns:

2805 The added thread as an indirect reference

2806

2807 """

2808 nthread = thread.clone(

2809 self, force_duplicate=True, ignore_fields=("/F",)

2810 ) # use of clone to keep link between reader and writer

2811 self.threads.append(nthread.indirect_reference)

2812 first_article = cast("DictionaryObject", thread["/F"])

2813 current_article: Optional[DictionaryObject] = first_article

2814 new_article: Optional[DictionaryObject] = None

2815 while current_article is not None:

2816 pag = self._get_cloned_page(

2817 cast("PageObject", current_article["/P"]), pages, reader

2818 )

2819 if pag is not None:

2820 if new_article is None:

2821 new_article = cast(

2822 "DictionaryObject",

2823 self._add_object(DictionaryObject()).get_object(),

2824 )

2825 new_first = new_article

2826 nthread[NameObject("/F")] = new_article.indirect_reference

2827 else:

2828 new_article2 = cast(

2829 "DictionaryObject",

2830 self._add_object(

2831 DictionaryObject(

2832 {NameObject("/V"): new_article.indirect_reference}

2833 )

2834 ).get_object(),

2835 )

2836 new_article[NameObject("/N")] = new_article2.indirect_reference

2837 new_article = new_article2

2838 new_article[NameObject("/P")] = pag

2839 new_article[NameObject("/T")] = nthread.indirect_reference

2840 new_article[NameObject("/R")] = current_article["/R"]

2841 pag_obj = cast("PageObject", pag.get_object())

2842 if "/B" not in pag_obj:

2843 pag_obj[NameObject("/B")] = ArrayObject()

2844 cast("ArrayObject", pag_obj["/B"]).append(

2845 new_article.indirect_reference

2846 )

2847 current_article = cast("DictionaryObject", current_article["/N"])

2848 if current_article == first_article:

2849 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore

2850 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore

2851 current_article = None

2852 assert nthread.indirect_reference is not None

2853 return nthread.indirect_reference

2854

2855 def add_filtered_articles(

2856 self,

2857 fltr: Union[

2858 Pattern[Any], str

2859 ], # thread entry from the reader's array of threads

2860 pages: dict[int, PageObject],

2861 reader: PdfReader,

2862 ) -> None:

2863 """

2864 Add articles matching the defined criteria.

2865

2866 Args:

2867 fltr:

2868 pages:

2869 reader:

2870

2871 """

2872 if isinstance(fltr, str):

2873 fltr = re.compile(fltr)

2874 elif not isinstance(fltr, Pattern):

2875 fltr = re.compile("")

2876 for p in pages.values():

2877 pp = p.original_page

2878 for a in pp.get("/B", ()):

2879 a_obj = a.get_object()

2880 if is_null_or_none(a_obj):

2881 continue

2882 thr = a_obj.get("/T")

2883 if thr is None:

2884 continue

2885 thr = thr.get_object()

2886 if thr.indirect_reference.idnum not in self._id_translated[

2887 id(reader)

2888 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

2889 self._add_articles_thread(thr, pages, reader)

2890

2891 def _get_cloned_page(

2892 self,

2893 page: Union[None, IndirectObject, PageObject, NullObject],

2894 pages: dict[int, PageObject],

2895 reader: PdfReader,

2896 ) -> Optional[IndirectObject]:

2897 if isinstance(page, NullObject):

2898 return None

2899 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

2900 _i = page.indirect_reference

2901 elif isinstance(page, IndirectObject):

2902 _i = page

2903 try:

2904 return pages[_i.idnum].indirect_reference # type: ignore

2905 except Exception:

2906 return None

2907

2908 def _insert_filtered_annotations(

2909 self,

2910 annots: Union[IndirectObject, list[DictionaryObject], None],

2911 page: PageObject,

2912 pages: dict[int, PageObject],

2913 reader: PdfReader,

2914 ) -> list[Destination]:

2915 outlist = ArrayObject()

2916 if isinstance(annots, IndirectObject):

2917 annots = cast("list[Any]", annots.get_object())

2918 if annots is None:

2919 return outlist

2920 if not isinstance(annots, list):

2921 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)

2922 return outlist

2923 for an in annots:

2924 ano = cast("DictionaryObject", an.get_object())

2925 if (

2926 ano["/Subtype"] != "/Link"

2927 or "/A" not in ano

2928 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"

2929 or "/Dest" in ano

2930 ):

2931 if "/Dest" not in ano:

2932 outlist.append(self._add_object(ano.clone(self)))

2933 else:

2934 d = ano["/Dest"]

2935 if isinstance(d, str):

2936 # it is a named dest

2937 if str(d) in self.get_named_dest_root():

2938 outlist.append(ano.clone(self).indirect_reference)

2939 else:

2940 d = cast("ArrayObject", d)

2941 p = self._get_cloned_page(d[0], pages, reader)

2942 if p is not None:

2943 anc = ano.clone(self, ignore_fields=("/Dest",))

2944 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

2945 outlist.append(self._add_object(anc))

2946 else:

2947 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

2948 if d is None or isinstance(d, NullObject):

2949 continue

2950 if isinstance(d, str):

2951 # it is a named dest

2952 if str(d) in self.get_named_dest_root():

2953 outlist.append(ano.clone(self).indirect_reference)

2954 else:

2955 d = cast("ArrayObject", d)

2956 p = self._get_cloned_page(d[0], pages, reader)

2957 if p is not None:

2958 anc = ano.clone(self, ignore_fields=("/D",))

2959 cast("DictionaryObject", anc["/A"])[

2960 NameObject("/D")

2961 ] = ArrayObject([p, *d[1:]])

2962 outlist.append(self._add_object(anc))

2963 return outlist

2964

2965 def _get_filtered_outline(

2966 self,

2967 node: Any,

2968 pages: dict[int, PageObject],

2969 reader: PdfReader,

2970 ) -> list[Destination]:

2971 """

2972 Extract outline item entries that are part of the specified page set.

2973

2974 Args:

2975 node:

2976 pages:

2977 reader:

2978

2979 Returns:

2980 A list of destination objects.

2981

2982 """

2983 new_outline = []

2984 if node is None:

2985 node = NullObject()

2986 node = node.get_object()

2987 if is_null_or_none(node):

2988 node = DictionaryObject()

2989 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

2990 node = node.get("/First", None)

2991 if node is not None:

2992 node = node.get_object()

2993 new_outline += self._get_filtered_outline(node, pages, reader)

2994 else:

2995 v: Union[None, IndirectObject, NullObject]

2996 while node is not None:

2997 node = node.get_object()

2998 o = cast("Destination", reader._build_outline_item(node))

2999 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3000 if v is None:

3001 v = NullObject()

3002 o[NameObject("/Page")] = v

3003 if "/First" in node:

3004 o._filtered_children = self._get_filtered_outline(

3005 node["/First"], pages, reader

3006 )

3007 else:

3008 o._filtered_children = []

3009 if (

3010 not isinstance(o["/Page"], NullObject)

3011 or len(o._filtered_children) > 0

3012 ):

3013 new_outline.append(o)

3014 node = node.get("/Next", None)

3015 return new_outline

3016

3017 def _clone_outline(self, dest: Destination) -> TreeObject:

3018 n_ol = TreeObject()

3019 self._add_object(n_ol)

3020 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3021 if not isinstance(dest["/Page"], NullObject):

3022 if dest.node is not None and "/A" in dest.node:

3023 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3024 else:

3025 n_ol[NameObject("/Dest")] = dest.dest_array

3026 # TODO: /SE

3027 if dest.node is not None:

3028 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3029 n_ol[NameObject("/C")] = ArrayObject(

3030 dest.node.get(

3031 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3032 )

3033 )

3034 return n_ol

3035

3036 def _insert_filtered_outline(

3037 self,

3038 outlines: list[Destination],

3039 parent: Union[TreeObject, IndirectObject],

3040 before: Union[None, TreeObject, IndirectObject] = None,

3041 ) -> None:

3042 for dest in outlines:

3043 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3044 # with np=self.add_outline_item_destination(dest,parent,before)

3045 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3046 np = parent

3047 else:

3048 np = self._clone_outline(dest)

3049 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3050 self._insert_filtered_outline(dest._filtered_children, np, None)

3051

3052 def close(self) -> None:

3053 """Implemented for API harmonization."""

3054 return

3055

3056 def find_outline_item(

3057 self,

3058 outline_item: dict[str, Any],

3059 root: Optional[OutlineType] = None,

3060 ) -> Optional[list[int]]:

3061 if root is None:

3062 o = self.get_outline_root()

3063 else:

3064 o = cast("TreeObject", root)

3065

3066 i = 0

3067 while o is not None:

3068 if (

3069 o.indirect_reference == outline_item

3070 or o.get("/Title", None) == outline_item

3071 ):

3072 return [i]

3073 if "/First" in o:

3074 res = self.find_outline_item(

3075 outline_item, cast(OutlineType, o["/First"])

3076 )

3077 if res:

3078 return ([i] if "/Title" in o else []) + res

3079 if "/Next" in o:

3080 i += 1

3081 o = cast(TreeObject, o["/Next"])

3082 else:

3083 return None

3084 raise PyPdfError("This line is theoretically unreachable.") # pragma: no cover

3085

3086 def reset_translation(

3087 self, reader: Union[None, PdfReader, IndirectObject] = None

3088 ) -> None:

3089 """

3090 Reset the translation table between reader and the writer object.

3091

3092 Late cloning will create new independent objects.

3093

3094 Args:

3095 reader: PdfReader or IndirectObject referencing a PdfReader object.

3096 if set to None or omitted, all tables will be reset.

3097

3098 """

3099 if reader is None:

3100 self._id_translated = {}

3101 elif isinstance(reader, PdfReader):

3102 try:

3103 del self._id_translated[id(reader)]

3104 except Exception:

3105 pass

3106 elif isinstance(reader, IndirectObject):

3107 try:

3108 del self._id_translated[id(reader.pdf)]

3109 except Exception:

3110 pass

3111 else:

3112 raise Exception("invalid parameter {reader}")

3113

3114 def set_page_label(

3115 self,

3116 page_index_from: int,

3117 page_index_to: int,

3118 style: Optional[PageLabelStyle] = None,

3119 prefix: Optional[str] = None,

3120 start: Optional[int] = 0,

3121 ) -> None:

3122 """

3123 Set a page label to a range of pages.

3124

3125 Page indexes must be given starting from 0.

3126 Labels must have a style, a prefix or both.

3127 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3128

3129 Args:

3130 page_index_from: page index of the beginning of the range starting from 0

3131 page_index_to: page index of the beginning of the range starting from 0

3132 style: The numbering style to be used for the numeric portion of each page label:

3133

3134 * ``/D`` Decimal Arabic numerals

3135 * ``/R`` Uppercase Roman numerals

3136 * ``/r`` Lowercase Roman numerals

3137 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3138 AA to ZZ for the next 26, and so on)

3139 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3140 aa to zz for the next 26, and so on)

3141

3142 prefix: The label prefix for page labels in this range.

3143 start: The value of the numeric portion for the first page label

3144 in the range.

3145 Subsequent pages are numbered sequentially from this value,

3146 which must be greater than or equal to 1.

3147 Default value: 1.

3148

3149 """

3150 if style is None and prefix is None:

3151 raise ValueError("At least one of style and prefix must be given")

3152 if page_index_from < 0:

3153 raise ValueError("page_index_from must be greater or equal than 0")

3154 if page_index_to < page_index_from:

3155 raise ValueError(

3156 "page_index_to must be greater or equal than page_index_from"

3157 )

3158 if page_index_to >= len(self.pages):

3159 raise ValueError("page_index_to exceeds number of pages")

3160 if start is not None and start != 0 and start < 1:

3161 raise ValueError("If given, start must be greater or equal than one")

3162

3163 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3164

3165 def _set_page_label(

3166 self,

3167 page_index_from: int,

3168 page_index_to: int,

3169 style: Optional[PageLabelStyle] = None,

3170 prefix: Optional[str] = None,

3171 start: Optional[int] = 0,

3172 ) -> None:

3173 """

3174 Set a page label to a range of pages.

3175

3176 Page indexes must be given starting from 0.

3177 Labels must have a style, a prefix or both.

3178 If a range is not assigned any page label a decimal label starting from 1 is applied.

3179

3180 Args:

3181 page_index_from: page index of the beginning of the range starting from 0

3182 page_index_to: page index of the beginning of the range starting from 0

3183 style: The numbering style to be used for the numeric portion of each page label:

3184 /D Decimal Arabic numerals

3185 /R Uppercase Roman numerals

3186 /r Lowercase Roman numerals

3187 /A Uppercase letters (A to Z for the first 26 pages,

3188 AA to ZZ for the next 26, and so on)

3189 /a Lowercase letters (a to z for the first 26 pages,

3190 aa to zz for the next 26, and so on)

3191 prefix: The label prefix for page labels in this range.

3192 start: The value of the numeric portion for the first page label

3193 in the range.

3194 Subsequent pages are numbered sequentially from this value,

3195 which must be greater than or equal to 1. Default value: 1.

3196

3197 """

3198 default_page_label = DictionaryObject()

3199 default_page_label[NameObject("/S")] = NameObject("/D")

3200

3201 new_page_label = DictionaryObject()

3202 if style is not None:

3203 new_page_label[NameObject("/S")] = NameObject(style)

3204 if prefix is not None:

3205 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3206 if start != 0:

3207 new_page_label[NameObject("/St")] = NumberObject(start)

3208

3209 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3210 nums = ArrayObject()

3211 nums_insert(NumberObject(0), default_page_label, nums)

3212 page_labels = TreeObject()

3213 page_labels[NameObject("/Nums")] = nums

3214 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3215

3216 page_labels = cast(

3217 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3218 )

3219 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3220

3221 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3222 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3223 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3224 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3225 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3226

3227 page_labels[NameObject("/Nums")] = nums

3228 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3229

3230 def _repr_mimebundle_(

3231 self,

3232 include: Union[None, Iterable[str]] = None,

3233 exclude: Union[None, Iterable[str]] = None,

3234 ) -> dict[str, Any]:

3235 """

3236 Integration into Jupyter Notebooks.

3237

3238 This method returns a dictionary that maps a mime-type to its

3239 representation.

3240

3241 .. seealso::

3242

3243 https://ipython.readthedocs.io/en/stable/config/integrating.html

3244 """

3245 pdf_data = BytesIO()

3246 self.write(pdf_data)

3247 data = {

3248 "application/pdf": pdf_data,

3249 }

3250

3251 if include is not None:

3252 # Filter representations based on include list

3253 data = {k: v for k, v in data.items() if k in include}

3254

3255 if exclude is not None:

3256 # Remove representations based on exclude list

3257 data = {k: v for k, v in data.items() if k not in exclude}

3258

3259 return data

3260

3261

3262def _pdf_objectify(obj: Union[dict[str, Any], str, float, list[Any]]) -> PdfObject:

3263 if isinstance(obj, PdfObject):

3264 return obj

3265 if isinstance(obj, dict):

3266 to_add = DictionaryObject()

3267 for key, value in obj.items():

3268 to_add[NameObject(key)] = _pdf_objectify(value)

3269 return to_add

3270 if isinstance(obj, str):

3271 if obj.startswith("/"):

3272 return NameObject(obj)

3273 return TextStringObject(obj)

3274 if isinstance(obj, (float, int)):

3275 return FloatObject(obj)

3276 if isinstance(obj, list):

3277 return ArrayObject(_pdf_objectify(i) for i in obj)

3278 raise NotImplementedError(

3279 f"{type(obj)=} could not be cast to a PdfObject"

3280 )

3281

3282

3283def _create_outline_item(

3284 action_ref: Union[None, IndirectObject],

3285 title: str,

3286 color: Union[tuple[float, float, float], str, None],

3287 italic: bool,

3288 bold: bool,

3289) -> TreeObject:

3290 outline_item = TreeObject()

3291 if action_ref is not None:

3292 outline_item[NameObject("/A")] = action_ref

3293 outline_item.update(

3294 {

3295 NameObject("/Title"): create_string_object(title),

3296 }

3297 )

3298 if color:

3299 if isinstance(color, str):

3300 color = hex_to_rgb(color)

3301 outline_item.update(

3302 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3303 )

3304 if italic or bold:

3305 format_flag = 0

3306 if italic:

3307 format_flag += OutlineFontFlag.italic

3308 if bold:

3309 format_flag += OutlineFontFlag.bold

3310 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3311 return outline_item

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 21%

1411 statements