Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import uuid

36from io import BytesIO, FileIO, IOBase

37from itertools import compress

38from pathlib import Path

39from types import TracebackType

40from typing import (

41 IO,

42 Any,

43 Callable,

44 Dict,

45 Iterable,

46 List,

47 Optional,

48 Pattern,

49 Tuple,

50 Type,

51 Union,

52 cast,

53)

55from ._cmap import _default_fonts_space_width, build_char_map_from_dict

56from ._doc_common import DocumentInformation, PdfDocCommon

57from ._encryption import EncryptAlgorithm, Encryption

58from ._page import PageObject, Transformation

59from ._page_labels import nums_clear_range, nums_insert, nums_next

60from ._reader import PdfReader

61from ._utils import (

62 StrByteType,

63 StreamType,

64 _get_max_pdf_version_header,

65 deprecate,

66 deprecate_no_replacement,

67 deprecation_with_replacement,

68 logger_warning,

69)

70from .constants import AnnotationDictionaryAttributes as AA

71from .constants import CatalogAttributes as CA

72from .constants import (

73 CatalogDictionary,

74 FileSpecificationDictionaryEntries,

75 GoToActionArguments,

76 ImageType,

77 InteractiveFormDictEntries,

78 OutlineFontFlag,

79 PageLabelStyle,

80 TypFitArguments,

81 UserAccessPermissions,

82)

83from .constants import Core as CO

84from .constants import FieldDictionaryAttributes as FA

85from .constants import PageAttributes as PG

86from .constants import PagesAttributes as PA

87from .constants import TrailerKeys as TK

88from .errors import PyPdfError

89from .generic import (

90 PAGE_FIT,

91 ArrayObject,

92 BooleanObject,

93 ByteStringObject,

94 ContentStream,

95 DecodedStreamObject,

96 Destination,

97 DictionaryObject,

98 Fit,

99 FloatObject,

100 IndirectObject,

101 NameObject,

102 NullObject,

103 NumberObject,

104 PdfObject,

105 RectangleObject,

106 StreamObject,

107 TextStringObject,

108 TreeObject,

109 ViewerPreferences,

110 create_string_object,

111 hex_to_rgb,

112 is_null_or_none,

113)

114from .pagerange import PageRange, PageRangeSpec

115from .types import (

116 AnnotationSubtype,

117 BorderArrayType,

118 LayoutType,

119 OutlineItemType,

120 OutlineType,

121 PagemodeType,

122)

123from .xmp import XmpInformation

124

125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12

127

128

129class ObjectDeletionFlag(enum.IntFlag):

130 NONE = 0

131 TEXT = enum.auto()

132 LINKS = enum.auto()

133 ATTACHMENTS = enum.auto()

134 OBJECTS_3D = enum.auto()

135 ALL_ANNOTATIONS = enum.auto()

136 XOBJECT_IMAGES = enum.auto()

137 INLINE_IMAGES = enum.auto()

138 DRAWING_IMAGES = enum.auto()

139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

140

141

142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

143 hash = hashlib.md5()

144 for block in iter(lambda: stream.read(blocksize), b""):

145 hash.update(block)

146 return hash.hexdigest()

147

148

149class PdfWriter(PdfDocCommon):

150 """

151 Write a PDF file out, given pages produced by another class or through

152 cloning a PDF file during initialization.

153

154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

155

156 Args:

157 clone_from: identical to fileobj (for compatibility)

158

159 incremental: If true, loads the document and set the PdfWriter in incremental mode.

160

161 When writing incrementally, the original document is written first and new/modified

162 content is appended. To be used for signed document/forms to keep signature valid.

163

164 full: If true, loads all the objects (always full if incremental = True).

165 This parameter may allow loading large PDFs.

166

167 """

168

169 def __init__(

170 self,

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

173 incremental: bool = False,

174 full: bool = False,

175 ) -> None:

176 self.incremental = incremental or full

177 """

178 Returns if the PdfWriter object has been started in incremental mode.

179 """

180

181 self._objects: List[Optional[PdfObject]] = []

182 """

183 The indirect objects in the PDF.

184 For the incremental case, it will be filled with None

185 in clone_reader_document_root.

186 """

187

188 self._original_hash: List[int] = []

189 """

190 List of hashes after import; used to identify changes.

191 """

192

193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}

194 """

195 Maps hash values of indirect objects to the list of IndirectObjects.

196 This is used for compression.

197 """

198

199 self._id_translated: Dict[int, Dict[int, int]] = {}

200 """List of already translated IDs.

201 dict[id(pdf)][(idnum, generation)]

202 """

203

204 self._info_obj: Optional[PdfObject]

205 """The PDF files's document information dictionary,

206 the Info entry in the PDF file's trailer dictionary."""

207

208 self._ID: Union[ArrayObject, None] = None

209 """The PDF file identifier,

210 defined by the ID in the PDF file's trailer dictionary."""

211

212 if self.incremental:

213 if isinstance(fileobj, (str, Path)):

214 with open(fileobj, "rb") as f:

215 fileobj = BytesIO(f.read(-1))

216 if isinstance(fileobj, BytesIO):

217 fileobj = PdfReader(fileobj)

218 if not isinstance(fileobj, PdfReader):

219 raise PyPdfError("Invalid type for incremental mode")

220 self._reader = fileobj # prev content is in _reader.stream

221 self._header = fileobj.pdf_header.encode()

222 self._readonly = True # TODO: to be analysed

223 else:

224 self._header = b"%PDF-1.3"

225 self._info_obj = self._add_object(

226 DictionaryObject(

227 {NameObject("/Producer"): create_string_object("pypdf")}

228 )

229 )

230

231 def _get_clone_from(

232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

236 fileobj == "" or clone_from is not None

237 ):

238 return clone_from

239 cloning = True

240 if isinstance(fileobj, (str, Path)) and (

241 not Path(str(fileobj)).exists()

242 or Path(str(fileobj)).stat().st_size == 0

243 ):

244 cloning = False

245 if isinstance(fileobj, (IOBase, BytesIO)):

246 t = fileobj.tell()

247 if fileobj.seek(0, 2) == 0:

248 cloning = False

249 fileobj.seek(t, 0)

250 if cloning:

251 clone_from = fileobj

252 return clone_from

253

254 clone_from = _get_clone_from(fileobj, clone_from)

255 # To prevent overwriting

256 self.temp_fileobj = fileobj

257 self.fileobj = ""

258 self._with_as_usage = False

259 self._cloned = False

260 # The root of our page tree node

261 pages = DictionaryObject(

262 {

263 NameObject(PA.TYPE): NameObject("/Pages"),

264 NameObject(PA.COUNT): NumberObject(0),

265 NameObject(PA.KIDS): ArrayObject(),

266 }

267 )

268 self.flattened_pages = []

269 self._encryption: Optional[Encryption] = None

270 self._encrypt_entry: Optional[DictionaryObject] = None

271

272 if clone_from is not None:

273 if not isinstance(clone_from, PdfReader):

274 clone_from = PdfReader(clone_from)

275 self.clone_document_from_reader(clone_from)

276 self._cloned = True

277 else:

278 self._pages = self._add_object(pages)

279 self._root_object = DictionaryObject(

280 {

281 NameObject(PA.TYPE): NameObject(CO.CATALOG),

282 NameObject(CO.PAGES): self._pages,

283 }

284 )

285 self._add_object(self._root_object)

286 if full and not incremental:

287 self.incremental = False

288 if isinstance(self._ID, list):

289 if isinstance(self._ID[0], TextStringObject):

290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

291 if isinstance(self._ID[1], TextStringObject):

292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

293

294 # for commonality

295 @property

296 def is_encrypted(self) -> bool:

297 """

298 Read-only boolean property showing whether this PDF file is encrypted.

299

300 Note that this property, if true, will remain true even after the

301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

302 """

303 return False

304

305 @property

306 def root_object(self) -> DictionaryObject:

307 """

308 Provide direct access to PDF Structure.

309

310 Note:

311 Recommended only for read access.

312

313 """

314 return self._root_object

315

316 @property

317 def _info(self) -> Optional[DictionaryObject]:

318 """

319 Provide access to "/Info". Standardized with PdfReader.

320

321 Returns:

322 /Info Dictionary; None if the entry does not exist

323

324 """

325 return (

326 None

327 if self._info_obj is None

328 else cast(DictionaryObject, self._info_obj.get_object())

329 )

330

331 @_info.setter

332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

333 if value is None:

334 try:

335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore

336 except (KeyError, AttributeError):

337 pass

338 self._info_obj = None

339 else:

340 if self._info_obj is None:

341 self._info_obj = self._add_object(DictionaryObject())

342 obj = cast(DictionaryObject, self._info_obj.get_object())

343 obj.clear()

344 obj.update(cast(DictionaryObject, value.get_object()))

345

346 @property

347 def xmp_metadata(self) -> Optional[XmpInformation]:

348 """XMP (Extensible Metadata Platform) data."""

349 return cast(XmpInformation, self.root_object.xmp_metadata)

350

351 @xmp_metadata.setter

352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None:

353 """XMP (Extensible Metadata Platform) data."""

354 if value is None:

355 if "/Metadata" in self.root_object:

356 del self.root_object["/Metadata"]

357 else:

358 self.root_object[NameObject("/Metadata")] = value

359

360 return self.root_object.xmp_metadata # type: ignore

361

362 @property

363 def with_as_usage(self) -> bool:

364 deprecate_no_replacement("with_as_usage", "6.0")

365 return self._with_as_usage

366

367 @with_as_usage.setter

368 def with_as_usage(self, value: bool) -> None:

369 deprecate_no_replacement("with_as_usage", "6.0")

370 self._with_as_usage = value

371

372 def __enter__(self) -> "PdfWriter":

373 """Store how writer is initialized by 'with'."""

374 c: bool = self._cloned

375 t = self.temp_fileobj

376 self.__init__() # type: ignore

377 self._cloned = c

378 self._with_as_usage = True

379 self.fileobj = t # type: ignore

380 return self

381

382 def __exit__(

383 self,

384 exc_type: Optional[Type[BaseException]],

385 exc: Optional[BaseException],

386 traceback: Optional[TracebackType],

387 ) -> None:

388 """Write data to the fileobj."""

389 if self.fileobj and not self._cloned:

390 self.write(self.fileobj)

391

392 @property

393 def pdf_header(self) -> str:

394 """

395 Read/Write property of the PDF header that is written.

396

397 This should be something like ``'%PDF-1.5'``. It is recommended to set

398 the lowest version that supports all features which are used within the

399 PDF file.

400

401 Note: `pdf_header` returns a string but accepts bytes or str for writing

402 """

403 return self._header.decode()

404

405 @pdf_header.setter

406 def pdf_header(self, new_header: Union[str, bytes]) -> None:

407 if isinstance(new_header, str):

408 new_header = new_header.encode()

409 self._header = new_header

410

411 def _add_object(self, obj: PdfObject) -> IndirectObject:

412 if (

413 getattr(obj, "indirect_reference", None) is not None

414 and obj.indirect_reference.pdf == self # type: ignore

415 ):

416 return obj.indirect_reference # type: ignore

417 # check for /Contents in Pages (/Contents in annotations are strings)

418 if isinstance(obj, DictionaryObject) and isinstance(

419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

420 ):

421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

422 self._objects.append(obj)

423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

424 return obj.indirect_reference

425

426 def get_object(

427 self,

428 indirect_reference: Union[int, IndirectObject],

429 ) -> PdfObject:

430 if isinstance(indirect_reference, int):

431 obj = self._objects[indirect_reference - 1]

432 elif indirect_reference.pdf != self:

433 raise ValueError("PDF must be self")

434 else:

435 obj = self._objects[indirect_reference.idnum - 1]

436 assert obj is not None, "mypy"

437 return obj

438

439 def _replace_object(

440 self,

441 indirect_reference: Union[int, IndirectObject],

442 obj: PdfObject,

443 ) -> PdfObject:

444 if isinstance(indirect_reference, IndirectObject):

445 if indirect_reference.pdf != self:

446 raise ValueError("PDF must be self")

447 indirect_reference = indirect_reference.idnum

448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore

449 if (

450 getattr(obj, "indirect_reference", None) is not None

451 and obj.indirect_reference.pdf != self # type: ignore

452 ):

453 obj = obj.clone(self)

454 self._objects[indirect_reference - 1] = obj

455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

456

457 assert isinstance(obj, PdfObject), "mypy"

458 return obj

459

460 def _add_page(

461 self,

462 page: PageObject,

463 index: int,

464 excluded_keys: Iterable[str] = (),

465 ) -> PageObject:

466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:

467 raise ValueError("Invalid page object")

468 assert self.flattened_pages is not None, "for mypy"

469 page_org = page

470 excluded_keys = list(excluded_keys)

471 excluded_keys += [PA.PARENT, "/StructParents"]

472 # Acrobat does not accept two indirect references pointing on the same

473 # page; therefore in order to add multiple copies of the same

474 # page, we need to create a new dictionary for the page, however the

475 # objects below (including content) are not duplicated:

476 try: # delete an already existing page

477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore

478 page_org.indirect_reference.idnum # type: ignore

479 ]

480 except Exception:

481 pass

482 page = cast(

483 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

484 )

485 if page_org.pdf is not None:

486 other = page_org.pdf.pdf_header

487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

488 node, idx = self._get_page_in_node(index)

489 page[NameObject(PA.PARENT)] = node.indirect_reference

490

491 if idx >= 0:

492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)

493 self.flattened_pages.insert(index, page)

494 else:

495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)

496 self.flattened_pages.append(page)

497 recurse = 0

498 while not is_null_or_none(node):

499 node = cast(DictionaryObject, node.get_object())

500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)

501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix.

502 recurse += 1

503 if recurse > 1000:

504 raise PyPdfError("Too many recursive calls!")

505 return page

506

507 def set_need_appearances_writer(self, state: bool = True) -> None:

508 """

509 Sets the "NeedAppearances" flag in the PDF writer.

510

511 The "NeedAppearances" flag indicates whether the appearance dictionary

512 for form fields should be automatically generated by the PDF viewer or

513 if the embedded appearance should be used.

514

515 Args:

516 state: The actual value of the NeedAppearances flag.

517

518 Returns:

519 None

520

521 """

522 # See §12.7.2 and §7.7.2 for more information:

523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

524 try:

525 # get the AcroForm tree

526 if CatalogDictionary.ACRO_FORM not in self._root_object:

527 self._root_object[

528 NameObject(CatalogDictionary.ACRO_FORM)

529 ] = self._add_object(DictionaryObject())

530

531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

533 need_appearances

534 ] = BooleanObject(state)

535 except Exception as exc: # pragma: no cover

536 logger_warning(

537 f"set_need_appearances_writer({state}) catch : {exc}", __name__

538 )

539

540 def create_viewer_preferences(self) -> ViewerPreferences:

541 o = ViewerPreferences()

542 self._root_object[

543 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

544 ] = self._add_object(o)

545 return o

546

547 def add_page(

548 self,

549 page: PageObject,

550 excluded_keys: Iterable[str] = (),

551 ) -> PageObject:

552 """

553 Add a page to this PDF file.

554

555 Recommended for advanced usage including the adequate excluded_keys.

556

557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

558 instance.

559

560 Args:

561 page: The page to add to the document. Should be

562 an instance of :class:`PageObject<pypdf._page.PageObject>`

563 excluded_keys:

564

565 Returns:

566 The added PageObject.

567

568 """

569 assert self.flattened_pages is not None, "mypy"

570 return self._add_page(page, len(self.flattened_pages), excluded_keys)

571

572 def insert_page(

573 self,

574 page: PageObject,

575 index: int = 0,

576 excluded_keys: Iterable[str] = (),

577 ) -> PageObject:

578 """

579 Insert a page in this PDF file. The page is usually acquired from a

580 :class:`PdfReader<pypdf.PdfReader>` instance.

581

582 Args:

583 page: The page to add to the document.

584 index: Position at which the page will be inserted.

585 excluded_keys:

586

587 Returns:

588 The added PageObject.

589

590 """

591 assert self.flattened_pages is not None, "mypy"

592 if index < 0:

593 index = len(self.flattened_pages) + index

594 if index < 0:

595 raise ValueError("Invalid index value")

596 if index >= len(self.flattened_pages):

597 return self.add_page(page, excluded_keys)

598 return self._add_page(page, index, excluded_keys)

599

600 def _get_page_number_by_indirect(

601 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

602 ) -> Optional[int]:

603 """

604 Generate _page_id2num.

605

606 Args:

607 indirect_reference:

608

609 Returns:

610 The page number or None

611

612 """

613 # To provide same function as in PdfReader

614 if is_null_or_none(indirect_reference):

615 return None

616 assert indirect_reference is not None, "mypy"

617 if isinstance(indirect_reference, int):

618 indirect_reference = IndirectObject(indirect_reference, 0, self)

619 obj = indirect_reference.get_object()

620 if isinstance(obj, PageObject):

621 return obj.page_number

622 return None

623

624 def add_blank_page(

625 self, width: Optional[float] = None, height: Optional[float] = None

626 ) -> PageObject:

627 """

628 Append a blank page to this PDF file and return it.

629

630 If no page size is specified, use the size of the last page.

631

632 Args:

633 width: The width of the new page expressed in default user

634 space units.

635 height: The height of the new page expressed in default

636 user space units.

637

638 Returns:

639 The newly appended page.

640

641 Raises:

642 PageSizeNotDefinedError: if width and height are not defined

643 and previous page does not exist.

644

645 """

646 page = PageObject.create_blank_page(self, width, height)

647 return self.add_page(page)

648

649 def insert_blank_page(

650 self,

651 width: Optional[Union[float, decimal.Decimal]] = None,

652 height: Optional[Union[float, decimal.Decimal]] = None,

653 index: int = 0,

654 ) -> PageObject:

655 """

656 Insert a blank page to this PDF file and return it.

657

658 If no page size is specified, use the size of the last page.

659

660 Args:

661 width: The width of the new page expressed in default user

662 space units.

663 height: The height of the new page expressed in default

664 user space units.

665 index: Position to add the page.

666

667 Returns:

668 The newly inserted page.

669

670 Raises:

671 PageSizeNotDefinedError: if width and height are not defined

672 and previous page does not exist.

673

674 """

675 if width is None or (height is None and index < self.get_num_pages()):

676 oldpage = self.pages[index]

677 width = oldpage.mediabox.width

678 height = oldpage.mediabox.height

679 page = PageObject.create_blank_page(self, width, height)

680 self.insert_page(page, index)

681 return page

682

683 @property

684 def open_destination(

685 self,

686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

687 return super().open_destination

688

689 @open_destination.setter

690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

691 if dest is None:

692 try:

693 del self._root_object["/OpenAction"]

694 except KeyError:

695 pass

696 elif isinstance(dest, str):

697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

698 elif isinstance(dest, Destination):

699 self._root_object[NameObject("/OpenAction")] = dest.dest_array

700 elif isinstance(dest, PageObject):

701 self._root_object[NameObject("/OpenAction")] = Destination(

702 "Opening",

703 dest.indirect_reference

704 if dest.indirect_reference is not None

705 else NullObject(),

706 PAGE_FIT,

707 ).dest_array

708

709 def add_js(self, javascript: str) -> None:

710 """

711 Add JavaScript which will launch upon opening this PDF.

712

713 Args:

714 javascript: Your JavaScript.

715

716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

717 # Example: This will launch the print window when the PDF is opened.

718

719 """

720 # Names / JavaScript preferred to be able to add multiple scripts

721 if "/Names" not in self._root_object:

722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

723 names = cast(DictionaryObject, self._root_object[CA.NAMES])

724 if "/JavaScript" not in names:

725 names[NameObject("/JavaScript")] = DictionaryObject(

726 {NameObject("/Names"): ArrayObject()}

727 )

728 js_list = cast(

729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

730 )

731 # We need a name for parameterized JavaScript in the PDF file,

732 # but it can be anything.

733 js_list.append(create_string_object(str(uuid.uuid4())))

734

735 js = DictionaryObject(

736 {

737 NameObject(PA.TYPE): NameObject("/Action"),

738 NameObject("/S"): NameObject("/JavaScript"),

739 NameObject("/JS"): TextStringObject(f"{javascript}"),

740 }

741 )

742 js_list.append(self._add_object(js))

743

744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:

745 """

746 Embed a file inside the PDF.

747

748 Reference:

749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

750 Section 7.11.3

751

752 Args:

753 filename: The filename to display.

754 data: The data in the file.

755

756 """

757 # We need three entries:

758 # * The file's data

759 # * The /Filespec entry

760 # * The file's name, which goes in the Catalog

761

762 # The entry for the file

763 # Sample:

764 # 8 0 obj

765 # <<

766 # /Length 12

767 # /Type /EmbeddedFile

768 # >>

769 # stream

770 # Hello world!

771 # endstream

772 # endobj

773

774 if isinstance(data, str):

775 data = data.encode("latin-1")

776 file_entry = DecodedStreamObject()

777 file_entry.set_data(data)

778 file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})

779

780 # The Filespec entry

781 # Sample:

782 # 7 0 obj

783 # <<

784 # /Type /Filespec

785 # /F (hello.txt)

786 # /EF << /F 8 0 R >>

787 # >>

788 # endobj

789

790 ef_entry = DictionaryObject()

791 ef_entry.update({NameObject("/F"): self._add_object(file_entry)})

792

793 filespec = DictionaryObject()

794 filespec.update(

795 {

796 NameObject(PA.TYPE): NameObject("/Filespec"),

797 NameObject(FileSpecificationDictionaryEntries.F): create_string_object(

798 filename

799 ), # Perhaps also try TextStringObject

800 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,

801 }

802 )

803

804 # Then create the entry for the root, as it needs

805 # a reference to the Filespec

806 # Sample:

807 # 1 0 obj

808 # <<

809 # /Type /Catalog

810 # /Outlines 2 0 R

811 # /Pages 3 0 R

812 # /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>

813 # >>

814 # endobj

815

816 if CA.NAMES not in self._root_object:

817 self._root_object[NameObject(CA.NAMES)] = self._add_object(

818 DictionaryObject()

819 )

820 if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]):

821 embedded_files_names_dictionary = DictionaryObject(

822 {NameObject(CA.NAMES): ArrayObject()}

823 )

824 cast(DictionaryObject, self._root_object[CA.NAMES])[

825 NameObject("/EmbeddedFiles")

826 ] = self._add_object(embedded_files_names_dictionary)

827 else:

828 embedded_files_names_dictionary = cast(

829 DictionaryObject,

830 cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"],

831 )

832 cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend(

833 [create_string_object(filename), filespec]

834 )

835

836 def append_pages_from_reader(

837 self,

838 reader: PdfReader,

839 after_page_append: Optional[Callable[[PageObject], None]] = None,

840 ) -> None:

841 """

842 Copy pages from reader to writer. Includes an optional callback

843 parameter which is invoked after pages are appended to the writer.

844

845 ``append`` should be preferred.

846

847 Args:

848 reader: a PdfReader object from which to copy page

849 annotations to this writer object. The writer's annots

850 will then be updated.

851 after_page_append:

852 Callback function that is invoked after each page is appended to

853 the writer. Signature includes a reference to the appended page

854 (delegates to append_pages_from_reader). The single parameter of

855 the callback is a reference to the page just appended to the

856 document.

857

858 """

859 reader_num_pages = len(reader.pages)

860 # Copy pages from reader to writer

861 for reader_page_number in range(reader_num_pages):

862 reader_page = reader.pages[reader_page_number]

863 writer_page = self.add_page(reader_page)

864 # Trigger callback, pass writer page as parameter

865 if callable(after_page_append):

866 after_page_append(writer_page)

867

868 def _merge_content_stream_to_page(

869 self,

870 page: PageObject,

871 new_content_data: bytes,

872 ) -> None:

873 """

874 Combines existing content stream(s) with new content (as bytes),

875 and returns a new single StreamObject.

876

877 Args:

878 page: The page to which the new content data will be added.

879 new_content_data: A binary-encoded new content stream, for

880 instance the commands to draw an XObject.

881 """

882 # First resolve the existing page content. This always is an IndirectObject:

883 # PDF Explained by John Whitington

884 # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

885 if NameObject("/Contents") in page:

886 existing_content_ref = page[NameObject("/Contents")]

887 existing_content = existing_content_ref.get_object()

888

889 if isinstance(existing_content, ArrayObject):

890 # Create a new StreamObject for the new_content_data

891 new_stream_obj = StreamObject()

892 new_stream_obj.set_data(new_content_data)

893 existing_content.append(self._add_object(new_stream_obj))

894 page[NameObject("/Contents")] = self._add_object(existing_content)

895 if isinstance(existing_content, StreamObject):

896 # Merge new content to existing StreamObject

897 merged_data = existing_content.get_data() + b"\n" + new_content_data

898 new_stream = StreamObject()

899 new_stream.set_data(merged_data)

900 page[NameObject("/Contents")] = self._add_object(new_stream)

901 else:

902 # If no existing content, then we have an empty page.

903 # Create a new StreamObject in a new /Contents entry.

904 new_stream = StreamObject()

905 new_stream.set_data(new_content_data)

906 page[NameObject("/Contents")] = self._add_object(new_stream)

907

908 def _add_apstream_object(

909 self,

910 page: PageObject,

911 appearance_stream_obj: StreamObject,

912 object_name: str,

913 x_offset: float,

914 y_offset: float,

915 font_res: Optional[DictionaryObject] = None

916 ) -> None:

917 """

918 Adds an appearance stream to the page content in the form of

919 an XObject.

920

921 Args:

922 page: The page to which to add the appearance stream.

923 appearance_stream_obj: The appearance stream.

924 object_name: The name of the appearance stream.

925 x_offset: The horizontal offset for the appearance stream.

926 y_offset: The vertical offset for the appearance stream.

927 font_res: The appearance stream's font resource (if given).

928 """

929 # Prepare XObject resource dictionary on the page

930 pg_res = cast(DictionaryObject, page[PG.RESOURCES])

931 if font_res is not None:

932 font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated

933 if "/Font" not in pg_res:

934 pg_res[NameObject("/Font")] = DictionaryObject()

935 pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])

936 if font_name not in pg_ft_res:

937 pg_ft_res[NameObject(font_name)] = font_res

938 # Always add the resolved stream object to the writer to get a new IndirectObject.

939 # This ensures we have a valid IndirectObject managed by *this* writer.

940 xobject_ref = self._add_object(appearance_stream_obj)

941 xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()

942 if "/XObject" not in pg_res:

943 pg_res[NameObject("/XObject")] = DictionaryObject()

944 pg_xo_res = cast(DictionaryObject, pg_res["/XObject"])

945 if xobject_name not in pg_xo_res:

946 pg_xo_res[xobject_name] = xobject_ref

947 else:

948 logger_warning(

949 f"XObject {xobject_name!r} already added to page resources. This might be an issue.",

950 __name__

951 )

952 xobject_cm = Transformation().translate(x_offset, y_offset)

953 xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()

954 self._merge_content_stream_to_page(page, xobject_drawing_commands)

955

956 def _update_field_annotation(

957 self,

958 page: PageObject,

959 field: DictionaryObject,

960 annotation: DictionaryObject,

961 font_name: str = "",

962 font_size: float = -1,

963 flatten: bool = False,

964 ) -> None:

965 # Calculate rectangle dimensions

966 _rct = cast(RectangleObject, annotation[AA.Rect])

967 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))

968

969 # Extract font information

970 da = annotation.get_inherited(

971 AA.DA,

972 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(

973 AA.DA, None

974 ),

975 )

976 if da is None:

977 da = TextStringObject("/Helv 0 Tf 0 g")

978 else:

979 da = da.get_object()

980 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")

981 font_properties = [x for x in font_properties if x != ""]

982 if font_name:

983 font_properties[font_properties.index("Tf") - 2] = font_name

984 else:

985 font_name = font_properties[font_properties.index("Tf") - 2]

986 font_height = (

987 font_size

988 if font_size >= 0

989 else float(font_properties[font_properties.index("Tf") - 1])

990 )

991 if font_height == 0:

992 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:

993 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE

994 else:

995 font_height = rct.height - 2

996 font_properties[font_properties.index("Tf") - 1] = str(font_height)

997 da = " ".join(font_properties)

998 y_offset = rct.height - 1 - font_height

999

1000 # Retrieve font information from local DR ...

1001 dr: Any = cast(

1002 DictionaryObject,

1003 cast(

1004 DictionaryObject,

1005 annotation.get_inherited(

1006 "/DR",

1007 cast(

1008 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

1009 ).get("/DR", DictionaryObject()),

1010 ),

1011 ).get_object(),

1012 )

1013 dr = dr.get("/Font", DictionaryObject()).get_object()

1014 # _default_fonts_space_width keys is the list of Standard fonts

1015 if font_name not in dr and font_name not in _default_fonts_space_width:

1016 # ...or AcroForm dictionary

1017 dr = cast(

1018 Dict[Any, Any],

1019 cast(

1020 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

1021 ).get("/DR", {}),

1022 )

1023 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()

1024 font_res = dr.get(font_name, None)

1025 if not is_null_or_none(font_res):

1026 font_res = cast(DictionaryObject, font_res.get_object())

1027 font_subtype, _, font_encoding, font_map = build_char_map_from_dict(

1028 200, font_res

1029 )

1030 try: # remove width stored in -1 key

1031 del font_map[-1]

1032 except KeyError:

1033 pass

1034 font_full_rev: Dict[str, bytes]

1035 if isinstance(font_encoding, str):

1036 font_full_rev = {

1037 v: k.encode(font_encoding) for k, v in font_map.items()

1038 }

1039 else:

1040 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

1041 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

1042 for key, value in font_map.items():

1043 font_full_rev[value] = font_encoding_rev.get(key, key)

1044 else:

1045 logger_warning(f"Font dictionary for {font_name} not found.", __name__)

1046 font_full_rev = {}

1047

1048 # Retrieve field text and selected values

1049 field_flags = field.get(FA.Ff, 0)

1050 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:

1051 txt = "\n".join(annotation.get_inherited(FA.Opt, []))

1052 sel = field.get("/V", [])

1053 if not isinstance(sel, list):

1054 sel = [sel]

1055 else: # /Tx

1056 txt = field.get("/V", "")

1057 sel = []

1058 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)

1059 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")

1060 # Generate appearance stream

1061 ap_stream = generate_appearance_stream(

1062 txt, sel, da, font_full_rev, rct, font_height, y_offset

1063 )

1064

1065 # Create appearance dictionary

1066 dct = DecodedStreamObject.initialize_from_dictionary(

1067 {

1068 NameObject("/Type"): NameObject("/XObject"),

1069 NameObject("/Subtype"): NameObject("/Form"),

1070 NameObject("/BBox"): rct,

1071 "__streamdata__": ByteStringObject(ap_stream),

1072 "/Length": 0,

1073 }

1074 )

1075 if AA.AP in annotation:

1076 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():

1077 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:

1078 dct[k] = v

1079

1080 # Update Resources with font information if necessary

1081 if font_res is not None:

1082 dct[NameObject("/Resources")] = DictionaryObject(

1083 {

1084 NameObject("/Font"): DictionaryObject(

1085 {

1086 NameObject(font_name): getattr(

1087 font_res, "indirect_reference", font_res

1088 )

1089 }

1090 )

1091 }

1092 )

1093 if AA.AP not in annotation:

1094 annotation[NameObject(AA.AP)] = DictionaryObject(

1095 {NameObject("/N"): self._add_object(dct)}

1096 )

1097 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):

1098 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1099 NameObject("/N")

1100 ] = self._add_object(dct)

1101 else: # [/AP][/N] exists

1102 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore

1103 self._objects[n - 1] = dct

1104 dct.indirect_reference = IndirectObject(n, 0, self)

1105

1106 if flatten:

1107 field_name = self._get_qualified_field_name(annotation)

1108 self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)

1109

1110 FFBITS_NUL = FA.FfBits(0)

1111

1112 def update_page_form_field_values(

1113 self,

1114 page: Union[PageObject, List[PageObject], None],

1115 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]],

1116 flags: FA.FfBits = FFBITS_NUL,

1117 auto_regenerate: Optional[bool] = True,

1118 flatten: bool = False,

1119 ) -> None:

1120 """

1121 Update the form field values for a given page from a fields dictionary.

1122

1123 Copy field texts and values from fields to page.

1124 If the field links to a parent object, add the information to the parent.

1125

1126 Args:

1127 page: `PageObject` - references **PDF writer's page** where the

1128 annotations and field data will be updated.

1129 `List[Pageobject]` - provides list of pages to be processed.

1130 `None` - all pages.

1131 fields: a Python dictionary of:

1132

1133 * field names (/T) as keys and text values (/V) as value

1134 * field names (/T) as keys and list of text values (/V) for multiple choice list

1135 * field names (/T) as keys and tuple of:

1136 * text values (/V)

1137 * font id (e.g. /F1, the font id must exist)

1138 * font size (0 for autosize)

1139

1140 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

1141

1142 auto_regenerate: Set/unset the need_appearances flag;

1143 the flag is unchanged if auto_regenerate is None.

1144

1145 flatten: Whether or not to flatten the annotation. If True, this adds the annotation's

1146 appearance stream to the page contents. Note that this option does not remove the

1147 annotation itself.

1148

1149 """

1150 if CatalogDictionary.ACRO_FORM not in self._root_object:

1151 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

1152 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1153 if InteractiveFormDictEntries.Fields not in af:

1154 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

1155 if isinstance(auto_regenerate, bool):

1156 self.set_need_appearances_writer(auto_regenerate)

1157 # Iterate through pages, update field values

1158 if page is None:

1159 page = list(self.pages)

1160 if isinstance(page, list):

1161 for p in page:

1162 if PG.ANNOTS in p: # just to prevent warnings

1163 self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)

1164 return

1165 if PG.ANNOTS not in page:

1166 logger_warning("No fields to update on this page", __name__)

1167 return

1168 for annotation in page[PG.ANNOTS]: # type: ignore

1169 annotation = cast(DictionaryObject, annotation.get_object())

1170 if annotation.get("/Subtype", "") != "/Widget":

1171 continue

1172 if "/FT" in annotation and "/T" in annotation:

1173 parent_annotation = annotation

1174 else:

1175 parent_annotation = annotation.get(

1176 PG.PARENT, DictionaryObject()

1177 ).get_object()

1178

1179 for field, value in fields.items():

1180 if not (

1181 self._get_qualified_field_name(parent_annotation) == field

1182 or parent_annotation.get("/T", None) == field

1183 ):

1184 continue

1185 if (

1186 parent_annotation.get("/FT", None) == "/Ch"

1187 and "/I" in parent_annotation

1188 ):

1189 del parent_annotation["/I"]

1190 if flags:

1191 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1192 if not (value is None and flatten): # Only change values if given by user and not flattening.

1193 if isinstance(value, list):

1194 lst = ArrayObject(TextStringObject(v) for v in value)

1195 parent_annotation[NameObject(FA.V)] = lst

1196 elif isinstance(value, tuple):

1197 annotation[NameObject(FA.V)] = TextStringObject(

1198 value[0],

1199 )

1200 else:

1201 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1202 if parent_annotation.get(FA.FT) == "/Btn":

1203 # Checkbox button (no /FT found in Radio widgets)

1204 v = NameObject(value)

1205 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1206 normal_ap = cast(DictionaryObject, ap["/N"])

1207 if v not in normal_ap:

1208 v = NameObject("/Off")

1209 appearance_stream_obj = normal_ap.get(v)

1210 # other cases will be updated through the for loop

1211 annotation[NameObject(AA.AS)] = v

1212 annotation[NameObject(FA.V)] = v

1213 if flatten and appearance_stream_obj is not None:

1214 # We basically copy the entire appearance stream, which should be an XObject that

1215 # is already registered. No need to add font resources.

1216 rct = cast(RectangleObject, annotation[AA.Rect])

1217 self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])

1218 elif (

1219 parent_annotation.get(FA.FT) == "/Tx"

1220 or parent_annotation.get(FA.FT) == "/Ch"

1221 ):

1222 # textbox

1223 if isinstance(value, tuple):

1224 self._update_field_annotation(

1225 page, parent_annotation, annotation, value[1], value[2], flatten=flatten

1226 )

1227 else:

1228 self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)

1229 elif (

1230 annotation.get(FA.FT) == "/Sig"

1231 ): # deprecated # not implemented yet

1232 logger_warning("Signature forms not implemented yet", __name__)

1233

1234 def reattach_fields(

1235 self, page: Optional[PageObject] = None

1236 ) -> List[DictionaryObject]:

1237 """

1238 Parse annotations within the page looking for orphan fields and

1239 reattach then into the Fields Structure.

1240

1241 Args:

1242 page: page to analyze.

1243 If none is provided, all pages will be analyzed.

1244

1245 Returns:

1246 list of reattached fields.

1247

1248 """

1249 lst = []

1250 if page is None:

1251 for p in self.pages:

1252 lst += self.reattach_fields(p)

1253 return lst

1254

1255 try:

1256 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1257 except KeyError:

1258 af = DictionaryObject()

1259 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1260 try:

1261 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1262 except KeyError:

1263 fields = ArrayObject()

1264 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1265

1266 if "/Annots" not in page:

1267 return lst

1268 annotations = cast(ArrayObject, page["/Annots"])

1269 for idx, annotation in enumerate(annotations):

1270 is_indirect = isinstance(annotation, IndirectObject)

1271 annotation = cast(DictionaryObject, annotation.get_object())

1272 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1273 if (

1274 "indirect_reference" in annotation.__dict__

1275 and annotation.indirect_reference in fields

1276 ):

1277 continue

1278 if not is_indirect:

1279 annotations[idx] = self._add_object(annotation)

1280 fields.append(annotation.indirect_reference)

1281 lst.append(annotation)

1282 return lst

1283

1284 def clone_reader_document_root(self, reader: PdfReader) -> None:

1285 """

1286 Copy the reader document root to the writer and all sub-elements,

1287 including pages, threads, outlines,... For partial insertion, ``append``

1288 should be considered.

1289

1290 Args:

1291 reader: PdfReader from which the document root should be copied.

1292

1293 """

1294 self._info_obj = None

1295 if self.incremental:

1296 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)

1297 for i in range(len(self._objects)):

1298 o = reader.get_object(i + 1)

1299 if o is not None:

1300 self._objects[i] = o.replicate(self)

1301 else:

1302 self._objects.clear()

1303 self._root_object = reader.root_object.clone(self)

1304 self._pages = self._root_object.raw_get("/Pages")

1305

1306 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest

1307 # must be done here before rewriting

1308 if self.incremental:

1309 self._original_hash = [

1310 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1311 ]

1312 self._flatten()

1313 assert self.flattened_pages is not None

1314 for p in self.flattened_pages:

1315 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1316 if not self.incremental:

1317 p[NameObject("/Parent")] = self._pages

1318 if not self.incremental:

1319 cast(DictionaryObject, self._pages.get_object())[

1320 NameObject("/Kids")

1321 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1322

1323 def clone_document_from_reader(

1324 self,

1325 reader: PdfReader,

1326 after_page_append: Optional[Callable[[PageObject], None]] = None,

1327 ) -> None:

1328 """

1329 Create a copy (clone) of a document from a PDF file reader cloning

1330 section '/Root' and '/Info' and '/ID' of the pdf.

1331

1332 Args:

1333 reader: PDF file reader instance from which the clone

1334 should be created.

1335 after_page_append:

1336 Callback function that is invoked after each page is appended to

1337 the writer. Signature includes a reference to the appended page

1338 (delegates to append_pages_from_reader). The single parameter of

1339 the callback is a reference to the page just appended to the

1340 document.

1341

1342 """

1343 self.clone_reader_document_root(reader)

1344 inf = reader._info

1345 if self.incremental:

1346 if inf is not None:

1347 self._info_obj = cast(

1348 IndirectObject, inf.clone(self).indirect_reference

1349 )

1350 assert isinstance(self._info, DictionaryObject), "for mypy"

1351 self._original_hash[

1352 self._info_obj.indirect_reference.idnum - 1

1353 ] = self._info.hash_bin()

1354 elif inf is not None:

1355 self._info_obj = self._add_object(

1356 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1357 )

1358 # else: _info_obj = None done in clone_reader_document_root()

1359

1360 try:

1361 self._ID = cast(ArrayObject, reader._ID).clone(self)

1362 except AttributeError:

1363 pass

1364

1365 if callable(after_page_append):

1366 for page in cast(

1367 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1368 ):

1369 after_page_append(page.get_object())

1370

1371 def _compute_document_identifier(self) -> ByteStringObject:

1372 stream = BytesIO()

1373 self._write_pdf_structure(stream)

1374 stream.seek(0)

1375 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1376

1377 def generate_file_identifiers(self) -> None:

1378 """

1379 Generate an identifier for the PDF that will be written.

1380

1381 The only point of this is ensuring uniqueness. Reproducibility is not

1382 required.

1383 When a file is first written, both identifiers shall be set to the same value.

1384 If both identifiers match when a file reference is resolved, it is very

1385 likely that the correct and unchanged file has been found. If only the first

1386 identifier matches, a different version of the correct file has been found.

1387 see §14.4 "File Identifiers".

1388 """

1389 if self._ID:

1390 id1 = self._ID[0]

1391 id2 = self._compute_document_identifier()

1392 else:

1393 id1 = self._compute_document_identifier()

1394 id2 = id1

1395 self._ID = ArrayObject((id1, id2))

1396

1397 def encrypt(

1398 self,

1399 user_password: str,

1400 owner_password: Optional[str] = None,

1401 use_128bit: bool = True,

1402 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1403 *,

1404 algorithm: Optional[str] = None,

1405 ) -> None:

1406 """

1407 Encrypt this PDF file with the PDF Standard encryption handler.

1408

1409 Args:

1410 user_password: The password which allows for opening

1411 and reading the PDF file with the restrictions provided.

1412 owner_password: The password which allows for

1413 opening the PDF files without any restrictions. By default,

1414 the owner password is the same as the user password.

1415 use_128bit: flag as to whether to use 128bit

1416 encryption. When false, 40bit encryption will be used.

1417 By default, this flag is on.

1418 permissions_flag: permissions as described in

1419 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1420 the permission is granted.

1421 Hence an integer value of -1 will set all flags.

1422 Bit position 3 is for printing, 4 is for modifying content,

1423 5 and 6 control annotations, 9 for form fields,

1424 10 for extraction of text and graphics.

1425 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1426 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1427 `use_128bit` will be ignored.

1428

1429 """

1430 if owner_password is None:

1431 owner_password = user_password

1432

1433 if algorithm is not None:

1434 try:

1435 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1436 except AttributeError:

1437 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1438 else:

1439 alg = EncryptAlgorithm.RC4_128

1440 if not use_128bit:

1441 alg = EncryptAlgorithm.RC4_40

1442 self.generate_file_identifiers()

1443 assert self._ID

1444 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1445 # in case call `encrypt` again

1446 entry = self._encryption.write_entry(user_password, owner_password)

1447 if self._encrypt_entry:

1448 # replace old encrypt_entry

1449 assert self._encrypt_entry.indirect_reference is not None

1450 entry.indirect_reference = self._encrypt_entry.indirect_reference

1451 self._objects[entry.indirect_reference.idnum - 1] = entry

1452 else:

1453 self._add_object(entry)

1454 self._encrypt_entry = entry

1455

1456 def write_stream(self, stream: StreamType) -> None:

1457 if hasattr(stream, "mode") and "b" not in stream.mode:

1458 logger_warning(

1459 f"File <{stream.name}> to write to is not in binary mode. "

1460 "It may not be written to correctly.",

1461 __name__,

1462 )

1463 # deprecated to be removed in pypdf 6.0.0 :

1464 # if not self._root:

1465 # self._root = self._add_object(self._root_object)

1466 # self._sweep_indirect_references(self._root)

1467

1468 if self.incremental:

1469 self._reader.stream.seek(0)

1470 stream.write(self._reader.stream.read(-1))

1471 if len(self.list_objects_in_increment()) > 0:

1472 self._write_increment(stream) # writes objs, xref stream and startxref

1473 else:

1474 object_positions, free_objects = self._write_pdf_structure(stream)

1475 xref_location = self._write_xref_table(

1476 stream, object_positions, free_objects

1477 )

1478 self._write_trailer(stream, xref_location)

1479

1480 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:

1481 """

1482 Write the collection of pages added to this object out as a PDF file.

1483

1484 Args:

1485 stream: An object to write the file to. The object can support

1486 the write method and the tell method, similar to a file object, or

1487 be a file path, just like the fileobj, just named it stream to keep

1488 existing workflow.

1489

1490 Returns:

1491 A tuple (bool, IO).

1492

1493 """

1494 my_file = False

1495

1496 if stream == "":

1497 raise ValueError(f"Output({stream=}) is empty.")

1498

1499 if isinstance(stream, (str, Path)):

1500 stream = FileIO(stream, "wb")

1501 my_file = True

1502

1503 self.write_stream(stream)

1504

1505 if my_file:

1506 stream.close()

1507 else:

1508 stream.flush()

1509

1510 return my_file, stream

1511

1512 def list_objects_in_increment(self) -> List[IndirectObject]:

1513 """

1514 For analysis or debugging.

1515 Provides the list of new or modified objects that will be written

1516 in the increment.

1517 Deleted objects will not be freed but will become orphans.

1518

1519 Returns:

1520 List of new or modified IndirectObjects

1521

1522 """

1523 original_hash_count = len(self._original_hash)

1524 return [

1525 cast(IndirectObject, obj).indirect_reference

1526 for i, obj in enumerate(self._objects)

1527 if (

1528 obj is not None

1529 and (

1530 i >= original_hash_count

1531 or obj.hash_bin() != self._original_hash[i]

1532 )

1533 )

1534 ]

1535

1536 def _write_increment(self, stream: StreamType) -> None:

1537 object_positions = {}

1538 object_blocks = []

1539 current_start = -1

1540 current_stop = -2

1541 original_hash_count = len(self._original_hash)

1542 for i, obj in enumerate(self._objects):

1543 if obj is not None and (

1544 i >= original_hash_count

1545 or obj.hash_bin() != self._original_hash[i]

1546 ):

1547 idnum = i + 1

1548 assert isinstance(obj, PdfObject), "mypy"

1549 # first write new/modified object

1550 object_positions[idnum] = stream.tell()

1551 stream.write(f"{idnum} 0 obj\n".encode())

1552 """ encryption is not operational

1553 if self._encryption and obj != self._encrypt_entry:

1554 obj = self._encryption.encrypt_object(obj, idnum, 0)

1555 """

1556 obj.write_to_stream(stream)

1557 stream.write(b"\nendobj\n")

1558

1559 # prepare xref

1560 if idnum != current_stop:

1561 if current_start > 0:

1562 object_blocks.append(

1563 [current_start, current_stop - current_start]

1564 )

1565 current_start = idnum

1566 current_stop = idnum + 1

1567 assert current_start > 0, "for pytest only"

1568 object_blocks.append([current_start, current_stop - current_start])

1569 # write incremented xref

1570 xref_location = stream.tell()

1571 xr_id = len(self._objects) + 1

1572 stream.write(f"{xr_id} 0 obj".encode())

1573 init_data = {

1574 NameObject("/Type"): NameObject("/XRef"),

1575 NameObject("/Size"): NumberObject(xr_id + 1),

1576 NameObject("/Root"): self.root_object.indirect_reference,

1577 NameObject("/Filter"): NameObject("/FlateDecode"),

1578 NameObject("/Index"): ArrayObject(

1579 [NumberObject(_it) for _su in object_blocks for _it in _su]

1580 ),

1581 NameObject("/W"): ArrayObject(

1582 [NumberObject(1), NumberObject(4), NumberObject(1)]

1583 ),

1584 "__streamdata__": b"",

1585 }

1586 if self._info is not None and (

1587 self._info.indirect_reference.idnum - 1 # type: ignore

1588 >= len(self._original_hash)

1589 or cast(IndirectObject, self._info).hash_bin() # kept for future

1590 != self._original_hash[

1591 self._info.indirect_reference.idnum - 1 # type: ignore

1592 ]

1593 ):

1594 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1595 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1596 if self._ID:

1597 init_data[NameObject(TK.ID)] = self._ID

1598 xr = StreamObject.initialize_from_dictionary(init_data)

1599 xr.set_data(

1600 b"".join(

1601 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1602 )

1603 )

1604 xr.write_to_stream(stream)

1605 stream.write(f"\nendobj\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1606

1607 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:

1608 object_positions = []

1609 free_objects = []

1610 stream.write(self.pdf_header.encode() + b"\n")

1611 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1612

1613 for idnum, obj in enumerate(self._objects, start=1):

1614 if obj is not None:

1615 object_positions.append(stream.tell())

1616 stream.write(f"{idnum} 0 obj\n".encode())

1617 if self._encryption and obj != self._encrypt_entry:

1618 obj = self._encryption.encrypt_object(obj, idnum, 0)

1619 obj.write_to_stream(stream)

1620 stream.write(b"\nendobj\n")

1621 else:

1622 object_positions.append(-1)

1623 free_objects.append(idnum)

1624 free_objects.append(0) # add 0 to loop in accordance with specification

1625 return object_positions, free_objects

1626

1627 def _write_xref_table(

1628 self, stream: StreamType, object_positions: List[int], free_objects: List[int]

1629 ) -> int:

1630 xref_location = stream.tell()

1631 stream.write(b"xref\n")

1632 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1633 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1634 free_idx = 1

1635 for offset in object_positions:

1636 if offset > 0:

1637 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1638 else:

1639 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1640 free_idx += 1

1641 return xref_location

1642

1643 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1644 """

1645 Write the PDF trailer to the stream.

1646

1647 To quote the PDF specification:

1648 [The] trailer [gives] the location of the cross-reference table and

1649 of certain special objects within the body of the file.

1650 """

1651 stream.write(b"trailer\n")

1652 trailer = DictionaryObject(

1653 {

1654 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1655 NameObject(TK.ROOT): self.root_object.indirect_reference,

1656 }

1657 )

1658 if self._info is not None:

1659 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1660 if self._ID is not None:

1661 trailer[NameObject(TK.ID)] = self._ID

1662 if self._encrypt_entry:

1663 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1664 trailer.write_to_stream(stream)

1665 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1666

1667 @property

1668 def metadata(self) -> Optional[DocumentInformation]:

1669 """

1670 Retrieve/set the PDF file's document information dictionary, if it exists.

1671

1672 Args:

1673 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1674

1675 Note that some PDF files use (XMP) metadata streams instead of document

1676 information dictionaries, and these metadata streams will not be

1677 accessed by this function, but by :meth:`~xmp_metadata`.

1678

1679 """

1680 return super().metadata

1681

1682 @metadata.setter

1683 def metadata(

1684 self,

1685 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],

1686 ) -> None:

1687 if value is None:

1688 self._info = None

1689 else:

1690 if self._info is not None:

1691 self._info.clear()

1692

1693 self.add_metadata(value)

1694

1695 def add_metadata(self, infos: Dict[str, Any]) -> None:

1696 """

1697 Add custom metadata to the output.

1698

1699 Args:

1700 infos: a Python dictionary where each key is a field

1701 and each value is your new metadata.

1702

1703 """

1704 args = {}

1705 if isinstance(infos, PdfObject):

1706 infos = cast(DictionaryObject, infos.get_object())

1707 for key, value in list(infos.items()):

1708 if isinstance(value, PdfObject):

1709 value = value.get_object()

1710 args[NameObject(key)] = create_string_object(str(value))

1711 if self._info is None:

1712 self._info = DictionaryObject()

1713 self._info.update(args)

1714

1715 def compress_identical_objects(

1716 self,

1717 remove_identicals: bool = True,

1718 remove_orphans: bool = True,

1719 ) -> None:

1720 """

1721 Parse the PDF file and merge objects that have the same hash.

1722 This will make objects common to multiple pages.

1723 Recommended to be used just before writing output.

1724

1725 Args:

1726 remove_identicals: Remove identical objects.

1727 remove_orphans: Remove unreferenced objects.

1728

1729 """

1730

1731 def replace_in_obj(

1732 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]

1733 ) -> None:

1734 if isinstance(obj, DictionaryObject):

1735 key_val = obj.items()

1736 elif isinstance(obj, ArrayObject):

1737 key_val = enumerate(obj) # type: ignore

1738 else:

1739 return

1740 assert isinstance(obj, (DictionaryObject, ArrayObject))

1741 for k, v in key_val:

1742 if isinstance(v, IndirectObject):

1743 orphans[v.idnum - 1] = False

1744 if v in crossref:

1745 obj[k] = crossref[v]

1746 else:

1747 """the filtering on DictionaryObject and ArrayObject only

1748 will be performed within replace_in_obj"""

1749 replace_in_obj(v, crossref)

1750

1751 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])

1752 self._idnum_hash = {}

1753 orphans = [True] * len(self._objects)

1754 # look for similar objects

1755 for idx, obj in enumerate(self._objects):

1756 if is_null_or_none(obj):

1757 continue

1758 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1759 assert isinstance(obj.indirect_reference, IndirectObject)

1760 h = obj.hash_value()

1761 if remove_identicals and h in self._idnum_hash:

1762 self._idnum_hash[h][1].append(obj.indirect_reference)

1763 self._objects[idx] = None

1764 else:

1765 self._idnum_hash[h] = (obj.indirect_reference, [])

1766

1767 # generate the dict converting others to 1st

1768 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1769 cnv_rev: Dict[IndirectObject, IndirectObject] = {}

1770 for k, v in cnv.items():

1771 cnv_rev.update(zip(v, (k,) * len(v)))

1772

1773 # replace reference to merged objects

1774 for obj in self._objects:

1775 if isinstance(obj, (DictionaryObject, ArrayObject)):

1776 replace_in_obj(obj, cnv_rev)

1777

1778 # remove orphans (if applicable)

1779 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

1780

1781 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore

1782

1783 try:

1784 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore

1785 except AttributeError:

1786 pass

1787 for i in compress(range(len(self._objects)), orphans):

1788 self._objects[i] = None

1789

1790 def _sweep_indirect_references(

1791 self,

1792 root: Union[

1793 ArrayObject,

1794 BooleanObject,

1795 DictionaryObject,

1796 FloatObject,

1797 IndirectObject,

1798 NameObject,

1799 PdfObject,

1800 NumberObject,

1801 TextStringObject,

1802 NullObject,

1803 ],

1804 ) -> None: # deprecated

1805 """

1806 Resolving any circular references to Page objects.

1807

1808 Circular references to Page objects can arise when objects such as

1809 annotations refer to their associated page. If these references are not

1810 properly handled, the PDF file will contain multiple copies of the same

1811 Page object. To address this problem, Page objects store their original

1812 object reference number. This method adds the reference number of any

1813 circularly referenced Page objects to an external reference map. This

1814 ensures that self-referencing trees reference the correct new object

1815 location, rather than copying in a new copy of the Page object.

1816

1817 Args:

1818 root: The root of the PDF object tree to sweep.

1819

1820 """

1821 deprecate(

1822 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed",

1823 )

1824

1825 def _resolve_indirect_object(

1826 self, data: IndirectObject

1827 ) -> IndirectObject: # deprecated

1828 """

1829 Resolves an indirect object to an indirect object in this PDF file.

1830

1831 If the input indirect object already belongs to this PDF file, it is

1832 returned directly. Otherwise, the object is retrieved from the input

1833 object's PDF file using the object's ID number and generation number. If

1834 the object cannot be found, a warning is logged and a `NullObject` is

1835 returned.

1836

1837 If the object is not already in this PDF file, it is added to the file's

1838 list of objects and assigned a new ID number and generation number of 0.

1839 The hash value of the object is then added to the `_idnum_hash`

1840 dictionary, with the corresponding `IndirectObject` reference as the

1841 value.

1842

1843 Args:

1844 data: The `IndirectObject` to resolve.

1845

1846 Returns:

1847 The resolved `IndirectObject` in this PDF file.

1848

1849 Raises:

1850 ValueError: If the input stream is closed.

1851

1852 """

1853 deprecate(

1854 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed",

1855 )

1856 return IndirectObject(0, 0, self)

1857

1858 def get_reference(self, obj: PdfObject) -> IndirectObject:

1859 idnum = self._objects.index(obj) + 1

1860 ref = IndirectObject(idnum, 0, self)

1861 assert ref.get_object() == obj

1862 return ref

1863

1864 def get_outline_root(self) -> TreeObject:

1865 if CO.OUTLINES in self._root_object:

1866 # Entries in the catalog dictionary

1867 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1868 if not isinstance(outline, TreeObject):

1869 t = TreeObject(outline)

1870 self._replace_object(outline.indirect_reference.idnum, t)

1871 outline = t

1872 idnum = self._objects.index(outline) + 1

1873 outline_ref = IndirectObject(idnum, 0, self)

1874 assert outline_ref.get_object() == outline

1875 else:

1876 outline = TreeObject()

1877 outline.update({})

1878 outline_ref = self._add_object(outline)

1879 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1880

1881 return outline

1882

1883 def get_threads_root(self) -> ArrayObject:

1884 """

1885 The list of threads.

1886

1887 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1888

1889 Returns:

1890 An array (possibly empty) of Dictionaries with an ``/F`` key,

1891 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1892

1893 """

1894 if CO.THREADS in self._root_object:

1895 # Entries in the catalog dictionary

1896 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1897 else:

1898 threads = ArrayObject()

1899 self._root_object[NameObject(CO.THREADS)] = threads

1900 return threads

1901

1902 @property

1903 def threads(self) -> ArrayObject:

1904 """

1905 Read-only property for the list of threads.

1906

1907 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1908

1909 Each element is a dictionary with an ``/F`` key, and optionally

1910 information about the thread in ``/I`` or ``/Metadata`` keys.

1911 """

1912 return self.get_threads_root()

1913

1914 def add_outline_item_destination(

1915 self,

1916 page_destination: Union[IndirectObject, PageObject, TreeObject],

1917 parent: Union[None, TreeObject, IndirectObject] = None,

1918 before: Union[None, TreeObject, IndirectObject] = None,

1919 is_open: bool = True,

1920 ) -> IndirectObject:

1921 page_destination = cast(PageObject, page_destination.get_object())

1922 if isinstance(page_destination, PageObject):

1923 return self.add_outline_item_destination(

1924 Destination(

1925 f"page #{page_destination.page_number}",

1926 cast(IndirectObject, page_destination.indirect_reference),

1927 Fit.fit(),

1928 )

1929 )

1930

1931 if parent is None:

1932 parent = self.get_outline_root()

1933

1934 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1935 parent = cast(TreeObject, parent.get_object())

1936 page_destination_ref = self._add_object(page_destination)

1937 if before is not None:

1938 before = before.indirect_reference

1939 parent.insert_child(

1940 page_destination_ref,

1941 before,

1942 self,

1943 page_destination.inc_parent_counter_outline

1944 if is_open

1945 else (lambda x, y: 0), # noqa: ARG005

1946 )

1947 if "/Count" not in page_destination:

1948 page_destination[NameObject("/Count")] = NumberObject(0)

1949

1950 return page_destination_ref

1951

1952 def add_outline_item_dict(

1953 self,

1954 outline_item: OutlineItemType,

1955 parent: Union[None, TreeObject, IndirectObject] = None,

1956 before: Union[None, TreeObject, IndirectObject] = None,

1957 is_open: bool = True,

1958 ) -> IndirectObject:

1959 outline_item_object = TreeObject()

1960 outline_item_object.update(outline_item)

1961

1962 """code currently unreachable

1963 if "/A" in outline_item:

1964 action = DictionaryObject()

1965 a_dict = cast(DictionaryObject, outline_item["/A"])

1966 for k, v in list(a_dict.items()):

1967 action[NameObject(str(k))] = v

1968 action_ref = self._add_object(action)

1969 outline_item_object[NameObject("/A")] = action_ref

1970 """

1971 return self.add_outline_item_destination(

1972 outline_item_object, parent, before, is_open

1973 )

1974

1975 def add_outline_item(

1976 self,

1977 title: str,

1978 page_number: Union[None, PageObject, IndirectObject, int],

1979 parent: Union[None, TreeObject, IndirectObject] = None,

1980 before: Union[None, TreeObject, IndirectObject] = None,

1981 color: Optional[Union[Tuple[float, float, float], str]] = None,

1982 bold: bool = False,

1983 italic: bool = False,

1984 fit: Fit = PAGE_FIT,

1985 is_open: bool = True,

1986 ) -> IndirectObject:

1987 """

1988 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1989

1990 Args:

1991 title: Title to use for this outline item.

1992 page_number: Page number this outline item will point to.

1993 parent: A reference to a parent outline item to create nested

1994 outline items.

1995 before:

1996 color: Color of the outline item's font as a red, green, blue tuple

1997 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1998 bold: Outline item font is bold

1999 italic: Outline item font is italic

2000 fit: The fit of the destination page.

2001

2002 Returns:

2003 The added outline item as an indirect object.

2004

2005 """

2006 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

2007 if isinstance(italic, Fit): # it means that we are on the old params

2008 if fit is not None and page_number is None:

2009 page_number = fit

2010 return self.add_outline_item(

2011 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

2012 )

2013 if page_number is None:

2014 action_ref = None

2015 else:

2016 if isinstance(page_number, IndirectObject):

2017 page_ref = page_number

2018 elif isinstance(page_number, PageObject):

2019 page_ref = page_number.indirect_reference

2020 elif isinstance(page_number, int):

2021 try:

2022 page_ref = self.pages[page_number].indirect_reference

2023 except IndexError:

2024 page_ref = NumberObject(page_number)

2025 if page_ref is None:

2026 logger_warning(

2027 f"can not find reference of page {page_number}",

2028 __name__,

2029 )

2030 page_ref = NullObject()

2031 dest = Destination(

2032 NameObject("/" + title + " outline item"),

2033 page_ref,

2034 fit,

2035 )

2036

2037 action_ref = self._add_object(

2038 DictionaryObject(

2039 {

2040 NameObject(GoToActionArguments.D): dest.dest_array,

2041 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

2042 }

2043 )

2044 )

2045 outline_item = self._add_object(

2046 _create_outline_item(action_ref, title, color, italic, bold)

2047 )

2048

2049 if parent is None:

2050 parent = self.get_outline_root()

2051 return self.add_outline_item_destination(outline_item, parent, before, is_open)

2052

2053 def add_outline(self) -> None:

2054 raise NotImplementedError(

2055 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

2056 )

2057

2058 def add_named_destination_array(

2059 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

2060 ) -> None:

2061 named_dest = self.get_named_dest_root()

2062 i = 0

2063 while i < len(named_dest):

2064 if title < named_dest[i]:

2065 named_dest.insert(i, destination)

2066 named_dest.insert(i, TextStringObject(title))

2067 return

2068 i += 2

2069 named_dest.extend([TextStringObject(title), destination])

2070 return

2071

2072 def add_named_destination_object(

2073 self,

2074 page_destination: PdfObject,

2075 ) -> IndirectObject:

2076 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore

2077 self.add_named_destination_array(

2078 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore

2079 )

2080

2081 return page_destination_ref

2082

2083 def add_named_destination(

2084 self,

2085 title: str,

2086 page_number: int,

2087 ) -> IndirectObject:

2088 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore

2089 dest = DictionaryObject()

2090 dest.update(

2091 {

2092 NameObject(GoToActionArguments.D): ArrayObject(

2093 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

2094 ),

2095 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

2096 }

2097 )

2098

2099 dest_ref = self._add_object(dest)

2100 if not isinstance(title, TextStringObject):

2101 title = TextStringObject(str(title))

2102

2103 self.add_named_destination_array(title, dest_ref)

2104 return dest_ref

2105

2106 def remove_links(self) -> None:

2107 """Remove links and annotations from this output."""

2108 for page in self.pages:

2109 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

2110

2111 def remove_annotations(

2112 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

2113 ) -> None:

2114 """

2115 Remove annotations by annotation subtype.

2116

2117 Args:

2118 subtypes: subtype or list of subtypes to be removed.

2119 Examples are: "/Link", "/FileAttachment", "/Sound",

2120 "/Movie", "/Screen", ...

2121 If you want to remove all annotations, use subtypes=None.

2122

2123 """

2124 for page in self.pages:

2125 self._remove_annots_from_page(page, subtypes)

2126

2127 def _remove_annots_from_page(

2128 self,

2129 page: Union[IndirectObject, PageObject, DictionaryObject],

2130 subtypes: Optional[Iterable[str]],

2131 ) -> None:

2132 page = cast(DictionaryObject, page.get_object())

2133 if PG.ANNOTS in page:

2134 i = 0

2135 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

2136 an = cast(ArrayObject, page[PG.ANNOTS])[i]

2137 obj = cast(DictionaryObject, an.get_object())

2138 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

2139 if isinstance(an, IndirectObject):

2140 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

2141 del page[PG.ANNOTS][i] # type:ignore

2142 else:

2143 i += 1

2144

2145 def remove_objects_from_page(

2146 self,

2147 page: Union[PageObject, DictionaryObject],

2148 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

2149 text_filters: Optional[Dict[str, Any]] = None

2150 ) -> None:

2151 """

2152 Remove objects specified by ``to_delete`` from the given page.

2153

2154 Args:

2155 page: Page object to clean up.

2156 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

2157 or a list of ObjectDeletionFlag

2158 text_filters: Properties of text to be deleted, if applicable. Optional.

2159 This is a Python dictionary with the following properties:

2160

2161 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

2162

2163 """

2164 if isinstance(to_delete, (list, tuple)):

2165 for to_d in to_delete:

2166 self.remove_objects_from_page(page, to_d)

2167 return None

2168 assert isinstance(to_delete, ObjectDeletionFlag)

2169

2170 if to_delete & ObjectDeletionFlag.LINKS:

2171 return self._remove_annots_from_page(page, ("/Link",))

2172 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

2173 return self._remove_annots_from_page(

2174 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

2175 )

2176 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

2177 return self._remove_annots_from_page(page, ("/3D",))

2178 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

2179 return self._remove_annots_from_page(page, None)

2180

2181 jump_operators = []

2182 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

2183 jump_operators = (

2184 [

2185 b"w", b"J", b"j", b"M", b"d", b"i",

2186 b"W", b"W*",

2187 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

2188 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

2189 b"sh"

2190 ]

2191 )

2192 if to_delete & ObjectDeletionFlag.TEXT:

2193 jump_operators = [b"Tj", b"TJ", b"'", b'"']

2194

2195 def clean(

2196 content: ContentStream,

2197 images: List[str],

2198 forms: List[str],

2199 text_filters: Optional[Dict[str, Any]] = None

2200 ) -> None:

2201 nonlocal jump_operators, to_delete

2202

2203 font_id = None

2204 font_ids_to_delete = []

2205 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

2206 font_ids_to_delete = text_filters.get("font_ids", [])

2207

2208 i = 0

2209 while i < len(content.operations):

2210 operands, operator = content.operations[i]

2211 if operator == b"Tf":

2212 font_id = operands[0]

2213 if (

2214 (

2215 operator == b"INLINE IMAGE"

2216 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2217 )

2218 or (operator in jump_operators)

2219 or (

2220 operator == b"Do"

2221 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2222 and (operands[0] in images)

2223 )

2224 ):

2225 if (

2226 not to_delete & ObjectDeletionFlag.TEXT

2227 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2228 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2229 ):

2230 del content.operations[i]

2231 else:

2232 i += 1

2233 else:

2234 i += 1

2235 content.get_data() # this ensures ._data is rebuilt from the .operations

2236

2237 def clean_forms(

2238 elt: DictionaryObject, stack: List[DictionaryObject]

2239 ) -> Tuple[List[str], List[str]]:

2240 nonlocal to_delete

2241 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2242 if (elt in stack) or (

2243 hasattr(elt, "indirect_reference")

2244 and any(

2245 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2246 for x in stack

2247 )

2248 ):

2249 # to prevent infinite looping

2250 return [], [] # pragma: no cover

2251 try:

2252 d = cast(

2253 Dict[Any, Any],

2254 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2255 )

2256 except KeyError:

2257 d = {}

2258 images = []

2259 forms = []

2260 for k, v in d.items():

2261 o = v.get_object()

2262 try:

2263 content: Any = None

2264 if (

2265 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2266 and o["/Subtype"] == "/Image"

2267 ):

2268 content = NullObject() # to delete the image keeping the entry

2269 images.append(k)

2270 if o["/Subtype"] == "/Form":

2271 forms.append(k)

2272 if isinstance(o, ContentStream):

2273 content = o

2274 else:

2275 content = ContentStream(o, self)

2276 content.update(

2277 {

2278 k1: v1

2279 for k1, v1 in o.items()

2280 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2281 }

2282 )

2283 try:

2284 content.indirect_reference = o.indirect_reference

2285 except AttributeError: # pragma: no cover

2286 pass

2287 stack.append(elt)

2288 clean_forms(content, stack) # clean subforms

2289 if content is not None:

2290 if isinstance(v, IndirectObject):

2291 self._objects[v.idnum - 1] = content

2292 else:

2293 # should only occur in a PDF not respecting PDF spec

2294 # where streams must be indirected.

2295 d[k] = self._add_object(content) # pragma: no cover

2296 except (TypeError, KeyError):

2297 pass

2298 for im in images:

2299 del d[im] # for clean-up

2300 if isinstance(elt, StreamObject): # for /Form

2301 if not isinstance(elt, ContentStream): # pragma: no cover

2302 e = ContentStream(elt, self)

2303 e.update(elt.items())

2304 elt = e

2305 clean(elt, images, forms, text_filters) # clean the content

2306 return images, forms

2307

2308 if not isinstance(page, PageObject):

2309 page = PageObject(self, page.indirect_reference) # pragma: no cover

2310 if "/Contents" in page:

2311 content = cast(ContentStream, page.get_contents())

2312

2313 images, forms = clean_forms(page, [])

2314

2315 clean(content, images, forms, text_filters)

2316 page.replace_contents(content)

2317

2318 def remove_images(

2319 self,

2320 to_delete: ImageType = ImageType.ALL,

2321 ) -> None:

2322 """

2323 Remove images from this output.

2324

2325 Args:

2326 to_delete: The type of images to be deleted

2327 (default = all images types)

2328

2329 """

2330 if isinstance(to_delete, bool):

2331 to_delete = ImageType.ALL

2332

2333 i = ObjectDeletionFlag.NONE

2334

2335 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2336 if to_delete & ImageType[image]:

2337 i |= ObjectDeletionFlag[image]

2338

2339 for page in self.pages:

2340 self.remove_objects_from_page(page, i)

2341

2342 def remove_text(self, font_names: Optional[List[str]] = None) -> None:

2343 """

2344 Remove text from the PDF.

2345

2346 Args:

2347 font_names: List of font names to remove, such as "Helvetica-Bold".

2348 Optional. If not specified, all text will be removed.

2349 """

2350 if not font_names:

2351 font_names = []

2352

2353 for page in self.pages:

2354 resource_ids_to_remove = []

2355

2356 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2357 # Font names need to be converted to resource names/IDs for easier removal

2358 if font_names:

2359 # Recursively loop through page objects to gather font info

2360 def get_font_info(

2361 obj: Any,

2362 font_info: Optional[Dict[str, Any]] = None,

2363 key: Optional[str] = None

2364 ) -> Dict[str, Any]:

2365 if font_info is None:

2366 font_info = {}

2367 if isinstance(obj, IndirectObject):

2368 obj = obj.get_object()

2369 if isinstance(obj, dict):

2370 if obj.get("/Type") == "/Font":

2371 font_name = obj.get("/BaseFont", "")

2372 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2373 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2374 if normalized_font_name not in font_info:

2375 font_info[normalized_font_name] = {

2376 "normalized_font_name": normalized_font_name,

2377 "resource_ids": [],

2378 }

2379 if key not in font_info[normalized_font_name]["resource_ids"]:

2380 font_info[normalized_font_name]["resource_ids"].append(key)

2381 for k in obj:

2382 font_info = get_font_info(obj[k], font_info, k)

2383 elif isinstance(obj, (list, ArrayObject)):

2384 for child_obj in obj:

2385 font_info = get_font_info(child_obj, font_info)

2386 return font_info

2387

2388 # Add relevant resource names for removal

2389 font_info = get_font_info(page.get("/Resources"))

2390 for font_name in font_names:

2391 if font_name in font_info:

2392 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2393

2394 text_filters = {}

2395 if font_names:

2396 text_filters["font_ids"] = resource_ids_to_remove

2397 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2398

2399 def add_uri(

2400 self,

2401 page_number: int,

2402 uri: str,

2403 rect: RectangleObject,

2404 border: Optional[ArrayObject] = None,

2405 ) -> None:

2406 """

2407 Add an URI from a rectangular area to the specified page.

2408

2409 Args:

2410 page_number: index of the page on which to place the URI action.

2411 uri: URI of resource to link to.

2412 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2413 array of four integers specifying the clickable rectangular area

2414 ``[xLL, yLL, xUR, yUR]``, or string in the form

2415 ``"[ xLL yLL xUR yUR ]"``.

2416 border: if provided, an array describing border-drawing

2417 properties. See the PDF spec for details. No border will be

2418 drawn if this argument is omitted.

2419

2420 """

2421 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore

2422 page_ref = cast(Dict[str, Any], self.get_object(page_link))

2423

2424 border_arr: BorderArrayType

2425 if border is not None:

2426 border_arr = [NumberObject(n) for n in border[:3]]

2427 if len(border) == 4:

2428 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2429 border_arr.append(dash_pattern)

2430 else:

2431 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2432

2433 if isinstance(rect, str):

2434 rect = NumberObject(rect)

2435 elif isinstance(rect, RectangleObject):

2436 pass

2437 else:

2438 rect = RectangleObject(rect)

2439

2440 lnk2 = DictionaryObject()

2441 lnk2.update(

2442 {

2443 NameObject("/S"): NameObject("/URI"),

2444 NameObject("/URI"): TextStringObject(uri),

2445 }

2446 )

2447 lnk = DictionaryObject()

2448 lnk.update(

2449 {

2450 NameObject(AA.Type): NameObject("/Annot"),

2451 NameObject(AA.Subtype): NameObject("/Link"),

2452 NameObject(AA.P): page_link,

2453 NameObject(AA.Rect): rect,

2454 NameObject("/H"): NameObject("/I"),

2455 NameObject(AA.Border): ArrayObject(border_arr),

2456 NameObject("/A"): lnk2,

2457 }

2458 )

2459 lnk_ref = self._add_object(lnk)

2460

2461 if PG.ANNOTS in page_ref:

2462 page_ref[PG.ANNOTS].append(lnk_ref)

2463 else:

2464 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2465

2466 _valid_layouts = (

2467 "/NoLayout",

2468 "/SinglePage",

2469 "/OneColumn",

2470 "/TwoColumnLeft",

2471 "/TwoColumnRight",

2472 "/TwoPageLeft",

2473 "/TwoPageRight",

2474 )

2475

2476 def _get_page_layout(self) -> Optional[LayoutType]:

2477 try:

2478 return cast(LayoutType, self._root_object["/PageLayout"])

2479 except KeyError:

2480 return None

2481

2482 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2483 """

2484 Set the page layout.

2485

2486 Args:

2487 layout: The page layout to be used.

2488

2489 .. list-table:: Valid ``layout`` arguments

2490 :widths: 50 200

2491

2492 * - /NoLayout

2493 - Layout explicitly not specified

2494 * - /SinglePage

2495 - Show one page at a time

2496 * - /OneColumn

2497 - Show one column at a time

2498 * - /TwoColumnLeft

2499 - Show pages in two columns, odd-numbered pages on the left

2500 * - /TwoColumnRight

2501 - Show pages in two columns, odd-numbered pages on the right

2502 * - /TwoPageLeft

2503 - Show two pages at a time, odd-numbered pages on the left

2504 * - /TwoPageRight

2505 - Show two pages at a time, odd-numbered pages on the right

2506

2507 """

2508 if not isinstance(layout, NameObject):

2509 if layout not in self._valid_layouts:

2510 logger_warning(

2511 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",

2512 __name__,

2513 )

2514 layout = NameObject(layout)

2515 self._root_object.update({NameObject("/PageLayout"): layout})

2516

2517 def set_page_layout(self, layout: LayoutType) -> None:

2518 """

2519 Set the page layout.

2520

2521 Args:

2522 layout: The page layout to be used

2523

2524 .. list-table:: Valid ``layout`` arguments

2525 :widths: 50 200

2526

2527 * - /NoLayout

2528 - Layout explicitly not specified

2529 * - /SinglePage

2530 - Show one page at a time

2531 * - /OneColumn

2532 - Show one column at a time

2533 * - /TwoColumnLeft

2534 - Show pages in two columns, odd-numbered pages on the left

2535 * - /TwoColumnRight

2536 - Show pages in two columns, odd-numbered pages on the right

2537 * - /TwoPageLeft

2538 - Show two pages at a time, odd-numbered pages on the left

2539 * - /TwoPageRight

2540 - Show two pages at a time, odd-numbered pages on the right

2541

2542 """

2543 self._set_page_layout(layout)

2544

2545 @property

2546 def page_layout(self) -> Optional[LayoutType]:

2547 """

2548 Page layout property.

2549

2550 .. list-table:: Valid ``layout`` values

2551 :widths: 50 200

2552

2553 * - /NoLayout

2554 - Layout explicitly not specified

2555 * - /SinglePage

2556 - Show one page at a time

2557 * - /OneColumn

2558 - Show one column at a time

2559 * - /TwoColumnLeft

2560 - Show pages in two columns, odd-numbered pages on the left

2561 * - /TwoColumnRight

2562 - Show pages in two columns, odd-numbered pages on the right

2563 * - /TwoPageLeft

2564 - Show two pages at a time, odd-numbered pages on the left

2565 * - /TwoPageRight

2566 - Show two pages at a time, odd-numbered pages on the right

2567 """

2568 return self._get_page_layout()

2569

2570 @page_layout.setter

2571 def page_layout(self, layout: LayoutType) -> None:

2572 self._set_page_layout(layout)

2573

2574 _valid_modes = (

2575 "/UseNone",

2576 "/UseOutlines",

2577 "/UseThumbs",

2578 "/FullScreen",

2579 "/UseOC",

2580 "/UseAttachments",

2581 )

2582

2583 def _get_page_mode(self) -> Optional[PagemodeType]:

2584 try:

2585 return cast(PagemodeType, self._root_object["/PageMode"])

2586 except KeyError:

2587 return None

2588

2589 @property

2590 def page_mode(self) -> Optional[PagemodeType]:

2591 """

2592 Page mode property.

2593

2594 .. list-table:: Valid ``mode`` values

2595 :widths: 50 200

2596

2597 * - /UseNone

2598 - Do not show outline or thumbnails panels

2599 * - /UseOutlines

2600 - Show outline (aka bookmarks) panel

2601 * - /UseThumbs

2602 - Show page thumbnails panel

2603 * - /FullScreen

2604 - Fullscreen view

2605 * - /UseOC

2606 - Show Optional Content Group (OCG) panel

2607 * - /UseAttachments

2608 - Show attachments panel

2609 """

2610 return self._get_page_mode()

2611

2612 @page_mode.setter

2613 def page_mode(self, mode: PagemodeType) -> None:

2614 if isinstance(mode, NameObject):

2615 mode_name: NameObject = mode

2616 else:

2617 if mode not in self._valid_modes:

2618 logger_warning(

2619 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__

2620 )

2621 mode_name = NameObject(mode)

2622 self._root_object.update({NameObject("/PageMode"): mode_name})

2623

2624 def add_annotation(

2625 self,

2626 page_number: Union[int, PageObject],

2627 annotation: Dict[str, Any],

2628 ) -> DictionaryObject:

2629 """

2630 Add a single annotation to the page.

2631 The added annotation must be a new annotation.

2632 It cannot be recycled.

2633

2634 Args:

2635 page_number: PageObject or page index.

2636 annotation: Annotation to be added (created with annotation).

2637

2638 Returns:

2639 The inserted object.

2640 This can be used for popup creation, for example.

2641

2642 """

2643 page = page_number

2644 if isinstance(page, int):

2645 page = self.pages[page]

2646 elif not isinstance(page, PageObject):

2647 raise TypeError("page: invalid type")

2648

2649 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2650 to_add[NameObject("/P")] = page.indirect_reference

2651

2652 if page.annotations is None:

2653 page[NameObject("/Annots")] = ArrayObject()

2654 assert page.annotations is not None

2655

2656 # Internal link annotations need the correct object type for the

2657 # destination

2658 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2659 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")])

2660 dest = Destination(

2661 NameObject("/LinkName"),

2662 tmp["target_page_index"],

2663 Fit(

2664 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2665 ), # I have no clue why this dict-hack is necessary

2666 )

2667 to_add[NameObject("/Dest")] = dest.dest_array

2668

2669 page.annotations.append(self._add_object(to_add))

2670

2671 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2672 cast(DictionaryObject, to_add["/Parent"].get_object())[

2673 NameObject("/Popup")

2674 ] = to_add.indirect_reference

2675

2676 return to_add

2677

2678 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2679 """

2680 Perform some clean up in the page.

2681 Currently: convert NameObject named destination to TextStringObject

2682 (required for names/dests list)

2683

2684 Args:

2685 page:

2686

2687 Returns:

2688 The cleaned PageObject

2689

2690 """

2691 page = cast("PageObject", page.get_object())

2692 for a in page.get("/Annots", []):

2693 a_obj = a.get_object()

2694 d = a_obj.get("/Dest", None)

2695 act = a_obj.get("/A", None)

2696 if isinstance(d, NameObject):

2697 a_obj[NameObject("/Dest")] = TextStringObject(d)

2698 elif act is not None:

2699 act = act.get_object()

2700 d = act.get("/D", None)

2701 if isinstance(d, NameObject):

2702 act[NameObject("/D")] = TextStringObject(d)

2703 return page

2704

2705 def _create_stream(

2706 self, fileobj: Union[Path, StrByteType, PdfReader]

2707 ) -> Tuple[IOBase, Optional[Encryption]]:

2708 # If the fileobj parameter is a string, assume it is a path

2709 # and create a file object at that location. If it is a file,

2710 # copy the file's contents into a BytesIO stream object; if

2711 # it is a PdfReader, copy that reader's stream into a

2712 # BytesIO stream.

2713 # If fileobj is none of the above types, it is not modified

2714 encryption_obj = None

2715 stream: IOBase

2716 if isinstance(fileobj, (str, Path)):

2717 with FileIO(fileobj, "rb") as f:

2718 stream = BytesIO(f.read())

2719 elif isinstance(fileobj, PdfReader):

2720 if fileobj._encryption:

2721 encryption_obj = fileobj._encryption

2722 orig_tell = fileobj.stream.tell()

2723 fileobj.stream.seek(0)

2724 stream = BytesIO(fileobj.stream.read())

2725

2726 # reset the stream to its original location

2727 fileobj.stream.seek(orig_tell)

2728 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2729 fileobj.seek(0)

2730 filecontent = fileobj.read()

2731 stream = BytesIO(filecontent)

2732 else:

2733 raise NotImplementedError(

2734 "Merging requires an object that PdfReader can parse. "

2735 "Typically, that is a Path or a string representing a Path, "

2736 "a file object, or an object implementing .seek and .read. "

2737 "Passing a PdfReader directly works as well."

2738 )

2739 return stream, encryption_obj

2740

2741 def append(

2742 self,

2743 fileobj: Union[StrByteType, PdfReader, Path],

2744 outline_item: Union[

2745 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]

2746 ] = None,

2747 pages: Union[

2748 None,

2749 PageRange,

2750 Tuple[int, int],

2751 Tuple[int, int, int],

2752 List[int],

2753 List[PageObject],

2754 ] = None,

2755 import_outline: bool = True,

2756 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None,

2757 ) -> None:

2758 """

2759 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2760 concatenate all pages onto the end of the file instead of specifying a

2761 position.

2762

2763 Args:

2764 fileobj: A File Object or an object that supports the standard

2765 read and seek methods similar to a File Object. Could also be a

2766 string representing a path to a PDF file.

2767 outline_item: Optionally, you may specify a string to build an

2768 outline (aka 'bookmark') to identify the beginning of the

2769 included file.

2770 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2771 or a ``(start, stop[, step])`` tuple

2772 or a list of pages to be processed

2773 to merge only the specified range of pages from the source

2774 document into the output document.

2775 import_outline: You may prevent the source document's

2776 outline (collection of outline items, previously referred to as

2777 'bookmarks') from being imported by specifying this as ``False``.

2778 excluded_fields: Provide the list of fields/keys to be ignored

2779 if ``/Annots`` is part of the list, the annotation will be ignored

2780 if ``/B`` is part of the list, the articles will be ignored

2781

2782 """

2783 if excluded_fields is None:

2784 excluded_fields = ()

2785 if isinstance(outline_item, (tuple, list, PageRange)):

2786 if isinstance(pages, bool):

2787 if not isinstance(import_outline, bool):

2788 excluded_fields = import_outline

2789 import_outline = pages

2790 pages = outline_item

2791 self.merge(

2792 None,

2793 fileobj,

2794 None,

2795 pages,

2796 import_outline,

2797 excluded_fields,

2798 )

2799 else: # if isinstance(outline_item, str):

2800 self.merge(

2801 None,

2802 fileobj,

2803 outline_item,

2804 pages,

2805 import_outline,

2806 excluded_fields,

2807 )

2808

2809 def merge(

2810 self,

2811 position: Optional[int],

2812 fileobj: Union[Path, StrByteType, PdfReader],

2813 outline_item: Optional[str] = None,

2814 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None,

2815 import_outline: bool = True,

2816 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (),

2817 ) -> None:

2818 """

2819 Merge the pages from the given file into the output file at the

2820 specified page number.

2821

2822 Args:

2823 position: The *page number* to insert this file. File will

2824 be inserted after the given number.

2825 fileobj: A File Object or an object that supports the standard

2826 read and seek methods similar to a File Object. Could also be a

2827 string representing a path to a PDF file.

2828 outline_item: Optionally, you may specify a string to build an outline

2829 (aka 'bookmark') to identify the

2830 beginning of the included file.

2831 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2832 or a ``(start, stop[, step])`` tuple

2833 or a list of pages to be processed

2834 to merge only the specified range of pages from the source

2835 document into the output document.

2836 import_outline: You may prevent the source document's

2837 outline (collection of outline items, previously referred to as

2838 'bookmarks') from being imported by specifying this as ``False``.

2839 excluded_fields: provide the list of fields/keys to be ignored

2840 if ``/Annots`` is part of the list, the annotation will be ignored

2841 if ``/B`` is part of the list, the articles will be ignored

2842

2843 Raises:

2844 TypeError: The pages attribute is not configured properly

2845

2846 """

2847 if isinstance(fileobj, PdfDocCommon):

2848 reader = fileobj

2849 else:

2850 stream, encryption_obj = self._create_stream(fileobj)

2851 # Create a new PdfReader instance using the stream

2852 # (either file or BytesIO or StringIO) created above

2853 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2854

2855 if excluded_fields is None:

2856 excluded_fields = ()

2857 # Find the range of pages to merge.

2858 if pages is None:

2859 pages = list(range(len(reader.pages)))

2860 elif isinstance(pages, PageRange):

2861 pages = list(range(*pages.indices(len(reader.pages))))

2862 elif isinstance(pages, list):

2863 pass # keep unchanged

2864 elif isinstance(pages, tuple) and len(pages) <= 3:

2865 pages = list(range(*pages))

2866 elif not isinstance(pages, tuple):

2867 raise TypeError(

2868 '"pages" must be a tuple of (start, stop[, step]) or a list'

2869 )

2870

2871 srcpages = {}

2872 for page in pages:

2873 if isinstance(page, PageObject):

2874 pg = page

2875 else:

2876 pg = reader.pages[page]

2877 assert pg.indirect_reference is not None

2878 if position is None:

2879 # numbers in the exclude list identifies that the exclusion is

2880 # only applicable to 1st level of cloning

2881 srcpages[pg.indirect_reference.idnum] = self.add_page(

2882 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2883 )

2884 else:

2885 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2886 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2887 )

2888 position += 1

2889 srcpages[pg.indirect_reference.idnum].original_page = pg

2890

2891 reader._named_destinations = (

2892 reader.named_destinations

2893 ) # need for the outline processing below

2894

2895 arr: Any

2896

2897 def _process_named_dests(dest: Any) -> None:

2898 arr = dest.dest_array

2899 if "/Names" in self._root_object and dest["/Title"] in cast(

2900 List[Any],

2901 cast(

2902 DictionaryObject,

2903 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2904 ).get("/Names", DictionaryObject()),

2905 ):

2906 # already exists: should not duplicate it

2907 pass

2908 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2909 pass

2910 elif isinstance(dest["/Page"], int):

2911 # the page reference is a page number normally not a PDF Reference

2912 # page numbers as int are normally accepted only in external goto

2913 try:

2914 p = reader.pages[dest["/Page"]]

2915 except IndexError:

2916 return

2917 assert p.indirect_reference is not None

2918 try:

2919 arr[NumberObject(0)] = NumberObject(

2920 srcpages[p.indirect_reference.idnum].page_number

2921 )

2922 self.add_named_destination_array(dest["/Title"], arr)

2923 except KeyError:

2924 pass

2925 elif dest["/Page"].indirect_reference.idnum in srcpages:

2926 arr[NumberObject(0)] = srcpages[

2927 dest["/Page"].indirect_reference.idnum

2928 ].indirect_reference

2929 self.add_named_destination_array(dest["/Title"], arr)

2930

2931 for dest in reader._named_destinations.values():

2932 _process_named_dests(dest)

2933

2934 outline_item_typ: TreeObject

2935 if outline_item is not None:

2936 outline_item_typ = cast(

2937 "TreeObject",

2938 self.add_outline_item(

2939 TextStringObject(outline_item),

2940 next(iter(srcpages.values())).indirect_reference,

2941 fit=PAGE_FIT,

2942 ).get_object(),

2943 )

2944 else:

2945 outline_item_typ = self.get_outline_root()

2946

2947 _ro = reader.root_object

2948 if import_outline and CO.OUTLINES in _ro:

2949 outline = self._get_filtered_outline(

2950 _ro.get(CO.OUTLINES, None), srcpages, reader

2951 )

2952 self._insert_filtered_outline(

2953 outline, outline_item_typ, None

2954 ) # TODO: use before parameter

2955

2956 if "/Annots" not in excluded_fields:

2957 for pag in srcpages.values():

2958 lst = self._insert_filtered_annotations(

2959 pag.original_page.get("/Annots", []), pag, srcpages, reader

2960 )

2961 if len(lst) > 0:

2962 pag[NameObject("/Annots")] = lst

2963 self.clean_page(pag)

2964

2965 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:

2966 if "/AcroForm" not in self._root_object:

2967 self._root_object[NameObject("/AcroForm")] = self._add_object(

2968 cast(

2969 DictionaryObject,

2970 reader.root_object["/AcroForm"],

2971 ).clone(self, False, ("/Fields",))

2972 )

2973 arr = ArrayObject()

2974 else:

2975 arr = cast(

2976 ArrayObject,

2977 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2978 )

2979 trslat = self._id_translated[id(reader)]

2980 try:

2981 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore

2982 try:

2983 ind = IndirectObject(trslat[f.idnum], 0, self)

2984 if ind not in arr:

2985 arr.append(ind)

2986 except KeyError:

2987 # for trslat[] which mean the field has not be copied

2988 # through the page

2989 pass

2990 except KeyError: # for /Acroform or /Fields are not existing

2991 arr = self._add_object(ArrayObject())

2992 cast(DictionaryObject, self._root_object["/AcroForm"])[

2993 NameObject("/Fields")

2994 ] = arr

2995

2996 if "/B" not in excluded_fields:

2997 self.add_filtered_articles("", srcpages, reader)

2998

2999 def _add_articles_thread(

3000 self,

3001 thread: DictionaryObject, # thread entry from the reader's array of threads

3002 pages: Dict[int, PageObject],

3003 reader: PdfReader,

3004 ) -> IndirectObject:

3005 """

3006 Clone the thread with only the applicable articles.

3007

3008 Args:

3009 thread:

3010 pages:

3011 reader:

3012

3013 Returns:

3014 The added thread as an indirect reference

3015

3016 """

3017 nthread = thread.clone(

3018 self, force_duplicate=True, ignore_fields=("/F",)

3019 ) # use of clone to keep link between reader and writer

3020 self.threads.append(nthread.indirect_reference)

3021 first_article = cast("DictionaryObject", thread["/F"])

3022 current_article: Optional[DictionaryObject] = first_article

3023 new_article: Optional[DictionaryObject] = None

3024 while current_article is not None:

3025 pag = self._get_cloned_page(

3026 cast("PageObject", current_article["/P"]), pages, reader

3027 )

3028 if pag is not None:

3029 if new_article is None:

3030 new_article = cast(

3031 "DictionaryObject",

3032 self._add_object(DictionaryObject()).get_object(),

3033 )

3034 new_first = new_article

3035 nthread[NameObject("/F")] = new_article.indirect_reference

3036 else:

3037 new_article2 = cast(

3038 "DictionaryObject",

3039 self._add_object(

3040 DictionaryObject(

3041 {NameObject("/V"): new_article.indirect_reference}

3042 )

3043 ).get_object(),

3044 )

3045 new_article[NameObject("/N")] = new_article2.indirect_reference

3046 new_article = new_article2

3047 new_article[NameObject("/P")] = pag

3048 new_article[NameObject("/T")] = nthread.indirect_reference

3049 new_article[NameObject("/R")] = current_article["/R"]

3050 pag_obj = cast("PageObject", pag.get_object())

3051 if "/B" not in pag_obj:

3052 pag_obj[NameObject("/B")] = ArrayObject()

3053 cast("ArrayObject", pag_obj["/B"]).append(

3054 new_article.indirect_reference

3055 )

3056 current_article = cast("DictionaryObject", current_article["/N"])

3057 if current_article == first_article:

3058 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore

3059 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore

3060 current_article = None

3061 assert nthread.indirect_reference is not None

3062 return nthread.indirect_reference

3063

3064 def add_filtered_articles(

3065 self,

3066 fltr: Union[

3067 Pattern[Any], str

3068 ], # thread entry from the reader's array of threads

3069 pages: Dict[int, PageObject],

3070 reader: PdfReader,

3071 ) -> None:

3072 """

3073 Add articles matching the defined criteria.

3074

3075 Args:

3076 fltr:

3077 pages:

3078 reader:

3079

3080 """

3081 if isinstance(fltr, str):

3082 fltr = re.compile(fltr)

3083 elif not isinstance(fltr, Pattern):

3084 fltr = re.compile("")

3085 for p in pages.values():

3086 pp = p.original_page

3087 for a in pp.get("/B", ()):

3088 thr = a.get_object().get("/T")

3089 if thr is None:

3090 continue

3091 thr = thr.get_object()

3092 if thr.indirect_reference.idnum not in self._id_translated[

3093 id(reader)

3094 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

3095 self._add_articles_thread(thr, pages, reader)

3096

3097 def _get_cloned_page(

3098 self,

3099 page: Union[None, IndirectObject, PageObject, NullObject],

3100 pages: Dict[int, PageObject],

3101 reader: PdfReader,

3102 ) -> Optional[IndirectObject]:

3103 if isinstance(page, NullObject):

3104 return None

3105 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

3106 _i = page.indirect_reference

3107 elif isinstance(page, IndirectObject):

3108 _i = page

3109 try:

3110 return pages[_i.idnum].indirect_reference # type: ignore

3111 except Exception:

3112 return None

3113

3114 def _insert_filtered_annotations(

3115 self,

3116 annots: Union[IndirectObject, List[DictionaryObject], None],

3117 page: PageObject,

3118 pages: Dict[int, PageObject],

3119 reader: PdfReader,

3120 ) -> List[Destination]:

3121 outlist = ArrayObject()

3122 if isinstance(annots, IndirectObject):

3123 annots = cast("List[Any]", annots.get_object())

3124 if annots is None:

3125 return outlist

3126 if not isinstance(annots, list):

3127 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)

3128 return outlist

3129 for an in annots:

3130 ano = cast("DictionaryObject", an.get_object())

3131 if (

3132 ano["/Subtype"] != "/Link"

3133 or "/A" not in ano

3134 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"

3135 or "/Dest" in ano

3136 ):

3137 if "/Dest" not in ano:

3138 outlist.append(self._add_object(ano.clone(self)))

3139 else:

3140 d = ano["/Dest"]

3141 if isinstance(d, str):

3142 # it is a named dest

3143 if str(d) in self.get_named_dest_root():

3144 outlist.append(ano.clone(self).indirect_reference)

3145 else:

3146 d = cast("ArrayObject", d)

3147 p = self._get_cloned_page(d[0], pages, reader)

3148 if p is not None:

3149 anc = ano.clone(self, ignore_fields=("/Dest",))

3150 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

3151 outlist.append(self._add_object(anc))

3152 else:

3153 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

3154 if d is None or isinstance(d, NullObject):

3155 continue

3156 if isinstance(d, str):

3157 # it is a named dest

3158 if str(d) in self.get_named_dest_root():

3159 outlist.append(ano.clone(self).indirect_reference)

3160 else:

3161 d = cast("ArrayObject", d)

3162 p = self._get_cloned_page(d[0], pages, reader)

3163 if p is not None:

3164 anc = ano.clone(self, ignore_fields=("/D",))

3165 cast("DictionaryObject", anc["/A"])[

3166 NameObject("/D")

3167 ] = ArrayObject([p, *d[1:]])

3168 outlist.append(self._add_object(anc))

3169 return outlist

3170

3171 def _get_filtered_outline(

3172 self,

3173 node: Any,

3174 pages: Dict[int, PageObject],

3175 reader: PdfReader,

3176 ) -> List[Destination]:

3177 """

3178 Extract outline item entries that are part of the specified page set.

3179

3180 Args:

3181 node:

3182 pages:

3183 reader:

3184

3185 Returns:

3186 A list of destination objects.

3187

3188 """

3189 new_outline = []

3190 if node is None:

3191 node = NullObject()

3192 node = node.get_object()

3193 if is_null_or_none(node):

3194 node = DictionaryObject()

3195 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

3196 node = node.get("/First", None)

3197 if node is not None:

3198 node = node.get_object()

3199 new_outline += self._get_filtered_outline(node, pages, reader)

3200 else:

3201 v: Union[None, IndirectObject, NullObject]

3202 while node is not None:

3203 node = node.get_object()

3204 o = cast("Destination", reader._build_outline_item(node))

3205 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3206 if v is None:

3207 v = NullObject()

3208 o[NameObject("/Page")] = v

3209 if "/First" in node:

3210 o._filtered_children = self._get_filtered_outline(

3211 node["/First"], pages, reader

3212 )

3213 else:

3214 o._filtered_children = []

3215 if (

3216 not isinstance(o["/Page"], NullObject)

3217 or len(o._filtered_children) > 0

3218 ):

3219 new_outline.append(o)

3220 node = node.get("/Next", None)

3221 return new_outline

3222

3223 def _clone_outline(self, dest: Destination) -> TreeObject:

3224 n_ol = TreeObject()

3225 self._add_object(n_ol)

3226 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3227 if not isinstance(dest["/Page"], NullObject):

3228 if dest.node is not None and "/A" in dest.node:

3229 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3230 else:

3231 n_ol[NameObject("/Dest")] = dest.dest_array

3232 # TODO: /SE

3233 if dest.node is not None:

3234 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3235 n_ol[NameObject("/C")] = ArrayObject(

3236 dest.node.get(

3237 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3238 )

3239 )

3240 return n_ol

3241

3242 def _insert_filtered_outline(

3243 self,

3244 outlines: List[Destination],

3245 parent: Union[TreeObject, IndirectObject],

3246 before: Union[None, TreeObject, IndirectObject] = None,

3247 ) -> None:

3248 for dest in outlines:

3249 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3250 # with np=self.add_outline_item_destination(dest,parent,before)

3251 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3252 np = parent

3253 else:

3254 np = self._clone_outline(dest)

3255 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3256 self._insert_filtered_outline(dest._filtered_children, np, None)

3257

3258 def close(self) -> None:

3259 """Implemented for API harmonization."""

3260 return

3261

3262 def find_outline_item(

3263 self,

3264 outline_item: Dict[str, Any],

3265 root: Optional[OutlineType] = None,

3266 ) -> Optional[List[int]]:

3267 if root is None:

3268 o = self.get_outline_root()

3269 else:

3270 o = cast("TreeObject", root)

3271

3272 i = 0

3273 while o is not None:

3274 if (

3275 o.indirect_reference == outline_item

3276 or o.get("/Title", None) == outline_item

3277 ):

3278 return [i]

3279 if "/First" in o:

3280 res = self.find_outline_item(

3281 outline_item, cast(OutlineType, o["/First"])

3282 )

3283 if res:

3284 return ([i] if "/Title" in o else []) + res

3285 if "/Next" in o:

3286 i += 1

3287 o = cast(TreeObject, o["/Next"])

3288 else:

3289 return None

3290

3291 def find_bookmark(

3292 self,

3293 outline_item: Dict[str, Any],

3294 root: Optional[OutlineType] = None,

3295 ) -> None: # deprecated

3296 """

3297 .. deprecated:: 2.9.0

3298 Use :meth:`find_outline_item` instead.

3299 """

3300 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0")

3301

3302 def reset_translation(

3303 self, reader: Union[None, PdfReader, IndirectObject] = None

3304 ) -> None:

3305 """

3306 Reset the translation table between reader and the writer object.

3307

3308 Late cloning will create new independent objects.

3309

3310 Args:

3311 reader: PdfReader or IndirectObject referencing a PdfReader object.

3312 if set to None or omitted, all tables will be reset.

3313

3314 """

3315 if reader is None:

3316 self._id_translated = {}

3317 elif isinstance(reader, PdfReader):

3318 try:

3319 del self._id_translated[id(reader)]

3320 except Exception:

3321 pass

3322 elif isinstance(reader, IndirectObject):

3323 try:

3324 del self._id_translated[id(reader.pdf)]

3325 except Exception:

3326 pass

3327 else:

3328 raise Exception("invalid parameter {reader}")

3329

3330 def set_page_label(

3331 self,

3332 page_index_from: int,

3333 page_index_to: int,

3334 style: Optional[PageLabelStyle] = None,

3335 prefix: Optional[str] = None,

3336 start: Optional[int] = 0,

3337 ) -> None:

3338 """

3339 Set a page label to a range of pages.

3340

3341 Page indexes must be given starting from 0.

3342 Labels must have a style, a prefix or both.

3343 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3344

3345 Args:

3346 page_index_from: page index of the beginning of the range starting from 0

3347 page_index_to: page index of the beginning of the range starting from 0

3348 style: The numbering style to be used for the numeric portion of each page label:

3349

3350 * ``/D`` Decimal Arabic numerals

3351 * ``/R`` Uppercase Roman numerals

3352 * ``/r`` Lowercase Roman numerals

3353 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3354 AA to ZZ for the next 26, and so on)

3355 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3356 aa to zz for the next 26, and so on)

3357

3358 prefix: The label prefix for page labels in this range.

3359 start: The value of the numeric portion for the first page label

3360 in the range.

3361 Subsequent pages are numbered sequentially from this value,

3362 which must be greater than or equal to 1.

3363 Default value: 1.

3364

3365 """

3366 if style is None and prefix is None:

3367 raise ValueError("At least one of style and prefix must be given")

3368 if page_index_from < 0:

3369 raise ValueError("page_index_from must be greater or equal than 0")

3370 if page_index_to < page_index_from:

3371 raise ValueError(

3372 "page_index_to must be greater or equal than page_index_from"

3373 )

3374 if page_index_to >= len(self.pages):

3375 raise ValueError("page_index_to exceeds number of pages")

3376 if start is not None and start != 0 and start < 1:

3377 raise ValueError("If given, start must be greater or equal than one")

3378

3379 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3380

3381 def _set_page_label(

3382 self,

3383 page_index_from: int,

3384 page_index_to: int,

3385 style: Optional[PageLabelStyle] = None,

3386 prefix: Optional[str] = None,

3387 start: Optional[int] = 0,

3388 ) -> None:

3389 """

3390 Set a page label to a range of pages.

3391

3392 Page indexes must be given starting from 0.

3393 Labels must have a style, a prefix or both.

3394 If a range is not assigned any page label a decimal label starting from 1 is applied.

3395

3396 Args:

3397 page_index_from: page index of the beginning of the range starting from 0

3398 page_index_to: page index of the beginning of the range starting from 0

3399 style: The numbering style to be used for the numeric portion of each page label:

3400 /D Decimal Arabic numerals

3401 /R Uppercase Roman numerals

3402 /r Lowercase Roman numerals

3403 /A Uppercase letters (A to Z for the first 26 pages,

3404 AA to ZZ for the next 26, and so on)

3405 /a Lowercase letters (a to z for the first 26 pages,

3406 aa to zz for the next 26, and so on)

3407 prefix: The label prefix for page labels in this range.

3408 start: The value of the numeric portion for the first page label

3409 in the range.

3410 Subsequent pages are numbered sequentially from this value,

3411 which must be greater than or equal to 1. Default value: 1.

3412

3413 """

3414 default_page_label = DictionaryObject()

3415 default_page_label[NameObject("/S")] = NameObject("/D")

3416

3417 new_page_label = DictionaryObject()

3418 if style is not None:

3419 new_page_label[NameObject("/S")] = NameObject(style)

3420 if prefix is not None:

3421 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3422 if start != 0:

3423 new_page_label[NameObject("/St")] = NumberObject(start)

3424

3425 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3426 nums = ArrayObject()

3427 nums_insert(NumberObject(0), default_page_label, nums)

3428 page_labels = TreeObject()

3429 page_labels[NameObject("/Nums")] = nums

3430 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3431

3432 page_labels = cast(

3433 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3434 )

3435 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3436

3437 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3438 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3439 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3440 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3441 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3442

3443 page_labels[NameObject("/Nums")] = nums

3444 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3445

3446 def _repr_mimebundle_(

3447 self,

3448 include: Union[None, Iterable[str]] = None,

3449 exclude: Union[None, Iterable[str]] = None,

3450 ) -> Dict[str, Any]:

3451 """

3452 Integration into Jupyter Notebooks.

3453

3454 This method returns a dictionary that maps a mime-type to its

3455 representation.

3456

3457 .. seealso::

3458

3459 https://ipython.readthedocs.io/en/stable/config/integrating.html

3460 """

3461 pdf_data = BytesIO()

3462 self.write(pdf_data)

3463 data = {

3464 "application/pdf": pdf_data,

3465 }

3466

3467 if include is not None:

3468 # Filter representations based on include list

3469 data = {k: v for k, v in data.items() if k in include}

3470

3471 if exclude is not None:

3472 # Remove representations based on exclude list

3473 data = {k: v for k, v in data.items() if k not in exclude}

3474

3475 return data

3476

3477

3478def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject:

3479 if isinstance(obj, PdfObject):

3480 return obj

3481 if isinstance(obj, dict):

3482 to_add = DictionaryObject()

3483 for key, value in obj.items():

3484 to_add[NameObject(key)] = _pdf_objectify(value)

3485 return to_add

3486 if isinstance(obj, str):

3487 if obj.startswith("/"):

3488 return NameObject(obj)

3489 return TextStringObject(obj)

3490 if isinstance(obj, (float, int)):

3491 return FloatObject(obj)

3492 if isinstance(obj, list):

3493 return ArrayObject(_pdf_objectify(i) for i in obj)

3494 raise NotImplementedError(

3495 f"{type(obj)=} could not be cast to a PdfObject"

3496 )

3497

3498

3499def _create_outline_item(

3500 action_ref: Union[None, IndirectObject],

3501 title: str,

3502 color: Union[Tuple[float, float, float], str, None],

3503 italic: bool,

3504 bold: bool,

3505) -> TreeObject:

3506 outline_item = TreeObject()

3507 if action_ref is not None:

3508 outline_item[NameObject("/A")] = action_ref

3509 outline_item.update(

3510 {

3511 NameObject("/Title"): create_string_object(title),

3512 }

3513 )

3514 if color:

3515 if isinstance(color, str):

3516 color = hex_to_rgb(color)

3517 outline_item.update(

3518 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3519 )

3520 if italic or bold:

3521 format_flag = 0

3522 if italic:

3523 format_flag += OutlineFontFlag.italic

3524 if bold:

3525 format_flag += OutlineFontFlag.bold

3526 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3527 return outline_item

3528

3529

3530def generate_appearance_stream(

3531 txt: str,

3532 sel: List[str],

3533 da: str,

3534 font_full_rev: Dict[str, bytes],

3535 rct: RectangleObject,

3536 font_height: float,

3537 y_offset: float,

3538) -> bytes:

3539 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()

3540 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):

3541 if line in sel:

3542 # may be improved but cannot find how to get fill working => replaced with lined box

3543 ap_stream += (

3544 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"

3545 f"0.5 0.5 0.5 rg s\n{da}\n"

3546 ).encode()

3547 if line_number == 0:

3548 ap_stream += f"2 {y_offset} Td\n".encode()

3549 else:

3550 # Td is a relative translation

3551 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()

3552 enc_line: List[bytes] = [

3553 font_full_rev.get(c, c.encode("utf-16-be")) for c in line

3554 ]

3555 if any(len(c) >= 2 for c in enc_line):

3556 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"

3557 else:

3558 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"

3559 ap_stream += b"ET\nQ\nEMC\nQ\n"

3560 return ap_stream

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%

1466 statements