Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

6# Redistribution and use in source and binary forms, with or without

7# modification, are permitted provided that the following conditions are

8# met:

10# * Redistributions of source code must retain the above copyright notice,

11# this list of conditions and the following disclaimer.

12# * Redistributions in binary form must reproduce the above copyright notice,

13# this list of conditions and the following disclaimer in the documentation

14# and/or other materials provided with the distribution.

15# * The name of the author may not be used to endorse or promote products

16# derived from this software without specific prior written permission.

17#

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

28# POSSIBILITY OF SUCH DAMAGE.

30import decimal

31import enum

32import hashlib

33import re

34import struct

35import uuid

36from io import BytesIO, FileIO, IOBase

37from itertools import compress

38from pathlib import Path

39from types import TracebackType

40from typing import (

41 IO,

42 Any,

43 Callable,

44 Dict,

45 Iterable,

46 List,

47 Optional,

48 Pattern,

49 Tuple,

50 Type,

51 Union,

52 cast,

53)

55from ._cmap import _default_fonts_space_width, build_char_map_from_dict

56from ._doc_common import DocumentInformation, PdfDocCommon

57from ._encryption import EncryptAlgorithm, Encryption

58from ._page import PageObject

59from ._page_labels import nums_clear_range, nums_insert, nums_next

60from ._reader import PdfReader

61from ._utils import (

62 StrByteType,

63 StreamType,

64 _get_max_pdf_version_header,

65 deprecate,

66 deprecate_no_replacement,

67 deprecation_with_replacement,

68 logger_warning,

69)

70from .constants import AnnotationDictionaryAttributes as AA

71from .constants import CatalogAttributes as CA

72from .constants import (

73 CatalogDictionary,

74 FileSpecificationDictionaryEntries,

75 GoToActionArguments,

76 ImageType,

77 InteractiveFormDictEntries,

78 OutlineFontFlag,

79 PageLabelStyle,

80 TypFitArguments,

81 UserAccessPermissions,

82)

83from .constants import Core as CO

84from .constants import FieldDictionaryAttributes as FA

85from .constants import PageAttributes as PG

86from .constants import PagesAttributes as PA

87from .constants import TrailerKeys as TK

88from .errors import PyPdfError

89from .generic import (

90 PAGE_FIT,

91 ArrayObject,

92 BooleanObject,

93 ByteStringObject,

94 ContentStream,

95 DecodedStreamObject,

96 Destination,

97 DictionaryObject,

98 Fit,

99 FloatObject,

100 IndirectObject,

101 NameObject,

102 NullObject,

103 NumberObject,

104 PdfObject,

105 RectangleObject,

106 StreamObject,

107 TextStringObject,

108 TreeObject,

109 ViewerPreferences,

110 create_string_object,

111 hex_to_rgb,

112 is_null_or_none,

113)

114from .pagerange import PageRange, PageRangeSpec

115from .types import (

116 AnnotationSubtype,

117 BorderArrayType,

118 LayoutType,

119 OutlineItemType,

120 OutlineType,

121 PagemodeType,

122)

123from .xmp import XmpInformation

124

125ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all()

126DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12

127

128

129class ObjectDeletionFlag(enum.IntFlag):

130 NONE = 0

131 TEXT = enum.auto()

132 LINKS = enum.auto()

133 ATTACHMENTS = enum.auto()

134 OBJECTS_3D = enum.auto()

135 ALL_ANNOTATIONS = enum.auto()

136 XOBJECT_IMAGES = enum.auto()

137 INLINE_IMAGES = enum.auto()

138 DRAWING_IMAGES = enum.auto()

139 IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES

140

141

142def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:

143 hash = hashlib.md5()

144 for block in iter(lambda: stream.read(blocksize), b""):

145 hash.update(block)

146 return hash.hexdigest()

147

148

149class PdfWriter(PdfDocCommon):

150 """

151 Write a PDF file out, given pages produced by another class or through

152 cloning a PDF file during initialization.

153

154 Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.

155

156 Args:

157 clone_from: identical to fileobj (for compatibility)

158

159 incremental: If true, loads the document and set the PdfWriter in incremental mode.

160

161 When writing incrementally, the original document is written first and new/modified

162 content is appended. To be used for signed document/forms to keep signature valid.

163

164 full: If true, loads all the objects (always full if incremental = True).

165 This parameter may allow loading large PDFs.

166

167 """

168

169 def __init__(

170 self,

171 fileobj: Union[None, PdfReader, StrByteType, Path] = "",

172 clone_from: Union[None, PdfReader, StrByteType, Path] = None,

173 incremental: bool = False,

174 full: bool = False,

175 ) -> None:

176 self.incremental = incremental or full

177 """

178 Returns if the PdfWriter object has been started in incremental mode.

179 """

180

181 self._objects: List[Optional[PdfObject]] = []

182 """

183 The indirect objects in the PDF.

184 For the incremental case, it will be filled with None

185 in clone_reader_document_root.

186 """

187

188 self._original_hash: List[int] = []

189 """

190 List of hashes after import; used to identify changes.

191 """

192

193 self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}

194 """

195 Maps hash values of indirect objects to the list of IndirectObjects.

196 This is used for compression.

197 """

198

199 self._id_translated: Dict[int, Dict[int, int]] = {}

200 """List of already translated IDs.

201 dict[id(pdf)][(idnum, generation)]

202 """

203

204 self._info_obj: Optional[PdfObject]

205 """The PDF files's document information dictionary,

206 the Info entry in the PDF file's trailer dictionary."""

207

208 self._ID: Union[ArrayObject, None] = None

209 """The PDF file identifier,

210 defined by the ID in the PDF file's trailer dictionary."""

211

212 if self.incremental:

213 if isinstance(fileobj, (str, Path)):

214 with open(fileobj, "rb") as f:

215 fileobj = BytesIO(f.read(-1))

216 if isinstance(fileobj, BytesIO):

217 fileobj = PdfReader(fileobj)

218 if not isinstance(fileobj, PdfReader):

219 raise PyPdfError("Invalid type for incremental mode")

220 self._reader = fileobj # prev content is in _reader.stream

221 self._header = fileobj.pdf_header.encode()

222 self._readonly = True # TODO: to be analysed

223 else:

224 self._header = b"%PDF-1.3"

225 self._info_obj = self._add_object(

226 DictionaryObject(

227 {NameObject("/Producer"): create_string_object("pypdf")}

228 )

229 )

230

231 def _get_clone_from(

232 fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

233 clone_from: Union[None, PdfReader, str, Path, IO[Any], BytesIO],

234 ) -> Union[None, PdfReader, str, Path, IO[Any], BytesIO]:

235 if isinstance(fileobj, (str, Path, IO, BytesIO)) and (

236 fileobj == "" or clone_from is not None

237 ):

238 return clone_from

239 cloning = True

240 if isinstance(fileobj, (str, Path)) and (

241 not Path(str(fileobj)).exists()

242 or Path(str(fileobj)).stat().st_size == 0

243 ):

244 cloning = False

245 if isinstance(fileobj, (IOBase, BytesIO)):

246 t = fileobj.tell()

247 if fileobj.seek(0, 2) == 0:

248 cloning = False

249 fileobj.seek(t, 0)

250 if cloning:

251 clone_from = fileobj

252 return clone_from

253

254 clone_from = _get_clone_from(fileobj, clone_from)

255 # To prevent overwriting

256 self.temp_fileobj = fileobj

257 self.fileobj = ""

258 self._with_as_usage = False

259 self._cloned = False

260 # The root of our page tree node

261 pages = DictionaryObject(

262 {

263 NameObject(PA.TYPE): NameObject("/Pages"),

264 NameObject(PA.COUNT): NumberObject(0),

265 NameObject(PA.KIDS): ArrayObject(),

266 }

267 )

268 self.flattened_pages = []

269 self._encryption: Optional[Encryption] = None

270 self._encrypt_entry: Optional[DictionaryObject] = None

271

272 if clone_from is not None:

273 if not isinstance(clone_from, PdfReader):

274 clone_from = PdfReader(clone_from)

275 self.clone_document_from_reader(clone_from)

276 self._cloned = True

277 else:

278 self._pages = self._add_object(pages)

279 self._root_object = DictionaryObject(

280 {

281 NameObject(PA.TYPE): NameObject(CO.CATALOG),

282 NameObject(CO.PAGES): self._pages,

283 }

284 )

285 self._add_object(self._root_object)

286 if full and not incremental:

287 self.incremental = False

288 if isinstance(self._ID, list):

289 if isinstance(self._ID[0], TextStringObject):

290 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())

291 if isinstance(self._ID[1], TextStringObject):

292 self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())

293

294 # for commonality

295 @property

296 def is_encrypted(self) -> bool:

297 """

298 Read-only boolean property showing whether this PDF file is encrypted.

299

300 Note that this property, if true, will remain true even after the

301 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

302 """

303 return False

304

305 @property

306 def root_object(self) -> DictionaryObject:

307 """

308 Provide direct access to PDF Structure.

309

310 Note:

311 Recommended only for read access.

312

313 """

314 return self._root_object

315

316 @property

317 def _info(self) -> Optional[DictionaryObject]:

318 """

319 Provide access to "/Info". Standardized with PdfReader.

320

321 Returns:

322 /Info Dictionary; None if the entry does not exist

323

324 """

325 return (

326 None

327 if self._info_obj is None

328 else cast(DictionaryObject, self._info_obj.get_object())

329 )

330

331 @_info.setter

332 def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:

333 if value is None:

334 try:

335 self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore

336 except (KeyError, AttributeError):

337 pass

338 self._info_obj = None

339 else:

340 if self._info_obj is None:

341 self._info_obj = self._add_object(DictionaryObject())

342 obj = cast(DictionaryObject, self._info_obj.get_object())

343 obj.clear()

344 obj.update(cast(DictionaryObject, value.get_object()))

345

346 @property

347 def xmp_metadata(self) -> Optional[XmpInformation]:

348 """XMP (Extensible Metadata Platform) data."""

349 return cast(XmpInformation, self.root_object.xmp_metadata)

350

351 @xmp_metadata.setter

352 def xmp_metadata(self, value: Optional[XmpInformation]) -> None:

353 """XMP (Extensible Metadata Platform) data."""

354 if value is None:

355 if "/Metadata" in self.root_object:

356 del self.root_object["/Metadata"]

357 else:

358 self.root_object[NameObject("/Metadata")] = value

359

360 return self.root_object.xmp_metadata # type: ignore

361

362 @property

363 def with_as_usage(self) -> bool:

364 deprecate_no_replacement("with_as_usage", "6.0")

365 return self._with_as_usage

366

367 @with_as_usage.setter

368 def with_as_usage(self, value: bool) -> None:

369 deprecate_no_replacement("with_as_usage", "6.0")

370 self._with_as_usage = value

371

372 def __enter__(self) -> "PdfWriter":

373 """Store how writer is initialized by 'with'."""

374 c: bool = self._cloned

375 t = self.temp_fileobj

376 self.__init__() # type: ignore

377 self._cloned = c

378 self._with_as_usage = True

379 self.fileobj = t # type: ignore

380 return self

381

382 def __exit__(

383 self,

384 exc_type: Optional[Type[BaseException]],

385 exc: Optional[BaseException],

386 traceback: Optional[TracebackType],

387 ) -> None:

388 """Write data to the fileobj."""

389 if self.fileobj and not self._cloned:

390 self.write(self.fileobj)

391

392 @property

393 def pdf_header(self) -> str:

394 """

395 Read/Write property of the PDF header that is written.

396

397 This should be something like ``'%PDF-1.5'``. It is recommended to set

398 the lowest version that supports all features which are used within the

399 PDF file.

400

401 Note: `pdf_header` returns a string but accepts bytes or str for writing

402 """

403 return self._header.decode()

404

405 @pdf_header.setter

406 def pdf_header(self, new_header: Union[str, bytes]) -> None:

407 if isinstance(new_header, str):

408 new_header = new_header.encode()

409 self._header = new_header

410

411 def _add_object(self, obj: PdfObject) -> IndirectObject:

412 if (

413 getattr(obj, "indirect_reference", None) is not None

414 and obj.indirect_reference.pdf == self # type: ignore

415 ):

416 return obj.indirect_reference # type: ignore

417 # check for /Contents in Pages (/Contents in annotations are strings)

418 if isinstance(obj, DictionaryObject) and isinstance(

419 obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)

420 ):

421 obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])

422 self._objects.append(obj)

423 obj.indirect_reference = IndirectObject(len(self._objects), 0, self)

424 return obj.indirect_reference

425

426 def get_object(

427 self,

428 indirect_reference: Union[int, IndirectObject],

429 ) -> PdfObject:

430 if isinstance(indirect_reference, int):

431 obj = self._objects[indirect_reference - 1]

432 elif indirect_reference.pdf != self:

433 raise ValueError("PDF must be self")

434 else:

435 obj = self._objects[indirect_reference.idnum - 1]

436 assert obj is not None, "mypy"

437 return obj

438

439 def _replace_object(

440 self,

441 indirect_reference: Union[int, IndirectObject],

442 obj: PdfObject,

443 ) -> PdfObject:

444 if isinstance(indirect_reference, IndirectObject):

445 if indirect_reference.pdf != self:

446 raise ValueError("PDF must be self")

447 indirect_reference = indirect_reference.idnum

448 gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore

449 if (

450 getattr(obj, "indirect_reference", None) is not None

451 and obj.indirect_reference.pdf != self # type: ignore

452 ):

453 obj = obj.clone(self)

454 self._objects[indirect_reference - 1] = obj

455 obj.indirect_reference = IndirectObject(indirect_reference, gen, self)

456

457 assert isinstance(obj, PdfObject), "mypy"

458 return obj

459

460 def _add_page(

461 self,

462 page: PageObject,

463 index: int,

464 excluded_keys: Iterable[str] = (),

465 ) -> PageObject:

466 if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:

467 raise ValueError("Invalid page object")

468 assert self.flattened_pages is not None, "for mypy"

469 page_org = page

470 excluded_keys = list(excluded_keys)

471 excluded_keys += [PA.PARENT, "/StructParents"]

472 # Acrobat does not accept two indirect references pointing on the same

473 # page; therefore in order to add multiple copies of the same

474 # page, we need to create a new dictionary for the page, however the

475 # objects below (including content) are not duplicated:

476 try: # delete an already existing page

477 del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore

478 page_org.indirect_reference.idnum # type: ignore

479 ]

480 except Exception:

481 pass

482 page = cast(

483 "PageObject", page_org.clone(self, False, excluded_keys).get_object()

484 )

485 if page_org.pdf is not None:

486 other = page_org.pdf.pdf_header

487 self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)

488 node, idx = self._get_page_in_node(index)

489 page[NameObject(PA.PARENT)] = node.indirect_reference

490

491 if idx >= 0:

492 cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)

493 self.flattened_pages.insert(index, page)

494 else:

495 cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)

496 self.flattened_pages.append(page)

497 recurse = 0

498 while not is_null_or_none(node):

499 node = cast(DictionaryObject, node.get_object())

500 node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)

501 node = node.get(PA.PARENT, None) # type: ignore[assignment] # TODO: Fix.

502 recurse += 1

503 if recurse > 1000:

504 raise PyPdfError("Too many recursive calls!")

505 return page

506

507 def set_need_appearances_writer(self, state: bool = True) -> None:

508 """

509 Sets the "NeedAppearances" flag in the PDF writer.

510

511 The "NeedAppearances" flag indicates whether the appearance dictionary

512 for form fields should be automatically generated by the PDF viewer or

513 if the embedded appearance should be used.

514

515 Args:

516 state: The actual value of the NeedAppearances flag.

517

518 Returns:

519 None

520

521 """

522 # See §12.7.2 and §7.7.2 for more information:

523 # https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

524 try:

525 # get the AcroForm tree

526 if CatalogDictionary.ACRO_FORM not in self._root_object:

527 self._root_object[

528 NameObject(CatalogDictionary.ACRO_FORM)

529 ] = self._add_object(DictionaryObject())

530

531 need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances)

532 cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])[

533 need_appearances

534 ] = BooleanObject(state)

535 except Exception as exc: # pragma: no cover

536 logger_warning(

537 f"set_need_appearances_writer({state}) catch : {exc}", __name__

538 )

539

540 def create_viewer_preferences(self) -> ViewerPreferences:

541 o = ViewerPreferences()

542 self._root_object[

543 NameObject(CatalogDictionary.VIEWER_PREFERENCES)

544 ] = self._add_object(o)

545 return o

546

547 def add_page(

548 self,

549 page: PageObject,

550 excluded_keys: Iterable[str] = (),

551 ) -> PageObject:

552 """

553 Add a page to this PDF file.

554

555 Recommended for advanced usage including the adequate excluded_keys.

556

557 The page is usually acquired from a :class:`PdfReader<pypdf.PdfReader>`

558 instance.

559

560 Args:

561 page: The page to add to the document. Should be

562 an instance of :class:`PageObject<pypdf._page.PageObject>`

563 excluded_keys:

564

565 Returns:

566 The added PageObject.

567

568 """

569 assert self.flattened_pages is not None, "mypy"

570 return self._add_page(page, len(self.flattened_pages), excluded_keys)

571

572 def insert_page(

573 self,

574 page: PageObject,

575 index: int = 0,

576 excluded_keys: Iterable[str] = (),

577 ) -> PageObject:

578 """

579 Insert a page in this PDF file. The page is usually acquired from a

580 :class:`PdfReader<pypdf.PdfReader>` instance.

581

582 Args:

583 page: The page to add to the document.

584 index: Position at which the page will be inserted.

585 excluded_keys:

586

587 Returns:

588 The added PageObject.

589

590 """

591 assert self.flattened_pages is not None, "mypy"

592 if index < 0:

593 index = len(self.flattened_pages) + index

594 if index < 0:

595 raise ValueError("Invalid index value")

596 if index >= len(self.flattened_pages):

597 return self.add_page(page, excluded_keys)

598 return self._add_page(page, index, excluded_keys)

599

600 def _get_page_number_by_indirect(

601 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

602 ) -> Optional[int]:

603 """

604 Generate _page_id2num.

605

606 Args:

607 indirect_reference:

608

609 Returns:

610 The page number or None

611

612 """

613 # To provide same function as in PdfReader

614 if is_null_or_none(indirect_reference):

615 return None

616 assert indirect_reference is not None, "mypy"

617 if isinstance(indirect_reference, int):

618 indirect_reference = IndirectObject(indirect_reference, 0, self)

619 obj = indirect_reference.get_object()

620 if isinstance(obj, PageObject):

621 return obj.page_number

622 return None

623

624 def add_blank_page(

625 self, width: Optional[float] = None, height: Optional[float] = None

626 ) -> PageObject:

627 """

628 Append a blank page to this PDF file and return it.

629

630 If no page size is specified, use the size of the last page.

631

632 Args:

633 width: The width of the new page expressed in default user

634 space units.

635 height: The height of the new page expressed in default

636 user space units.

637

638 Returns:

639 The newly appended page.

640

641 Raises:

642 PageSizeNotDefinedError: if width and height are not defined

643 and previous page does not exist.

644

645 """

646 page = PageObject.create_blank_page(self, width, height)

647 return self.add_page(page)

648

649 def insert_blank_page(

650 self,

651 width: Optional[Union[float, decimal.Decimal]] = None,

652 height: Optional[Union[float, decimal.Decimal]] = None,

653 index: int = 0,

654 ) -> PageObject:

655 """

656 Insert a blank page to this PDF file and return it.

657

658 If no page size is specified, use the size of the last page.

659

660 Args:

661 width: The width of the new page expressed in default user

662 space units.

663 height: The height of the new page expressed in default

664 user space units.

665 index: Position to add the page.

666

667 Returns:

668 The newly inserted page.

669

670 Raises:

671 PageSizeNotDefinedError: if width and height are not defined

672 and previous page does not exist.

673

674 """

675 if width is None or (height is None and index < self.get_num_pages()):

676 oldpage = self.pages[index]

677 width = oldpage.mediabox.width

678 height = oldpage.mediabox.height

679 page = PageObject.create_blank_page(self, width, height)

680 self.insert_page(page, index)

681 return page

682

683 @property

684 def open_destination(

685 self,

686 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

687 return super().open_destination

688

689 @open_destination.setter

690 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

691 if dest is None:

692 try:

693 del self._root_object["/OpenAction"]

694 except KeyError:

695 pass

696 elif isinstance(dest, str):

697 self._root_object[NameObject("/OpenAction")] = TextStringObject(dest)

698 elif isinstance(dest, Destination):

699 self._root_object[NameObject("/OpenAction")] = dest.dest_array

700 elif isinstance(dest, PageObject):

701 self._root_object[NameObject("/OpenAction")] = Destination(

702 "Opening",

703 dest.indirect_reference

704 if dest.indirect_reference is not None

705 else NullObject(),

706 PAGE_FIT,

707 ).dest_array

708

709 def add_js(self, javascript: str) -> None:

710 """

711 Add JavaScript which will launch upon opening this PDF.

712

713 Args:

714 javascript: Your JavaScript.

715

716 >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

717 # Example: This will launch the print window when the PDF is opened.

718

719 """

720 # Names / JavaScript preferred to be able to add multiple scripts

721 if "/Names" not in self._root_object:

722 self._root_object[NameObject(CA.NAMES)] = DictionaryObject()

723 names = cast(DictionaryObject, self._root_object[CA.NAMES])

724 if "/JavaScript" not in names:

725 names[NameObject("/JavaScript")] = DictionaryObject(

726 {NameObject("/Names"): ArrayObject()}

727 )

728 js_list = cast(

729 ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"]

730 )

731 # We need a name for parameterized JavaScript in the PDF file,

732 # but it can be anything.

733 js_list.append(create_string_object(str(uuid.uuid4())))

734

735 js = DictionaryObject(

736 {

737 NameObject(PA.TYPE): NameObject("/Action"),

738 NameObject("/S"): NameObject("/JavaScript"),

739 NameObject("/JS"): TextStringObject(f"{javascript}"),

740 }

741 )

742 js_list.append(self._add_object(js))

743

744 def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:

745 """

746 Embed a file inside the PDF.

747

748 Reference:

749 https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

750 Section 7.11.3

751

752 Args:

753 filename: The filename to display.

754 data: The data in the file.

755

756 """

757 # We need three entries:

758 # * The file's data

759 # * The /Filespec entry

760 # * The file's name, which goes in the Catalog

761

762 # The entry for the file

763 # Sample:

764 # 8 0 obj

765 # <<

766 # /Length 12

767 # /Type /EmbeddedFile

768 # >>

769 # stream

770 # Hello world!

771 # endstream

772 # endobj

773

774 if isinstance(data, str):

775 data = data.encode("latin-1")

776 file_entry = DecodedStreamObject()

777 file_entry.set_data(data)

778 file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})

779

780 # The Filespec entry

781 # Sample:

782 # 7 0 obj

783 # <<

784 # /Type /Filespec

785 # /F (hello.txt)

786 # /EF << /F 8 0 R >>

787 # >>

788 # endobj

789

790 ef_entry = DictionaryObject()

791 ef_entry.update({NameObject("/F"): self._add_object(file_entry)})

792

793 filespec = DictionaryObject()

794 filespec.update(

795 {

796 NameObject(PA.TYPE): NameObject("/Filespec"),

797 NameObject(FileSpecificationDictionaryEntries.F): create_string_object(

798 filename

799 ), # Perhaps also try TextStringObject

800 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,

801 }

802 )

803

804 # Then create the entry for the root, as it needs

805 # a reference to the Filespec

806 # Sample:

807 # 1 0 obj

808 # <<

809 # /Type /Catalog

810 # /Outlines 2 0 R

811 # /Pages 3 0 R

812 # /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>

813 # >>

814 # endobj

815

816 if CA.NAMES not in self._root_object:

817 self._root_object[NameObject(CA.NAMES)] = self._add_object(

818 DictionaryObject()

819 )

820 if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]):

821 embedded_files_names_dictionary = DictionaryObject(

822 {NameObject(CA.NAMES): ArrayObject()}

823 )

824 cast(DictionaryObject, self._root_object[CA.NAMES])[

825 NameObject("/EmbeddedFiles")

826 ] = self._add_object(embedded_files_names_dictionary)

827 else:

828 embedded_files_names_dictionary = cast(

829 DictionaryObject,

830 cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"],

831 )

832 cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend(

833 [create_string_object(filename), filespec]

834 )

835

836 def append_pages_from_reader(

837 self,

838 reader: PdfReader,

839 after_page_append: Optional[Callable[[PageObject], None]] = None,

840 ) -> None:

841 """

842 Copy pages from reader to writer. Includes an optional callback

843 parameter which is invoked after pages are appended to the writer.

844

845 ``append`` should be preferred.

846

847 Args:

848 reader: a PdfReader object from which to copy page

849 annotations to this writer object. The writer's annots

850 will then be updated.

851 after_page_append:

852 Callback function that is invoked after each page is appended to

853 the writer. Signature includes a reference to the appended page

854 (delegates to append_pages_from_reader). The single parameter of

855 the callback is a reference to the page just appended to the

856 document.

857

858 """

859 reader_num_pages = len(reader.pages)

860 # Copy pages from reader to writer

861 for reader_page_number in range(reader_num_pages):

862 reader_page = reader.pages[reader_page_number]

863 writer_page = self.add_page(reader_page)

864 # Trigger callback, pass writer page as parameter

865 if callable(after_page_append):

866 after_page_append(writer_page)

867

868 def _update_field_annotation(

869 self,

870 field: DictionaryObject,

871 annotation: DictionaryObject,

872 font_name: str = "",

873 font_size: float = -1,

874 ) -> None:

875 # Calculate rectangle dimensions

876 _rct = cast(RectangleObject, annotation[AA.Rect])

877 rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1])))

878

879 # Extract font information

880 da = annotation.get_inherited(

881 AA.DA,

882 cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get(

883 AA.DA, None

884 ),

885 )

886 if da is None:

887 da = TextStringObject("/Helv 0 Tf 0 g")

888 else:

889 da = da.get_object()

890 font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")

891 font_properties = [x for x in font_properties if x != ""]

892 if font_name:

893 font_properties[font_properties.index("Tf") - 2] = font_name

894 else:

895 font_name = font_properties[font_properties.index("Tf") - 2]

896 font_height = (

897 font_size

898 if font_size >= 0

899 else float(font_properties[font_properties.index("Tf") - 1])

900 )

901 if font_height == 0:

902 if field.get(FA.Ff, 0) & FA.FfBits.Multiline:

903 font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE

904 else:

905 font_height = rct.height - 2

906 font_properties[font_properties.index("Tf") - 1] = str(font_height)

907 da = " ".join(font_properties)

908 y_offset = rct.height - 1 - font_height

909

910 # Retrieve font information from local DR ...

911 dr: Any = cast(

912 DictionaryObject,

913 cast(

914 DictionaryObject,

915 annotation.get_inherited(

916 "/DR",

917 cast(

918 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

919 ).get("/DR", DictionaryObject()),

920 ),

921 ).get_object(),

922 )

923 dr = dr.get("/Font", DictionaryObject()).get_object()

924 # _default_fonts_space_width keys is the list of Standard fonts

925 if font_name not in dr and font_name not in _default_fonts_space_width:

926 # ...or AcroForm dictionary

927 dr = cast(

928 Dict[Any, Any],

929 cast(

930 DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]

931 ).get("/DR", {}),

932 )

933 dr = dr.get_object().get("/Font", DictionaryObject()).get_object()

934 font_res = dr.get(font_name, None)

935 if not is_null_or_none(font_res):

936 font_res = cast(DictionaryObject, font_res.get_object())

937 font_subtype, _, font_encoding, font_map = build_char_map_from_dict(

938 200, font_res

939 )

940 try: # remove width stored in -1 key

941 del font_map[-1]

942 except KeyError:

943 pass

944 font_full_rev: Dict[str, bytes]

945 if isinstance(font_encoding, str):

946 font_full_rev = {

947 v: k.encode(font_encoding) for k, v in font_map.items()

948 }

949 else:

950 font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

951 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}

952 for key, value in font_map.items():

953 font_full_rev[value] = font_encoding_rev.get(key, key)

954 else:

955 logger_warning(f"Font dictionary for {font_name} not found.", __name__)

956 font_full_rev = {}

957

958 # Retrieve field text and selected values

959 field_flags = field.get(FA.Ff, 0)

960 if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:

961 txt = "\n".join(annotation.get_inherited(FA.Opt, []))

962 sel = field.get("/V", [])

963 if not isinstance(sel, list):

964 sel = [sel]

965 else: # /Tx

966 txt = field.get("/V", "")

967 sel = []

968 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)

969 txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")

970 # Generate appearance stream

971 ap_stream = generate_appearance_stream(

972 txt, sel, da, font_full_rev, rct, font_height, y_offset

973 )

974

975 # Create appearance dictionary

976 dct = DecodedStreamObject.initialize_from_dictionary(

977 {

978 NameObject("/Type"): NameObject("/XObject"),

979 NameObject("/Subtype"): NameObject("/Form"),

980 NameObject("/BBox"): rct,

981 "__streamdata__": ByteStringObject(ap_stream),

982 "/Length": 0,

983 }

984 )

985 if AA.AP in annotation:

986 for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items():

987 if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:

988 dct[k] = v

989

990 # Update Resources with font information if necessary

991 if font_res is not None:

992 dct[NameObject("/Resources")] = DictionaryObject(

993 {

994 NameObject("/Font"): DictionaryObject(

995 {

996 NameObject(font_name): getattr(

997 font_res, "indirect_reference", font_res

998 )

999 }

1000 )

1001 }

1002 )

1003 if AA.AP not in annotation:

1004 annotation[NameObject(AA.AP)] = DictionaryObject(

1005 {NameObject("/N"): self._add_object(dct)}

1006 )

1007 elif "/N" not in cast(DictionaryObject, annotation[AA.AP]):

1008 cast(DictionaryObject, annotation[NameObject(AA.AP)])[

1009 NameObject("/N")

1010 ] = self._add_object(dct)

1011 else: # [/AP][/N] exists

1012 n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore

1013 self._objects[n - 1] = dct

1014 dct.indirect_reference = IndirectObject(n, 0, self)

1015

1016 FFBITS_NUL = FA.FfBits(0)

1017

1018 def update_page_form_field_values(

1019 self,

1020 page: Union[PageObject, List[PageObject], None],

1021 fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]],

1022 flags: FA.FfBits = FFBITS_NUL,

1023 auto_regenerate: Optional[bool] = True,

1024 ) -> None:

1025 """

1026 Update the form field values for a given page from a fields dictionary.

1027

1028 Copy field texts and values from fields to page.

1029 If the field links to a parent object, add the information to the parent.

1030

1031 Args:

1032 page: `PageObject` - references **PDF writer's page** where the

1033 annotations and field data will be updated.

1034 `List[Pageobject]` - provides list of pages to be processed.

1035 `None` - all pages.

1036 fields: a Python dictionary of:

1037

1038 * field names (/T) as keys and text values (/V) as value

1039 * field names (/T) as keys and list of text values (/V) for multiple choice list

1040 * field names (/T) as keys and tuple of:

1041 * text values (/V)

1042 * font id (e.g. /F1, the font id must exist)

1043 * font size (0 for autosize)

1044

1045 flags: A set of flags from :class:`~pypdf.constants.FieldDictionaryAttributes.FfBits`.

1046

1047 auto_regenerate: Set/unset the need_appearances flag;

1048 the flag is unchanged if auto_regenerate is None.

1049

1050 """

1051 if CatalogDictionary.ACRO_FORM not in self._root_object:

1052 raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")

1053 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1054 if InteractiveFormDictEntries.Fields not in af:

1055 raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object")

1056 if isinstance(auto_regenerate, bool):

1057 self.set_need_appearances_writer(auto_regenerate)

1058 # Iterate through pages, update field values

1059 if page is None:

1060 page = list(self.pages)

1061 if isinstance(page, list):

1062 for p in page:

1063 if PG.ANNOTS in p: # just to prevent warnings

1064 self.update_page_form_field_values(p, fields, flags, None)

1065 return

1066 if PG.ANNOTS not in page:

1067 logger_warning("No fields to update on this page", __name__)

1068 return

1069 for annotation in page[PG.ANNOTS]: # type: ignore

1070 annotation = cast(DictionaryObject, annotation.get_object())

1071 if annotation.get("/Subtype", "") != "/Widget":

1072 continue

1073 if "/FT" in annotation and "/T" in annotation:

1074 parent_annotation = annotation

1075 else:

1076 parent_annotation = annotation.get(

1077 PG.PARENT, DictionaryObject()

1078 ).get_object()

1079

1080 for field, value in fields.items():

1081 if not (

1082 self._get_qualified_field_name(parent_annotation) == field

1083 or parent_annotation.get("/T", None) == field

1084 ):

1085 continue

1086 if (

1087 parent_annotation.get("/FT", None) == "/Ch"

1088 and "/I" in parent_annotation

1089 ):

1090 del parent_annotation["/I"]

1091 if flags:

1092 annotation[NameObject(FA.Ff)] = NumberObject(flags)

1093 if isinstance(value, list):

1094 lst = ArrayObject(TextStringObject(v) for v in value)

1095 parent_annotation[NameObject(FA.V)] = lst

1096 elif isinstance(value, tuple):

1097 annotation[NameObject(FA.V)] = TextStringObject(

1098 value[0],

1099 )

1100 else:

1101 parent_annotation[NameObject(FA.V)] = TextStringObject(value)

1102 if parent_annotation.get(FA.FT) == "/Btn":

1103 # Checkbox button (no /FT found in Radio widgets)

1104 v = NameObject(value)

1105 ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])

1106 if v not in cast(ArrayObject, ap[NameObject("/N")]):

1107 v = NameObject("/Off")

1108 # other cases will be updated through the for loop

1109 annotation[NameObject(AA.AS)] = v

1110 annotation[NameObject(FA.V)] = v

1111 elif (

1112 parent_annotation.get(FA.FT) == "/Tx"

1113 or parent_annotation.get(FA.FT) == "/Ch"

1114 ):

1115 # textbox

1116 if isinstance(value, tuple):

1117 self._update_field_annotation(

1118 parent_annotation, annotation, value[1], value[2]

1119 )

1120 else:

1121 self._update_field_annotation(parent_annotation, annotation)

1122 elif (

1123 annotation.get(FA.FT) == "/Sig"

1124 ): # deprecated # not implemented yet

1125 logger_warning("Signature forms not implemented yet", __name__)

1126

1127 def reattach_fields(

1128 self, page: Optional[PageObject] = None

1129 ) -> List[DictionaryObject]:

1130 """

1131 Parse annotations within the page looking for orphan fields and

1132 reattach then into the Fields Structure.

1133

1134 Args:

1135 page: page to analyze.

1136 If none is provided, all pages will be analyzed.

1137

1138 Returns:

1139 list of reattached fields.

1140

1141 """

1142 lst = []

1143 if page is None:

1144 for p in self.pages:

1145 lst += self.reattach_fields(p)

1146 return lst

1147

1148 try:

1149 af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM])

1150 except KeyError:

1151 af = DictionaryObject()

1152 self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af

1153 try:

1154 fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields])

1155 except KeyError:

1156 fields = ArrayObject()

1157 af[NameObject(InteractiveFormDictEntries.Fields)] = fields

1158

1159 if "/Annots" not in page:

1160 return lst

1161 annotations = cast(ArrayObject, page["/Annots"])

1162 for idx, annotation in enumerate(annotations):

1163 is_indirect = isinstance(annotation, IndirectObject)

1164 annotation = cast(DictionaryObject, annotation.get_object())

1165 if annotation.get("/Subtype", "") == "/Widget" and "/FT" in annotation:

1166 if (

1167 "indirect_reference" in annotation.__dict__

1168 and annotation.indirect_reference in fields

1169 ):

1170 continue

1171 if not is_indirect:

1172 annotations[idx] = self._add_object(annotation)

1173 fields.append(annotation.indirect_reference)

1174 lst.append(annotation)

1175 return lst

1176

1177 def clone_reader_document_root(self, reader: PdfReader) -> None:

1178 """

1179 Copy the reader document root to the writer and all sub-elements,

1180 including pages, threads, outlines,... For partial insertion, ``append``

1181 should be considered.

1182

1183 Args:

1184 reader: PdfReader from which the document root should be copied.

1185

1186 """

1187 self._info_obj = None

1188 if self.incremental:

1189 self._objects = [None] * (cast(int, reader.trailer["/Size"]) - 1)

1190 for i in range(len(self._objects)):

1191 o = reader.get_object(i + 1)

1192 if o is not None:

1193 self._objects[i] = o.replicate(self)

1194 else:

1195 self._objects.clear()

1196 self._root_object = reader.root_object.clone(self)

1197 self._pages = self._root_object.raw_get("/Pages")

1198

1199 assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest

1200 # must be done here before rewriting

1201 if self.incremental:

1202 self._original_hash = [

1203 (obj.hash_bin() if obj is not None else 0) for obj in self._objects

1204 ]

1205 self._flatten()

1206 assert self.flattened_pages is not None

1207 for p in self.flattened_pages:

1208 self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)

1209 if not self.incremental:

1210 p[NameObject("/Parent")] = self._pages

1211 if not self.incremental:

1212 cast(DictionaryObject, self._pages.get_object())[

1213 NameObject("/Kids")

1214 ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])

1215

1216 def clone_document_from_reader(

1217 self,

1218 reader: PdfReader,

1219 after_page_append: Optional[Callable[[PageObject], None]] = None,

1220 ) -> None:

1221 """

1222 Create a copy (clone) of a document from a PDF file reader cloning

1223 section '/Root' and '/Info' and '/ID' of the pdf.

1224

1225 Args:

1226 reader: PDF file reader instance from which the clone

1227 should be created.

1228 after_page_append:

1229 Callback function that is invoked after each page is appended to

1230 the writer. Signature includes a reference to the appended page

1231 (delegates to append_pages_from_reader). The single parameter of

1232 the callback is a reference to the page just appended to the

1233 document.

1234

1235 """

1236 self.clone_reader_document_root(reader)

1237 inf = reader._info

1238 if self.incremental:

1239 if inf is not None:

1240 self._info_obj = cast(

1241 IndirectObject, inf.clone(self).indirect_reference

1242 )

1243 assert isinstance(self._info, DictionaryObject), "for mypy"

1244 self._original_hash[

1245 self._info_obj.indirect_reference.idnum - 1

1246 ] = self._info.hash_bin()

1247 elif inf is not None:

1248 self._info_obj = self._add_object(

1249 DictionaryObject(cast(DictionaryObject, inf.get_object()))

1250 )

1251 # else: _info_obj = None done in clone_reader_document_root()

1252

1253 try:

1254 self._ID = cast(ArrayObject, reader._ID).clone(self)

1255 except AttributeError:

1256 pass

1257

1258 if callable(after_page_append):

1259 for page in cast(

1260 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]

1261 ):

1262 after_page_append(page.get_object())

1263

1264 def _compute_document_identifier(self) -> ByteStringObject:

1265 stream = BytesIO()

1266 self._write_pdf_structure(stream)

1267 stream.seek(0)

1268 return ByteStringObject(_rolling_checksum(stream).encode("utf8"))

1269

1270 def generate_file_identifiers(self) -> None:

1271 """

1272 Generate an identifier for the PDF that will be written.

1273

1274 The only point of this is ensuring uniqueness. Reproducibility is not

1275 required.

1276 When a file is first written, both identifiers shall be set to the same value.

1277 If both identifiers match when a file reference is resolved, it is very

1278 likely that the correct and unchanged file has been found. If only the first

1279 identifier matches, a different version of the correct file has been found.

1280 see §14.4 "File Identifiers".

1281 """

1282 if self._ID:

1283 id1 = self._ID[0]

1284 id2 = self._compute_document_identifier()

1285 else:

1286 id1 = self._compute_document_identifier()

1287 id2 = id1

1288 self._ID = ArrayObject((id1, id2))

1289

1290 def encrypt(

1291 self,

1292 user_password: str,

1293 owner_password: Optional[str] = None,

1294 use_128bit: bool = True,

1295 permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS,

1296 *,

1297 algorithm: Optional[str] = None,

1298 ) -> None:

1299 """

1300 Encrypt this PDF file with the PDF Standard encryption handler.

1301

1302 Args:

1303 user_password: The password which allows for opening

1304 and reading the PDF file with the restrictions provided.

1305 owner_password: The password which allows for

1306 opening the PDF files without any restrictions. By default,

1307 the owner password is the same as the user password.

1308 use_128bit: flag as to whether to use 128bit

1309 encryption. When false, 40bit encryption will be used.

1310 By default, this flag is on.

1311 permissions_flag: permissions as described in

1312 Table 3.20 of the PDF 1.7 specification. A bit value of 1 means

1313 the permission is granted.

1314 Hence an integer value of -1 will set all flags.

1315 Bit position 3 is for printing, 4 is for modifying content,

1316 5 and 6 control annotations, 9 for form fields,

1317 10 for extraction of text and graphics.

1318 algorithm: encrypt algorithm. Values may be one of "RC4-40", "RC4-128",

1319 "AES-128", "AES-256-R5", "AES-256". If it is valid,

1320 `use_128bit` will be ignored.

1321

1322 """

1323 if owner_password is None:

1324 owner_password = user_password

1325

1326 if algorithm is not None:

1327 try:

1328 alg = getattr(EncryptAlgorithm, algorithm.replace("-", "_"))

1329 except AttributeError:

1330 raise ValueError(f"Algorithm '{algorithm}' NOT supported")

1331 else:

1332 alg = EncryptAlgorithm.RC4_128

1333 if not use_128bit:

1334 alg = EncryptAlgorithm.RC4_40

1335 self.generate_file_identifiers()

1336 assert self._ID

1337 self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])

1338 # in case call `encrypt` again

1339 entry = self._encryption.write_entry(user_password, owner_password)

1340 if self._encrypt_entry:

1341 # replace old encrypt_entry

1342 assert self._encrypt_entry.indirect_reference is not None

1343 entry.indirect_reference = self._encrypt_entry.indirect_reference

1344 self._objects[entry.indirect_reference.idnum - 1] = entry

1345 else:

1346 self._add_object(entry)

1347 self._encrypt_entry = entry

1348

1349 def write_stream(self, stream: StreamType) -> None:

1350 if hasattr(stream, "mode") and "b" not in stream.mode:

1351 logger_warning(

1352 f"File <{stream.name}> to write to is not in binary mode. "

1353 "It may not be written to correctly.",

1354 __name__,

1355 )

1356 # deprecated to be removed in pypdf 6.0.0 :

1357 # if not self._root:

1358 # self._root = self._add_object(self._root_object)

1359 # self._sweep_indirect_references(self._root)

1360

1361 if self.incremental:

1362 self._reader.stream.seek(0)

1363 stream.write(self._reader.stream.read(-1))

1364 if len(self.list_objects_in_increment()) > 0:

1365 self._write_increment(stream) # writes objs, xref stream and startxref

1366 else:

1367 object_positions, free_objects = self._write_pdf_structure(stream)

1368 xref_location = self._write_xref_table(

1369 stream, object_positions, free_objects

1370 )

1371 self._write_trailer(stream, xref_location)

1372

1373 def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:

1374 """

1375 Write the collection of pages added to this object out as a PDF file.

1376

1377 Args:

1378 stream: An object to write the file to. The object can support

1379 the write method and the tell method, similar to a file object, or

1380 be a file path, just like the fileobj, just named it stream to keep

1381 existing workflow.

1382

1383 Returns:

1384 A tuple (bool, IO).

1385

1386 """

1387 my_file = False

1388

1389 if stream == "":

1390 raise ValueError(f"Output({stream=}) is empty.")

1391

1392 if isinstance(stream, (str, Path)):

1393 stream = FileIO(stream, "wb")

1394 my_file = True

1395

1396 self.write_stream(stream)

1397

1398 if my_file:

1399 stream.close()

1400 else:

1401 stream.flush()

1402

1403 return my_file, stream

1404

1405 def list_objects_in_increment(self) -> List[IndirectObject]:

1406 """

1407 For analysis or debugging.

1408 Provides the list of new or modified objects that will be written

1409 in the increment.

1410 Deleted objects will not be freed but will become orphans.

1411

1412 Returns:

1413 List of new or modified IndirectObjects

1414

1415 """

1416 original_hash_count = len(self._original_hash)

1417 return [

1418 cast(IndirectObject, obj).indirect_reference

1419 for i, obj in enumerate(self._objects)

1420 if (

1421 obj is not None

1422 and (

1423 i >= original_hash_count

1424 or obj.hash_bin() != self._original_hash[i]

1425 )

1426 )

1427 ]

1428

1429 def _write_increment(self, stream: StreamType) -> None:

1430 object_positions = {}

1431 object_blocks = []

1432 current_start = -1

1433 current_stop = -2

1434 original_hash_count = len(self._original_hash)

1435 for i, obj in enumerate(self._objects):

1436 if obj is not None and (

1437 i >= original_hash_count

1438 or obj.hash_bin() != self._original_hash[i]

1439 ):

1440 idnum = i + 1

1441 assert isinstance(obj, PdfObject), "mypy"

1442 # first write new/modified object

1443 object_positions[idnum] = stream.tell()

1444 stream.write(f"{idnum} 0 obj\n".encode())

1445 """ encryption is not operational

1446 if self._encryption and obj != self._encrypt_entry:

1447 obj = self._encryption.encrypt_object(obj, idnum, 0)

1448 """

1449 obj.write_to_stream(stream)

1450 stream.write(b"\nendobj\n")

1451

1452 # prepare xref

1453 if idnum != current_stop:

1454 if current_start > 0:

1455 object_blocks.append(

1456 [current_start, current_stop - current_start]

1457 )

1458 current_start = idnum

1459 current_stop = idnum + 1

1460 assert current_start > 0, "for pytest only"

1461 object_blocks.append([current_start, current_stop - current_start])

1462 # write incremented xref

1463 xref_location = stream.tell()

1464 xr_id = len(self._objects) + 1

1465 stream.write(f"{xr_id} 0 obj".encode())

1466 init_data = {

1467 NameObject("/Type"): NameObject("/XRef"),

1468 NameObject("/Size"): NumberObject(xr_id + 1),

1469 NameObject("/Root"): self.root_object.indirect_reference,

1470 NameObject("/Filter"): NameObject("/FlateDecode"),

1471 NameObject("/Index"): ArrayObject(

1472 [NumberObject(_it) for _su in object_blocks for _it in _su]

1473 ),

1474 NameObject("/W"): ArrayObject(

1475 [NumberObject(1), NumberObject(4), NumberObject(1)]

1476 ),

1477 "__streamdata__": b"",

1478 }

1479 if self._info is not None and (

1480 self._info.indirect_reference.idnum - 1 # type: ignore

1481 >= len(self._original_hash)

1482 or cast(IndirectObject, self._info).hash_bin() # kept for future

1483 != self._original_hash[

1484 self._info.indirect_reference.idnum - 1 # type: ignore

1485 ]

1486 ):

1487 init_data[NameObject(TK.INFO)] = self._info.indirect_reference

1488 init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)

1489 if self._ID:

1490 init_data[NameObject(TK.ID)] = self._ID

1491 xr = StreamObject.initialize_from_dictionary(init_data)

1492 xr.set_data(

1493 b"".join(

1494 [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]

1495 )

1496 )

1497 xr.write_to_stream(stream)

1498 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1499

1500 def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:

1501 object_positions = []

1502 free_objects = []

1503 stream.write(self.pdf_header.encode() + b"\n")

1504 stream.write(b"%\xE2\xE3\xCF\xD3\n")

1505

1506 for idnum, obj in enumerate(self._objects, start=1):

1507 if obj is not None:

1508 object_positions.append(stream.tell())

1509 stream.write(f"{idnum} 0 obj\n".encode())

1510 if self._encryption and obj != self._encrypt_entry:

1511 obj = self._encryption.encrypt_object(obj, idnum, 0)

1512 obj.write_to_stream(stream)

1513 stream.write(b"\nendobj\n")

1514 else:

1515 object_positions.append(-1)

1516 free_objects.append(idnum)

1517 free_objects.append(0) # add 0 to loop in accordance with specification

1518 return object_positions, free_objects

1519

1520 def _write_xref_table(

1521 self, stream: StreamType, object_positions: List[int], free_objects: List[int]

1522 ) -> int:

1523 xref_location = stream.tell()

1524 stream.write(b"xref\n")

1525 stream.write(f"0 {len(self._objects) + 1}\n".encode())

1526 stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())

1527 free_idx = 1

1528 for offset in object_positions:

1529 if offset > 0:

1530 stream.write(f"{offset:0>10} {0:0>5} n \n".encode())

1531 else:

1532 stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())

1533 free_idx += 1

1534 return xref_location

1535

1536 def _write_trailer(self, stream: StreamType, xref_location: int) -> None:

1537 """

1538 Write the PDF trailer to the stream.

1539

1540 To quote the PDF specification:

1541 [The] trailer [gives] the location of the cross-reference table and

1542 of certain special objects within the body of the file.

1543 """

1544 stream.write(b"trailer\n")

1545 trailer = DictionaryObject(

1546 {

1547 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),

1548 NameObject(TK.ROOT): self.root_object.indirect_reference,

1549 }

1550 )

1551 if self._info is not None:

1552 trailer[NameObject(TK.INFO)] = self._info.indirect_reference

1553 if self._ID is not None:

1554 trailer[NameObject(TK.ID)] = self._ID

1555 if self._encrypt_entry:

1556 trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

1557 trailer.write_to_stream(stream)

1558 stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

1559

1560 @property

1561 def metadata(self) -> Optional[DocumentInformation]:

1562 """

1563 Retrieve/set the PDF file's document information dictionary, if it exists.

1564

1565 Args:

1566 value: dict with the entries to be set. if None : remove the /Info entry from the pdf.

1567

1568 Note that some PDF files use (XMP) metadata streams instead of document

1569 information dictionaries, and these metadata streams will not be

1570 accessed by this function, but by :meth:`~xmp_metadata`.

1571

1572 """

1573 return super().metadata

1574

1575 @metadata.setter

1576 def metadata(

1577 self,

1578 value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],

1579 ) -> None:

1580 if value is None:

1581 self._info = None

1582 else:

1583 if self._info is not None:

1584 self._info.clear()

1585

1586 self.add_metadata(value)

1587

1588 def add_metadata(self, infos: Dict[str, Any]) -> None:

1589 """

1590 Add custom metadata to the output.

1591

1592 Args:

1593 infos: a Python dictionary where each key is a field

1594 and each value is your new metadata.

1595

1596 """

1597 args = {}

1598 if isinstance(infos, PdfObject):

1599 infos = cast(DictionaryObject, infos.get_object())

1600 for key, value in list(infos.items()):

1601 if isinstance(value, PdfObject):

1602 value = value.get_object()

1603 args[NameObject(key)] = create_string_object(str(value))

1604 if self._info is None:

1605 self._info = DictionaryObject()

1606 self._info.update(args)

1607

1608 def compress_identical_objects(

1609 self,

1610 remove_identicals: bool = True,

1611 remove_orphans: bool = True,

1612 ) -> None:

1613 """

1614 Parse the PDF file and merge objects that have the same hash.

1615 This will make objects common to multiple pages.

1616 Recommended to be used just before writing output.

1617

1618 Args:

1619 remove_identicals: Remove identical objects.

1620 remove_orphans: Remove unreferenced objects.

1621

1622 """

1623

1624 def replace_in_obj(

1625 obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]

1626 ) -> None:

1627 if isinstance(obj, DictionaryObject):

1628 key_val = obj.items()

1629 elif isinstance(obj, ArrayObject):

1630 key_val = enumerate(obj) # type: ignore

1631 else:

1632 return

1633 assert isinstance(obj, (DictionaryObject, ArrayObject))

1634 for k, v in key_val:

1635 if isinstance(v, IndirectObject):

1636 orphans[v.idnum - 1] = False

1637 if v in crossref:

1638 obj[k] = crossref[v]

1639 else:

1640 """the filtering on DictionaryObject and ArrayObject only

1641 will be performed within replace_in_obj"""

1642 replace_in_obj(v, crossref)

1643

1644 # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])

1645 self._idnum_hash = {}

1646 orphans = [True] * len(self._objects)

1647 # look for similar objects

1648 for idx, obj in enumerate(self._objects):

1649 if is_null_or_none(obj):

1650 continue

1651 assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.

1652 assert isinstance(obj.indirect_reference, IndirectObject)

1653 h = obj.hash_value()

1654 if remove_identicals and h in self._idnum_hash:

1655 self._idnum_hash[h][1].append(obj.indirect_reference)

1656 self._objects[idx] = None

1657 else:

1658 self._idnum_hash[h] = (obj.indirect_reference, [])

1659

1660 # generate the dict converting others to 1st

1661 cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}

1662 cnv_rev: Dict[IndirectObject, IndirectObject] = {}

1663 for k, v in cnv.items():

1664 cnv_rev.update(zip(v, (k,) * len(v)))

1665

1666 # replace reference to merged objects

1667 for obj in self._objects:

1668 if isinstance(obj, (DictionaryObject, ArrayObject)):

1669 replace_in_obj(obj, cnv_rev)

1670

1671 # remove orphans (if applicable)

1672 orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

1673

1674 orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore

1675

1676 try:

1677 orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore

1678 except AttributeError:

1679 pass

1680 for i in compress(range(len(self._objects)), orphans):

1681 self._objects[i] = None

1682

1683 def _sweep_indirect_references(

1684 self,

1685 root: Union[

1686 ArrayObject,

1687 BooleanObject,

1688 DictionaryObject,

1689 FloatObject,

1690 IndirectObject,

1691 NameObject,

1692 PdfObject,

1693 NumberObject,

1694 TextStringObject,

1695 NullObject,

1696 ],

1697 ) -> None: # deprecated

1698 """

1699 Resolving any circular references to Page objects.

1700

1701 Circular references to Page objects can arise when objects such as

1702 annotations refer to their associated page. If these references are not

1703 properly handled, the PDF file will contain multiple copies of the same

1704 Page object. To address this problem, Page objects store their original

1705 object reference number. This method adds the reference number of any

1706 circularly referenced Page objects to an external reference map. This

1707 ensures that self-referencing trees reference the correct new object

1708 location, rather than copying in a new copy of the Page object.

1709

1710 Args:

1711 root: The root of the PDF object tree to sweep.

1712

1713 """

1714 deprecate(

1715 "_sweep_indirect_references has been removed, please report to dev team if this warning is observed",

1716 )

1717

1718 def _resolve_indirect_object(

1719 self, data: IndirectObject

1720 ) -> IndirectObject: # deprecated

1721 """

1722 Resolves an indirect object to an indirect object in this PDF file.

1723

1724 If the input indirect object already belongs to this PDF file, it is

1725 returned directly. Otherwise, the object is retrieved from the input

1726 object's PDF file using the object's ID number and generation number. If

1727 the object cannot be found, a warning is logged and a `NullObject` is

1728 returned.

1729

1730 If the object is not already in this PDF file, it is added to the file's

1731 list of objects and assigned a new ID number and generation number of 0.

1732 The hash value of the object is then added to the `_idnum_hash`

1733 dictionary, with the corresponding `IndirectObject` reference as the

1734 value.

1735

1736 Args:

1737 data: The `IndirectObject` to resolve.

1738

1739 Returns:

1740 The resolved `IndirectObject` in this PDF file.

1741

1742 Raises:

1743 ValueError: If the input stream is closed.

1744

1745 """

1746 deprecate(

1747 "_resolve_indirect_object has been removed, please report to dev team if this warning is observed",

1748 )

1749 return IndirectObject(0, 0, self)

1750

1751 def get_reference(self, obj: PdfObject) -> IndirectObject:

1752 idnum = self._objects.index(obj) + 1

1753 ref = IndirectObject(idnum, 0, self)

1754 assert ref.get_object() == obj

1755 return ref

1756

1757 def get_outline_root(self) -> TreeObject:

1758 if CO.OUTLINES in self._root_object:

1759 # Entries in the catalog dictionary

1760 outline = cast(TreeObject, self._root_object[CO.OUTLINES])

1761 if not isinstance(outline, TreeObject):

1762 t = TreeObject(outline)

1763 self._replace_object(outline.indirect_reference.idnum, t)

1764 outline = t

1765 idnum = self._objects.index(outline) + 1

1766 outline_ref = IndirectObject(idnum, 0, self)

1767 assert outline_ref.get_object() == outline

1768 else:

1769 outline = TreeObject()

1770 outline.update({})

1771 outline_ref = self._add_object(outline)

1772 self._root_object[NameObject(CO.OUTLINES)] = outline_ref

1773

1774 return outline

1775

1776 def get_threads_root(self) -> ArrayObject:

1777 """

1778 The list of threads.

1779

1780 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1781

1782 Returns:

1783 An array (possibly empty) of Dictionaries with an ``/F`` key,

1784 and optionally information about the thread in ``/I`` or ``/Metadata`` keys.

1785

1786 """

1787 if CO.THREADS in self._root_object:

1788 # Entries in the catalog dictionary

1789 threads = cast(ArrayObject, self._root_object[CO.THREADS])

1790 else:

1791 threads = ArrayObject()

1792 self._root_object[NameObject(CO.THREADS)] = threads

1793 return threads

1794

1795 @property

1796 def threads(self) -> ArrayObject:

1797 """

1798 Read-only property for the list of threads.

1799

1800 See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.

1801

1802 Each element is a dictionary with an ``/F`` key, and optionally

1803 information about the thread in ``/I`` or ``/Metadata`` keys.

1804 """

1805 return self.get_threads_root()

1806

1807 def add_outline_item_destination(

1808 self,

1809 page_destination: Union[IndirectObject, PageObject, TreeObject],

1810 parent: Union[None, TreeObject, IndirectObject] = None,

1811 before: Union[None, TreeObject, IndirectObject] = None,

1812 is_open: bool = True,

1813 ) -> IndirectObject:

1814 page_destination = cast(PageObject, page_destination.get_object())

1815 if isinstance(page_destination, PageObject):

1816 return self.add_outline_item_destination(

1817 Destination(

1818 f"page #{page_destination.page_number}",

1819 cast(IndirectObject, page_destination.indirect_reference),

1820 Fit.fit(),

1821 )

1822 )

1823

1824 if parent is None:

1825 parent = self.get_outline_root()

1826

1827 page_destination[NameObject("/%is_open%")] = BooleanObject(is_open)

1828 parent = cast(TreeObject, parent.get_object())

1829 page_destination_ref = self._add_object(page_destination)

1830 if before is not None:

1831 before = before.indirect_reference

1832 parent.insert_child(

1833 page_destination_ref,

1834 before,

1835 self,

1836 page_destination.inc_parent_counter_outline

1837 if is_open

1838 else (lambda x, y: 0), # noqa: ARG005

1839 )

1840 if "/Count" not in page_destination:

1841 page_destination[NameObject("/Count")] = NumberObject(0)

1842

1843 return page_destination_ref

1844

1845 def add_outline_item_dict(

1846 self,

1847 outline_item: OutlineItemType,

1848 parent: Union[None, TreeObject, IndirectObject] = None,

1849 before: Union[None, TreeObject, IndirectObject] = None,

1850 is_open: bool = True,

1851 ) -> IndirectObject:

1852 outline_item_object = TreeObject()

1853 outline_item_object.update(outline_item)

1854

1855 """code currently unreachable

1856 if "/A" in outline_item:

1857 action = DictionaryObject()

1858 a_dict = cast(DictionaryObject, outline_item["/A"])

1859 for k, v in list(a_dict.items()):

1860 action[NameObject(str(k))] = v

1861 action_ref = self._add_object(action)

1862 outline_item_object[NameObject("/A")] = action_ref

1863 """

1864 return self.add_outline_item_destination(

1865 outline_item_object, parent, before, is_open

1866 )

1867

1868 def add_outline_item(

1869 self,

1870 title: str,

1871 page_number: Union[None, PageObject, IndirectObject, int],

1872 parent: Union[None, TreeObject, IndirectObject] = None,

1873 before: Union[None, TreeObject, IndirectObject] = None,

1874 color: Optional[Union[Tuple[float, float, float], str]] = None,

1875 bold: bool = False,

1876 italic: bool = False,

1877 fit: Fit = PAGE_FIT,

1878 is_open: bool = True,

1879 ) -> IndirectObject:

1880 """

1881 Add an outline item (commonly referred to as a "Bookmark") to the PDF file.

1882

1883 Args:

1884 title: Title to use for this outline item.

1885 page_number: Page number this outline item will point to.

1886 parent: A reference to a parent outline item to create nested

1887 outline items.

1888 before:

1889 color: Color of the outline item's font as a red, green, blue tuple

1890 from 0.0 to 1.0 or as a Hex String (#RRGGBB)

1891 bold: Outline item font is bold

1892 italic: Outline item font is italic

1893 fit: The fit of the destination page.

1894

1895 Returns:

1896 The added outline item as an indirect object.

1897

1898 """

1899 page_ref: Union[None, NullObject, IndirectObject, NumberObject]

1900 if isinstance(italic, Fit): # it means that we are on the old params

1901 if fit is not None and page_number is None:

1902 page_number = fit

1903 return self.add_outline_item(

1904 title, page_number, parent, None, before, color, bold, italic, is_open=is_open

1905 )

1906 if page_number is None:

1907 action_ref = None

1908 else:

1909 if isinstance(page_number, IndirectObject):

1910 page_ref = page_number

1911 elif isinstance(page_number, PageObject):

1912 page_ref = page_number.indirect_reference

1913 elif isinstance(page_number, int):

1914 try:

1915 page_ref = self.pages[page_number].indirect_reference

1916 except IndexError:

1917 page_ref = NumberObject(page_number)

1918 if page_ref is None:

1919 logger_warning(

1920 f"can not find reference of page {page_number}",

1921 __name__,

1922 )

1923 page_ref = NullObject()

1924 dest = Destination(

1925 NameObject("/" + title + " outline item"),

1926 page_ref,

1927 fit,

1928 )

1929

1930 action_ref = self._add_object(

1931 DictionaryObject(

1932 {

1933 NameObject(GoToActionArguments.D): dest.dest_array,

1934 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1935 }

1936 )

1937 )

1938 outline_item = self._add_object(

1939 _create_outline_item(action_ref, title, color, italic, bold)

1940 )

1941

1942 if parent is None:

1943 parent = self.get_outline_root()

1944 return self.add_outline_item_destination(outline_item, parent, before, is_open)

1945

1946 def add_outline(self) -> None:

1947 raise NotImplementedError(

1948 "This method is not yet implemented. Use :meth:`add_outline_item` instead."

1949 )

1950

1951 def add_named_destination_array(

1952 self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject]

1953 ) -> None:

1954 named_dest = self.get_named_dest_root()

1955 i = 0

1956 while i < len(named_dest):

1957 if title < named_dest[i]:

1958 named_dest.insert(i, destination)

1959 named_dest.insert(i, TextStringObject(title))

1960 return

1961 i += 2

1962 named_dest.extend([TextStringObject(title), destination])

1963 return

1964

1965 def add_named_destination_object(

1966 self,

1967 page_destination: PdfObject,

1968 ) -> IndirectObject:

1969 page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore

1970 self.add_named_destination_array(

1971 cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore

1972 )

1973

1974 return page_destination_ref

1975

1976 def add_named_destination(

1977 self,

1978 title: str,

1979 page_number: int,

1980 ) -> IndirectObject:

1981 page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore

1982 dest = DictionaryObject()

1983 dest.update(

1984 {

1985 NameObject(GoToActionArguments.D): ArrayObject(

1986 [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)]

1987 ),

1988 NameObject(GoToActionArguments.S): NameObject("/GoTo"),

1989 }

1990 )

1991

1992 dest_ref = self._add_object(dest)

1993 if not isinstance(title, TextStringObject):

1994 title = TextStringObject(str(title))

1995

1996 self.add_named_destination_array(title, dest_ref)

1997 return dest_ref

1998

1999 def remove_links(self) -> None:

2000 """Remove links and annotations from this output."""

2001 for page in self.pages:

2002 self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)

2003

2004 def remove_annotations(

2005 self, subtypes: Optional[Union[AnnotationSubtype, Iterable[AnnotationSubtype]]]

2006 ) -> None:

2007 """

2008 Remove annotations by annotation subtype.

2009

2010 Args:

2011 subtypes: subtype or list of subtypes to be removed.

2012 Examples are: "/Link", "/FileAttachment", "/Sound",

2013 "/Movie", "/Screen", ...

2014 If you want to remove all annotations, use subtypes=None.

2015

2016 """

2017 for page in self.pages:

2018 self._remove_annots_from_page(page, subtypes)

2019

2020 def _remove_annots_from_page(

2021 self,

2022 page: Union[IndirectObject, PageObject, DictionaryObject],

2023 subtypes: Optional[Iterable[str]],

2024 ) -> None:

2025 page = cast(DictionaryObject, page.get_object())

2026 if PG.ANNOTS in page:

2027 i = 0

2028 while i < len(cast(ArrayObject, page[PG.ANNOTS])):

2029 an = cast(ArrayObject, page[PG.ANNOTS])[i]

2030 obj = cast(DictionaryObject, an.get_object())

2031 if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:

2032 if isinstance(an, IndirectObject):

2033 self._objects[an.idnum - 1] = NullObject() # to reduce PDF size

2034 del page[PG.ANNOTS][i] # type:ignore

2035 else:

2036 i += 1

2037

2038 def remove_objects_from_page(

2039 self,

2040 page: Union[PageObject, DictionaryObject],

2041 to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],

2042 text_filters: Optional[Dict[str, Any]] = None

2043 ) -> None:

2044 """

2045 Remove objects specified by ``to_delete`` from the given page.

2046

2047 Args:

2048 page: Page object to clean up.

2049 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``

2050 or a list of ObjectDeletionFlag

2051 text_filters: Properties of text to be deleted, if applicable. Optional.

2052 This is a Python dictionary with the following properties:

2053

2054 * font_ids: List of font resource IDs (such as /F1 or /T1_0) to be deleted.

2055

2056 """

2057 if isinstance(to_delete, (list, tuple)):

2058 for to_d in to_delete:

2059 self.remove_objects_from_page(page, to_d)

2060 return None

2061 assert isinstance(to_delete, ObjectDeletionFlag)

2062

2063 if to_delete & ObjectDeletionFlag.LINKS:

2064 return self._remove_annots_from_page(page, ("/Link",))

2065 if to_delete & ObjectDeletionFlag.ATTACHMENTS:

2066 return self._remove_annots_from_page(

2067 page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")

2068 )

2069 if to_delete & ObjectDeletionFlag.OBJECTS_3D:

2070 return self._remove_annots_from_page(page, ("/3D",))

2071 if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:

2072 return self._remove_annots_from_page(page, None)

2073

2074 jump_operators = []

2075 if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:

2076 jump_operators = (

2077 [

2078 b"w", b"J", b"j", b"M", b"d", b"i",

2079 b"W", b"W*",

2080 b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n",

2081 b"m", b"l", b"c", b"v", b"y", b"h", b"re",

2082 b"sh"

2083 ]

2084 )

2085 if to_delete & ObjectDeletionFlag.TEXT:

2086 jump_operators = [b"Tj", b"TJ", b"'", b'"']

2087

2088 def clean(

2089 content: ContentStream,

2090 images: List[str],

2091 forms: List[str],

2092 text_filters: Optional[Dict[str, Any]] = None

2093 ) -> None:

2094 nonlocal jump_operators, to_delete

2095

2096 font_id = None

2097 font_ids_to_delete = []

2098 if text_filters and to_delete & ObjectDeletionFlag.TEXT:

2099 font_ids_to_delete = text_filters.get("font_ids", [])

2100

2101 i = 0

2102 while i < len(content.operations):

2103 operands, operator = content.operations[i]

2104 if operator == b"Tf":

2105 font_id = operands[0]

2106 if (

2107 (

2108 operator == b"INLINE IMAGE"

2109 and (to_delete & ObjectDeletionFlag.INLINE_IMAGES)

2110 )

2111 or (operator in jump_operators)

2112 or (

2113 operator == b"Do"

2114 and (to_delete & ObjectDeletionFlag.XOBJECT_IMAGES)

2115 and (operands[0] in images)

2116 )

2117 ):

2118 if (

2119 not to_delete & ObjectDeletionFlag.TEXT

2120 or (to_delete & ObjectDeletionFlag.TEXT and not text_filters)

2121 or (to_delete & ObjectDeletionFlag.TEXT and font_id in font_ids_to_delete)

2122 ):

2123 del content.operations[i]

2124 else:

2125 i += 1

2126 else:

2127 i += 1

2128 content.get_data() # this ensures ._data is rebuilt from the .operations

2129

2130 def clean_forms(

2131 elt: DictionaryObject, stack: List[DictionaryObject]

2132 ) -> Tuple[List[str], List[str]]:

2133 nonlocal to_delete

2134 # elt in recursive call is a new ContentStream object, so we have to check the indirect_reference

2135 if (elt in stack) or (

2136 hasattr(elt, "indirect_reference")

2137 and any(

2138 elt.indirect_reference == getattr(x, "indirect_reference", -1)

2139 for x in stack

2140 )

2141 ):

2142 # to prevent infinite looping

2143 return [], [] # pragma: no cover

2144 try:

2145 d = cast(

2146 Dict[Any, Any],

2147 cast(DictionaryObject, elt["/Resources"])["/XObject"],

2148 )

2149 except KeyError:

2150 d = {}

2151 images = []

2152 forms = []

2153 for k, v in d.items():

2154 o = v.get_object()

2155 try:

2156 content: Any = None

2157 if (

2158 to_delete & ObjectDeletionFlag.XOBJECT_IMAGES

2159 and o["/Subtype"] == "/Image"

2160 ):

2161 content = NullObject() # to delete the image keeping the entry

2162 images.append(k)

2163 if o["/Subtype"] == "/Form":

2164 forms.append(k)

2165 if isinstance(o, ContentStream):

2166 content = o

2167 else:

2168 content = ContentStream(o, self)

2169 content.update(

2170 {

2171 k1: v1

2172 for k1, v1 in o.items()

2173 if k1 not in ["/Length", "/Filter", "/DecodeParms"]

2174 }

2175 )

2176 try:

2177 content.indirect_reference = o.indirect_reference

2178 except AttributeError: # pragma: no cover

2179 pass

2180 stack.append(elt)

2181 clean_forms(content, stack) # clean subforms

2182 if content is not None:

2183 if isinstance(v, IndirectObject):

2184 self._objects[v.idnum - 1] = content

2185 else:

2186 # should only occur in a PDF not respecting PDF spec

2187 # where streams must be indirected.

2188 d[k] = self._add_object(content) # pragma: no cover

2189 except (TypeError, KeyError):

2190 pass

2191 for im in images:

2192 del d[im] # for clean-up

2193 if isinstance(elt, StreamObject): # for /Form

2194 if not isinstance(elt, ContentStream): # pragma: no cover

2195 e = ContentStream(elt, self)

2196 e.update(elt.items())

2197 elt = e

2198 clean(elt, images, forms, text_filters) # clean the content

2199 return images, forms

2200

2201 if not isinstance(page, PageObject):

2202 page = PageObject(self, page.indirect_reference) # pragma: no cover

2203 if "/Contents" in page:

2204 content = cast(ContentStream, page.get_contents())

2205

2206 images, forms = clean_forms(page, [])

2207

2208 clean(content, images, forms, text_filters)

2209 page.replace_contents(content)

2210

2211 def remove_images(

2212 self,

2213 to_delete: ImageType = ImageType.ALL,

2214 ) -> None:

2215 """

2216 Remove images from this output.

2217

2218 Args:

2219 to_delete: The type of images to be deleted

2220 (default = all images types)

2221

2222 """

2223 if isinstance(to_delete, bool):

2224 to_delete = ImageType.ALL

2225

2226 i = ObjectDeletionFlag.NONE

2227

2228 for image in ("XOBJECT_IMAGES", "INLINE_IMAGES", "DRAWING_IMAGES"):

2229 if to_delete & ImageType[image]:

2230 i |= ObjectDeletionFlag[image]

2231

2232 for page in self.pages:

2233 self.remove_objects_from_page(page, i)

2234

2235 def remove_text(self, font_names: Optional[List[str]] = None) -> None:

2236 """

2237 Remove text from the PDF.

2238

2239 Args:

2240 font_names: List of font names to remove, such as "Helvetica-Bold".

2241 Optional. If not specified, all text will be removed.

2242 """

2243 if not font_names:

2244 font_names = []

2245

2246 for page in self.pages:

2247 resource_ids_to_remove = []

2248

2249 # Content streams reference fonts and other resources with names like "/F1" or "/T1_0"

2250 # Font names need to be converted to resource names/IDs for easier removal

2251 if font_names:

2252 # Recursively loop through page objects to gather font info

2253 def get_font_info(

2254 obj: Any,

2255 font_info: Optional[Dict[str, Any]] = None,

2256 key: Optional[str] = None

2257 ) -> Dict[str, Any]:

2258 if font_info is None:

2259 font_info = {}

2260 if isinstance(obj, IndirectObject):

2261 obj = obj.get_object()

2262 if isinstance(obj, dict):

2263 if obj.get("/Type") == "/Font":

2264 font_name = obj.get("/BaseFont", "")

2265 # Normalize font names like "/RRXFFV+Palatino-Bold" to "Palatino-Bold"

2266 normalized_font_name = font_name.lstrip("/").split("+")[-1]

2267 if normalized_font_name not in font_info:

2268 font_info[normalized_font_name] = {

2269 "normalized_font_name": normalized_font_name,

2270 "resource_ids": [],

2271 }

2272 if key not in font_info[normalized_font_name]["resource_ids"]:

2273 font_info[normalized_font_name]["resource_ids"].append(key)

2274 for k in obj:

2275 font_info = get_font_info(obj[k], font_info, k)

2276 elif isinstance(obj, (list, ArrayObject)):

2277 for child_obj in obj:

2278 font_info = get_font_info(child_obj, font_info)

2279 return font_info

2280

2281 # Add relevant resource names for removal

2282 font_info = get_font_info(page.get("/Resources"))

2283 for font_name in font_names:

2284 if font_name in font_info:

2285 resource_ids_to_remove.extend(font_info[font_name]["resource_ids"])

2286

2287 text_filters = {}

2288 if font_names:

2289 text_filters["font_ids"] = resource_ids_to_remove

2290 self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)

2291

2292 def add_uri(

2293 self,

2294 page_number: int,

2295 uri: str,

2296 rect: RectangleObject,

2297 border: Optional[ArrayObject] = None,

2298 ) -> None:

2299 """

2300 Add an URI from a rectangular area to the specified page.

2301

2302 Args:

2303 page_number: index of the page on which to place the URI action.

2304 uri: URI of resource to link to.

2305 rect: :class:`RectangleObject<pypdf.generic.RectangleObject>` or

2306 array of four integers specifying the clickable rectangular area

2307 ``[xLL, yLL, xUR, yUR]``, or string in the form

2308 ``"[ xLL yLL xUR yUR ]"``.

2309 border: if provided, an array describing border-drawing

2310 properties. See the PDF spec for details. No border will be

2311 drawn if this argument is omitted.

2312

2313 """

2314 page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore

2315 page_ref = cast(Dict[str, Any], self.get_object(page_link))

2316

2317 border_arr: BorderArrayType

2318 if border is not None:

2319 border_arr = [NumberObject(n) for n in border[:3]]

2320 if len(border) == 4:

2321 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])

2322 border_arr.append(dash_pattern)

2323 else:

2324 border_arr = [NumberObject(2), NumberObject(2), NumberObject(2)]

2325

2326 if isinstance(rect, str):

2327 rect = NumberObject(rect)

2328 elif isinstance(rect, RectangleObject):

2329 pass

2330 else:

2331 rect = RectangleObject(rect)

2332

2333 lnk2 = DictionaryObject()

2334 lnk2.update(

2335 {

2336 NameObject("/S"): NameObject("/URI"),

2337 NameObject("/URI"): TextStringObject(uri),

2338 }

2339 )

2340 lnk = DictionaryObject()

2341 lnk.update(

2342 {

2343 NameObject(AA.Type): NameObject("/Annot"),

2344 NameObject(AA.Subtype): NameObject("/Link"),

2345 NameObject(AA.P): page_link,

2346 NameObject(AA.Rect): rect,

2347 NameObject("/H"): NameObject("/I"),

2348 NameObject(AA.Border): ArrayObject(border_arr),

2349 NameObject("/A"): lnk2,

2350 }

2351 )

2352 lnk_ref = self._add_object(lnk)

2353

2354 if PG.ANNOTS in page_ref:

2355 page_ref[PG.ANNOTS].append(lnk_ref)

2356 else:

2357 page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref])

2358

2359 _valid_layouts = (

2360 "/NoLayout",

2361 "/SinglePage",

2362 "/OneColumn",

2363 "/TwoColumnLeft",

2364 "/TwoColumnRight",

2365 "/TwoPageLeft",

2366 "/TwoPageRight",

2367 )

2368

2369 def _get_page_layout(self) -> Optional[LayoutType]:

2370 try:

2371 return cast(LayoutType, self._root_object["/PageLayout"])

2372 except KeyError:

2373 return None

2374

2375 def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:

2376 """

2377 Set the page layout.

2378

2379 Args:

2380 layout: The page layout to be used.

2381

2382 .. list-table:: Valid ``layout`` arguments

2383 :widths: 50 200

2384

2385 * - /NoLayout

2386 - Layout explicitly not specified

2387 * - /SinglePage

2388 - Show one page at a time

2389 * - /OneColumn

2390 - Show one column at a time

2391 * - /TwoColumnLeft

2392 - Show pages in two columns, odd-numbered pages on the left

2393 * - /TwoColumnRight

2394 - Show pages in two columns, odd-numbered pages on the right

2395 * - /TwoPageLeft

2396 - Show two pages at a time, odd-numbered pages on the left

2397 * - /TwoPageRight

2398 - Show two pages at a time, odd-numbered pages on the right

2399

2400 """

2401 if not isinstance(layout, NameObject):

2402 if layout not in self._valid_layouts:

2403 logger_warning(

2404 f"Layout should be one of: {'', ''.join(self._valid_layouts)}",

2405 __name__,

2406 )

2407 layout = NameObject(layout)

2408 self._root_object.update({NameObject("/PageLayout"): layout})

2409

2410 def set_page_layout(self, layout: LayoutType) -> None:

2411 """

2412 Set the page layout.

2413

2414 Args:

2415 layout: The page layout to be used

2416

2417 .. list-table:: Valid ``layout`` arguments

2418 :widths: 50 200

2419

2420 * - /NoLayout

2421 - Layout explicitly not specified

2422 * - /SinglePage

2423 - Show one page at a time

2424 * - /OneColumn

2425 - Show one column at a time

2426 * - /TwoColumnLeft

2427 - Show pages in two columns, odd-numbered pages on the left

2428 * - /TwoColumnRight

2429 - Show pages in two columns, odd-numbered pages on the right

2430 * - /TwoPageLeft

2431 - Show two pages at a time, odd-numbered pages on the left

2432 * - /TwoPageRight

2433 - Show two pages at a time, odd-numbered pages on the right

2434

2435 """

2436 self._set_page_layout(layout)

2437

2438 @property

2439 def page_layout(self) -> Optional[LayoutType]:

2440 """

2441 Page layout property.

2442

2443 .. list-table:: Valid ``layout`` values

2444 :widths: 50 200

2445

2446 * - /NoLayout

2447 - Layout explicitly not specified

2448 * - /SinglePage

2449 - Show one page at a time

2450 * - /OneColumn

2451 - Show one column at a time

2452 * - /TwoColumnLeft

2453 - Show pages in two columns, odd-numbered pages on the left

2454 * - /TwoColumnRight

2455 - Show pages in two columns, odd-numbered pages on the right

2456 * - /TwoPageLeft

2457 - Show two pages at a time, odd-numbered pages on the left

2458 * - /TwoPageRight

2459 - Show two pages at a time, odd-numbered pages on the right

2460 """

2461 return self._get_page_layout()

2462

2463 @page_layout.setter

2464 def page_layout(self, layout: LayoutType) -> None:

2465 self._set_page_layout(layout)

2466

2467 _valid_modes = (

2468 "/UseNone",

2469 "/UseOutlines",

2470 "/UseThumbs",

2471 "/FullScreen",

2472 "/UseOC",

2473 "/UseAttachments",

2474 )

2475

2476 def _get_page_mode(self) -> Optional[PagemodeType]:

2477 try:

2478 return cast(PagemodeType, self._root_object["/PageMode"])

2479 except KeyError:

2480 return None

2481

2482 @property

2483 def page_mode(self) -> Optional[PagemodeType]:

2484 """

2485 Page mode property.

2486

2487 .. list-table:: Valid ``mode`` values

2488 :widths: 50 200

2489

2490 * - /UseNone

2491 - Do not show outline or thumbnails panels

2492 * - /UseOutlines

2493 - Show outline (aka bookmarks) panel

2494 * - /UseThumbs

2495 - Show page thumbnails panel

2496 * - /FullScreen

2497 - Fullscreen view

2498 * - /UseOC

2499 - Show Optional Content Group (OCG) panel

2500 * - /UseAttachments

2501 - Show attachments panel

2502 """

2503 return self._get_page_mode()

2504

2505 @page_mode.setter

2506 def page_mode(self, mode: PagemodeType) -> None:

2507 if isinstance(mode, NameObject):

2508 mode_name: NameObject = mode

2509 else:

2510 if mode not in self._valid_modes:

2511 logger_warning(

2512 f"Mode should be one of: {', '.join(self._valid_modes)}", __name__

2513 )

2514 mode_name = NameObject(mode)

2515 self._root_object.update({NameObject("/PageMode"): mode_name})

2516

2517 def add_annotation(

2518 self,

2519 page_number: Union[int, PageObject],

2520 annotation: Dict[str, Any],

2521 ) -> DictionaryObject:

2522 """

2523 Add a single annotation to the page.

2524 The added annotation must be a new annotation.

2525 It cannot be recycled.

2526

2527 Args:

2528 page_number: PageObject or page index.

2529 annotation: Annotation to be added (created with annotation).

2530

2531 Returns:

2532 The inserted object.

2533 This can be used for popup creation, for example.

2534

2535 """

2536 page = page_number

2537 if isinstance(page, int):

2538 page = self.pages[page]

2539 elif not isinstance(page, PageObject):

2540 raise TypeError("page: invalid type")

2541

2542 to_add = cast(DictionaryObject, _pdf_objectify(annotation))

2543 to_add[NameObject("/P")] = page.indirect_reference

2544

2545 if page.annotations is None:

2546 page[NameObject("/Annots")] = ArrayObject()

2547 assert page.annotations is not None

2548

2549 # Internal link annotations need the correct object type for the

2550 # destination

2551 if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add:

2552 tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")])

2553 dest = Destination(

2554 NameObject("/LinkName"),

2555 tmp["target_page_index"],

2556 Fit(

2557 fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"]

2558 ), # I have no clue why this dict-hack is necessary

2559 )

2560 to_add[NameObject("/Dest")] = dest.dest_array

2561

2562 page.annotations.append(self._add_object(to_add))

2563

2564 if to_add.get("/Subtype") == "/Popup" and NameObject("/Parent") in to_add:

2565 cast(DictionaryObject, to_add["/Parent"].get_object())[

2566 NameObject("/Popup")

2567 ] = to_add.indirect_reference

2568

2569 return to_add

2570

2571 def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject:

2572 """

2573 Perform some clean up in the page.

2574 Currently: convert NameObject named destination to TextStringObject

2575 (required for names/dests list)

2576

2577 Args:

2578 page:

2579

2580 Returns:

2581 The cleaned PageObject

2582

2583 """

2584 page = cast("PageObject", page.get_object())

2585 for a in page.get("/Annots", []):

2586 a_obj = a.get_object()

2587 d = a_obj.get("/Dest", None)

2588 act = a_obj.get("/A", None)

2589 if isinstance(d, NameObject):

2590 a_obj[NameObject("/Dest")] = TextStringObject(d)

2591 elif act is not None:

2592 act = act.get_object()

2593 d = act.get("/D", None)

2594 if isinstance(d, NameObject):

2595 act[NameObject("/D")] = TextStringObject(d)

2596 return page

2597

2598 def _create_stream(

2599 self, fileobj: Union[Path, StrByteType, PdfReader]

2600 ) -> Tuple[IOBase, Optional[Encryption]]:

2601 # If the fileobj parameter is a string, assume it is a path

2602 # and create a file object at that location. If it is a file,

2603 # copy the file's contents into a BytesIO stream object; if

2604 # it is a PdfReader, copy that reader's stream into a

2605 # BytesIO stream.

2606 # If fileobj is none of the above types, it is not modified

2607 encryption_obj = None

2608 stream: IOBase

2609 if isinstance(fileobj, (str, Path)):

2610 with FileIO(fileobj, "rb") as f:

2611 stream = BytesIO(f.read())

2612 elif isinstance(fileobj, PdfReader):

2613 if fileobj._encryption:

2614 encryption_obj = fileobj._encryption

2615 orig_tell = fileobj.stream.tell()

2616 fileobj.stream.seek(0)

2617 stream = BytesIO(fileobj.stream.read())

2618

2619 # reset the stream to its original location

2620 fileobj.stream.seek(orig_tell)

2621 elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):

2622 fileobj.seek(0)

2623 filecontent = fileobj.read()

2624 stream = BytesIO(filecontent)

2625 else:

2626 raise NotImplementedError(

2627 "Merging requires an object that PdfReader can parse. "

2628 "Typically, that is a Path or a string representing a Path, "

2629 "a file object, or an object implementing .seek and .read. "

2630 "Passing a PdfReader directly works as well."

2631 )

2632 return stream, encryption_obj

2633

2634 def append(

2635 self,

2636 fileobj: Union[StrByteType, PdfReader, Path],

2637 outline_item: Union[

2638 str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]

2639 ] = None,

2640 pages: Union[

2641 None,

2642 PageRange,

2643 Tuple[int, int],

2644 Tuple[int, int, int],

2645 List[int],

2646 List[PageObject],

2647 ] = None,

2648 import_outline: bool = True,

2649 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None,

2650 ) -> None:

2651 """

2652 Identical to the :meth:`merge()<merge>` method, but assumes you want to

2653 concatenate all pages onto the end of the file instead of specifying a

2654 position.

2655

2656 Args:

2657 fileobj: A File Object or an object that supports the standard

2658 read and seek methods similar to a File Object. Could also be a

2659 string representing a path to a PDF file.

2660 outline_item: Optionally, you may specify a string to build an

2661 outline (aka 'bookmark') to identify the beginning of the

2662 included file.

2663 pages: Can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2664 or a ``(start, stop[, step])`` tuple

2665 or a list of pages to be processed

2666 to merge only the specified range of pages from the source

2667 document into the output document.

2668 import_outline: You may prevent the source document's

2669 outline (collection of outline items, previously referred to as

2670 'bookmarks') from being imported by specifying this as ``False``.

2671 excluded_fields: Provide the list of fields/keys to be ignored

2672 if ``/Annots`` is part of the list, the annotation will be ignored

2673 if ``/B`` is part of the list, the articles will be ignored

2674

2675 """

2676 if excluded_fields is None:

2677 excluded_fields = ()

2678 if isinstance(outline_item, (tuple, list, PageRange)):

2679 if isinstance(pages, bool):

2680 if not isinstance(import_outline, bool):

2681 excluded_fields = import_outline

2682 import_outline = pages

2683 pages = outline_item

2684 self.merge(

2685 None,

2686 fileobj,

2687 None,

2688 pages,

2689 import_outline,

2690 excluded_fields,

2691 )

2692 else: # if isinstance(outline_item, str):

2693 self.merge(

2694 None,

2695 fileobj,

2696 outline_item,

2697 pages,

2698 import_outline,

2699 excluded_fields,

2700 )

2701

2702 def merge(

2703 self,

2704 position: Optional[int],

2705 fileobj: Union[Path, StrByteType, PdfReader],

2706 outline_item: Optional[str] = None,

2707 pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None,

2708 import_outline: bool = True,

2709 excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (),

2710 ) -> None:

2711 """

2712 Merge the pages from the given file into the output file at the

2713 specified page number.

2714

2715 Args:

2716 position: The *page number* to insert this file. File will

2717 be inserted after the given number.

2718 fileobj: A File Object or an object that supports the standard

2719 read and seek methods similar to a File Object. Could also be a

2720 string representing a path to a PDF file.

2721 outline_item: Optionally, you may specify a string to build an outline

2722 (aka 'bookmark') to identify the

2723 beginning of the included file.

2724 pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`

2725 or a ``(start, stop[, step])`` tuple

2726 or a list of pages to be processed

2727 to merge only the specified range of pages from the source

2728 document into the output document.

2729 import_outline: You may prevent the source document's

2730 outline (collection of outline items, previously referred to as

2731 'bookmarks') from being imported by specifying this as ``False``.

2732 excluded_fields: provide the list of fields/keys to be ignored

2733 if ``/Annots`` is part of the list, the annotation will be ignored

2734 if ``/B`` is part of the list, the articles will be ignored

2735

2736 Raises:

2737 TypeError: The pages attribute is not configured properly

2738

2739 """

2740 if isinstance(fileobj, PdfDocCommon):

2741 reader = fileobj

2742 else:

2743 stream, encryption_obj = self._create_stream(fileobj)

2744 # Create a new PdfReader instance using the stream

2745 # (either file or BytesIO or StringIO) created above

2746 reader = PdfReader(stream, strict=False) # type: ignore[arg-type]

2747

2748 if excluded_fields is None:

2749 excluded_fields = ()

2750 # Find the range of pages to merge.

2751 if pages is None:

2752 pages = list(range(len(reader.pages)))

2753 elif isinstance(pages, PageRange):

2754 pages = list(range(*pages.indices(len(reader.pages))))

2755 elif isinstance(pages, list):

2756 pass # keep unchanged

2757 elif isinstance(pages, tuple) and len(pages) <= 3:

2758 pages = list(range(*pages))

2759 elif not isinstance(pages, tuple):

2760 raise TypeError(

2761 '"pages" must be a tuple of (start, stop[, step]) or a list'

2762 )

2763

2764 srcpages = {}

2765 for page in pages:

2766 if isinstance(page, PageObject):

2767 pg = page

2768 else:

2769 pg = reader.pages[page]

2770 assert pg.indirect_reference is not None

2771 if position is None:

2772 # numbers in the exclude list identifies that the exclusion is

2773 # only applicable to 1st level of cloning

2774 srcpages[pg.indirect_reference.idnum] = self.add_page(

2775 pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2776 )

2777 else:

2778 srcpages[pg.indirect_reference.idnum] = self.insert_page(

2779 pg, position, [*list(excluded_fields), 1, "/B", 1, "/Annots"] # type: ignore

2780 )

2781 position += 1

2782 srcpages[pg.indirect_reference.idnum].original_page = pg

2783

2784 reader._named_destinations = (

2785 reader.named_destinations

2786 ) # need for the outline processing below

2787

2788 arr: Any

2789

2790 def _process_named_dests(dest: Any) -> None:

2791 arr = dest.dest_array

2792 if "/Names" in self._root_object and dest["/Title"] in cast(

2793 List[Any],

2794 cast(

2795 DictionaryObject,

2796 cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),

2797 ).get("/Names", DictionaryObject()),

2798 ):

2799 # already exists: should not duplicate it

2800 pass

2801 elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):

2802 pass

2803 elif isinstance(dest["/Page"], int):

2804 # the page reference is a page number normally not a PDF Reference

2805 # page numbers as int are normally accepted only in external goto

2806 try:

2807 p = reader.pages[dest["/Page"]]

2808 except IndexError:

2809 return

2810 assert p.indirect_reference is not None

2811 try:

2812 arr[NumberObject(0)] = NumberObject(

2813 srcpages[p.indirect_reference.idnum].page_number

2814 )

2815 self.add_named_destination_array(dest["/Title"], arr)

2816 except KeyError:

2817 pass

2818 elif dest["/Page"].indirect_reference.idnum in srcpages:

2819 arr[NumberObject(0)] = srcpages[

2820 dest["/Page"].indirect_reference.idnum

2821 ].indirect_reference

2822 self.add_named_destination_array(dest["/Title"], arr)

2823

2824 for dest in reader._named_destinations.values():

2825 _process_named_dests(dest)

2826

2827 outline_item_typ: TreeObject

2828 if outline_item is not None:

2829 outline_item_typ = cast(

2830 "TreeObject",

2831 self.add_outline_item(

2832 TextStringObject(outline_item),

2833 next(iter(srcpages.values())).indirect_reference,

2834 fit=PAGE_FIT,

2835 ).get_object(),

2836 )

2837 else:

2838 outline_item_typ = self.get_outline_root()

2839

2840 _ro = reader.root_object

2841 if import_outline and CO.OUTLINES in _ro:

2842 outline = self._get_filtered_outline(

2843 _ro.get(CO.OUTLINES, None), srcpages, reader

2844 )

2845 self._insert_filtered_outline(

2846 outline, outline_item_typ, None

2847 ) # TODO: use before parameter

2848

2849 if "/Annots" not in excluded_fields:

2850 for pag in srcpages.values():

2851 lst = self._insert_filtered_annotations(

2852 pag.original_page.get("/Annots", []), pag, srcpages, reader

2853 )

2854 if len(lst) > 0:

2855 pag[NameObject("/Annots")] = lst

2856 self.clean_page(pag)

2857

2858 if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:

2859 if "/AcroForm" not in self._root_object:

2860 self._root_object[NameObject("/AcroForm")] = self._add_object(

2861 cast(

2862 DictionaryObject,

2863 reader.root_object["/AcroForm"],

2864 ).clone(self, False, ("/Fields",))

2865 )

2866 arr = ArrayObject()

2867 else:

2868 arr = cast(

2869 ArrayObject,

2870 cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],

2871 )

2872 trslat = self._id_translated[id(reader)]

2873 try:

2874 for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore

2875 try:

2876 ind = IndirectObject(trslat[f.idnum], 0, self)

2877 if ind not in arr:

2878 arr.append(ind)

2879 except KeyError:

2880 # for trslat[] which mean the field has not be copied

2881 # through the page

2882 pass

2883 except KeyError: # for /Acroform or /Fields are not existing

2884 arr = self._add_object(ArrayObject())

2885 cast(DictionaryObject, self._root_object["/AcroForm"])[

2886 NameObject("/Fields")

2887 ] = arr

2888

2889 if "/B" not in excluded_fields:

2890 self.add_filtered_articles("", srcpages, reader)

2891

2892 def _add_articles_thread(

2893 self,

2894 thread: DictionaryObject, # thread entry from the reader's array of threads

2895 pages: Dict[int, PageObject],

2896 reader: PdfReader,

2897 ) -> IndirectObject:

2898 """

2899 Clone the thread with only the applicable articles.

2900

2901 Args:

2902 thread:

2903 pages:

2904 reader:

2905

2906 Returns:

2907 The added thread as an indirect reference

2908

2909 """

2910 nthread = thread.clone(

2911 self, force_duplicate=True, ignore_fields=("/F",)

2912 ) # use of clone to keep link between reader and writer

2913 self.threads.append(nthread.indirect_reference)

2914 first_article = cast("DictionaryObject", thread["/F"])

2915 current_article: Optional[DictionaryObject] = first_article

2916 new_article: Optional[DictionaryObject] = None

2917 while current_article is not None:

2918 pag = self._get_cloned_page(

2919 cast("PageObject", current_article["/P"]), pages, reader

2920 )

2921 if pag is not None:

2922 if new_article is None:

2923 new_article = cast(

2924 "DictionaryObject",

2925 self._add_object(DictionaryObject()).get_object(),

2926 )

2927 new_first = new_article

2928 nthread[NameObject("/F")] = new_article.indirect_reference

2929 else:

2930 new_article2 = cast(

2931 "DictionaryObject",

2932 self._add_object(

2933 DictionaryObject(

2934 {NameObject("/V"): new_article.indirect_reference}

2935 )

2936 ).get_object(),

2937 )

2938 new_article[NameObject("/N")] = new_article2.indirect_reference

2939 new_article = new_article2

2940 new_article[NameObject("/P")] = pag

2941 new_article[NameObject("/T")] = nthread.indirect_reference

2942 new_article[NameObject("/R")] = current_article["/R"]

2943 pag_obj = cast("PageObject", pag.get_object())

2944 if "/B" not in pag_obj:

2945 pag_obj[NameObject("/B")] = ArrayObject()

2946 cast("ArrayObject", pag_obj["/B"]).append(

2947 new_article.indirect_reference

2948 )

2949 current_article = cast("DictionaryObject", current_article["/N"])

2950 if current_article == first_article:

2951 new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore

2952 new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore

2953 current_article = None

2954 assert nthread.indirect_reference is not None

2955 return nthread.indirect_reference

2956

2957 def add_filtered_articles(

2958 self,

2959 fltr: Union[

2960 Pattern[Any], str

2961 ], # thread entry from the reader's array of threads

2962 pages: Dict[int, PageObject],

2963 reader: PdfReader,

2964 ) -> None:

2965 """

2966 Add articles matching the defined criteria.

2967

2968 Args:

2969 fltr:

2970 pages:

2971 reader:

2972

2973 """

2974 if isinstance(fltr, str):

2975 fltr = re.compile(fltr)

2976 elif not isinstance(fltr, Pattern):

2977 fltr = re.compile("")

2978 for p in pages.values():

2979 pp = p.original_page

2980 for a in pp.get("/B", ()):

2981 thr = a.get_object().get("/T")

2982 if thr is None:

2983 continue

2984 thr = thr.get_object()

2985 if thr.indirect_reference.idnum not in self._id_translated[

2986 id(reader)

2987 ] and fltr.search((thr.get("/I", {})).get("/Title", "")):

2988 self._add_articles_thread(thr, pages, reader)

2989

2990 def _get_cloned_page(

2991 self,

2992 page: Union[None, IndirectObject, PageObject, NullObject],

2993 pages: Dict[int, PageObject],

2994 reader: PdfReader,

2995 ) -> Optional[IndirectObject]:

2996 if isinstance(page, NullObject):

2997 return None

2998 if isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page":

2999 _i = page.indirect_reference

3000 elif isinstance(page, IndirectObject):

3001 _i = page

3002 try:

3003 return pages[_i.idnum].indirect_reference # type: ignore

3004 except Exception:

3005 return None

3006

3007 def _insert_filtered_annotations(

3008 self,

3009 annots: Union[IndirectObject, List[DictionaryObject], None],

3010 page: PageObject,

3011 pages: Dict[int, PageObject],

3012 reader: PdfReader,

3013 ) -> List[Destination]:

3014 outlist = ArrayObject()

3015 if isinstance(annots, IndirectObject):

3016 annots = cast("List[Any]", annots.get_object())

3017 if annots is None:

3018 return outlist

3019 if not isinstance(annots, list):

3020 logger_warning(f"Expected list of annotations, got {annots} of type {annots.__class__.__name__}.", __name__)

3021 return outlist

3022 for an in annots:

3023 ano = cast("DictionaryObject", an.get_object())

3024 if (

3025 ano["/Subtype"] != "/Link"

3026 or "/A" not in ano

3027 or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo"

3028 or "/Dest" in ano

3029 ):

3030 if "/Dest" not in ano:

3031 outlist.append(self._add_object(ano.clone(self)))

3032 else:

3033 d = ano["/Dest"]

3034 if isinstance(d, str):

3035 # it is a named dest

3036 if str(d) in self.get_named_dest_root():

3037 outlist.append(ano.clone(self).indirect_reference)

3038 else:

3039 d = cast("ArrayObject", d)

3040 p = self._get_cloned_page(d[0], pages, reader)

3041 if p is not None:

3042 anc = ano.clone(self, ignore_fields=("/Dest",))

3043 anc[NameObject("/Dest")] = ArrayObject([p, *d[1:]])

3044 outlist.append(self._add_object(anc))

3045 else:

3046 d = cast("DictionaryObject", ano["/A"]).get("/D", NullObject())

3047 if d is None or isinstance(d, NullObject):

3048 continue

3049 if isinstance(d, str):

3050 # it is a named dest

3051 if str(d) in self.get_named_dest_root():

3052 outlist.append(ano.clone(self).indirect_reference)

3053 else:

3054 d = cast("ArrayObject", d)

3055 p = self._get_cloned_page(d[0], pages, reader)

3056 if p is not None:

3057 anc = ano.clone(self, ignore_fields=("/D",))

3058 cast("DictionaryObject", anc["/A"])[

3059 NameObject("/D")

3060 ] = ArrayObject([p, *d[1:]])

3061 outlist.append(self._add_object(anc))

3062 return outlist

3063

3064 def _get_filtered_outline(

3065 self,

3066 node: Any,

3067 pages: Dict[int, PageObject],

3068 reader: PdfReader,

3069 ) -> List[Destination]:

3070 """

3071 Extract outline item entries that are part of the specified page set.

3072

3073 Args:

3074 node:

3075 pages:

3076 reader:

3077

3078 Returns:

3079 A list of destination objects.

3080

3081 """

3082 new_outline = []

3083 if node is None:

3084 node = NullObject()

3085 node = node.get_object()

3086 if is_null_or_none(node):

3087 node = DictionaryObject()

3088 if node.get("/Type", "") == "/Outlines" or "/Title" not in node:

3089 node = node.get("/First", None)

3090 if node is not None:

3091 node = node.get_object()

3092 new_outline += self._get_filtered_outline(node, pages, reader)

3093 else:

3094 v: Union[None, IndirectObject, NullObject]

3095 while node is not None:

3096 node = node.get_object()

3097 o = cast("Destination", reader._build_outline_item(node))

3098 v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader)

3099 if v is None:

3100 v = NullObject()

3101 o[NameObject("/Page")] = v

3102 if "/First" in node:

3103 o._filtered_children = self._get_filtered_outline(

3104 node["/First"], pages, reader

3105 )

3106 else:

3107 o._filtered_children = []

3108 if (

3109 not isinstance(o["/Page"], NullObject)

3110 or len(o._filtered_children) > 0

3111 ):

3112 new_outline.append(o)

3113 node = node.get("/Next", None)

3114 return new_outline

3115

3116 def _clone_outline(self, dest: Destination) -> TreeObject:

3117 n_ol = TreeObject()

3118 self._add_object(n_ol)

3119 n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"])

3120 if not isinstance(dest["/Page"], NullObject):

3121 if dest.node is not None and "/A" in dest.node:

3122 n_ol[NameObject("/A")] = dest.node["/A"].clone(self)

3123 else:

3124 n_ol[NameObject("/Dest")] = dest.dest_array

3125 # TODO: /SE

3126 if dest.node is not None:

3127 n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0))

3128 n_ol[NameObject("/C")] = ArrayObject(

3129 dest.node.get(

3130 "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)]

3131 )

3132 )

3133 return n_ol

3134

3135 def _insert_filtered_outline(

3136 self,

3137 outlines: List[Destination],

3138 parent: Union[TreeObject, IndirectObject],

3139 before: Union[None, TreeObject, IndirectObject] = None,

3140 ) -> None:

3141 for dest in outlines:

3142 # TODO: can be improved to keep A and SE entries (ignored for the moment)

3143 # with np=self.add_outline_item_destination(dest,parent,before)

3144 if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest:

3145 np = parent

3146 else:

3147 np = self._clone_outline(dest)

3148 cast(TreeObject, parent.get_object()).insert_child(np, before, self)

3149 self._insert_filtered_outline(dest._filtered_children, np, None)

3150

3151 def close(self) -> None:

3152 """Implemented for API harmonization."""

3153 return

3154

3155 def find_outline_item(

3156 self,

3157 outline_item: Dict[str, Any],

3158 root: Optional[OutlineType] = None,

3159 ) -> Optional[List[int]]:

3160 if root is None:

3161 o = self.get_outline_root()

3162 else:

3163 o = cast("TreeObject", root)

3164

3165 i = 0

3166 while o is not None:

3167 if (

3168 o.indirect_reference == outline_item

3169 or o.get("/Title", None) == outline_item

3170 ):

3171 return [i]

3172 if "/First" in o:

3173 res = self.find_outline_item(

3174 outline_item, cast(OutlineType, o["/First"])

3175 )

3176 if res:

3177 return ([i] if "/Title" in o else []) + res

3178 if "/Next" in o:

3179 i += 1

3180 o = cast(TreeObject, o["/Next"])

3181 else:

3182 return None

3183

3184 def find_bookmark(

3185 self,

3186 outline_item: Dict[str, Any],

3187 root: Optional[OutlineType] = None,

3188 ) -> None: # deprecated

3189 """

3190 .. deprecated:: 2.9.0

3191 Use :meth:`find_outline_item` instead.

3192 """

3193 deprecation_with_replacement("find_bookmark", "find_outline_item", "5.0.0")

3194

3195 def reset_translation(

3196 self, reader: Union[None, PdfReader, IndirectObject] = None

3197 ) -> None:

3198 """

3199 Reset the translation table between reader and the writer object.

3200

3201 Late cloning will create new independent objects.

3202

3203 Args:

3204 reader: PdfReader or IndirectObject referencing a PdfReader object.

3205 if set to None or omitted, all tables will be reset.

3206

3207 """

3208 if reader is None:

3209 self._id_translated = {}

3210 elif isinstance(reader, PdfReader):

3211 try:

3212 del self._id_translated[id(reader)]

3213 except Exception:

3214 pass

3215 elif isinstance(reader, IndirectObject):

3216 try:

3217 del self._id_translated[id(reader.pdf)]

3218 except Exception:

3219 pass

3220 else:

3221 raise Exception("invalid parameter {reader}")

3222

3223 def set_page_label(

3224 self,

3225 page_index_from: int,

3226 page_index_to: int,

3227 style: Optional[PageLabelStyle] = None,

3228 prefix: Optional[str] = None,

3229 start: Optional[int] = 0,

3230 ) -> None:

3231 """

3232 Set a page label to a range of pages.

3233

3234 Page indexes must be given starting from 0.

3235 Labels must have a style, a prefix or both.

3236 If a range is not assigned any page label, a decimal label starting from 1 is applied.

3237

3238 Args:

3239 page_index_from: page index of the beginning of the range starting from 0

3240 page_index_to: page index of the beginning of the range starting from 0

3241 style: The numbering style to be used for the numeric portion of each page label:

3242

3243 * ``/D`` Decimal Arabic numerals

3244 * ``/R`` Uppercase Roman numerals

3245 * ``/r`` Lowercase Roman numerals

3246 * ``/A`` Uppercase letters (A to Z for the first 26 pages,

3247 AA to ZZ for the next 26, and so on)

3248 * ``/a`` Lowercase letters (a to z for the first 26 pages,

3249 aa to zz for the next 26, and so on)

3250

3251 prefix: The label prefix for page labels in this range.

3252 start: The value of the numeric portion for the first page label

3253 in the range.

3254 Subsequent pages are numbered sequentially from this value,

3255 which must be greater than or equal to 1.

3256 Default value: 1.

3257

3258 """

3259 if style is None and prefix is None:

3260 raise ValueError("At least one of style and prefix must be given")

3261 if page_index_from < 0:

3262 raise ValueError("page_index_from must be greater or equal than 0")

3263 if page_index_to < page_index_from:

3264 raise ValueError(

3265 "page_index_to must be greater or equal than page_index_from"

3266 )

3267 if page_index_to >= len(self.pages):

3268 raise ValueError("page_index_to exceeds number of pages")

3269 if start is not None and start != 0 and start < 1:

3270 raise ValueError("If given, start must be greater or equal than one")

3271

3272 self._set_page_label(page_index_from, page_index_to, style, prefix, start)

3273

3274 def _set_page_label(

3275 self,

3276 page_index_from: int,

3277 page_index_to: int,

3278 style: Optional[PageLabelStyle] = None,

3279 prefix: Optional[str] = None,

3280 start: Optional[int] = 0,

3281 ) -> None:

3282 """

3283 Set a page label to a range of pages.

3284

3285 Page indexes must be given starting from 0.

3286 Labels must have a style, a prefix or both.

3287 If a range is not assigned any page label a decimal label starting from 1 is applied.

3288

3289 Args:

3290 page_index_from: page index of the beginning of the range starting from 0

3291 page_index_to: page index of the beginning of the range starting from 0

3292 style: The numbering style to be used for the numeric portion of each page label:

3293 /D Decimal Arabic numerals

3294 /R Uppercase Roman numerals

3295 /r Lowercase Roman numerals

3296 /A Uppercase letters (A to Z for the first 26 pages,

3297 AA to ZZ for the next 26, and so on)

3298 /a Lowercase letters (a to z for the first 26 pages,

3299 aa to zz for the next 26, and so on)

3300 prefix: The label prefix for page labels in this range.

3301 start: The value of the numeric portion for the first page label

3302 in the range.

3303 Subsequent pages are numbered sequentially from this value,

3304 which must be greater than or equal to 1. Default value: 1.

3305

3306 """

3307 default_page_label = DictionaryObject()

3308 default_page_label[NameObject("/S")] = NameObject("/D")

3309

3310 new_page_label = DictionaryObject()

3311 if style is not None:

3312 new_page_label[NameObject("/S")] = NameObject(style)

3313 if prefix is not None:

3314 new_page_label[NameObject("/P")] = TextStringObject(prefix)

3315 if start != 0:

3316 new_page_label[NameObject("/St")] = NumberObject(start)

3317

3318 if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object:

3319 nums = ArrayObject()

3320 nums_insert(NumberObject(0), default_page_label, nums)

3321 page_labels = TreeObject()

3322 page_labels[NameObject("/Nums")] = nums

3323 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3324

3325 page_labels = cast(

3326 TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)]

3327 )

3328 nums = cast(ArrayObject, page_labels[NameObject("/Nums")])

3329

3330 nums_insert(NumberObject(page_index_from), new_page_label, nums)

3331 nums_clear_range(NumberObject(page_index_from), page_index_to, nums)

3332 next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums)

3333 if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages):

3334 nums_insert(NumberObject(page_index_to + 1), default_page_label, nums)

3335

3336 page_labels[NameObject("/Nums")] = nums

3337 self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels

3338

3339 def _repr_mimebundle_(

3340 self,

3341 include: Union[None, Iterable[str]] = None,

3342 exclude: Union[None, Iterable[str]] = None,

3343 ) -> Dict[str, Any]:

3344 """

3345 Integration into Jupyter Notebooks.

3346

3347 This method returns a dictionary that maps a mime-type to its

3348 representation.

3349

3350 .. seealso::

3351

3352 https://ipython.readthedocs.io/en/stable/config/integrating.html

3353 """

3354 pdf_data = BytesIO()

3355 self.write(pdf_data)

3356 data = {

3357 "application/pdf": pdf_data,

3358 }

3359

3360 if include is not None:

3361 # Filter representations based on include list

3362 data = {k: v for k, v in data.items() if k in include}

3363

3364 if exclude is not None:

3365 # Remove representations based on exclude list

3366 data = {k: v for k, v in data.items() if k not in exclude}

3367

3368 return data

3369

3370

3371def _pdf_objectify(obj: Union[Dict[str, Any], str, float, List[Any]]) -> PdfObject:

3372 if isinstance(obj, PdfObject):

3373 return obj

3374 if isinstance(obj, dict):

3375 to_add = DictionaryObject()

3376 for key, value in obj.items():

3377 to_add[NameObject(key)] = _pdf_objectify(value)

3378 return to_add

3379 if isinstance(obj, str):

3380 if obj.startswith("/"):

3381 return NameObject(obj)

3382 return TextStringObject(obj)

3383 if isinstance(obj, (float, int)):

3384 return FloatObject(obj)

3385 if isinstance(obj, list):

3386 return ArrayObject(_pdf_objectify(i) for i in obj)

3387 raise NotImplementedError(

3388 f"{type(obj)=} could not be cast to a PdfObject"

3389 )

3390

3391

3392def _create_outline_item(

3393 action_ref: Union[None, IndirectObject],

3394 title: str,

3395 color: Union[Tuple[float, float, float], str, None],

3396 italic: bool,

3397 bold: bool,

3398) -> TreeObject:

3399 outline_item = TreeObject()

3400 if action_ref is not None:

3401 outline_item[NameObject("/A")] = action_ref

3402 outline_item.update(

3403 {

3404 NameObject("/Title"): create_string_object(title),

3405 }

3406 )

3407 if color:

3408 if isinstance(color, str):

3409 color = hex_to_rgb(color)

3410 outline_item.update(

3411 {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}

3412 )

3413 if italic or bold:

3414 format_flag = 0

3415 if italic:

3416 format_flag += OutlineFontFlag.italic

3417 if bold:

3418 format_flag += OutlineFontFlag.bold

3419 outline_item.update({NameObject("/F"): NumberObject(format_flag)})

3420 return outline_item

3421

3422

3423def generate_appearance_stream(

3424 txt: str,

3425 sel: List[str],

3426 da: str,

3427 font_full_rev: Dict[str, bytes],

3428 rct: RectangleObject,

3429 font_height: float,

3430 y_offset: float,

3431) -> bytes:

3432 ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode()

3433 for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")):

3434 if line in sel:

3435 # may be improved but cannot find how to get fill working => replaced with lined box

3436 ap_stream += (

3437 f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n"

3438 f"0.5 0.5 0.5 rg s\n{da}\n"

3439 ).encode()

3440 if line_number == 0:

3441 ap_stream += f"2 {y_offset} Td\n".encode()

3442 else:

3443 # Td is a relative translation

3444 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()

3445 enc_line: List[bytes] = [

3446 font_full_rev.get(c, c.encode("utf-16-be")) for c in line

3447 ]

3448 if any(len(c) >= 2 for c in enc_line):

3449 ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"

3450 else:

3451 ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"

3452 ap_stream += b"ET\nQ\nEMC\nQ\n"

3453 return ap_stream

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_writer.py: 15%

1420 statements