Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

643 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# Copyright (c) 2024, Pubpub-ZZ 

4# 

5# All rights reserved. 

6# 

7# Redistribution and use in source and binary forms, with or without 

8# modification, are permitted provided that the following conditions are 

9# met: 

10# 

11# * Redistributions of source code must retain the above copyright notice, 

12# this list of conditions and the following disclaimer. 

13# * Redistributions in binary form must reproduce the above copyright notice, 

14# this list of conditions and the following disclaimer in the documentation 

15# and/or other materials provided with the distribution. 

16# * The name of the author may not be used to endorse or promote products 

17# derived from this software without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

29# POSSIBILITY OF SUCH DAMAGE. 

30 

31import struct 

32import zlib 

33from abc import abstractmethod 

34from collections.abc import Generator, Iterable, Iterator, Mapping 

35from datetime import datetime 

36from typing import ( 

37 Any, 

38 Optional, 

39 Union, 

40 cast, 

41) 

42 

43from ._encryption import Encryption 

44from ._page import PageObject, _VirtualList 

45from ._page_labels import index2label as page_index2page_label 

46from ._utils import ( 

47 deprecation_with_replacement, 

48 logger_warning, 

49 parse_iso8824_date, 

50) 

51from .constants import CatalogAttributes as CA 

52from .constants import CatalogDictionary as CD 

53from .constants import ( 

54 CheckboxRadioButtonAttributes, 

55 GoToActionArguments, 

56 PagesAttributes, 

57 UserAccessPermissions, 

58) 

59from .constants import Core as CO 

60from .constants import DocumentInformationAttributes as DI 

61from .constants import FieldDictionaryAttributes as FA 

62from .constants import PageAttributes as PG 

63from .errors import PdfReadError, PyPdfError 

64from .generic import ( 

65 ArrayObject, 

66 BooleanObject, 

67 ByteStringObject, 

68 Destination, 

69 DictionaryObject, 

70 EncodedStreamObject, 

71 Field, 

72 Fit, 

73 FloatObject, 

74 IndirectObject, 

75 NameObject, 

76 NullObject, 

77 NumberObject, 

78 PdfObject, 

79 TextStringObject, 

80 TreeObject, 

81 ViewerPreferences, 

82 create_string_object, 

83 is_null_or_none, 

84) 

85from .generic._files import EmbeddedFile 

86from .types import OutlineType, PagemodeType 

87from .xmp import XmpInformation 

88 

89 

90def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]: 

91 if size > 8: 

92 raise PdfReadError("Invalid size in convert_to_int") 

93 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d 

94 d = d[-8:] 

95 return struct.unpack(">q", d)[0] 

96 

97 

98class DocumentInformation(DictionaryObject): 

99 """ 

100 A class representing the basic document metadata provided in a PDF File. 

101 This class is accessible through 

102 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`. 

103 

104 All text properties of the document metadata have 

105 *two* properties, e.g. author and author_raw. The non-raw property will 

106 always return a ``TextStringObject``, making it ideal for a case where the 

107 metadata is being displayed. The raw property can sometimes return a 

108 ``ByteStringObject``, if pypdf was unable to decode the string's text 

109 encoding; this requires additional safety in the caller and therefore is not 

110 as commonly accessed. 

111 """ 

112 

113 def __init__(self) -> None: 

114 DictionaryObject.__init__(self) 

115 

116 def _get_text(self, key: str) -> Optional[str]: 

117 retval = self.get(key, None) 

118 if isinstance(retval, TextStringObject): 

119 return retval 

120 if isinstance(retval, ByteStringObject): 

121 return str(retval) 

122 return None 

123 

124 @property 

125 def title(self) -> Optional[str]: 

126 """ 

127 Read-only property accessing the document's title. 

128 

129 Returns a ``TextStringObject`` or ``None`` if the title is not 

130 specified. 

131 """ 

132 return ( 

133 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore 

134 if self.get(DI.TITLE) 

135 else None 

136 ) 

137 

138 @property 

139 def title_raw(self) -> Optional[str]: 

140 """The "raw" version of title; can return a ``ByteStringObject``.""" 

141 return self.get(DI.TITLE) 

142 

143 @property 

144 def author(self) -> Optional[str]: 

145 """ 

146 Read-only property accessing the document's author. 

147 

148 Returns a ``TextStringObject`` or ``None`` if the author is not 

149 specified. 

150 """ 

151 return self._get_text(DI.AUTHOR) 

152 

153 @property 

154 def author_raw(self) -> Optional[str]: 

155 """The "raw" version of author; can return a ``ByteStringObject``.""" 

156 return self.get(DI.AUTHOR) 

157 

158 @property 

159 def subject(self) -> Optional[str]: 

160 """ 

161 Read-only property accessing the document's subject. 

162 

163 Returns a ``TextStringObject`` or ``None`` if the subject is not 

164 specified. 

165 """ 

166 return self._get_text(DI.SUBJECT) 

167 

168 @property 

169 def subject_raw(self) -> Optional[str]: 

170 """The "raw" version of subject; can return a ``ByteStringObject``.""" 

171 return self.get(DI.SUBJECT) 

172 

173 @property 

174 def creator(self) -> Optional[str]: 

175 """ 

176 Read-only property accessing the document's creator. 

177 

178 If the document was converted to PDF from another format, this is the 

179 name of the application (e.g. OpenOffice) that created the original 

180 document from which it was converted. Returns a ``TextStringObject`` or 

181 ``None`` if the creator is not specified. 

182 """ 

183 return self._get_text(DI.CREATOR) 

184 

185 @property 

186 def creator_raw(self) -> Optional[str]: 

187 """The "raw" version of creator; can return a ``ByteStringObject``.""" 

188 return self.get(DI.CREATOR) 

189 

190 @property 

191 def producer(self) -> Optional[str]: 

192 """ 

193 Read-only property accessing the document's producer. 

194 

195 If the document was converted to PDF from another format, this is the 

196 name of the application (for example, macOS Quartz) that converted it to 

197 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not 

198 specified. 

199 """ 

200 return self._get_text(DI.PRODUCER) 

201 

202 @property 

203 def producer_raw(self) -> Optional[str]: 

204 """The "raw" version of producer; can return a ``ByteStringObject``.""" 

205 return self.get(DI.PRODUCER) 

206 

207 @property 

208 def creation_date(self) -> Optional[datetime]: 

209 """Read-only property accessing the document's creation date.""" 

210 return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) 

211 

212 @property 

213 def creation_date_raw(self) -> Optional[str]: 

214 """ 

215 The "raw" version of creation date; can return a ``ByteStringObject``. 

216 

217 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

218 is the offset from UTC. 

219 """ 

220 return self.get(DI.CREATION_DATE) 

221 

222 @property 

223 def modification_date(self) -> Optional[datetime]: 

224 """ 

225 Read-only property accessing the document's modification date. 

226 

227 The date and time the document was most recently modified. 

228 """ 

229 return parse_iso8824_date(self._get_text(DI.MOD_DATE)) 

230 

231 @property 

232 def modification_date_raw(self) -> Optional[str]: 

233 """ 

234 The "raw" version of modification date; can return a 

235 ``ByteStringObject``. 

236 

237 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

238 is the offset from UTC. 

239 """ 

240 return self.get(DI.MOD_DATE) 

241 

242 @property 

243 def keywords(self) -> Optional[str]: 

244 """ 

245 Read-only property accessing the document's keywords. 

246 

247 Returns a ``TextStringObject`` or ``None`` if keywords are not 

248 specified. 

249 """ 

250 return self._get_text(DI.KEYWORDS) 

251 

252 @property 

253 def keywords_raw(self) -> Optional[str]: 

254 """The "raw" version of keywords; can return a ``ByteStringObject``.""" 

255 return self.get(DI.KEYWORDS) 

256 

257 

258class PdfDocCommon: 

259 """ 

260 Common functions from PdfWriter and PdfReader objects. 

261 

262 This root class is strongly abstracted. 

263 """ 

264 

265 strict: bool = False # default 

266 

267 flattened_pages: Optional[list[PageObject]] = None 

268 

269 _encryption: Optional[Encryption] = None 

270 

271 _readonly: bool = False 

272 

273 @property 

274 @abstractmethod 

275 def root_object(self) -> DictionaryObject: 

276 ... # pragma: no cover 

277 

278 @property 

279 @abstractmethod 

280 def pdf_header(self) -> str: 

281 ... # pragma: no cover 

282 

283 @abstractmethod 

284 def get_object( 

285 self, indirect_reference: Union[int, IndirectObject] 

286 ) -> Optional[PdfObject]: 

287 ... # pragma: no cover 

288 

289 @abstractmethod 

290 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: 

291 ... # pragma: no cover 

292 

293 @property 

294 @abstractmethod 

295 def _info(self) -> Optional[DictionaryObject]: 

296 ... # pragma: no cover 

297 

298 @property 

299 def metadata(self) -> Optional[DocumentInformation]: 

300 """ 

301 Retrieve the PDF file's document information dictionary, if it exists. 

302 

303 Note that some PDF files use metadata streams instead of document 

304 information dictionaries, and these metadata streams will not be 

305 accessed by this function. 

306 """ 

307 retval = DocumentInformation() 

308 if self._info is None: 

309 return None 

310 retval.update(self._info) 

311 return retval 

312 

313 @property 

314 def xmp_metadata(self) -> Optional[XmpInformation]: 

315 ... # pragma: no cover 

316 

317 @property 

318 def viewer_preferences(self) -> Optional[ViewerPreferences]: 

319 """Returns the existing ViewerPreferences as an overloaded dictionary.""" 

320 o = self.root_object.get(CD.VIEWER_PREFERENCES, None) 

321 if o is None: 

322 return None 

323 o = o.get_object() 

324 if not isinstance(o, ViewerPreferences): 

325 o = ViewerPreferences(o) 

326 if hasattr(o, "indirect_reference") and o.indirect_reference is not None: 

327 self._replace_object(o.indirect_reference, o) 

328 else: 

329 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o 

330 return o 

331 

332 def get_num_pages(self) -> int: 

333 """ 

334 Calculate the number of pages in this PDF file. 

335 

336 Returns: 

337 The number of pages of the parsed PDF file. 

338 

339 Raises: 

340 PdfReadError: If restrictions prevent this action. 

341 

342 """ 

343 # Flattened pages will not work on an encrypted PDF; 

344 # the PDF file's page count is used in this case. Otherwise, 

345 # the original method (flattened page count) is used. 

346 if self.is_encrypted: 

347 return self.root_object["/Pages"]["/Count"] # type: ignore 

348 if self.flattened_pages is None: 

349 self._flatten(self._readonly) 

350 assert self.flattened_pages is not None 

351 return len(self.flattened_pages) 

352 

353 def get_page(self, page_number: int) -> PageObject: 

354 """ 

355 Retrieve a page by number from this PDF file. 

356 Most of the time ``.pages[page_number]`` is preferred. 

357 

358 Args: 

359 page_number: The page number to retrieve 

360 (pages begin at zero) 

361 

362 Returns: 

363 A :class:`PageObject<pypdf._page.PageObject>` instance. 

364 

365 """ 

366 if self.flattened_pages is None: 

367 self._flatten(self._readonly) 

368 assert self.flattened_pages is not None, "hint for mypy" 

369 return self.flattened_pages[page_number] 

370 

371 def _get_page_in_node( 

372 self, 

373 page_number: int, 

374 ) -> tuple[DictionaryObject, int]: 

375 """ 

376 Retrieve the node and position within the /Kids containing the page. 

377 If page_number is greater than the number of pages, it returns the top node, -1. 

378 """ 

379 top = cast(DictionaryObject, self.root_object["/Pages"]) 

380 

381 def recursive_call( 

382 node: DictionaryObject, mi: int 

383 ) -> tuple[Optional[PdfObject], int]: 

384 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types 

385 if node["/Type"] == "/Page": 

386 if page_number == mi: 

387 return node, -1 

388 return None, mi + 1 

389 if (page_number - mi) >= ma: # not in nodes below 

390 if node == top: 

391 return top, -1 

392 return None, mi + ma 

393 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): 

394 kid = cast(DictionaryObject, kid.get_object()) 

395 n, i = recursive_call(kid, mi) 

396 if n is not None: # page has just been found ... 

397 if i < 0: # ... just below! 

398 return node, idx 

399 # ... at lower levels 

400 return n, i 

401 mi = i 

402 raise PyPdfError("Unexpectedly cannot find the node.") 

403 

404 node, idx = recursive_call(top, 0) 

405 assert isinstance(node, DictionaryObject), "mypy" 

406 return node, idx 

407 

408 @property 

409 def named_destinations(self) -> dict[str, Destination]: 

410 """A read-only dictionary which maps names to destinations.""" 

411 return self._get_named_destinations() 

412 

413 def get_named_dest_root(self) -> ArrayObject: 

414 named_dest = ArrayObject() 

415 if CA.NAMES in self.root_object and isinstance( 

416 self.root_object[CA.NAMES], DictionaryObject 

417 ): 

418 names = cast(DictionaryObject, self.root_object[CA.NAMES]) 

419 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): 

420 # §3.6.3 Name Dictionary (PDF spec 1.7) 

421 dests = cast(DictionaryObject, names[CA.DESTS]) 

422 dests_ref = dests.indirect_reference 

423 if CA.NAMES in dests: 

424 # §7.9.6, entries in a name tree node dictionary 

425 named_dest = cast(ArrayObject, dests[CA.NAMES]) 

426 else: 

427 named_dest = ArrayObject() 

428 dests[NameObject(CA.NAMES)] = named_dest 

429 elif hasattr(self, "_add_object"): 

430 dests = DictionaryObject() 

431 dests_ref = self._add_object(dests) 

432 names[NameObject(CA.DESTS)] = dests_ref 

433 dests[NameObject(CA.NAMES)] = named_dest 

434 

435 elif hasattr(self, "_add_object"): 

436 names = DictionaryObject() 

437 names_ref = self._add_object(names) 

438 self.root_object[NameObject(CA.NAMES)] = names_ref 

439 dests = DictionaryObject() 

440 dests_ref = self._add_object(dests) 

441 names[NameObject(CA.DESTS)] = dests_ref 

442 dests[NameObject(CA.NAMES)] = named_dest 

443 

444 return named_dest 

445 

446 ## common 

447 def _get_named_destinations( 

448 self, 

449 tree: Union[TreeObject, None] = None, 

450 retval: Optional[dict[str, Destination]] = None, 

451 ) -> dict[str, Destination]: 

452 """ 

453 Retrieve the named destinations present in the document. 

454 

455 Args: 

456 tree: The current tree. 

457 retval: The previously retrieved destinations for nested calls. 

458 

459 Returns: 

460 A dictionary which maps names to destinations. 

461 

462 """ 

463 if retval is None: 

464 retval = {} 

465 catalog = self.root_object 

466 

467 # get the name tree 

468 if CA.DESTS in catalog: 

469 tree = cast(TreeObject, catalog[CA.DESTS]) 

470 elif CA.NAMES in catalog: 

471 names = cast(DictionaryObject, catalog[CA.NAMES]) 

472 if CA.DESTS in names: 

473 tree = cast(TreeObject, names[CA.DESTS]) 

474 

475 if is_null_or_none(tree): 

476 return retval 

477 assert tree is not None, "mypy" 

478 

479 if PagesAttributes.KIDS in tree: 

480 # recurse down the tree 

481 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]): 

482 self._get_named_destinations(kid.get_object(), retval) 

483 # §7.9.6, entries in a name tree node dictionary 

484 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6) 

485 names = cast(DictionaryObject, tree[CA.NAMES]) 

486 i = 0 

487 while i < len(names): 

488 original_key = names[i].get_object() 

489 i += 1 

490 if not isinstance(original_key, (bytes, str)): 

491 continue 

492 key = str(original_key) 

493 try: 

494 value = names[i].get_object() 

495 except IndexError: 

496 break 

497 i += 1 

498 if isinstance(value, DictionaryObject): 

499 if "/D" in value: 

500 value = value["/D"] 

501 else: 

502 continue 

503 dest = self._build_destination(key, value) 

504 if dest is not None: 

505 retval[key] = dest 

506 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1) 

507 for k__, v__ in tree.items(): 

508 val = v__.get_object() 

509 if isinstance(val, DictionaryObject): 

510 if "/D" in val: 

511 val = val["/D"].get_object() 

512 else: 

513 continue 

514 dest = self._build_destination(k__, val) 

515 if dest is not None: 

516 retval[k__] = dest 

517 return retval 

518 

519 # A select group of relevant field attributes. For the complete list, 

520 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification. 

521 

522 def get_fields( 

523 self, 

524 tree: Optional[TreeObject] = None, 

525 retval: Optional[dict[Any, Any]] = None, 

526 fileobj: Optional[Any] = None, 

527 stack: Optional[list[PdfObject]] = None, 

528 ) -> Optional[dict[str, Any]]: 

529 """ 

530 Extract field data if this PDF contains interactive form fields. 

531 

532 The *tree*, *retval*, *stack* parameters are for recursive use. 

533 

534 Args: 

535 tree: Current object to parse. 

536 retval: In-progress list of fields. 

537 fileobj: A file object (usually a text file) to write 

538 a report to on all interactive form fields found. 

539 stack: List of already parsed objects. 

540 

541 Returns: 

542 A dictionary where each key is a field name, and each 

543 value is a :class:`Field<pypdf.generic.Field>` object. By 

544 default, the mapping name is used for keys. 

545 ``None`` if form data could not be located. 

546 

547 """ 

548 field_attributes = FA.attributes_dict() 

549 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) 

550 if retval is None: 

551 retval = {} 

552 catalog = self.root_object 

553 stack = [] 

554 # get the AcroForm tree 

555 if CD.ACRO_FORM in catalog: 

556 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) 

557 else: 

558 return None 

559 if tree is None: 

560 return retval 

561 assert stack is not None 

562 if "/Fields" in tree: 

563 fields = cast(ArrayObject, tree["/Fields"]) 

564 for f in fields: 

565 field = f.get_object() 

566 self._build_field(field, retval, fileobj, field_attributes, stack) 

567 elif any(attr in tree for attr in field_attributes): 

568 # Tree is a field 

569 self._build_field(tree, retval, fileobj, field_attributes, stack) 

570 return retval 

571 

572 def _get_qualified_field_name(self, parent: DictionaryObject) -> str: 

573 if "/TM" in parent: 

574 return cast(str, parent["/TM"]) 

575 if "/Parent" in parent: 

576 return ( 

577 self._get_qualified_field_name( 

578 cast(DictionaryObject, parent["/Parent"]) 

579 ) 

580 + "." 

581 + cast(str, parent.get("/T", "")) 

582 ) 

583 return cast(str, parent.get("/T", "")) 

584 

585 def _build_field( 

586 self, 

587 field: Union[TreeObject, DictionaryObject], 

588 retval: dict[Any, Any], 

589 fileobj: Any, 

590 field_attributes: Any, 

591 stack: list[PdfObject], 

592 ) -> None: 

593 if all(attr not in field for attr in ("/T", "/TM")): 

594 return 

595 key = self._get_qualified_field_name(field) 

596 if fileobj: 

597 self._write_field(fileobj, field, field_attributes) 

598 fileobj.write("\n") 

599 retval[key] = Field(field) 

600 obj = retval[key].indirect_reference.get_object() # to get the full object 

601 if obj.get(FA.FT, "") == "/Ch": 

602 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] 

603 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: 

604 # Checkbox 

605 retval[key][NameObject("/_States_")] = ArrayObject( 

606 list(obj["/AP"]["/N"].keys()) 

607 ) 

608 if "/Off" not in retval[key]["/_States_"]: 

609 retval[key][NameObject("/_States_")].append(NameObject("/Off")) 

610 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: 

611 states: list[str] = [] 

612 retval[key][NameObject("/_States_")] = ArrayObject(states) 

613 for k in obj.get(FA.Kids, {}): 

614 k = k.get_object() 

615 for s in list(k["/AP"]["/N"].keys()): 

616 if s not in states: 

617 states.append(s) 

618 retval[key][NameObject("/_States_")] = ArrayObject(states) 

619 if ( 

620 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 

621 and "/Off" in retval[key]["/_States_"] 

622 ): 

623 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] 

624 # at last for order 

625 self._check_kids(field, retval, fileobj, stack) 

626 

627 def _check_kids( 

628 self, 

629 tree: Union[TreeObject, DictionaryObject], 

630 retval: Any, 

631 fileobj: Any, 

632 stack: list[PdfObject], 

633 ) -> None: 

634 if tree in stack: 

635 logger_warning( 

636 f"{self._get_qualified_field_name(tree)} already parsed", __name__ 

637 ) 

638 return 

639 stack.append(tree) 

640 if PagesAttributes.KIDS in tree: 

641 # recurse down the tree 

642 for kid in tree[PagesAttributes.KIDS]: # type: ignore 

643 kid = kid.get_object() 

644 self.get_fields(kid, retval, fileobj, stack) 

645 

646 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: 

647 field_attributes_tuple = FA.attributes() 

648 field_attributes_tuple = ( 

649 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() 

650 ) 

651 

652 for attr in field_attributes_tuple: 

653 if attr in ( 

654 FA.Kids, 

655 FA.AA, 

656 ): 

657 continue 

658 attr_name = field_attributes[attr] 

659 try: 

660 if attr == FA.FT: 

661 # Make the field type value clearer 

662 types = { 

663 "/Btn": "Button", 

664 "/Tx": "Text", 

665 "/Ch": "Choice", 

666 "/Sig": "Signature", 

667 } 

668 if field[attr] in types: 

669 fileobj.write(f"{attr_name}: {types[field[attr]]}\n") 

670 elif attr == FA.Parent: 

671 # Let's just write the name of the parent 

672 try: 

673 name = field[attr][FA.TM] 

674 except KeyError: 

675 name = field[attr][FA.T] 

676 fileobj.write(f"{attr_name}: {name}\n") 

677 else: 

678 fileobj.write(f"{attr_name}: {field[attr]}\n") 

679 except KeyError: 

680 # Field attribute is N/A or unknown, so don't write anything 

681 pass 

682 

683 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]: 

684 """ 

685 Retrieve form fields from the document with textual data. 

686 

687 Args: 

688 full_qualified_name: to get full name 

689 

690 Returns: 

691 A dictionary. The key is the name of the form field, 

692 the value is the content of the field. 

693 

694 If the document contains multiple form fields with the same name, the 

695 second and following will get the suffix .2, .3, ... 

696 

697 """ 

698 

699 def indexed_key(k: str, fields: dict[Any, Any]) -> str: 

700 if k not in fields: 

701 return k 

702 return ( 

703 k 

704 + "." 

705 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2) 

706 ) 

707 

708 # Retrieve document form fields 

709 formfields = self.get_fields() 

710 if formfields is None: 

711 return {} 

712 ff = {} 

713 for field, value in formfields.items(): 

714 if value.get("/FT") == "/Tx": 

715 if full_qualified_name: 

716 ff[field] = value.get("/V") 

717 else: 

718 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") 

719 return ff 

720 

721 def get_pages_showing_field( 

722 self, field: Union[Field, PdfObject, IndirectObject] 

723 ) -> list[PageObject]: 

724 """ 

725 Provides list of pages where the field is called. 

726 

727 Args: 

728 field: Field Object, PdfObject or IndirectObject referencing a Field 

729 

730 Returns: 

731 List of pages: 

732 - Empty list: 

733 The field has no widgets attached 

734 (either hidden field or ancestor field). 

735 - Single page list: 

736 Page where the widget is present 

737 (most common). 

738 - Multi-page list: 

739 Field with multiple kids widgets 

740 (example: radio buttons, field repeated on multiple pages). 

741 

742 """ 

743 

744 def _get_inherited(obj: DictionaryObject, key: str) -> Any: 

745 if key in obj: 

746 return obj[key] 

747 if "/Parent" in obj: 

748 return _get_inherited( 

749 cast(DictionaryObject, obj["/Parent"].get_object()), key 

750 ) 

751 return None 

752 

753 try: 

754 # to cope with all types 

755 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore 

756 except Exception as exc: 

757 raise ValueError("Field type is invalid") from exc 

758 if is_null_or_none(_get_inherited(field, "/FT")): 

759 raise ValueError("Field is not valid") 

760 ret = [] 

761 if field.get("/Subtype", "") == "/Widget": 

762 if "/P" in field: 

763 ret = [field["/P"].get_object()] 

764 else: 

765 ret = [ 

766 p 

767 for p in self.pages 

768 if field.indirect_reference in p.get("/Annots", "") 

769 ] 

770 else: 

771 kids = field.get("/Kids", ()) 

772 for k in kids: 

773 k = k.get_object() 

774 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): 

775 # Kid that is just a widget, not a field: 

776 if "/P" in k: 

777 ret += [k["/P"].get_object()] 

778 else: 

779 ret += [ 

780 p 

781 for p in self.pages 

782 if k.indirect_reference in p.get("/Annots", "") 

783 ] 

784 return [ 

785 x 

786 if isinstance(x, PageObject) 

787 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore 

788 for x in ret 

789 ] 

790 

791 @property 

792 def open_destination( 

793 self, 

794 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

795 """ 

796 Property to access the opening destination (``/OpenAction`` entry in 

797 the PDF catalog). It returns ``None`` if the entry does not exist 

798 or is not set. 

799 

800 Raises: 

801 Exception: If a destination is invalid. 

802 

803 """ 

804 if "/OpenAction" not in self.root_object: 

805 return None 

806 oa: Any = self.root_object["/OpenAction"] 

807 if isinstance(oa, bytes): # pragma: no cover 

808 oa = oa.decode() 

809 if isinstance(oa, str): 

810 return create_string_object(oa) 

811 if isinstance(oa, ArrayObject): 

812 try: 

813 page, typ, *array = oa 

814 fit = Fit(typ, tuple(array)) 

815 return Destination("OpenAction", page, fit) 

816 except Exception as exc: 

817 raise Exception(f"Invalid Destination {oa}: {exc}") 

818 else: 

819 return None 

820 

821 @open_destination.setter 

822 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

823 raise NotImplementedError("No setter for open_destination") 

824 

825 @property 

826 def outline(self) -> OutlineType: 

827 """ 

828 Read-only property for the outline present in the document 

829 (i.e., a collection of 'outline items' which are also known as 

830 'bookmarks'). 

831 """ 

832 return self._get_outline() 

833 

834 def _get_outline( 

835 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None 

836 ) -> OutlineType: 

837 if outline is None: 

838 outline = [] 

839 catalog = self.root_object 

840 

841 # get the outline dictionary and named destinations 

842 if CO.OUTLINES in catalog: 

843 lines = cast(DictionaryObject, catalog[CO.OUTLINES]) 

844 

845 if isinstance(lines, NullObject): 

846 return outline 

847 

848 # §12.3.3 Document outline, entries in the outline dictionary 

849 if not is_null_or_none(lines) and "/First" in lines: 

850 node = cast(DictionaryObject, lines["/First"]) 

851 self._named_destinations = self._get_named_destinations() 

852 

853 if node is None: 

854 return outline 

855 

856 # see if there are any more outline items 

857 while True: 

858 outline_obj = self._build_outline_item(node) 

859 if outline_obj: 

860 outline.append(outline_obj) 

861 

862 # check for sub-outline 

863 if "/First" in node: 

864 sub_outline: list[Any] = [] 

865 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) 

866 if sub_outline: 

867 outline.append(sub_outline) 

868 

869 if "/Next" not in node: 

870 break 

871 node = cast(DictionaryObject, node["/Next"]) 

872 

873 return outline 

874 

875 @property 

876 def threads(self) -> Optional[ArrayObject]: 

877 """ 

878 Read-only property for the list of threads. 

879 

880 See §12.4.3 from the PDF 1.7 or 2.0 specification. 

881 

882 It is an array of dictionaries with "/F" (the first bead in the thread) 

883 and "/I" (a thread information dictionary containing information about 

884 the thread, such as its title, author, and creation date) properties or 

885 None if there are no articles. 

886 

887 Since PDF 2.0 it can also contain an indirect reference to a metadata 

888 stream containing information about the thread, such as its title, 

889 author, and creation date. 

890 """ 

891 catalog = self.root_object 

892 if CO.THREADS in catalog: 

893 return cast("ArrayObject", catalog[CO.THREADS]) 

894 return None 

895 

896 @abstractmethod 

897 def _get_page_number_by_indirect( 

898 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

899 ) -> Optional[int]: 

900 ... # pragma: no cover 

901 

902 def get_page_number(self, page: PageObject) -> Optional[int]: 

903 """ 

904 Retrieve page number of a given PageObject. 

905 

906 Args: 

907 page: The page to get page number. Should be 

908 an instance of :class:`PageObject<pypdf._page.PageObject>` 

909 

910 Returns: 

911 The page number or None if page is not found 

912 

913 """ 

914 return self._get_page_number_by_indirect(page.indirect_reference) 

915 

916 def get_destination_page_number(self, destination: Destination) -> Optional[int]: 

917 """ 

918 Retrieve page number of a given Destination object. 

919 

920 Args: 

921 destination: The destination to get page number. 

922 

923 Returns: 

924 The page number or None if page is not found 

925 

926 """ 

927 return self._get_page_number_by_indirect(destination.page) 

928 

929 def _build_destination( 

930 self, 

931 title: str, 

932 array: Optional[ 

933 list[ 

934 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] 

935 ] 

936 ], 

937 ) -> Destination: 

938 page, typ = None, None 

939 # handle outline items with missing or invalid destination 

940 if ( 

941 isinstance(array, (NullObject, str)) 

942 or (isinstance(array, ArrayObject) and len(array) == 0) 

943 or array is None 

944 ): 

945 page = NullObject() 

946 return Destination(title, page, Fit.fit()) 

947 page, typ, *array = array # type: ignore 

948 try: 

949 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore 

950 except PdfReadError: 

951 logger_warning(f"Unknown destination: {title} {array}", __name__) 

952 if self.strict: 

953 raise 

954 # create a link to first Page 

955 tmp = self.pages[0].indirect_reference 

956 indirect_reference = NullObject() if tmp is None else tmp 

957 return Destination(title, indirect_reference, Fit.fit()) 

958 

959 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: 

960 dest, title, outline_item = None, None, None 

961 

962 # title required for valid outline 

963 # §12.3.3, entries in an outline item dictionary 

964 try: 

965 title = cast("str", node["/Title"]) 

966 except KeyError: 

967 if self.strict: 

968 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") 

969 title = "" 

970 

971 if "/A" in node: 

972 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported) 

973 action = cast(DictionaryObject, node["/A"]) 

974 action_type = cast(NameObject, action[GoToActionArguments.S]) 

975 if action_type == "/GoTo": 

976 if GoToActionArguments.D in action: 

977 dest = action[GoToActionArguments.D] 

978 elif self.strict: 

979 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}") 

980 elif "/Dest" in node: 

981 # Destination, PDF 1.7 and PDF 2.0 §12.3.2 

982 dest = node["/Dest"] 

983 # if array was referenced in another object, will be a dict w/ key "/D" 

984 if isinstance(dest, DictionaryObject) and "/D" in dest: 

985 dest = dest["/D"] 

986 

987 if isinstance(dest, ArrayObject): 

988 outline_item = self._build_destination(title, dest) 

989 elif isinstance(dest, str): 

990 # named destination, addresses NameObject Issue #193 

991 # TODO: Keep named destination instead of replacing it? 

992 try: 

993 outline_item = self._build_destination( 

994 title, self._named_destinations[dest].dest_array 

995 ) 

996 except KeyError: 

997 # named destination not found in Name Dict 

998 outline_item = self._build_destination(title, None) 

999 elif dest is None: 

1000 # outline item not required to have destination or action 

1001 # PDFv1.7 Table 153 

1002 outline_item = self._build_destination(title, dest) 

1003 else: 

1004 if self.strict: 

1005 raise PdfReadError(f"Unexpected destination {dest!r}") 

1006 logger_warning( 

1007 f"Removed unexpected destination {dest!r} from destination", 

1008 __name__, 

1009 ) 

1010 outline_item = self._build_destination(title, None) 

1011 

1012 # if outline item created, add color, format, and child count if present 

1013 if outline_item: 

1014 if "/C" in node: 

1015 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 

1016 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore 

1017 if "/F" in node: 

1018 # specifies style characteristics bold and/or italic 

1019 # with 1=italic, 2=bold, 3=both 

1020 outline_item[NameObject("/F")] = node["/F"] 

1021 if "/Count" in node: 

1022 # absolute value = num. visible children 

1023 # with positive = open/unfolded, negative = closed/folded 

1024 outline_item[NameObject("/Count")] = node["/Count"] 

1025 # if count is 0 we will consider it as open (to have available is_open) 

1026 outline_item[NameObject("/%is_open%")] = BooleanObject( 

1027 node.get("/Count", 0) >= 0 

1028 ) 

1029 outline_item.node = node 

1030 try: 

1031 outline_item.indirect_reference = node.indirect_reference 

1032 except AttributeError: 

1033 pass 

1034 return outline_item 

1035 

1036 @property 

1037 def pages(self) -> list[PageObject]: 

1038 """ 

1039 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`. 

1040 This property allows to get a page or a range of pages. 

1041 

1042 Note: 

1043 For PdfWriter only: Provides the capability to remove a page/range of 

1044 page from the list (using the del operator). Remember: Only the page 

1045 entry is removed, as the objects beneath can be used elsewhere. A 

1046 solution to completely remove them - if they are not used anywhere - is 

1047 to write to a buffer/temporary file and then load it into a new 

1048 PdfWriter. 

1049 

1050 """ 

1051 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore 

1052 

1053 @property 

1054 def page_labels(self) -> list[str]: 

1055 """ 

1056 A list of labels for the pages in this document. 

1057 

1058 This property is read-only. The labels are in the order that the pages 

1059 appear in the document. 

1060 """ 

1061 return [page_index2page_label(self, i) for i in range(len(self.pages))] 

1062 

1063 @property 

1064 def page_layout(self) -> Optional[str]: 

1065 """ 

1066 Get the page layout currently being used. 

1067 

1068 .. list-table:: Valid ``layout`` values 

1069 :widths: 50 200 

1070 

1071 * - /NoLayout 

1072 - Layout explicitly not specified 

1073 * - /SinglePage 

1074 - Show one page at a time 

1075 * - /OneColumn 

1076 - Show one column at a time 

1077 * - /TwoColumnLeft 

1078 - Show pages in two columns, odd-numbered pages on the left 

1079 * - /TwoColumnRight 

1080 - Show pages in two columns, odd-numbered pages on the right 

1081 * - /TwoPageLeft 

1082 - Show two pages at a time, odd-numbered pages on the left 

1083 * - /TwoPageRight 

1084 - Show two pages at a time, odd-numbered pages on the right 

1085 """ 

1086 try: 

1087 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) 

1088 except KeyError: 

1089 return None 

1090 

1091 @property 

1092 def page_mode(self) -> Optional[PagemodeType]: 

1093 """ 

1094 Get the page mode currently being used. 

1095 

1096 .. list-table:: Valid ``mode`` values 

1097 :widths: 50 200 

1098 

1099 * - /UseNone 

1100 - Do not show outline or thumbnails panels 

1101 * - /UseOutlines 

1102 - Show outline (aka bookmarks) panel 

1103 * - /UseThumbs 

1104 - Show page thumbnails panel 

1105 * - /FullScreen 

1106 - Fullscreen view 

1107 * - /UseOC 

1108 - Show Optional Content Group (OCG) panel 

1109 * - /UseAttachments 

1110 - Show attachments panel 

1111 """ 

1112 try: 

1113 return self.root_object["/PageMode"] # type: ignore 

1114 except KeyError: 

1115 return None 

1116 

1117 def _flatten( 

1118 self, 

1119 list_only: bool = False, 

1120 pages: Union[None, DictionaryObject, PageObject] = None, 

1121 inherit: Optional[dict[str, Any]] = None, 

1122 indirect_reference: Optional[IndirectObject] = None, 

1123 ) -> None: 

1124 """ 

1125 Process the document pages to ease searching. 

1126 

1127 Attributes of a page may inherit from ancestor nodes 

1128 in the page tree. Flattening means moving 

1129 any inheritance data into descendant nodes, 

1130 effectively removing the inheritance dependency. 

1131 

1132 Note: It is distinct from another use of "flattening" applied to PDFs. 

1133 Flattening a PDF also means combining all the contents into one single layer 

1134 and making the file less editable. 

1135 

1136 Args: 

1137 list_only: Will only list the pages within _flatten_pages. 

1138 pages: 

1139 inherit: 

1140 indirect_reference: Used recursively to flatten the /Pages object. 

1141 

1142 """ 

1143 inheritable_page_attributes = ( 

1144 NameObject(PG.RESOURCES), 

1145 NameObject(PG.MEDIABOX), 

1146 NameObject(PG.CROPBOX), 

1147 NameObject(PG.ROTATE), 

1148 ) 

1149 if inherit is None: 

1150 inherit = {} 

1151 if pages is None: 

1152 # Fix issue 327: set flattened_pages attribute only for 

1153 # decrypted file 

1154 catalog = self.root_object 

1155 pages = catalog.get("/Pages").get_object() # type: ignore 

1156 if not isinstance(pages, DictionaryObject): 

1157 raise PdfReadError("Invalid object in /Pages") 

1158 self.flattened_pages = [] 

1159 

1160 if PagesAttributes.TYPE in pages: 

1161 t = cast(str, pages[PagesAttributes.TYPE]) 

1162 # if the page tree node has no /Type, consider as a page if /Kids is also missing 

1163 elif PagesAttributes.KIDS not in pages: 

1164 t = "/Page" 

1165 else: 

1166 t = "/Pages" 

1167 

1168 if t == "/Pages": 

1169 for attr in inheritable_page_attributes: 

1170 if attr in pages: 

1171 inherit[attr] = pages[attr] 

1172 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]): 

1173 addt = {} 

1174 if isinstance(page, IndirectObject): 

1175 addt["indirect_reference"] = page 

1176 obj = page.get_object() 

1177 if obj: 

1178 # damaged file may have invalid child in /Pages 

1179 try: 

1180 self._flatten(list_only, obj, inherit, **addt) 

1181 except RecursionError: 

1182 raise PdfReadError( 

1183 "Maximum recursion depth reached during page flattening." 

1184 ) 

1185 elif t == "/Page": 

1186 for attr_in, value in inherit.items(): 

1187 # if the page has its own value, it does not inherit the 

1188 # parent's value 

1189 if attr_in not in pages: 

1190 pages[attr_in] = value 

1191 page_obj = PageObject(self, indirect_reference) 

1192 if not list_only: 

1193 page_obj.update(pages) 

1194 

1195 # TODO: Could flattened_pages be None at this point? 

1196 self.flattened_pages.append(page_obj) # type: ignore 

1197 

1198 def remove_page( 

1199 self, 

1200 page: Union[int, PageObject, IndirectObject], 

1201 clean: bool = False, 

1202 ) -> None: 

1203 """ 

1204 Remove page from pages list. 

1205 

1206 Args: 

1207 page: 

1208 * :class:`int`: Page number to be removed. 

1209 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times 

1210 only the first one will be removed. 

1211 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed. 

1212 

1213 clean: replace PageObject with NullObject to prevent annotations 

1214 or destinations to reference a detached page. 

1215 

1216 """ 

1217 if self.flattened_pages is None: 

1218 self._flatten(self._readonly) 

1219 assert self.flattened_pages is not None 

1220 if isinstance(page, IndirectObject): 

1221 p = page.get_object() 

1222 if not isinstance(p, PageObject): 

1223 logger_warning("IndirectObject is not referencing a page", __name__) 

1224 return 

1225 page = p 

1226 

1227 if not isinstance(page, int): 

1228 try: 

1229 page = self.flattened_pages.index(page) 

1230 except ValueError: 

1231 logger_warning("Cannot find page in pages", __name__) 

1232 return 

1233 if not (0 <= page < len(self.flattened_pages)): 

1234 logger_warning("Page number is out of range", __name__) 

1235 return 

1236 

1237 ind = self.pages[page].indirect_reference 

1238 del self.pages[page] 

1239 if clean and ind is not None: 

1240 self._replace_object(ind, NullObject()) 

1241 

1242 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: 

1243 """ 

1244 Used to ease development. 

1245 

1246 This is equivalent to generic.IndirectObject(num,gen,self).get_object() 

1247 

1248 Args: 

1249 num: The object number of the indirect object. 

1250 gen: The generation number of the indirect object. 

1251 

1252 Returns: 

1253 A PdfObject 

1254 

1255 """ 

1256 return IndirectObject(num, gen, self).get_object() 

1257 

1258 def decode_permissions( 

1259 self, permissions_code: int 

1260 ) -> dict[str, bool]: # pragma: no cover 

1261 """Take the permissions as an integer, return the allowed access.""" 

1262 deprecation_with_replacement( 

1263 old_name="decode_permissions", 

1264 new_name="user_access_permissions", 

1265 removed_in="5.0.0", 

1266 ) 

1267 

1268 permissions_mapping = { 

1269 "print": UserAccessPermissions.PRINT, 

1270 "modify": UserAccessPermissions.MODIFY, 

1271 "copy": UserAccessPermissions.EXTRACT, 

1272 "annotations": UserAccessPermissions.ADD_OR_MODIFY, 

1273 "forms": UserAccessPermissions.FILL_FORM_FIELDS, 

1274 # Do not fix typo, as part of official, but deprecated API. 

1275 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS, 

1276 "assemble": UserAccessPermissions.ASSEMBLE_DOC, 

1277 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION, 

1278 } 

1279 

1280 return { 

1281 key: permissions_code & flag != 0 

1282 for key, flag in permissions_mapping.items() 

1283 } 

1284 

1285 @property 

1286 def user_access_permissions(self) -> Optional[UserAccessPermissions]: 

1287 """Get the user access permissions for encrypted documents. Returns None if not encrypted.""" 

1288 if self._encryption is None: 

1289 return None 

1290 return UserAccessPermissions(self._encryption.P) 

1291 

1292 @property 

1293 @abstractmethod 

1294 def is_encrypted(self) -> bool: 

1295 """ 

1296 Read-only boolean property showing whether this PDF file is encrypted. 

1297 

1298 Note that this property, if true, will remain true even after the 

1299 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1300 """ 

1301 ... # pragma: no cover 

1302 

1303 @property 

1304 def xfa(self) -> Optional[dict[str, Any]]: 

1305 tree: Optional[TreeObject] = None 

1306 retval: dict[str, Any] = {} 

1307 catalog = self.root_object 

1308 

1309 if "/AcroForm" not in catalog or not catalog["/AcroForm"]: 

1310 return None 

1311 

1312 tree = cast(TreeObject, catalog["/AcroForm"]) 

1313 

1314 if "/XFA" in tree: 

1315 fields = cast(ArrayObject, tree["/XFA"]) 

1316 i = iter(fields) 

1317 for f in i: 

1318 tag = f 

1319 f = next(i) 

1320 if isinstance(f, IndirectObject): 

1321 field = cast(Optional[EncodedStreamObject], f.get_object()) 

1322 if field: 

1323 es = zlib.decompress(field._data) 

1324 retval[tag] = es 

1325 return retval 

1326 

1327 @property 

1328 def attachments(self) -> Mapping[str, list[bytes]]: 

1329 """Mapping of attachment filenames to their content.""" 

1330 return LazyDict( 

1331 { 

1332 name: (self._get_attachment_list, name) 

1333 for name in self._list_attachments() 

1334 } 

1335 ) 

1336 

1337 @property 

1338 def attachment_list(self) -> Generator[EmbeddedFile, None, None]: 

1339 """Iterable of attachment objects.""" 

1340 yield from EmbeddedFile._load(self.root_object) 

1341 

1342 def _list_attachments(self) -> list[str]: 

1343 """ 

1344 Retrieves the list of filenames of file attachments. 

1345 

1346 Returns: 

1347 list of filenames 

1348 

1349 """ 

1350 names = [] 

1351 for entry in self.attachment_list: 

1352 names.append(entry.name) 

1353 if (name := entry.alternative_name) != entry.name and name: 

1354 names.append(name) 

1355 return names 

1356 

1357 def _get_attachment_list(self, name: str) -> list[bytes]: 

1358 out = self._get_attachments(name)[name] 

1359 if isinstance(out, list): 

1360 return out 

1361 return [out] 

1362 

1363 def _get_attachments( 

1364 self, filename: Optional[str] = None 

1365 ) -> dict[str, Union[bytes, list[bytes]]]: 

1366 """ 

1367 Retrieves all or selected file attachments of the PDF as a dictionary of file names 

1368 and the file data as a bytestring. 

1369 

1370 Args: 

1371 filename: If filename is None, then a dictionary of all attachments 

1372 will be returned, where the key is the filename and the value 

1373 is the content. Otherwise, a dictionary with just a single key 

1374 - the filename - and its content will be returned. 

1375 

1376 Returns: 

1377 dictionary of filename -> Union[bytestring or List[ByteString]] 

1378 If the filename exists multiple times a list of the different versions will be provided. 

1379 

1380 """ 

1381 attachments: dict[str, Union[bytes, list[bytes]]] = {} 

1382 for entry in self.attachment_list: 

1383 names = set() 

1384 alternative_name = entry.alternative_name 

1385 if filename is not None: 

1386 if filename in {entry.name, alternative_name}: 

1387 name = entry.name if filename == entry.name else alternative_name 

1388 names.add(name) 

1389 else: 

1390 continue 

1391 else: 

1392 names = {entry.name, alternative_name} 

1393 

1394 for name in names: 

1395 if name is None: 

1396 continue 

1397 if name in attachments: 

1398 if not isinstance(attachments[name], list): 

1399 attachments[name] = [attachments[name]] # type:ignore 

1400 attachments[name].append(entry.content) # type:ignore 

1401 else: 

1402 attachments[name] = entry.content 

1403 return attachments 

1404 

1405 @abstractmethod 

1406 def _repr_mimebundle_( 

1407 self, 

1408 include: Union[None, Iterable[str]] = None, 

1409 exclude: Union[None, Iterable[str]] = None, 

1410 ) -> dict[str, Any]: 

1411 """ 

1412 Integration into Jupyter Notebooks. 

1413 

1414 This method returns a dictionary that maps a mime-type to its 

1415 representation. 

1416 

1417 .. seealso:: 

1418 

1419 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1420 """ 

1421 ... # pragma: no cover 

1422 

1423 

1424class LazyDict(Mapping[Any, Any]): 

1425 def __init__(self, *args: Any, **kwargs: Any) -> None: 

1426 self._raw_dict = dict(*args, **kwargs) 

1427 

1428 def __getitem__(self, key: str) -> Any: 

1429 func, arg = self._raw_dict.__getitem__(key) 

1430 return func(arg) 

1431 

1432 def __iter__(self) -> Iterator[Any]: 

1433 return iter(self._raw_dict) 

1434 

1435 def __len__(self) -> int: 

1436 return len(self._raw_dict) 

1437 

1438 def __str__(self) -> str: 

1439 return f"LazyDict(keys={list(self.keys())})"