Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

643 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# Copyright (c) 2024, Pubpub-ZZ 

4# 

5# All rights reserved. 

6# 

7# Redistribution and use in source and binary forms, with or without 

8# modification, are permitted provided that the following conditions are 

9# met: 

10# 

11# * Redistributions of source code must retain the above copyright notice, 

12# this list of conditions and the following disclaimer. 

13# * Redistributions in binary form must reproduce the above copyright notice, 

14# this list of conditions and the following disclaimer in the documentation 

15# and/or other materials provided with the distribution. 

16# * The name of the author may not be used to endorse or promote products 

17# derived from this software without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

29# POSSIBILITY OF SUCH DAMAGE. 

30 

31import struct 

32import zlib 

33from abc import abstractmethod 

34from datetime import datetime 

35from typing import ( 

36 Any, 

37 Dict, 

38 Generator, 

39 Iterable, 

40 Iterator, 

41 List, 

42 Mapping, 

43 Optional, 

44 Tuple, 

45 Union, 

46 cast, 

47) 

48 

49from ._encryption import Encryption 

50from ._page import PageObject, _VirtualList 

51from ._page_labels import index2label as page_index2page_label 

52from ._utils import ( 

53 deprecate_with_replacement, 

54 logger_warning, 

55 parse_iso8824_date, 

56) 

57from .constants import CatalogAttributes as CA 

58from .constants import CatalogDictionary as CD 

59from .constants import ( 

60 CheckboxRadioButtonAttributes, 

61 GoToActionArguments, 

62 UserAccessPermissions, 

63) 

64from .constants import Core as CO 

65from .constants import DocumentInformationAttributes as DI 

66from .constants import FieldDictionaryAttributes as FA 

67from .constants import PageAttributes as PG 

68from .constants import PagesAttributes as PA 

69from .errors import PdfReadError, PyPdfError 

70from .generic import ( 

71 ArrayObject, 

72 BooleanObject, 

73 ByteStringObject, 

74 Destination, 

75 DictionaryObject, 

76 EncodedStreamObject, 

77 Field, 

78 Fit, 

79 FloatObject, 

80 IndirectObject, 

81 NameObject, 

82 NullObject, 

83 NumberObject, 

84 PdfObject, 

85 TextStringObject, 

86 TreeObject, 

87 ViewerPreferences, 

88 create_string_object, 

89 is_null_or_none, 

90) 

91from .generic._files import EmbeddedFile 

92from .types import OutlineType, PagemodeType 

93from .xmp import XmpInformation 

94 

95 

96def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: 

97 if size > 8: 

98 raise PdfReadError("Invalid size in convert_to_int") 

99 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d 

100 d = d[-8:] 

101 return struct.unpack(">q", d)[0] 

102 

103 

104class DocumentInformation(DictionaryObject): 

105 """ 

106 A class representing the basic document metadata provided in a PDF File. 

107 This class is accessible through 

108 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`. 

109 

110 All text properties of the document metadata have 

111 *two* properties, e.g. author and author_raw. The non-raw property will 

112 always return a ``TextStringObject``, making it ideal for a case where the 

113 metadata is being displayed. The raw property can sometimes return a 

114 ``ByteStringObject``, if pypdf was unable to decode the string's text 

115 encoding; this requires additional safety in the caller and therefore is not 

116 as commonly accessed. 

117 """ 

118 

119 def __init__(self) -> None: 

120 DictionaryObject.__init__(self) 

121 

122 def _get_text(self, key: str) -> Optional[str]: 

123 retval = self.get(key, None) 

124 if isinstance(retval, TextStringObject): 

125 return retval 

126 if isinstance(retval, ByteStringObject): 

127 return str(retval) 

128 return None 

129 

130 @property 

131 def title(self) -> Optional[str]: 

132 """ 

133 Read-only property accessing the document's title. 

134 

135 Returns a ``TextStringObject`` or ``None`` if the title is not 

136 specified. 

137 """ 

138 return ( 

139 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore 

140 if self.get(DI.TITLE) 

141 else None 

142 ) 

143 

144 @property 

145 def title_raw(self) -> Optional[str]: 

146 """The "raw" version of title; can return a ``ByteStringObject``.""" 

147 return self.get(DI.TITLE) 

148 

149 @property 

150 def author(self) -> Optional[str]: 

151 """ 

152 Read-only property accessing the document's author. 

153 

154 Returns a ``TextStringObject`` or ``None`` if the author is not 

155 specified. 

156 """ 

157 return self._get_text(DI.AUTHOR) 

158 

159 @property 

160 def author_raw(self) -> Optional[str]: 

161 """The "raw" version of author; can return a ``ByteStringObject``.""" 

162 return self.get(DI.AUTHOR) 

163 

164 @property 

165 def subject(self) -> Optional[str]: 

166 """ 

167 Read-only property accessing the document's subject. 

168 

169 Returns a ``TextStringObject`` or ``None`` if the subject is not 

170 specified. 

171 """ 

172 return self._get_text(DI.SUBJECT) 

173 

174 @property 

175 def subject_raw(self) -> Optional[str]: 

176 """The "raw" version of subject; can return a ``ByteStringObject``.""" 

177 return self.get(DI.SUBJECT) 

178 

179 @property 

180 def creator(self) -> Optional[str]: 

181 """ 

182 Read-only property accessing the document's creator. 

183 

184 If the document was converted to PDF from another format, this is the 

185 name of the application (e.g. OpenOffice) that created the original 

186 document from which it was converted. Returns a ``TextStringObject`` or 

187 ``None`` if the creator is not specified. 

188 """ 

189 return self._get_text(DI.CREATOR) 

190 

191 @property 

192 def creator_raw(self) -> Optional[str]: 

193 """The "raw" version of creator; can return a ``ByteStringObject``.""" 

194 return self.get(DI.CREATOR) 

195 

196 @property 

197 def producer(self) -> Optional[str]: 

198 """ 

199 Read-only property accessing the document's producer. 

200 

201 If the document was converted to PDF from another format, this is the 

202 name of the application (for example, macOS Quartz) that converted it to 

203 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not 

204 specified. 

205 """ 

206 return self._get_text(DI.PRODUCER) 

207 

208 @property 

209 def producer_raw(self) -> Optional[str]: 

210 """The "raw" version of producer; can return a ``ByteStringObject``.""" 

211 return self.get(DI.PRODUCER) 

212 

213 @property 

214 def creation_date(self) -> Optional[datetime]: 

215 """Read-only property accessing the document's creation date.""" 

216 return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) 

217 

218 @property 

219 def creation_date_raw(self) -> Optional[str]: 

220 """ 

221 The "raw" version of creation date; can return a ``ByteStringObject``. 

222 

223 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

224 is the offset from UTC. 

225 """ 

226 return self.get(DI.CREATION_DATE) 

227 

228 @property 

229 def modification_date(self) -> Optional[datetime]: 

230 """ 

231 Read-only property accessing the document's modification date. 

232 

233 The date and time the document was most recently modified. 

234 """ 

235 return parse_iso8824_date(self._get_text(DI.MOD_DATE)) 

236 

237 @property 

238 def modification_date_raw(self) -> Optional[str]: 

239 """ 

240 The "raw" version of modification date; can return a 

241 ``ByteStringObject``. 

242 

243 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

244 is the offset from UTC. 

245 """ 

246 return self.get(DI.MOD_DATE) 

247 

248 @property 

249 def keywords(self) -> Optional[str]: 

250 """ 

251 Read-only property accessing the document's keywords. 

252 

253 Returns a ``TextStringObject`` or ``None`` if keywords are not 

254 specified. 

255 """ 

256 return self._get_text(DI.KEYWORDS) 

257 

258 @property 

259 def keywords_raw(self) -> Optional[str]: 

260 """The "raw" version of keywords; can return a ``ByteStringObject``.""" 

261 return self.get(DI.KEYWORDS) 

262 

263 

264class PdfDocCommon: 

265 """ 

266 Common functions from PdfWriter and PdfReader objects. 

267 

268 This root class is strongly abstracted. 

269 """ 

270 

271 strict: bool = False # default 

272 

273 flattened_pages: Optional[List[PageObject]] = None 

274 

275 _encryption: Optional[Encryption] = None 

276 

277 _readonly: bool = False 

278 

279 @property 

280 @abstractmethod 

281 def root_object(self) -> DictionaryObject: 

282 ... # pragma: no cover 

283 

284 @property 

285 @abstractmethod 

286 def pdf_header(self) -> str: 

287 ... # pragma: no cover 

288 

289 @abstractmethod 

290 def get_object( 

291 self, indirect_reference: Union[int, IndirectObject] 

292 ) -> Optional[PdfObject]: 

293 ... # pragma: no cover 

294 

295 @abstractmethod 

296 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: 

297 ... # pragma: no cover 

298 

299 @property 

300 @abstractmethod 

301 def _info(self) -> Optional[DictionaryObject]: 

302 ... # pragma: no cover 

303 

304 @property 

305 def metadata(self) -> Optional[DocumentInformation]: 

306 """ 

307 Retrieve the PDF file's document information dictionary, if it exists. 

308 

309 Note that some PDF files use metadata streams instead of document 

310 information dictionaries, and these metadata streams will not be 

311 accessed by this function. 

312 """ 

313 retval = DocumentInformation() 

314 if self._info is None: 

315 return None 

316 retval.update(self._info) 

317 return retval 

318 

319 @property 

320 def xmp_metadata(self) -> Optional[XmpInformation]: 

321 ... # pragma: no cover 

322 

323 @property 

324 def viewer_preferences(self) -> Optional[ViewerPreferences]: 

325 """Returns the existing ViewerPreferences as an overloaded dictionary.""" 

326 o = self.root_object.get(CD.VIEWER_PREFERENCES, None) 

327 if o is None: 

328 return None 

329 o = o.get_object() 

330 if not isinstance(o, ViewerPreferences): 

331 o = ViewerPreferences(o) 

332 if hasattr(o, "indirect_reference") and o.indirect_reference is not None: 

333 self._replace_object(o.indirect_reference, o) 

334 else: 

335 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o 

336 return o 

337 

338 def get_num_pages(self) -> int: 

339 """ 

340 Calculate the number of pages in this PDF file. 

341 

342 Returns: 

343 The number of pages of the parsed PDF file. 

344 

345 Raises: 

346 PdfReadError: If restrictions prevent this action. 

347 

348 """ 

349 # Flattened pages will not work on an encrypted PDF; 

350 # the PDF file's page count is used in this case. Otherwise, 

351 # the original method (flattened page count) is used. 

352 if self.is_encrypted: 

353 return self.root_object["/Pages"]["/Count"] # type: ignore 

354 if self.flattened_pages is None: 

355 self._flatten(self._readonly) 

356 assert self.flattened_pages is not None 

357 return len(self.flattened_pages) 

358 

359 def get_page(self, page_number: int) -> PageObject: 

360 """ 

361 Retrieve a page by number from this PDF file. 

362 Most of the time ``.pages[page_number]`` is preferred. 

363 

364 Args: 

365 page_number: The page number to retrieve 

366 (pages begin at zero) 

367 

368 Returns: 

369 A :class:`PageObject<pypdf._page.PageObject>` instance. 

370 

371 """ 

372 if self.flattened_pages is None: 

373 self._flatten(self._readonly) 

374 assert self.flattened_pages is not None, "hint for mypy" 

375 return self.flattened_pages[page_number] 

376 

377 def _get_page_in_node( 

378 self, 

379 page_number: int, 

380 ) -> Tuple[DictionaryObject, int]: 

381 """ 

382 Retrieve the node and position within the /Kids containing the page. 

383 If page_number is greater than the number of pages, it returns the top node, -1. 

384 """ 

385 top = cast(DictionaryObject, self.root_object["/Pages"]) 

386 

387 def recursive_call( 

388 node: DictionaryObject, mi: int 

389 ) -> Tuple[Optional[PdfObject], int]: 

390 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types 

391 if node["/Type"] == "/Page": 

392 if page_number == mi: 

393 return node, -1 

394 return None, mi + 1 

395 if (page_number - mi) >= ma: # not in nodes below 

396 if node == top: 

397 return top, -1 

398 return None, mi + ma 

399 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): 

400 kid = cast(DictionaryObject, kid.get_object()) 

401 n, i = recursive_call(kid, mi) 

402 if n is not None: # page has just been found ... 

403 if i < 0: # ... just below! 

404 return node, idx 

405 # ... at lower levels 

406 return n, i 

407 mi = i 

408 raise PyPdfError("Unexpectedly cannot find the node.") 

409 

410 node, idx = recursive_call(top, 0) 

411 assert isinstance(node, DictionaryObject), "mypy" 

412 return node, idx 

413 

414 @property 

415 def named_destinations(self) -> Dict[str, Destination]: 

416 """A read-only dictionary which maps names to destinations.""" 

417 return self._get_named_destinations() 

418 

419 def get_named_dest_root(self) -> ArrayObject: 

420 named_dest = ArrayObject() 

421 if CA.NAMES in self.root_object and isinstance( 

422 self.root_object[CA.NAMES], DictionaryObject 

423 ): 

424 names = cast(DictionaryObject, self.root_object[CA.NAMES]) 

425 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): 

426 # §3.6.3 Name Dictionary (PDF spec 1.7) 

427 dests = cast(DictionaryObject, names[CA.DESTS]) 

428 dests_ref = dests.indirect_reference 

429 if CA.NAMES in dests: 

430 # §7.9.6, entries in a name tree node dictionary 

431 named_dest = cast(ArrayObject, dests[CA.NAMES]) 

432 else: 

433 named_dest = ArrayObject() 

434 dests[NameObject(CA.NAMES)] = named_dest 

435 elif hasattr(self, "_add_object"): 

436 dests = DictionaryObject() 

437 dests_ref = self._add_object(dests) 

438 names[NameObject(CA.DESTS)] = dests_ref 

439 dests[NameObject(CA.NAMES)] = named_dest 

440 

441 elif hasattr(self, "_add_object"): 

442 names = DictionaryObject() 

443 names_ref = self._add_object(names) 

444 self.root_object[NameObject(CA.NAMES)] = names_ref 

445 dests = DictionaryObject() 

446 dests_ref = self._add_object(dests) 

447 names[NameObject(CA.DESTS)] = dests_ref 

448 dests[NameObject(CA.NAMES)] = named_dest 

449 

450 return named_dest 

451 

452 ## common 

453 def _get_named_destinations( 

454 self, 

455 tree: Union[TreeObject, None] = None, 

456 retval: Optional[Dict[str, Destination]] = None, 

457 ) -> Dict[str, Destination]: 

458 """ 

459 Retrieve the named destinations present in the document. 

460 

461 Args: 

462 tree: The current tree. 

463 retval: The previously retrieved destinations for nested calls. 

464 

465 Returns: 

466 A dictionary which maps names to destinations. 

467 

468 """ 

469 if retval is None: 

470 retval = {} 

471 catalog = self.root_object 

472 

473 # get the name tree 

474 if CA.DESTS in catalog: 

475 tree = cast(TreeObject, catalog[CA.DESTS]) 

476 elif CA.NAMES in catalog: 

477 names = cast(DictionaryObject, catalog[CA.NAMES]) 

478 if CA.DESTS in names: 

479 tree = cast(TreeObject, names[CA.DESTS]) 

480 

481 if is_null_or_none(tree): 

482 return retval 

483 assert tree is not None, "mypy" 

484 

485 if PA.KIDS in tree: 

486 # recurse down the tree 

487 for kid in cast(ArrayObject, tree[PA.KIDS]): 

488 self._get_named_destinations(kid.get_object(), retval) 

489 # §7.9.6, entries in a name tree node dictionary 

490 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6) 

491 names = cast(DictionaryObject, tree[CA.NAMES]) 

492 i = 0 

493 while i < len(names): 

494 original_key = names[i].get_object() 

495 i += 1 

496 if not isinstance(original_key, (bytes, str)): 

497 continue 

498 key = str(original_key) 

499 try: 

500 value = names[i].get_object() 

501 except IndexError: 

502 break 

503 i += 1 

504 if isinstance(value, DictionaryObject): 

505 if "/D" in value: 

506 value = value["/D"] 

507 else: 

508 continue 

509 dest = self._build_destination(key, value) 

510 if dest is not None: 

511 retval[key] = dest 

512 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1) 

513 for k__, v__ in tree.items(): 

514 val = v__.get_object() 

515 if isinstance(val, DictionaryObject): 

516 if "/D" in val: 

517 val = val["/D"].get_object() 

518 else: 

519 continue 

520 dest = self._build_destination(k__, val) 

521 if dest is not None: 

522 retval[k__] = dest 

523 return retval 

524 

525 # A select group of relevant field attributes. For the complete list, 

526 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification. 

527 

528 def get_fields( 

529 self, 

530 tree: Optional[TreeObject] = None, 

531 retval: Optional[Dict[Any, Any]] = None, 

532 fileobj: Optional[Any] = None, 

533 stack: Optional[List[PdfObject]] = None, 

534 ) -> Optional[Dict[str, Any]]: 

535 """ 

536 Extract field data if this PDF contains interactive form fields. 

537 

538 The *tree*, *retval*, *stack* parameters are for recursive use. 

539 

540 Args: 

541 tree: Current object to parse. 

542 retval: In-progress list of fields. 

543 fileobj: A file object (usually a text file) to write 

544 a report to on all interactive form fields found. 

545 stack: List of already parsed objects. 

546 

547 Returns: 

548 A dictionary where each key is a field name, and each 

549 value is a :class:`Field<pypdf.generic.Field>` object. By 

550 default, the mapping name is used for keys. 

551 ``None`` if form data could not be located. 

552 

553 """ 

554 field_attributes = FA.attributes_dict() 

555 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) 

556 if retval is None: 

557 retval = {} 

558 catalog = self.root_object 

559 stack = [] 

560 # get the AcroForm tree 

561 if CD.ACRO_FORM in catalog: 

562 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) 

563 else: 

564 return None 

565 if tree is None: 

566 return retval 

567 assert stack is not None 

568 if "/Fields" in tree: 

569 fields = cast(ArrayObject, tree["/Fields"]) 

570 for f in fields: 

571 field = f.get_object() 

572 self._build_field(field, retval, fileobj, field_attributes, stack) 

573 elif any(attr in tree for attr in field_attributes): 

574 # Tree is a field 

575 self._build_field(tree, retval, fileobj, field_attributes, stack) 

576 return retval 

577 

578 def _get_qualified_field_name(self, parent: DictionaryObject) -> str: 

579 if "/TM" in parent: 

580 return cast(str, parent["/TM"]) 

581 if "/Parent" in parent: 

582 return ( 

583 self._get_qualified_field_name( 

584 cast(DictionaryObject, parent["/Parent"]) 

585 ) 

586 + "." 

587 + cast(str, parent.get("/T", "")) 

588 ) 

589 return cast(str, parent.get("/T", "")) 

590 

591 def _build_field( 

592 self, 

593 field: Union[TreeObject, DictionaryObject], 

594 retval: Dict[Any, Any], 

595 fileobj: Any, 

596 field_attributes: Any, 

597 stack: List[PdfObject], 

598 ) -> None: 

599 if all(attr not in field for attr in ("/T", "/TM")): 

600 return 

601 key = self._get_qualified_field_name(field) 

602 if fileobj: 

603 self._write_field(fileobj, field, field_attributes) 

604 fileobj.write("\n") 

605 retval[key] = Field(field) 

606 obj = retval[key].indirect_reference.get_object() # to get the full object 

607 if obj.get(FA.FT, "") == "/Ch": 

608 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] 

609 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: 

610 # Checkbox 

611 retval[key][NameObject("/_States_")] = ArrayObject( 

612 list(obj["/AP"]["/N"].keys()) 

613 ) 

614 if "/Off" not in retval[key]["/_States_"]: 

615 retval[key][NameObject("/_States_")].append(NameObject("/Off")) 

616 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: 

617 states: List[str] = [] 

618 retval[key][NameObject("/_States_")] = ArrayObject(states) 

619 for k in obj.get(FA.Kids, {}): 

620 k = k.get_object() 

621 for s in list(k["/AP"]["/N"].keys()): 

622 if s not in states: 

623 states.append(s) 

624 retval[key][NameObject("/_States_")] = ArrayObject(states) 

625 if ( 

626 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 

627 and "/Off" in retval[key]["/_States_"] 

628 ): 

629 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] 

630 # at last for order 

631 self._check_kids(field, retval, fileobj, stack) 

632 

633 def _check_kids( 

634 self, 

635 tree: Union[TreeObject, DictionaryObject], 

636 retval: Any, 

637 fileobj: Any, 

638 stack: List[PdfObject], 

639 ) -> None: 

640 if tree in stack: 

641 logger_warning( 

642 f"{self._get_qualified_field_name(tree)} already parsed", __name__ 

643 ) 

644 return 

645 stack.append(tree) 

646 if PA.KIDS in tree: 

647 # recurse down the tree 

648 for kid in tree[PA.KIDS]: # type: ignore 

649 kid = kid.get_object() 

650 self.get_fields(kid, retval, fileobj, stack) 

651 

652 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: 

653 field_attributes_tuple = FA.attributes() 

654 field_attributes_tuple = ( 

655 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() 

656 ) 

657 

658 for attr in field_attributes_tuple: 

659 if attr in ( 

660 FA.Kids, 

661 FA.AA, 

662 ): 

663 continue 

664 attr_name = field_attributes[attr] 

665 try: 

666 if attr == FA.FT: 

667 # Make the field type value clearer 

668 types = { 

669 "/Btn": "Button", 

670 "/Tx": "Text", 

671 "/Ch": "Choice", 

672 "/Sig": "Signature", 

673 } 

674 if field[attr] in types: 

675 fileobj.write(f"{attr_name}: {types[field[attr]]}\n") 

676 elif attr == FA.Parent: 

677 # Let's just write the name of the parent 

678 try: 

679 name = field[attr][FA.TM] 

680 except KeyError: 

681 name = field[attr][FA.T] 

682 fileobj.write(f"{attr_name}: {name}\n") 

683 else: 

684 fileobj.write(f"{attr_name}: {field[attr]}\n") 

685 except KeyError: 

686 # Field attribute is N/A or unknown, so don't write anything 

687 pass 

688 

689 def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]: 

690 """ 

691 Retrieve form fields from the document with textual data. 

692 

693 Args: 

694 full_qualified_name: to get full name 

695 

696 Returns: 

697 A dictionary. The key is the name of the form field, 

698 the value is the content of the field. 

699 

700 If the document contains multiple form fields with the same name, the 

701 second and following will get the suffix .2, .3, ... 

702 

703 """ 

704 

705 def indexed_key(k: str, fields: Dict[Any, Any]) -> str: 

706 if k not in fields: 

707 return k 

708 return ( 

709 k 

710 + "." 

711 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2) 

712 ) 

713 

714 # Retrieve document form fields 

715 formfields = self.get_fields() 

716 if formfields is None: 

717 return {} 

718 ff = {} 

719 for field, value in formfields.items(): 

720 if value.get("/FT") == "/Tx": 

721 if full_qualified_name: 

722 ff[field] = value.get("/V") 

723 else: 

724 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") 

725 return ff 

726 

727 def get_pages_showing_field( 

728 self, field: Union[Field, PdfObject, IndirectObject] 

729 ) -> List[PageObject]: 

730 """ 

731 Provides list of pages where the field is called. 

732 

733 Args: 

734 field: Field Object, PdfObject or IndirectObject referencing a Field 

735 

736 Returns: 

737 List of pages: 

738 - Empty list: 

739 The field has no widgets attached 

740 (either hidden field or ancestor field). 

741 - Single page list: 

742 Page where the widget is present 

743 (most common). 

744 - Multi-page list: 

745 Field with multiple kids widgets 

746 (example: radio buttons, field repeated on multiple pages). 

747 

748 """ 

749 

750 def _get_inherited(obj: DictionaryObject, key: str) -> Any: 

751 if key in obj: 

752 return obj[key] 

753 if "/Parent" in obj: 

754 return _get_inherited( 

755 cast(DictionaryObject, obj["/Parent"].get_object()), key 

756 ) 

757 return None 

758 

759 try: 

760 # to cope with all types 

761 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore 

762 except Exception as exc: 

763 raise ValueError("Field type is invalid") from exc 

764 if is_null_or_none(_get_inherited(field, "/FT")): 

765 raise ValueError("Field is not valid") 

766 ret = [] 

767 if field.get("/Subtype", "") == "/Widget": 

768 if "/P" in field: 

769 ret = [field["/P"].get_object()] 

770 else: 

771 ret = [ 

772 p 

773 for p in self.pages 

774 if field.indirect_reference in p.get("/Annots", "") 

775 ] 

776 else: 

777 kids = field.get("/Kids", ()) 

778 for k in kids: 

779 k = k.get_object() 

780 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): 

781 # Kid that is just a widget, not a field: 

782 if "/P" in k: 

783 ret += [k["/P"].get_object()] 

784 else: 

785 ret += [ 

786 p 

787 for p in self.pages 

788 if k.indirect_reference in p.get("/Annots", "") 

789 ] 

790 return [ 

791 x 

792 if isinstance(x, PageObject) 

793 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore 

794 for x in ret 

795 ] 

796 

797 @property 

798 def open_destination( 

799 self, 

800 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

801 """ 

802 Property to access the opening destination (``/OpenAction`` entry in 

803 the PDF catalog). It returns ``None`` if the entry does not exist 

804 or is not set. 

805 

806 Raises: 

807 Exception: If a destination is invalid. 

808 

809 """ 

810 if "/OpenAction" not in self.root_object: 

811 return None 

812 oa: Any = self.root_object["/OpenAction"] 

813 if isinstance(oa, bytes): # pragma: no cover 

814 oa = oa.decode() 

815 if isinstance(oa, str): 

816 return create_string_object(oa) 

817 if isinstance(oa, ArrayObject): 

818 try: 

819 page, typ, *array = oa 

820 fit = Fit(typ, tuple(array)) 

821 return Destination("OpenAction", page, fit) 

822 except Exception as exc: 

823 raise Exception(f"Invalid Destination {oa}: {exc}") 

824 else: 

825 return None 

826 

827 @open_destination.setter 

828 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

829 raise NotImplementedError("No setter for open_destination") 

830 

831 @property 

832 def outline(self) -> OutlineType: 

833 """ 

834 Read-only property for the outline present in the document 

835 (i.e., a collection of 'outline items' which are also known as 

836 'bookmarks'). 

837 """ 

838 return self._get_outline() 

839 

840 def _get_outline( 

841 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None 

842 ) -> OutlineType: 

843 if outline is None: 

844 outline = [] 

845 catalog = self.root_object 

846 

847 # get the outline dictionary and named destinations 

848 if CO.OUTLINES in catalog: 

849 lines = cast(DictionaryObject, catalog[CO.OUTLINES]) 

850 

851 if isinstance(lines, NullObject): 

852 return outline 

853 

854 # §12.3.3 Document outline, entries in the outline dictionary 

855 if not is_null_or_none(lines) and "/First" in lines: 

856 node = cast(DictionaryObject, lines["/First"]) 

857 self._named_destinations = self._get_named_destinations() 

858 

859 if node is None: 

860 return outline 

861 

862 # see if there are any more outline items 

863 while True: 

864 outline_obj = self._build_outline_item(node) 

865 if outline_obj: 

866 outline.append(outline_obj) 

867 

868 # check for sub-outline 

869 if "/First" in node: 

870 sub_outline: List[Any] = [] 

871 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) 

872 if sub_outline: 

873 outline.append(sub_outline) 

874 

875 if "/Next" not in node: 

876 break 

877 node = cast(DictionaryObject, node["/Next"]) 

878 

879 return outline 

880 

881 @property 

882 def threads(self) -> Optional[ArrayObject]: 

883 """ 

884 Read-only property for the list of threads. 

885 

886 See §12.4.3 from the PDF 1.7 or 2.0 specification. 

887 

888 It is an array of dictionaries with "/F" (the first bead in the thread) 

889 and "/I" (a thread information dictionary containing information about 

890 the thread, such as its title, author, and creation date) properties or 

891 None if there are no articles. 

892 

893 Since PDF 2.0 it can also contain an indirect reference to a metadata 

894 stream containing information about the thread, such as its title, 

895 author, and creation date. 

896 """ 

897 catalog = self.root_object 

898 if CO.THREADS in catalog: 

899 return cast("ArrayObject", catalog[CO.THREADS]) 

900 return None 

901 

902 @abstractmethod 

903 def _get_page_number_by_indirect( 

904 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

905 ) -> Optional[int]: 

906 ... # pragma: no cover 

907 

908 def get_page_number(self, page: PageObject) -> Optional[int]: 

909 """ 

910 Retrieve page number of a given PageObject. 

911 

912 Args: 

913 page: The page to get page number. Should be 

914 an instance of :class:`PageObject<pypdf._page.PageObject>` 

915 

916 Returns: 

917 The page number or None if page is not found 

918 

919 """ 

920 return self._get_page_number_by_indirect(page.indirect_reference) 

921 

922 def get_destination_page_number(self, destination: Destination) -> Optional[int]: 

923 """ 

924 Retrieve page number of a given Destination object. 

925 

926 Args: 

927 destination: The destination to get page number. 

928 

929 Returns: 

930 The page number or None if page is not found 

931 

932 """ 

933 return self._get_page_number_by_indirect(destination.page) 

934 

935 def _build_destination( 

936 self, 

937 title: str, 

938 array: Optional[ 

939 List[ 

940 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] 

941 ] 

942 ], 

943 ) -> Destination: 

944 page, typ = None, None 

945 # handle outline items with missing or invalid destination 

946 if ( 

947 isinstance(array, (NullObject, str)) 

948 or (isinstance(array, ArrayObject) and len(array) == 0) 

949 or array is None 

950 ): 

951 page = NullObject() 

952 return Destination(title, page, Fit.fit()) 

953 page, typ, *array = array # type: ignore 

954 try: 

955 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore 

956 except PdfReadError: 

957 logger_warning(f"Unknown destination: {title} {array}", __name__) 

958 if self.strict: 

959 raise 

960 # create a link to first Page 

961 tmp = self.pages[0].indirect_reference 

962 indirect_reference = NullObject() if tmp is None else tmp 

963 return Destination(title, indirect_reference, Fit.fit()) 

964 

965 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: 

966 dest, title, outline_item = None, None, None 

967 

968 # title required for valid outline 

969 # §12.3.3, entries in an outline item dictionary 

970 try: 

971 title = cast("str", node["/Title"]) 

972 except KeyError: 

973 if self.strict: 

974 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") 

975 title = "" 

976 

977 if "/A" in node: 

978 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported) 

979 action = cast(DictionaryObject, node["/A"]) 

980 action_type = cast(NameObject, action[GoToActionArguments.S]) 

981 if action_type == "/GoTo": 

982 if GoToActionArguments.D in action: 

983 dest = action[GoToActionArguments.D] 

984 elif self.strict: 

985 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}") 

986 elif "/Dest" in node: 

987 # Destination, PDF 1.7 and PDF 2.0 §12.3.2 

988 dest = node["/Dest"] 

989 # if array was referenced in another object, will be a dict w/ key "/D" 

990 if isinstance(dest, DictionaryObject) and "/D" in dest: 

991 dest = dest["/D"] 

992 

993 if isinstance(dest, ArrayObject): 

994 outline_item = self._build_destination(title, dest) 

995 elif isinstance(dest, str): 

996 # named destination, addresses NameObject Issue #193 

997 # TODO: Keep named destination instead of replacing it? 

998 try: 

999 outline_item = self._build_destination( 

1000 title, self._named_destinations[dest].dest_array 

1001 ) 

1002 except KeyError: 

1003 # named destination not found in Name Dict 

1004 outline_item = self._build_destination(title, None) 

1005 elif dest is None: 

1006 # outline item not required to have destination or action 

1007 # PDFv1.7 Table 153 

1008 outline_item = self._build_destination(title, dest) 

1009 else: 

1010 if self.strict: 

1011 raise PdfReadError(f"Unexpected destination {dest!r}") 

1012 logger_warning( 

1013 f"Removed unexpected destination {dest!r} from destination", 

1014 __name__, 

1015 ) 

1016 outline_item = self._build_destination(title, None) 

1017 

1018 # if outline item created, add color, format, and child count if present 

1019 if outline_item: 

1020 if "/C" in node: 

1021 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 

1022 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore 

1023 if "/F" in node: 

1024 # specifies style characteristics bold and/or italic 

1025 # with 1=italic, 2=bold, 3=both 

1026 outline_item[NameObject("/F")] = node["/F"] 

1027 if "/Count" in node: 

1028 # absolute value = num. visible children 

1029 # with positive = open/unfolded, negative = closed/folded 

1030 outline_item[NameObject("/Count")] = node["/Count"] 

1031 # if count is 0 we will consider it as open (to have available is_open) 

1032 outline_item[NameObject("/%is_open%")] = BooleanObject( 

1033 node.get("/Count", 0) >= 0 

1034 ) 

1035 outline_item.node = node 

1036 try: 

1037 outline_item.indirect_reference = node.indirect_reference 

1038 except AttributeError: 

1039 pass 

1040 return outline_item 

1041 

1042 @property 

1043 def pages(self) -> List[PageObject]: 

1044 """ 

1045 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`. 

1046 This property allows to get a page or a range of pages. 

1047 

1048 Note: 

1049 For PdfWriter only: Provides the capability to remove a page/range of 

1050 page from the list (using the del operator). Remember: Only the page 

1051 entry is removed, as the objects beneath can be used elsewhere. A 

1052 solution to completely remove them - if they are not used anywhere - is 

1053 to write to a buffer/temporary file and then load it into a new 

1054 PdfWriter. 

1055 

1056 """ 

1057 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore 

1058 

1059 @property 

1060 def page_labels(self) -> List[str]: 

1061 """ 

1062 A list of labels for the pages in this document. 

1063 

1064 This property is read-only. The labels are in the order that the pages 

1065 appear in the document. 

1066 """ 

1067 return [page_index2page_label(self, i) for i in range(len(self.pages))] 

1068 

1069 @property 

1070 def page_layout(self) -> Optional[str]: 

1071 """ 

1072 Get the page layout currently being used. 

1073 

1074 .. list-table:: Valid ``layout`` values 

1075 :widths: 50 200 

1076 

1077 * - /NoLayout 

1078 - Layout explicitly not specified 

1079 * - /SinglePage 

1080 - Show one page at a time 

1081 * - /OneColumn 

1082 - Show one column at a time 

1083 * - /TwoColumnLeft 

1084 - Show pages in two columns, odd-numbered pages on the left 

1085 * - /TwoColumnRight 

1086 - Show pages in two columns, odd-numbered pages on the right 

1087 * - /TwoPageLeft 

1088 - Show two pages at a time, odd-numbered pages on the left 

1089 * - /TwoPageRight 

1090 - Show two pages at a time, odd-numbered pages on the right 

1091 """ 

1092 try: 

1093 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) 

1094 except KeyError: 

1095 return None 

1096 

1097 @property 

1098 def page_mode(self) -> Optional[PagemodeType]: 

1099 """ 

1100 Get the page mode currently being used. 

1101 

1102 .. list-table:: Valid ``mode`` values 

1103 :widths: 50 200 

1104 

1105 * - /UseNone 

1106 - Do not show outline or thumbnails panels 

1107 * - /UseOutlines 

1108 - Show outline (aka bookmarks) panel 

1109 * - /UseThumbs 

1110 - Show page thumbnails panel 

1111 * - /FullScreen 

1112 - Fullscreen view 

1113 * - /UseOC 

1114 - Show Optional Content Group (OCG) panel 

1115 * - /UseAttachments 

1116 - Show attachments panel 

1117 """ 

1118 try: 

1119 return self.root_object["/PageMode"] # type: ignore 

1120 except KeyError: 

1121 return None 

1122 

1123 def _flatten( 

1124 self, 

1125 list_only: bool = False, 

1126 pages: Union[None, DictionaryObject, PageObject] = None, 

1127 inherit: Optional[Dict[str, Any]] = None, 

1128 indirect_reference: Optional[IndirectObject] = None, 

1129 ) -> None: 

1130 """ 

1131 Process the document pages to ease searching. 

1132 

1133 Attributes of a page may inherit from ancestor nodes 

1134 in the page tree. Flattening means moving 

1135 any inheritance data into descendant nodes, 

1136 effectively removing the inheritance dependency. 

1137 

1138 Note: It is distinct from another use of "flattening" applied to PDFs. 

1139 Flattening a PDF also means combining all the contents into one single layer 

1140 and making the file less editable. 

1141 

1142 Args: 

1143 list_only: Will only list the pages within _flatten_pages. 

1144 pages: 

1145 inherit: 

1146 indirect_reference: Used recursively to flatten the /Pages object. 

1147 

1148 """ 

1149 inheritable_page_attributes = ( 

1150 NameObject(PG.RESOURCES), 

1151 NameObject(PG.MEDIABOX), 

1152 NameObject(PG.CROPBOX), 

1153 NameObject(PG.ROTATE), 

1154 ) 

1155 if inherit is None: 

1156 inherit = {} 

1157 if pages is None: 

1158 # Fix issue 327: set flattened_pages attribute only for 

1159 # decrypted file 

1160 catalog = self.root_object 

1161 pages = catalog.get("/Pages").get_object() # type: ignore 

1162 if not isinstance(pages, DictionaryObject): 

1163 raise PdfReadError("Invalid object in /Pages") 

1164 self.flattened_pages = [] 

1165 

1166 if PA.TYPE in pages: 

1167 t = cast(str, pages[PA.TYPE]) 

1168 # if the page tree node has no /Type, consider as a page if /Kids is also missing 

1169 elif PA.KIDS not in pages: 

1170 t = "/Page" 

1171 else: 

1172 t = "/Pages" 

1173 

1174 if t == "/Pages": 

1175 for attr in inheritable_page_attributes: 

1176 if attr in pages: 

1177 inherit[attr] = pages[attr] 

1178 for page in cast(ArrayObject, pages[PA.KIDS]): 

1179 addt = {} 

1180 if isinstance(page, IndirectObject): 

1181 addt["indirect_reference"] = page 

1182 obj = page.get_object() 

1183 if obj: 

1184 # damaged file may have invalid child in /Pages 

1185 try: 

1186 self._flatten(list_only, obj, inherit, **addt) 

1187 except RecursionError: 

1188 raise PdfReadError( 

1189 "Maximum recursion depth reached during page flattening." 

1190 ) 

1191 elif t == "/Page": 

1192 for attr_in, value in inherit.items(): 

1193 # if the page has its own value, it does not inherit the 

1194 # parent's value 

1195 if attr_in not in pages: 

1196 pages[attr_in] = value 

1197 page_obj = PageObject(self, indirect_reference) 

1198 if not list_only: 

1199 page_obj.update(pages) 

1200 

1201 # TODO: Could flattened_pages be None at this point? 

1202 self.flattened_pages.append(page_obj) # type: ignore 

1203 

1204 def remove_page( 

1205 self, 

1206 page: Union[int, PageObject, IndirectObject], 

1207 clean: bool = False, 

1208 ) -> None: 

1209 """ 

1210 Remove page from pages list. 

1211 

1212 Args: 

1213 page: 

1214 * :class:`int`: Page number to be removed. 

1215 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times 

1216 only the first one will be removed. 

1217 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed. 

1218 

1219 clean: replace PageObject with NullObject to prevent annotations 

1220 or destinations to reference a detached page. 

1221 

1222 """ 

1223 if self.flattened_pages is None: 

1224 self._flatten(self._readonly) 

1225 assert self.flattened_pages is not None 

1226 if isinstance(page, IndirectObject): 

1227 p = page.get_object() 

1228 if not isinstance(p, PageObject): 

1229 logger_warning("IndirectObject is not referencing a page", __name__) 

1230 return 

1231 page = p 

1232 

1233 if not isinstance(page, int): 

1234 try: 

1235 page = self.flattened_pages.index(page) 

1236 except ValueError: 

1237 logger_warning("Cannot find page in pages", __name__) 

1238 return 

1239 if not (0 <= page < len(self.flattened_pages)): 

1240 logger_warning("Page number is out of range", __name__) 

1241 return 

1242 

1243 ind = self.pages[page].indirect_reference 

1244 del self.pages[page] 

1245 if clean and ind is not None: 

1246 self._replace_object(ind, NullObject()) 

1247 

1248 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: 

1249 """ 

1250 Used to ease development. 

1251 

1252 This is equivalent to generic.IndirectObject(num,gen,self).get_object() 

1253 

1254 Args: 

1255 num: The object number of the indirect object. 

1256 gen: The generation number of the indirect object. 

1257 

1258 Returns: 

1259 A PdfObject 

1260 

1261 """ 

1262 return IndirectObject(num, gen, self).get_object() 

1263 

1264 def decode_permissions( 

1265 self, permissions_code: int 

1266 ) -> Dict[str, bool]: # pragma: no cover 

1267 """Take the permissions as an integer, return the allowed access.""" 

1268 deprecate_with_replacement( 

1269 old_name="decode_permissions", 

1270 new_name="user_access_permissions", 

1271 removed_in="5.0.0", 

1272 ) 

1273 

1274 permissions_mapping = { 

1275 "print": UserAccessPermissions.PRINT, 

1276 "modify": UserAccessPermissions.MODIFY, 

1277 "copy": UserAccessPermissions.EXTRACT, 

1278 "annotations": UserAccessPermissions.ADD_OR_MODIFY, 

1279 "forms": UserAccessPermissions.FILL_FORM_FIELDS, 

1280 # Do not fix typo, as part of official, but deprecated API. 

1281 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS, 

1282 "assemble": UserAccessPermissions.ASSEMBLE_DOC, 

1283 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION, 

1284 } 

1285 

1286 return { 

1287 key: permissions_code & flag != 0 

1288 for key, flag in permissions_mapping.items() 

1289 } 

1290 

1291 @property 

1292 def user_access_permissions(self) -> Optional[UserAccessPermissions]: 

1293 """Get the user access permissions for encrypted documents. Returns None if not encrypted.""" 

1294 if self._encryption is None: 

1295 return None 

1296 return UserAccessPermissions(self._encryption.P) 

1297 

1298 @property 

1299 @abstractmethod 

1300 def is_encrypted(self) -> bool: 

1301 """ 

1302 Read-only boolean property showing whether this PDF file is encrypted. 

1303 

1304 Note that this property, if true, will remain true even after the 

1305 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1306 """ 

1307 ... # pragma: no cover 

1308 

1309 @property 

1310 def xfa(self) -> Optional[Dict[str, Any]]: 

1311 tree: Optional[TreeObject] = None 

1312 retval: Dict[str, Any] = {} 

1313 catalog = self.root_object 

1314 

1315 if "/AcroForm" not in catalog or not catalog["/AcroForm"]: 

1316 return None 

1317 

1318 tree = cast(TreeObject, catalog["/AcroForm"]) 

1319 

1320 if "/XFA" in tree: 

1321 fields = cast(ArrayObject, tree["/XFA"]) 

1322 i = iter(fields) 

1323 for f in i: 

1324 tag = f 

1325 f = next(i) 

1326 if isinstance(f, IndirectObject): 

1327 field = cast(Optional[EncodedStreamObject], f.get_object()) 

1328 if field: 

1329 es = zlib.decompress(field._data) 

1330 retval[tag] = es 

1331 return retval 

1332 

1333 @property 

1334 def attachments(self) -> Mapping[str, List[bytes]]: 

1335 """Mapping of attachment filenames to their content.""" 

1336 return LazyDict( 

1337 { 

1338 name: (self._get_attachment_list, name) 

1339 for name in self._list_attachments() 

1340 } 

1341 ) 

1342 

1343 @property 

1344 def attachment_list(self) -> Generator[EmbeddedFile, None, None]: 

1345 """Iterable of attachment objects.""" 

1346 yield from EmbeddedFile._load(self.root_object) 

1347 

1348 def _list_attachments(self) -> List[str]: 

1349 """ 

1350 Retrieves the list of filenames of file attachments. 

1351 

1352 Returns: 

1353 list of filenames 

1354 

1355 """ 

1356 names = [] 

1357 for entry in self.attachment_list: 

1358 names.append(entry.name) 

1359 if (name := entry.alternative_name) != entry.name and name: 

1360 names.append(name) 

1361 return names 

1362 

1363 def _get_attachment_list(self, name: str) -> List[bytes]: 

1364 out = self._get_attachments(name)[name] 

1365 if isinstance(out, list): 

1366 return out 

1367 return [out] 

1368 

1369 def _get_attachments( 

1370 self, filename: Optional[str] = None 

1371 ) -> Dict[str, Union[bytes, List[bytes]]]: 

1372 """ 

1373 Retrieves all or selected file attachments of the PDF as a dictionary of file names 

1374 and the file data as a bytestring. 

1375 

1376 Args: 

1377 filename: If filename is None, then a dictionary of all attachments 

1378 will be returned, where the key is the filename and the value 

1379 is the content. Otherwise, a dictionary with just a single key 

1380 - the filename - and its content will be returned. 

1381 

1382 Returns: 

1383 dictionary of filename -> Union[bytestring or List[ByteString]] 

1384 If the filename exists multiple times a list of the different versions will be provided. 

1385 

1386 """ 

1387 attachments: Dict[str, Union[bytes, List[bytes]]] = {} 

1388 for entry in self.attachment_list: 

1389 names = set() 

1390 alternative_name = entry.alternative_name 

1391 if filename is not None: 

1392 if filename in {entry.name, alternative_name}: 

1393 name = entry.name if filename == entry.name else alternative_name 

1394 names.add(name) 

1395 else: 

1396 continue 

1397 else: 

1398 names = {entry.name, alternative_name} 

1399 

1400 for name in names: 

1401 if name is None: 

1402 continue 

1403 if name in attachments: 

1404 if not isinstance(attachments[name], list): 

1405 attachments[name] = [attachments[name]] # type:ignore 

1406 attachments[name].append(entry.content) # type:ignore 

1407 else: 

1408 attachments[name] = entry.content 

1409 return attachments 

1410 

1411 @abstractmethod 

1412 def _repr_mimebundle_( 

1413 self, 

1414 include: Union[None, Iterable[str]] = None, 

1415 exclude: Union[None, Iterable[str]] = None, 

1416 ) -> Dict[str, Any]: 

1417 """ 

1418 Integration into Jupyter Notebooks. 

1419 

1420 This method returns a dictionary that maps a mime-type to its 

1421 representation. 

1422 

1423 .. seealso:: 

1424 

1425 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1426 """ 

1427 ... # pragma: no cover 

1428 

1429 

1430class LazyDict(Mapping[Any, Any]): 

1431 def __init__(self, *args: Any, **kwargs: Any) -> None: 

1432 self._raw_dict = dict(*args, **kwargs) 

1433 

1434 def __getitem__(self, key: str) -> Any: 

1435 func, arg = self._raw_dict.__getitem__(key) 

1436 return func(arg) 

1437 

1438 def __iter__(self) -> Iterator[Any]: 

1439 return iter(self._raw_dict) 

1440 

1441 def __len__(self) -> int: 

1442 return len(self._raw_dict) 

1443 

1444 def __str__(self) -> str: 

1445 return f"LazyDict(keys={list(self.keys())})"