Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

656 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# Copyright (c) 2024, Pubpub-ZZ 

4# 

5# All rights reserved. 

6# 

7# Redistribution and use in source and binary forms, with or without 

8# modification, are permitted provided that the following conditions are 

9# met: 

10# 

11# * Redistributions of source code must retain the above copyright notice, 

12# this list of conditions and the following disclaimer. 

13# * Redistributions in binary form must reproduce the above copyright notice, 

14# this list of conditions and the following disclaimer in the documentation 

15# and/or other materials provided with the distribution. 

16# * The name of the author may not be used to endorse or promote products 

17# derived from this software without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

29# POSSIBILITY OF SUCH DAMAGE. 

30 

31import struct 

32from abc import ABC, abstractmethod 

33from collections.abc import Generator, Iterable, Iterator, Mapping 

34from datetime import datetime 

35from typing import ( 

36 Any, 

37 NoReturn, 

38 Optional, 

39 Union, 

40 cast, 

41) 

42 

43from ._encryption import Encryption 

44from ._page import PageObject, _VirtualList 

45from ._page_labels import index2label as page_index2page_label 

46from ._utils import ( 

47 deprecation_with_replacement, 

48 logger_warning, 

49 parse_iso8824_date, 

50) 

51from .constants import CatalogAttributes as CA 

52from .constants import CatalogDictionary as CD 

53from .constants import ( 

54 CheckboxRadioButtonAttributes, 

55 GoToActionArguments, 

56 PagesAttributes, 

57 UserAccessPermissions, 

58) 

59from .constants import Core as CO 

60from .constants import DocumentInformationAttributes as DI 

61from .constants import FieldDictionaryAttributes as FA 

62from .constants import PageAttributes as PG 

63from .errors import PdfReadError, PyPdfError 

64from .filters import _decompress_with_limit 

65from .generic import ( 

66 ArrayObject, 

67 BooleanObject, 

68 ByteStringObject, 

69 Destination, 

70 DictionaryObject, 

71 EncodedStreamObject, 

72 Field, 

73 Fit, 

74 FloatObject, 

75 IndirectObject, 

76 NameObject, 

77 NullObject, 

78 NumberObject, 

79 PdfObject, 

80 TextStringObject, 

81 TreeObject, 

82 ViewerPreferences, 

83 create_string_object, 

84 is_null_or_none, 

85) 

86from .generic._files import EmbeddedFile 

87from .types import OutlineType, PagemodeType 

88from .xmp import XmpInformation 

89 

90 

91def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]: 

92 if size > 8: 

93 raise PdfReadError("Invalid size in convert_to_int") 

94 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d 

95 d = d[-8:] 

96 return cast(int, struct.unpack(">Q", d)[0]) 

97 

98 

99class DocumentInformation(DictionaryObject): 

100 """ 

101 A class representing the basic document metadata provided in a PDF File. 

102 This class is accessible through 

103 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`. 

104 

105 All text properties of the document metadata have 

106 *two* properties, e.g. author and author_raw. The non-raw property will 

107 always return a ``TextStringObject``, making it ideal for a case where the 

108 metadata is being displayed. The raw property can sometimes return a 

109 ``ByteStringObject``, if pypdf was unable to decode the string's text 

110 encoding; this requires additional safety in the caller and therefore is not 

111 as commonly accessed. 

112 """ 

113 

114 def __init__(self) -> None: 

115 DictionaryObject.__init__(self) 

116 

117 def _get_text(self, key: str) -> Optional[str]: 

118 retval = self.get(key, None) 

119 if isinstance(retval, TextStringObject): 

120 return retval 

121 if isinstance(retval, ByteStringObject): 

122 return str(retval) 

123 return None 

124 

125 @property 

126 def title(self) -> Optional[str]: 

127 """ 

128 Read-only property accessing the document's title. 

129 

130 Returns a ``TextStringObject`` or ``None`` if the title is not 

131 specified. 

132 """ 

133 return ( 

134 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore[union-attr] 

135 if self.get(DI.TITLE) 

136 else None 

137 ) 

138 

139 @property 

140 def title_raw(self) -> Optional[str]: 

141 """The "raw" version of title; can return a ``ByteStringObject``.""" 

142 return self.get(DI.TITLE) 

143 

144 @property 

145 def author(self) -> Optional[str]: 

146 """ 

147 Read-only property accessing the document's author. 

148 

149 Returns a ``TextStringObject`` or ``None`` if the author is not 

150 specified. 

151 """ 

152 return self._get_text(DI.AUTHOR) 

153 

154 @property 

155 def author_raw(self) -> Optional[str]: 

156 """The "raw" version of author; can return a ``ByteStringObject``.""" 

157 return self.get(DI.AUTHOR) 

158 

159 @property 

160 def subject(self) -> Optional[str]: 

161 """ 

162 Read-only property accessing the document's subject. 

163 

164 Returns a ``TextStringObject`` or ``None`` if the subject is not 

165 specified. 

166 """ 

167 return self._get_text(DI.SUBJECT) 

168 

169 @property 

170 def subject_raw(self) -> Optional[str]: 

171 """The "raw" version of subject; can return a ``ByteStringObject``.""" 

172 return self.get(DI.SUBJECT) 

173 

174 @property 

175 def creator(self) -> Optional[str]: 

176 """ 

177 Read-only property accessing the document's creator. 

178 

179 If the document was converted to PDF from another format, this is the 

180 name of the application (e.g. OpenOffice) that created the original 

181 document from which it was converted. Returns a ``TextStringObject`` or 

182 ``None`` if the creator is not specified. 

183 """ 

184 return self._get_text(DI.CREATOR) 

185 

186 @property 

187 def creator_raw(self) -> Optional[str]: 

188 """The "raw" version of creator; can return a ``ByteStringObject``.""" 

189 return self.get(DI.CREATOR) 

190 

191 @property 

192 def producer(self) -> Optional[str]: 

193 """ 

194 Read-only property accessing the document's producer. 

195 

196 If the document was converted to PDF from another format, this is the 

197 name of the application (for example, macOS Quartz) that converted it to 

198 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not 

199 specified. 

200 """ 

201 return self._get_text(DI.PRODUCER) 

202 

203 @property 

204 def producer_raw(self) -> Optional[str]: 

205 """The "raw" version of producer; can return a ``ByteStringObject``.""" 

206 return self.get(DI.PRODUCER) 

207 

208 @property 

209 def creation_date(self) -> Optional[datetime]: 

210 """Read-only property accessing the document's creation date.""" 

211 return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) 

212 

213 @property 

214 def creation_date_raw(self) -> Optional[str]: 

215 """ 

216 The "raw" version of creation date; can return a ``ByteStringObject``. 

217 

218 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

219 is the offset from UTC. 

220 """ 

221 return self.get(DI.CREATION_DATE) 

222 

223 @property 

224 def modification_date(self) -> Optional[datetime]: 

225 """ 

226 Read-only property accessing the document's modification date. 

227 

228 The date and time the document was most recently modified. 

229 """ 

230 return parse_iso8824_date(self._get_text(DI.MOD_DATE)) 

231 

232 @property 

233 def modification_date_raw(self) -> Optional[str]: 

234 """ 

235 The "raw" version of modification date; can return a 

236 ``ByteStringObject``. 

237 

238 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

239 is the offset from UTC. 

240 """ 

241 return self.get(DI.MOD_DATE) 

242 

243 @property 

244 def keywords(self) -> Optional[str]: 

245 """ 

246 Read-only property accessing the document's keywords. 

247 

248 Returns a ``TextStringObject`` or ``None`` if keywords are not 

249 specified. 

250 """ 

251 return self._get_text(DI.KEYWORDS) 

252 

253 @property 

254 def keywords_raw(self) -> Optional[str]: 

255 """The "raw" version of keywords; can return a ``ByteStringObject``.""" 

256 return self.get(DI.KEYWORDS) 

257 

258 

259class PdfDocCommon(ABC): 

260 """ 

261 Common functions from PdfWriter and PdfReader objects. 

262 

263 This root class is strongly abstracted. 

264 """ 

265 

266 strict: bool = False # default 

267 

268 flattened_pages: Optional[list[PageObject]] = None 

269 

270 _encryption: Optional[Encryption] = None 

271 

272 _readonly: bool = False 

273 

274 @property 

275 @abstractmethod 

276 def root_object(self) -> DictionaryObject: 

277 ... # pragma: no cover 

278 

279 @property 

280 @abstractmethod 

281 def pdf_header(self) -> str: 

282 ... # pragma: no cover 

283 

284 @abstractmethod 

285 def get_object( 

286 self, indirect_reference: Union[int, IndirectObject] 

287 ) -> Optional[PdfObject]: 

288 ... # pragma: no cover 

289 

290 @abstractmethod 

291 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: 

292 ... # pragma: no cover 

293 

294 @property 

295 @abstractmethod 

296 def _info(self) -> Optional[DictionaryObject]: 

297 ... # pragma: no cover 

298 

299 @property 

300 def metadata(self) -> Optional[DocumentInformation]: 

301 """ 

302 Retrieve the PDF file's document information dictionary, if it exists. 

303 

304 Note that some PDF files use metadata streams instead of document 

305 information dictionaries, and these metadata streams will not be 

306 accessed by this function. 

307 """ 

308 retval = DocumentInformation() 

309 if self._info is None: 

310 return None 

311 retval.update(self._info) 

312 return retval 

313 

314 @property 

315 @abstractmethod 

316 def xmp_metadata(self) -> Optional[XmpInformation]: 

317 ... # pragma: no cover 

318 

319 @property 

320 def viewer_preferences(self) -> Optional[ViewerPreferences]: 

321 """Returns the existing ViewerPreferences as an overloaded dictionary.""" 

322 o = self.root_object.get(CD.VIEWER_PREFERENCES, None) 

323 if o is None: 

324 return None 

325 o = o.get_object() 

326 if not isinstance(o, ViewerPreferences): 

327 o = ViewerPreferences(o) 

328 if hasattr(o, "indirect_reference") and o.indirect_reference is not None: 

329 self._replace_object(o.indirect_reference, o) 

330 else: 

331 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o 

332 return o 

333 

334 def get_num_pages(self) -> int: 

335 """ 

336 Calculate the number of pages in this PDF file. 

337 

338 Returns: 

339 The number of pages of the parsed PDF file. 

340 

341 Raises: 

342 PdfReadError: If restrictions prevent this action. 

343 

344 """ 

345 # Flattened pages will not work on an encrypted PDF; 

346 # the PDF file's page count is used in this case. Otherwise, 

347 # the original method (flattened page count) is used. 

348 if self.is_encrypted: 

349 return self.root_object["/Pages"]["/Count"] # type: ignore[no-any-return, index] 

350 if self.flattened_pages is None: 

351 self._flatten(self._readonly) 

352 assert self.flattened_pages is not None 

353 return len(self.flattened_pages) 

354 

355 def get_page(self, page_number: int) -> PageObject: 

356 """ 

357 Retrieve a page by number from this PDF file. 

358 Most of the time ``.pages[page_number]`` is preferred. 

359 

360 Args: 

361 page_number: The page number to retrieve 

362 (pages begin at zero) 

363 

364 Returns: 

365 A :class:`PageObject<pypdf._page.PageObject>` instance. 

366 

367 """ 

368 if self.flattened_pages is None: 

369 self._flatten(self._readonly) 

370 assert self.flattened_pages is not None, "hint for mypy" 

371 return self.flattened_pages[page_number] 

372 

373 def _get_page_in_node( 

374 self, 

375 page_number: int, 

376 ) -> tuple[DictionaryObject, int]: 

377 """ 

378 Retrieve the node and position within the /Kids containing the page. 

379 If page_number is greater than the number of pages, it returns the top node, -1. 

380 """ 

381 top = cast(DictionaryObject, self.root_object["/Pages"]) 

382 

383 def recursive_call( 

384 node: DictionaryObject, mi: int 

385 ) -> tuple[Optional[PdfObject], int]: 

386 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types 

387 if node["/Type"] == "/Page": # type: ignore[comparison-overlap] 

388 if page_number == mi: 

389 return node, -1 

390 return None, mi + 1 

391 if (page_number - mi) >= ma: # not in nodes below 

392 if node == top: 

393 return top, -1 

394 return None, mi + ma 

395 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): 

396 kid = cast(DictionaryObject, kid.get_object()) 

397 n, i = recursive_call(kid, mi) 

398 if n is not None: # page has just been found ... 

399 if i < 0: # ... just below! 

400 return node, idx 

401 # ... at lower levels 

402 return n, i 

403 mi = i 

404 raise PyPdfError("Unexpectedly cannot find the node.") 

405 

406 node, idx = recursive_call(top, 0) 

407 assert isinstance(node, DictionaryObject), "mypy" 

408 return node, idx 

409 

410 @property 

411 def named_destinations(self) -> dict[str, Destination]: 

412 """A read-only dictionary which maps names to destinations.""" 

413 return self._get_named_destinations() 

414 

415 def get_named_dest_root(self) -> ArrayObject: 

416 named_dest = ArrayObject() 

417 if CA.NAMES in self.root_object and isinstance( 

418 self.root_object[CA.NAMES], DictionaryObject 

419 ): 

420 names = cast(DictionaryObject, self.root_object[CA.NAMES]) 

421 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): 

422 # §3.6.3 Name Dictionary (PDF spec 1.7) 

423 dests = cast(DictionaryObject, names[CA.DESTS]) 

424 dests_ref = dests.indirect_reference 

425 if CA.NAMES in dests: 

426 # §7.9.6, entries in a name tree node dictionary 

427 named_dest = cast(ArrayObject, dests[CA.NAMES]) 

428 else: 

429 named_dest = ArrayObject() 

430 dests[NameObject(CA.NAMES)] = named_dest 

431 elif hasattr(self, "_add_object"): 

432 dests = DictionaryObject() 

433 dests_ref = self._add_object(dests) 

434 names[NameObject(CA.DESTS)] = dests_ref 

435 dests[NameObject(CA.NAMES)] = named_dest 

436 

437 elif hasattr(self, "_add_object"): 

438 names = DictionaryObject() 

439 names_ref = self._add_object(names) 

440 self.root_object[NameObject(CA.NAMES)] = names_ref 

441 dests = DictionaryObject() 

442 dests_ref = self._add_object(dests) 

443 names[NameObject(CA.DESTS)] = dests_ref 

444 dests[NameObject(CA.NAMES)] = named_dest 

445 

446 return named_dest 

447 

448 ## common 

449 def _get_named_destinations( 

450 self, 

451 tree: Union[TreeObject, None] = None, 

452 retval: Optional[dict[str, Destination]] = None, 

453 ) -> dict[str, Destination]: 

454 """ 

455 Retrieve the named destinations present in the document. 

456 

457 Args: 

458 tree: The current tree. 

459 retval: The previously retrieved destinations for nested calls. 

460 

461 Returns: 

462 A dictionary which maps names to destinations. 

463 

464 """ 

465 if retval is None: 

466 retval = {} 

467 catalog = self.root_object 

468 

469 # get the name tree 

470 if CA.DESTS in catalog: 

471 tree = cast(TreeObject, catalog[CA.DESTS]) 

472 elif CA.NAMES in catalog: 

473 names = cast(DictionaryObject, catalog[CA.NAMES]) 

474 if CA.DESTS in names: 

475 tree = cast(TreeObject, names[CA.DESTS]) 

476 

477 if is_null_or_none(tree): 

478 return retval 

479 assert tree is not None, "mypy" 

480 

481 if PagesAttributes.KIDS in tree: 

482 # recurse down the tree 

483 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]): 

484 self._get_named_destinations(kid.get_object(), retval) 

485 # §7.9.6, entries in a name tree node dictionary 

486 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6) 

487 names = cast(DictionaryObject, tree[CA.NAMES]) 

488 i = 0 

489 while i < len(names): 

490 key = names[i].get_object() 

491 i += 1 

492 if not isinstance(key, (bytes, str)): 

493 continue 

494 try: 

495 value = names[i].get_object() 

496 except IndexError: 

497 break 

498 i += 1 

499 if isinstance(value, DictionaryObject): 

500 if "/D" in value: 

501 value = value["/D"] 

502 else: 

503 continue 

504 dest = self._build_destination(key, value) 

505 if dest is not None: 

506 retval[cast(str, dest["/Title"])] = dest 

507 # Remain backwards-compatible. 

508 retval[str(key)] = dest 

509 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1) 

510 for k__, v__ in tree.items(): 

511 val = v__.get_object() 

512 if isinstance(val, DictionaryObject): 

513 if "/D" in val: 

514 val = val["/D"].get_object() 

515 else: 

516 continue 

517 dest = self._build_destination(k__, val) 

518 if dest is not None: 

519 retval[k__] = dest 

520 return retval 

521 

522 # A select group of relevant field attributes. For the complete list, 

523 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification. 

524 

525 def get_fields( 

526 self, 

527 tree: Optional[TreeObject] = None, 

528 retval: Optional[dict[Any, Any]] = None, 

529 fileobj: Optional[Any] = None, 

530 stack: Optional[list[PdfObject]] = None, 

531 ) -> Optional[dict[str, Any]]: 

532 """ 

533 Extract field data if this PDF contains interactive form fields. 

534 

535 The *tree*, *retval*, *stack* parameters are for recursive use. 

536 

537 Args: 

538 tree: Current object to parse. 

539 retval: In-progress list of fields. 

540 fileobj: A file object (usually a text file) to write 

541 a report to on all interactive form fields found. 

542 stack: List of already parsed objects. 

543 

544 Returns: 

545 A dictionary where each key is a field name, and each 

546 value is a :class:`Field<pypdf.generic.Field>` object. By 

547 default, the mapping name is used for keys. 

548 ``None`` if form data could not be located. 

549 

550 """ 

551 field_attributes = FA.attributes_dict() 

552 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) 

553 if retval is None: 

554 retval = {} 

555 catalog = self.root_object 

556 stack = [] 

557 # get the AcroForm tree 

558 if CD.ACRO_FORM in catalog: 

559 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) 

560 else: 

561 return None 

562 if tree is None: 

563 return retval 

564 assert stack is not None 

565 if "/Fields" in tree: 

566 fields = cast(ArrayObject, tree["/Fields"]) 

567 for f in fields: 

568 field = f.get_object() 

569 self._build_field(field, retval, fileobj, field_attributes, stack) 

570 elif any(attr in tree for attr in field_attributes): 

571 # Tree is a field 

572 self._build_field(tree, retval, fileobj, field_attributes, stack) 

573 return retval 

574 

575 def _get_qualified_field_name(self, parent: DictionaryObject) -> str: 

576 if "/TM" in parent: 

577 return cast(str, parent["/TM"]) 

578 if "/Parent" in parent: 

579 return ( 

580 self._get_qualified_field_name( 

581 cast(DictionaryObject, parent["/Parent"]) 

582 ) 

583 + "." 

584 + cast(str, parent.get("/T", "")) 

585 ) 

586 return cast(str, parent.get("/T", "")) 

587 

588 def _build_field( 

589 self, 

590 field: Union[TreeObject, DictionaryObject], 

591 retval: dict[Any, Any], 

592 fileobj: Any, 

593 field_attributes: Any, 

594 stack: list[PdfObject], 

595 ) -> None: 

596 if all(attr not in field for attr in ("/T", "/TM")): 

597 return 

598 key = self._get_qualified_field_name(field) 

599 if fileobj: 

600 self._write_field(fileobj, field, field_attributes) 

601 fileobj.write("\n") 

602 retval[key] = Field(field) 

603 obj = retval[key].indirect_reference.get_object() # to get the full object 

604 if obj.get(FA.FT, "") == "/Ch" and obj.get(NameObject(FA.Opt)): 

605 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] 

606 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: 

607 # Checkbox 

608 retval[key][NameObject("/_States_")] = ArrayObject( 

609 list(obj["/AP"]["/N"].keys()) 

610 ) 

611 if "/Off" not in retval[key]["/_States_"]: 

612 retval[key][NameObject("/_States_")].append(NameObject("/Off")) 

613 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: 

614 states: list[str] = [] 

615 retval[key][NameObject("/_States_")] = ArrayObject(states) 

616 for k in obj.get(FA.Kids, {}): 

617 k = k.get_object() 

618 for s in list(k["/AP"]["/N"].keys()): 

619 if s not in states: 

620 states.append(s) 

621 retval[key][NameObject("/_States_")] = ArrayObject(states) 

622 if ( 

623 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 

624 and "/Off" in retval[key]["/_States_"] 

625 ): 

626 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] 

627 # at last for order 

628 self._check_kids(field, retval, fileobj, stack) 

629 

630 def _check_kids( 

631 self, 

632 tree: Union[TreeObject, DictionaryObject], 

633 retval: Any, 

634 fileobj: Any, 

635 stack: list[PdfObject], 

636 ) -> None: 

637 if tree in stack: 

638 logger_warning( 

639 "%(field_name)s already parsed", 

640 source=__name__, 

641 field_name=self._get_qualified_field_name(tree), 

642 ) 

643 return 

644 stack.append(tree) 

645 if PagesAttributes.KIDS in tree: 

646 # recurse down the tree 

647 for kid in tree[PagesAttributes.KIDS]: # type: ignore[attr-defined] 

648 kid = kid.get_object() 

649 self.get_fields(kid, retval, fileobj, stack) 

650 

651 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: 

652 field_attributes_tuple = FA.attributes() 

653 field_attributes_tuple = ( 

654 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() 

655 ) 

656 

657 for attr in field_attributes_tuple: 

658 if attr in ( 

659 FA.Kids, 

660 FA.AA, 

661 ): 

662 continue 

663 attr_name = field_attributes[attr] 

664 try: 

665 if attr == FA.FT: 

666 # Make the field type value clearer 

667 types = { 

668 "/Btn": "Button", 

669 "/Tx": "Text", 

670 "/Ch": "Choice", 

671 "/Sig": "Signature", 

672 } 

673 if field[attr] in types: 

674 fileobj.write(f"{attr_name}: {types[field[attr]]}\n") 

675 elif attr == FA.Parent: 

676 # Let's just write the name of the parent 

677 try: 

678 name = field[attr][FA.TM] 

679 except KeyError: 

680 name = field[attr][FA.T] 

681 fileobj.write(f"{attr_name}: {name}\n") 

682 else: 

683 fileobj.write(f"{attr_name}: {field[attr]}\n") 

684 except KeyError: 

685 # Field attribute is N/A or unknown, so don't write anything 

686 pass 

687 

688 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]: 

689 """ 

690 Retrieve form fields from the document with textual data. 

691 

692 Args: 

693 full_qualified_name: to get full name 

694 

695 Returns: 

696 A dictionary. The key is the name of the form field, 

697 the value is the content of the field. 

698 

699 If the document contains multiple form fields with the same name, the 

700 second and following will get the suffix .2, .3, ... 

701 

702 """ 

703 

704 def indexed_key(k: str, fields: dict[Any, Any]) -> str: 

705 if k not in fields: 

706 return k 

707 return ( 

708 k 

709 + "." 

710 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2) 

711 ) 

712 

713 # Retrieve document form fields 

714 formfields = self.get_fields() 

715 if formfields is None: 

716 return {} 

717 ff = {} 

718 for field, value in formfields.items(): 

719 if value.get("/FT") == "/Tx": 

720 if full_qualified_name: 

721 ff[field] = value.get("/V") 

722 else: 

723 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") 

724 return ff 

725 

726 def get_pages_showing_field( 

727 self, field: Union[Field, PdfObject, IndirectObject] 

728 ) -> list[PageObject]: 

729 """ 

730 Provides list of pages where the field is called. 

731 

732 Args: 

733 field: Field Object, PdfObject or IndirectObject referencing a Field 

734 

735 Returns: 

736 List of pages: 

737 - Empty list: 

738 The field has no widgets attached 

739 (either hidden field or ancestor field). 

740 - Single page list: 

741 Page where the widget is present 

742 (most common). 

743 - Multi-page list: 

744 Field with multiple kids widgets 

745 (example: radio buttons, field repeated on multiple pages). 

746 

747 """ 

748 try: 

749 # to cope with all types 

750 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore[union-attr] 

751 except Exception as exc: 

752 raise ValueError("Field type is invalid") from exc 

753 if is_null_or_none(field.get_inherited(key="/FT", default=None)): 

754 raise ValueError("Field is not valid") 

755 ret = [] 

756 if field.get("/Subtype", "") == "/Widget": 

757 if "/P" in field: 

758 ret = [field["/P"].get_object()] 

759 else: 

760 ret = [ 

761 p 

762 for p in self.pages 

763 if field.indirect_reference in p.get("/Annots", "") 

764 ] 

765 else: 

766 kids = field.get("/Kids", ()) 

767 for k in kids: 

768 k = k.get_object() 

769 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): 

770 # Kid that is just a widget, not a field: 

771 if "/P" in k: 

772 ret += [k["/P"].get_object()] 

773 else: 

774 ret += [ 

775 p 

776 for p in self.pages 

777 if k.indirect_reference in p.get("/Annots", "") 

778 ] 

779 return [ 

780 x 

781 if isinstance(x, PageObject) 

782 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore[index, union-attr] 

783 for x in ret 

784 ] 

785 

786 @property 

787 def open_destination( 

788 self, 

789 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

790 """ 

791 Property to access the opening destination (``/OpenAction`` entry in 

792 the PDF catalog). It returns ``None`` if the entry does not exist 

793 or is not set. 

794 

795 Raises: 

796 Exception: If a destination is invalid. 

797 

798 """ 

799 if "/OpenAction" not in self.root_object: 

800 return None 

801 oa: Any = self.root_object["/OpenAction"] 

802 if isinstance(oa, bytes): # pragma: no cover 

803 oa = oa.decode() 

804 if isinstance(oa, str): 

805 return create_string_object(oa) 

806 if isinstance(oa, ArrayObject): 

807 try: 

808 page, typ, *array = oa 

809 fit = Fit(typ, tuple(array)) 

810 return Destination("OpenAction", page, fit) 

811 except Exception as exc: 

812 raise Exception(f"Invalid Destination {oa}: {exc}") 

813 else: 

814 return None 

815 

816 @open_destination.setter 

817 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

818 raise NotImplementedError("No setter for open_destination") 

819 

820 @property 

821 def outline(self) -> OutlineType: 

822 """ 

823 Read-only property for the outline present in the document 

824 (i.e., a collection of 'outline items' which are also known as 

825 'bookmarks'). 

826 """ 

827 return self._get_outline() 

828 

829 def _get_outline( 

830 self, 

831 node: Optional[DictionaryObject] = None, 

832 outline: Optional[Any] = None, 

833 visited: Optional[set[int]] = None, 

834 ) -> OutlineType: 

835 if outline is None: 

836 outline = [] 

837 catalog = self.root_object 

838 

839 # get the outline dictionary and named destinations 

840 if CO.OUTLINES in catalog: 

841 lines = cast(DictionaryObject, catalog[CO.OUTLINES]) 

842 

843 if isinstance(lines, NullObject): 

844 return outline 

845 

846 # §12.3.3 Document outline, entries in the outline dictionary 

847 if not is_null_or_none(lines) and "/First" in lines: 

848 node = cast(DictionaryObject, lines["/First"]) 

849 self._named_destinations = self._get_named_destinations() 

850 

851 if node is None: 

852 return outline 

853 

854 # see if there are any more outline items 

855 if visited is None: 

856 visited = set() 

857 while True: 

858 node_id = id(node) 

859 if node_id in visited: 

860 logger_warning("Detected cycle in outline structure for %(node)s", source=__name__, node=node) 

861 break 

862 visited.add(node_id) 

863 

864 outline_obj = self._build_outline_item(node) 

865 if outline_obj: 

866 outline.append(outline_obj) 

867 

868 # check for sub-outline 

869 if "/First" in node: 

870 sub_outline: list[Any] = [] 

871 # Pass a copy to allow multiple outer entries to reference the same inner one. 

872 inner_visited = visited.copy() 

873 self._get_outline( 

874 node=cast(DictionaryObject, node["/First"]), 

875 outline=sub_outline, 

876 visited=inner_visited, 

877 ) 

878 if sub_outline: 

879 outline.append(sub_outline) 

880 

881 if "/Next" not in node: 

882 break 

883 node = cast(DictionaryObject, node["/Next"]) 

884 

885 return outline 

886 

887 @property 

888 def threads(self) -> Optional[ArrayObject]: 

889 """ 

890 Read-only property for the list of threads. 

891 

892 See §12.4.3 from the PDF 1.7 or 2.0 specification. 

893 

894 It is an array of dictionaries with "/F" (the first bead in the thread) 

895 and "/I" (a thread information dictionary containing information about 

896 the thread, such as its title, author, and creation date) properties or 

897 None if there are no articles. 

898 

899 Since PDF 2.0 it can also contain an indirect reference to a metadata 

900 stream containing information about the thread, such as its title, 

901 author, and creation date. 

902 """ 

903 catalog = self.root_object 

904 if CO.THREADS in catalog: 

905 return cast("ArrayObject", catalog[CO.THREADS]) 

906 return None 

907 

908 @abstractmethod 

909 def _get_page_number_by_indirect( 

910 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

911 ) -> Optional[int]: 

912 ... # pragma: no cover 

913 

914 def get_page_number(self, page: PageObject) -> Optional[int]: 

915 """ 

916 Retrieve page number of a given PageObject. 

917 

918 Args: 

919 page: The page to get page number. Should be 

920 an instance of :class:`PageObject<pypdf._page.PageObject>` 

921 

922 Returns: 

923 The page number or None if page is not found 

924 

925 """ 

926 return self._get_page_number_by_indirect(page.indirect_reference) 

927 

928 def get_destination_page_number(self, destination: Destination) -> Optional[int]: 

929 """ 

930 Retrieve page number of a given Destination object. 

931 

932 Args: 

933 destination: The destination to get page number. 

934 

935 Returns: 

936 The page number or None if page is not found 

937 

938 """ 

939 return self._get_page_number_by_indirect(destination.page) 

940 

941 def _build_destination( 

942 self, 

943 title: Union[str, bytes], 

944 array: Optional[ 

945 list[ 

946 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] 

947 ] 

948 ], 

949 ) -> Destination: 

950 page, typ = None, None 

951 # handle outline items with missing or invalid destination 

952 if ( 

953 isinstance(array, (NullObject, str)) 

954 or (isinstance(array, ArrayObject) and len(array) == 0) 

955 or array is None 

956 ): 

957 page = NullObject() 

958 return Destination(title, page, Fit.fit()) 

959 page, typ, *array = array # type: ignore[assignment] 

960 try: 

961 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore[arg-type] 

962 except PdfReadError: 

963 logger_warning("Unknown destination: %(title)r %(array)s", source=__name__, title=title, array=array) 

964 if self.strict: 

965 raise 

966 # create a link to first Page 

967 tmp = self.pages[0].indirect_reference 

968 indirect_reference = NullObject() if tmp is None else tmp 

969 return Destination(title, indirect_reference, Fit.fit()) 

970 

971 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: 

972 dest, title, outline_item = None, None, None 

973 

974 # title required for valid outline 

975 # §12.3.3, entries in an outline item dictionary 

976 try: 

977 title = cast("str", node["/Title"]) 

978 except KeyError: 

979 if self.strict: 

980 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") 

981 title = "" 

982 

983 if "/A" in node: 

984 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported) 

985 action = cast(DictionaryObject, node["/A"]) 

986 action_type = cast(NameObject, action[GoToActionArguments.S]) 

987 if action_type == "/GoTo": 

988 if GoToActionArguments.D in action: 

989 dest = action[GoToActionArguments.D] 

990 elif self.strict: 

991 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}") 

992 elif "/Dest" in node: 

993 # Destination, PDF 1.7 and PDF 2.0 §12.3.2 

994 dest = node["/Dest"] 

995 # if array was referenced in another object, will be a dict w/ key "/D" 

996 if isinstance(dest, DictionaryObject) and "/D" in dest: 

997 dest = dest["/D"] 

998 

999 if isinstance(dest, ArrayObject): 

1000 outline_item = self._build_destination(title, dest) 

1001 elif isinstance(dest, str): 

1002 # named destination, addresses NameObject Issue #193 

1003 # TODO: Keep named destination instead of replacing it? 

1004 try: 

1005 outline_item = self._build_destination( 

1006 title, self._named_destinations[dest].dest_array 

1007 ) 

1008 except KeyError: 

1009 # named destination not found in Name Dict 

1010 outline_item = self._build_destination(title, None) 

1011 elif dest is None: 

1012 # outline item not required to have destination or action 

1013 # PDFv1.7 Table 153 

1014 outline_item = self._build_destination(title, dest) 

1015 else: 

1016 if self.strict: 

1017 raise PdfReadError(f"Unexpected destination {dest!r}") 

1018 logger_warning( 

1019 "Removed unexpected destination %(dest)r from destination", 

1020 source=__name__, 

1021 dest=dest, 

1022 ) 

1023 outline_item = self._build_destination(title, None) 

1024 

1025 # if outline item created, add color, format, and child count if present 

1026 if outline_item: 

1027 if "/C" in node: 

1028 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 

1029 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore[attr-defined] 

1030 if "/F" in node: 

1031 # specifies style characteristics bold and/or italic 

1032 # with 1=italic, 2=bold, 3=both 

1033 outline_item[NameObject("/F")] = node["/F"] 

1034 if "/Count" in node: 

1035 # absolute value = num. visible children 

1036 # with positive = open/unfolded, negative = closed/folded 

1037 outline_item[NameObject("/Count")] = node["/Count"] 

1038 # if count is 0 we will consider it as open (to have available is_open) 

1039 outline_item[NameObject("/%is_open%")] = BooleanObject( 

1040 node.get("/Count", 0) >= 0 

1041 ) 

1042 outline_item.node = node 

1043 try: 

1044 outline_item.indirect_reference = node.indirect_reference 

1045 except AttributeError: 

1046 pass 

1047 return outline_item 

1048 

1049 @property 

1050 def pages(self) -> list[PageObject]: 

1051 """ 

1052 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`. 

1053 This property allows to get a page or a range of pages. 

1054 

1055 Note: 

1056 For PdfWriter only: Provides the capability to remove a page/range of 

1057 page from the list (using the del operator). Remember: Only the page 

1058 entry is removed, as the objects beneath can be used elsewhere. A 

1059 solution to completely remove them - if they are not used anywhere - is 

1060 to write to a buffer/temporary file and then load it into a new 

1061 PdfWriter. 

1062 

1063 """ 

1064 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore[return-value] 

1065 

1066 @property 

1067 def page_labels(self) -> list[str]: 

1068 """ 

1069 A list of labels for the pages in this document. 

1070 

1071 This property is read-only. The labels are in the order that the pages 

1072 appear in the document. 

1073 """ 

1074 return [page_index2page_label(self, i) for i in range(len(self.pages))] 

1075 

1076 @property 

1077 def page_layout(self) -> Optional[str]: 

1078 """ 

1079 Get the page layout currently being used. 

1080 

1081 .. list-table:: Valid ``layout`` values 

1082 :widths: 50 200 

1083 

1084 * - /NoLayout 

1085 - Layout explicitly not specified 

1086 * - /SinglePage 

1087 - Show one page at a time 

1088 * - /OneColumn 

1089 - Show one column at a time 

1090 * - /TwoColumnLeft 

1091 - Show pages in two columns, odd-numbered pages on the left 

1092 * - /TwoColumnRight 

1093 - Show pages in two columns, odd-numbered pages on the right 

1094 * - /TwoPageLeft 

1095 - Show two pages at a time, odd-numbered pages on the left 

1096 * - /TwoPageRight 

1097 - Show two pages at a time, odd-numbered pages on the right 

1098 """ 

1099 try: 

1100 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) 

1101 except KeyError: 

1102 return None 

1103 

1104 @property 

1105 def page_mode(self) -> Optional[PagemodeType]: 

1106 """ 

1107 Get the page mode currently being used. 

1108 

1109 .. list-table:: Valid ``mode`` values 

1110 :widths: 50 200 

1111 

1112 * - /UseNone 

1113 - Do not show outline or thumbnails panels 

1114 * - /UseOutlines 

1115 - Show outline (aka bookmarks) panel 

1116 * - /UseThumbs 

1117 - Show page thumbnails panel 

1118 * - /FullScreen 

1119 - Fullscreen view 

1120 * - /UseOC 

1121 - Show Optional Content Group (OCG) panel 

1122 * - /UseAttachments 

1123 - Show attachments panel 

1124 """ 

1125 try: 

1126 return self.root_object["/PageMode"] # type: ignore[return-value] 

1127 except KeyError: 

1128 return None 

1129 

1130 def _flatten( 

1131 self, 

1132 list_only: bool = False, 

1133 pages: Union[None, DictionaryObject, PageObject] = None, 

1134 inherit: Optional[dict[str, Any]] = None, 

1135 indirect_reference: Optional[IndirectObject] = None, 

1136 ) -> None: 

1137 """ 

1138 Process the document pages to ease searching. 

1139 

1140 Attributes of a page may inherit from ancestor nodes 

1141 in the page tree. Flattening means moving 

1142 any inheritance data into descendant nodes, 

1143 effectively removing the inheritance dependency. 

1144 

1145 Note: It is distinct from another use of "flattening" applied to PDFs. 

1146 Flattening a PDF also means combining all the contents into one single layer 

1147 and making the file less editable. 

1148 

1149 Args: 

1150 list_only: Will only list the pages within _flatten_pages. 

1151 pages: 

1152 inherit: 

1153 indirect_reference: Used recursively to flatten the /Pages object. 

1154 

1155 """ 

1156 inheritable_page_attributes = ( 

1157 NameObject(PG.RESOURCES), 

1158 NameObject(PG.MEDIABOX), 

1159 NameObject(PG.CROPBOX), 

1160 NameObject(PG.ROTATE), 

1161 ) 

1162 if inherit is None: 

1163 inherit = {} 

1164 if is_null_or_none(pages): 

1165 # Fix issue 327: set flattened_pages attribute only for 

1166 # decrypted file 

1167 catalog = self.root_object 

1168 pages = catalog.get("/Pages").get_object() # type: ignore[union-attr] 

1169 if not isinstance(pages, DictionaryObject): 

1170 raise PdfReadError("Invalid object in /Pages") 

1171 self.flattened_pages = [] 

1172 assert pages is not None, "mypy" 

1173 

1174 if PagesAttributes.TYPE in pages: 

1175 t = cast(str, pages[PagesAttributes.TYPE]) 

1176 # if the page tree node has no /Type, consider as a page if /Kids is also missing 

1177 elif PagesAttributes.KIDS not in pages: 

1178 t = "/Page" 

1179 else: 

1180 t = "/Pages" 

1181 

1182 if t == "/Pages": 

1183 for attr in inheritable_page_attributes: 

1184 if attr in pages: 

1185 inherit[attr] = pages[attr] 

1186 pages_reference = getattr(pages, "indirect_reference", object()) 

1187 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]): 

1188 if getattr(page, "indirect_reference", object()) == pages_reference: 

1189 raise PdfReadError("Detected cyclic page references.") 

1190 

1191 addt = {} 

1192 if isinstance(page, IndirectObject): 

1193 addt["indirect_reference"] = page 

1194 obj = page.get_object() 

1195 if obj: 

1196 # damaged file may have invalid child in /Pages 

1197 try: 

1198 self._flatten(list_only, obj, inherit, **addt) 

1199 except RecursionError: 

1200 raise PdfReadError( 

1201 "Maximum recursion depth reached during page flattening." 

1202 ) 

1203 elif t == "/Page": 

1204 for attr_in, value in inherit.items(): 

1205 # if the page has its own value, it does not inherit the 

1206 # parent's value 

1207 if attr_in not in pages: 

1208 pages[attr_in] = value 

1209 page_obj = PageObject(self, indirect_reference) 

1210 if not list_only: 

1211 page_obj.update(pages) 

1212 

1213 # TODO: Could flattened_pages be None at this point? 

1214 self.flattened_pages.append(page_obj) # type: ignore[union-attr] 

1215 

1216 def remove_page( 

1217 self, 

1218 page: Union[int, PageObject, IndirectObject], 

1219 clean: bool = False, 

1220 ) -> None: 

1221 """ 

1222 Remove page from pages list. 

1223 

1224 Args: 

1225 page: 

1226 * :class:`int`: Page number to be removed. 

1227 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times 

1228 only the first one will be removed. 

1229 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed. 

1230 

1231 clean: replace PageObject with NullObject to prevent annotations 

1232 or destinations to reference a detached page. 

1233 

1234 """ 

1235 if self.flattened_pages is None: 

1236 self._flatten(self._readonly) 

1237 assert self.flattened_pages is not None 

1238 if isinstance(page, IndirectObject): 

1239 p = page.get_object() 

1240 if not isinstance(p, PageObject): 

1241 logger_warning("IndirectObject is not referencing a page", source=__name__) 

1242 return 

1243 page = p 

1244 

1245 if not isinstance(page, int): 

1246 try: 

1247 page = self.flattened_pages.index(page) 

1248 except ValueError: 

1249 logger_warning("Cannot find page in pages", source=__name__) 

1250 return 

1251 if not (0 <= page < len(self.flattened_pages)): 

1252 logger_warning("Page number is out of range", source=__name__) 

1253 return 

1254 

1255 ind = self.pages[page].indirect_reference 

1256 del self.pages[page] 

1257 if clean and ind is not None: 

1258 self._replace_object(ind, NullObject()) 

1259 

1260 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: 

1261 """ 

1262 Used to ease development. 

1263 

1264 This is equivalent to generic.IndirectObject(num,gen,self).get_object() 

1265 

1266 Args: 

1267 num: The object number of the indirect object. 

1268 gen: The generation number of the indirect object. 

1269 

1270 Returns: 

1271 A PdfObject 

1272 

1273 """ 

1274 return IndirectObject(num, gen, self).get_object() 

1275 

1276 def decode_permissions( 

1277 self, permissions_code: int 

1278 ) -> NoReturn: # pragma: no cover 

1279 """Take the permissions as an integer, return the allowed access.""" 

1280 deprecation_with_replacement( 

1281 old_name="decode_permissions", 

1282 new_name="user_access_permissions", 

1283 removed_in="5.0.0", 

1284 ) 

1285 

1286 @property 

1287 def user_access_permissions(self) -> Optional[UserAccessPermissions]: 

1288 """ 

1289 Get the user access permissions for encrypted documents. 

1290 Returns None if not encrypted. 

1291 

1292 .. warning:: 

1293 

1294 For AES-256 encrypted documents (R=5/R=6), the returned 

1295 permissions are derived from the ``/P`` field, which is 

1296 only trustworthy if the ``/Perms`` integrity check passed. 

1297 Check :attr:`are_permissions_valid` to verify. 

1298 """ 

1299 if self._encryption is None: 

1300 return None 

1301 return UserAccessPermissions(self._encryption.P) 

1302 

1303 @property 

1304 def are_permissions_valid(self) -> Optional[bool]: 

1305 """ 

1306 Whether the ``/Perms`` integrity check passed for this document. 

1307 

1308 For AES-256 encrypted documents (R=5/R=6), the ``/Perms`` field 

1309 is an encrypted copy of the permissions that can be verified 

1310 independently. Returns ``False`` if this check fails (the ``/P`` 

1311 permissions may have been tampered with). 

1312 

1313 Returns ``None`` if the document is not encrypted or has not yet 

1314 been decrypted via :meth:`decrypt()<pypdf.PdfReader.decrypt>`. 

1315 Returns ``True`` for non-AES-256 encryption (no ``/Perms`` to check). 

1316 """ 

1317 if self._encryption is None: 

1318 return None 

1319 if not self._encryption.is_decrypted(): 

1320 return None 

1321 return self._encryption._are_permissions_valid 

1322 

1323 @property 

1324 @abstractmethod 

1325 def is_encrypted(self) -> bool: 

1326 """ 

1327 Read-only boolean property showing whether this PDF file is encrypted. 

1328 

1329 Note that this property, if true, will remain true even after the 

1330 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1331 """ 

1332 ... # pragma: no cover 

1333 

1334 @property 

1335 def xfa(self) -> Optional[dict[str, Any]]: 

1336 retval: dict[str, Any] = {} 

1337 catalog = self.root_object 

1338 

1339 if "/AcroForm" not in catalog or not catalog["/AcroForm"]: 

1340 return None 

1341 

1342 tree = cast(TreeObject, catalog["/AcroForm"]) 

1343 

1344 if "/XFA" in tree: 

1345 fields = cast(ArrayObject, tree["/XFA"]) 

1346 i = iter(fields) 

1347 for f in i: 

1348 tag = f 

1349 f = next(i) 

1350 if isinstance(f, IndirectObject): 

1351 field = cast(Optional[EncodedStreamObject], f.get_object()) 

1352 if field: 

1353 es = _decompress_with_limit(field._data) 

1354 retval[tag] = es 

1355 return retval 

1356 

1357 @property 

1358 def attachments(self) -> Mapping[str, list[bytes]]: 

1359 """Mapping of attachment filenames to their content.""" 

1360 return LazyDict( 

1361 { 

1362 name: (self._get_attachment_list, name) 

1363 for name in self._list_attachments() 

1364 } 

1365 ) 

1366 

1367 @property 

1368 def attachment_list(self) -> Generator[EmbeddedFile, None, None]: 

1369 """Iterable of attachment objects.""" 

1370 yield from EmbeddedFile._load(self.root_object) 

1371 

1372 def _list_attachments(self) -> list[str]: 

1373 """ 

1374 Retrieves the list of filenames of file attachments. 

1375 

1376 Returns: 

1377 list of filenames 

1378 

1379 """ 

1380 names = [] 

1381 for entry in self.attachment_list: 

1382 names.append(entry.name) 

1383 if (name := entry.alternative_name) != entry.name and name: 

1384 names.append(name) 

1385 return names 

1386 

1387 def _get_attachment_list(self, name: str) -> list[bytes]: 

1388 out = self._get_attachments(name)[name] 

1389 if isinstance(out, list): 

1390 return out 

1391 return [out] 

1392 

1393 def _get_attachments( 

1394 self, filename: Optional[str] = None 

1395 ) -> dict[str, Union[bytes, list[bytes]]]: 

1396 """ 

1397 Retrieves all or selected file attachments of the PDF as a dictionary of file names 

1398 and the file data as a bytestring. 

1399 

1400 Args: 

1401 filename: If filename is None, then a dictionary of all attachments 

1402 will be returned, where the key is the filename and the value 

1403 is the content. Otherwise, a dictionary with just a single key 

1404 - the filename - and its content will be returned. 

1405 

1406 Returns: 

1407 dictionary of filename -> Union[bytestring or List[ByteString]] 

1408 If the filename exists multiple times a list of the different versions will be provided. 

1409 

1410 """ 

1411 attachments: dict[str, Union[bytes, list[bytes]]] = {} 

1412 for entry in self.attachment_list: 

1413 names = set() 

1414 alternative_name = entry.alternative_name 

1415 if filename is not None: 

1416 if filename in {entry.name, alternative_name}: 

1417 name = entry.name if filename == entry.name else alternative_name 

1418 names.add(name) 

1419 else: 

1420 continue 

1421 else: 

1422 names = {entry.name, alternative_name} 

1423 

1424 for name in names: 

1425 if name is None: 

1426 continue 

1427 if name in attachments: 

1428 if not isinstance(attachments[name], list): 

1429 attachments[name] = [attachments[name]] # type:ignore 

1430 attachments[name].append(entry.content) # type:ignore 

1431 else: 

1432 attachments[name] = entry.content 

1433 return attachments 

1434 

1435 @abstractmethod 

1436 def _repr_mimebundle_( 

1437 self, 

1438 include: Union[None, Iterable[str]] = None, 

1439 exclude: Union[None, Iterable[str]] = None, 

1440 ) -> dict[str, Any]: 

1441 """ 

1442 Integration into Jupyter Notebooks. 

1443 

1444 This method returns a dictionary that maps a mime-type to its 

1445 representation. 

1446 

1447 .. seealso:: 

1448 

1449 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1450 """ 

1451 ... # pragma: no cover 

1452 

1453 

1454class LazyDict(Mapping[Any, Any]): 

1455 def __init__(self, *args: Any, **kwargs: Any) -> None: 

1456 self._raw_dict = dict(*args, **kwargs) 

1457 

1458 def __getitem__(self, key: str) -> Any: 

1459 func, arg = self._raw_dict.__getitem__(key) 

1460 return func(arg) 

1461 

1462 def __iter__(self) -> Iterator[Any]: 

1463 return iter(self._raw_dict) 

1464 

1465 def __len__(self) -> int: 

1466 return len(self._raw_dict) 

1467 

1468 def __str__(self) -> str: 

1469 return f"LazyDict(keys={list(self.keys())})"