Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

654 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# Copyright (c) 2024, Pubpub-ZZ 

4# 

5# All rights reserved. 

6# 

7# Redistribution and use in source and binary forms, with or without 

8# modification, are permitted provided that the following conditions are 

9# met: 

10# 

11# * Redistributions of source code must retain the above copyright notice, 

12# this list of conditions and the following disclaimer. 

13# * Redistributions in binary form must reproduce the above copyright notice, 

14# this list of conditions and the following disclaimer in the documentation 

15# and/or other materials provided with the distribution. 

16# * The name of the author may not be used to endorse or promote products 

17# derived from this software without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

29# POSSIBILITY OF SUCH DAMAGE. 

30 

31import struct 

32import zlib 

33from abc import abstractmethod 

34from collections.abc import Generator, Iterable, Iterator, Mapping 

35from datetime import datetime 

36from typing import ( 

37 Any, 

38 Optional, 

39 Union, 

40 cast, 

41) 

42 

43from ._encryption import Encryption 

44from ._page import PageObject, _VirtualList 

45from ._page_labels import index2label as page_index2page_label 

46from ._utils import ( 

47 deprecation_with_replacement, 

48 logger_warning, 

49 parse_iso8824_date, 

50) 

51from .constants import CatalogAttributes as CA 

52from .constants import CatalogDictionary as CD 

53from .constants import ( 

54 CheckboxRadioButtonAttributes, 

55 GoToActionArguments, 

56 PagesAttributes, 

57 UserAccessPermissions, 

58) 

59from .constants import Core as CO 

60from .constants import DocumentInformationAttributes as DI 

61from .constants import FieldDictionaryAttributes as FA 

62from .constants import PageAttributes as PG 

63from .errors import PdfReadError, PyPdfError 

64from .generic import ( 

65 ArrayObject, 

66 BooleanObject, 

67 ByteStringObject, 

68 Destination, 

69 DictionaryObject, 

70 EncodedStreamObject, 

71 Field, 

72 Fit, 

73 FloatObject, 

74 IndirectObject, 

75 NameObject, 

76 NullObject, 

77 NumberObject, 

78 PdfObject, 

79 TextStringObject, 

80 TreeObject, 

81 ViewerPreferences, 

82 create_string_object, 

83 is_null_or_none, 

84) 

85from .generic._files import EmbeddedFile 

86from .types import OutlineType, PagemodeType 

87from .xmp import XmpInformation 

88 

89 

90def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]: 

91 if size > 8: 

92 raise PdfReadError("Invalid size in convert_to_int") 

93 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d 

94 d = d[-8:] 

95 return struct.unpack(">q", d)[0] 

96 

97 

98class DocumentInformation(DictionaryObject): 

99 """ 

100 A class representing the basic document metadata provided in a PDF File. 

101 This class is accessible through 

102 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`. 

103 

104 All text properties of the document metadata have 

105 *two* properties, e.g. author and author_raw. The non-raw property will 

106 always return a ``TextStringObject``, making it ideal for a case where the 

107 metadata is being displayed. The raw property can sometimes return a 

108 ``ByteStringObject``, if pypdf was unable to decode the string's text 

109 encoding; this requires additional safety in the caller and therefore is not 

110 as commonly accessed. 

111 """ 

112 

113 def __init__(self) -> None: 

114 DictionaryObject.__init__(self) 

115 

116 def _get_text(self, key: str) -> Optional[str]: 

117 retval = self.get(key, None) 

118 if isinstance(retval, TextStringObject): 

119 return retval 

120 if isinstance(retval, ByteStringObject): 

121 return str(retval) 

122 return None 

123 

124 @property 

125 def title(self) -> Optional[str]: 

126 """ 

127 Read-only property accessing the document's title. 

128 

129 Returns a ``TextStringObject`` or ``None`` if the title is not 

130 specified. 

131 """ 

132 return ( 

133 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore 

134 if self.get(DI.TITLE) 

135 else None 

136 ) 

137 

138 @property 

139 def title_raw(self) -> Optional[str]: 

140 """The "raw" version of title; can return a ``ByteStringObject``.""" 

141 return self.get(DI.TITLE) 

142 

143 @property 

144 def author(self) -> Optional[str]: 

145 """ 

146 Read-only property accessing the document's author. 

147 

148 Returns a ``TextStringObject`` or ``None`` if the author is not 

149 specified. 

150 """ 

151 return self._get_text(DI.AUTHOR) 

152 

153 @property 

154 def author_raw(self) -> Optional[str]: 

155 """The "raw" version of author; can return a ``ByteStringObject``.""" 

156 return self.get(DI.AUTHOR) 

157 

158 @property 

159 def subject(self) -> Optional[str]: 

160 """ 

161 Read-only property accessing the document's subject. 

162 

163 Returns a ``TextStringObject`` or ``None`` if the subject is not 

164 specified. 

165 """ 

166 return self._get_text(DI.SUBJECT) 

167 

168 @property 

169 def subject_raw(self) -> Optional[str]: 

170 """The "raw" version of subject; can return a ``ByteStringObject``.""" 

171 return self.get(DI.SUBJECT) 

172 

173 @property 

174 def creator(self) -> Optional[str]: 

175 """ 

176 Read-only property accessing the document's creator. 

177 

178 If the document was converted to PDF from another format, this is the 

179 name of the application (e.g. OpenOffice) that created the original 

180 document from which it was converted. Returns a ``TextStringObject`` or 

181 ``None`` if the creator is not specified. 

182 """ 

183 return self._get_text(DI.CREATOR) 

184 

185 @property 

186 def creator_raw(self) -> Optional[str]: 

187 """The "raw" version of creator; can return a ``ByteStringObject``.""" 

188 return self.get(DI.CREATOR) 

189 

190 @property 

191 def producer(self) -> Optional[str]: 

192 """ 

193 Read-only property accessing the document's producer. 

194 

195 If the document was converted to PDF from another format, this is the 

196 name of the application (for example, macOS Quartz) that converted it to 

197 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not 

198 specified. 

199 """ 

200 return self._get_text(DI.PRODUCER) 

201 

202 @property 

203 def producer_raw(self) -> Optional[str]: 

204 """The "raw" version of producer; can return a ``ByteStringObject``.""" 

205 return self.get(DI.PRODUCER) 

206 

207 @property 

208 def creation_date(self) -> Optional[datetime]: 

209 """Read-only property accessing the document's creation date.""" 

210 return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) 

211 

212 @property 

213 def creation_date_raw(self) -> Optional[str]: 

214 """ 

215 The "raw" version of creation date; can return a ``ByteStringObject``. 

216 

217 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

218 is the offset from UTC. 

219 """ 

220 return self.get(DI.CREATION_DATE) 

221 

222 @property 

223 def modification_date(self) -> Optional[datetime]: 

224 """ 

225 Read-only property accessing the document's modification date. 

226 

227 The date and time the document was most recently modified. 

228 """ 

229 return parse_iso8824_date(self._get_text(DI.MOD_DATE)) 

230 

231 @property 

232 def modification_date_raw(self) -> Optional[str]: 

233 """ 

234 The "raw" version of modification date; can return a 

235 ``ByteStringObject``. 

236 

237 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix 

238 is the offset from UTC. 

239 """ 

240 return self.get(DI.MOD_DATE) 

241 

242 @property 

243 def keywords(self) -> Optional[str]: 

244 """ 

245 Read-only property accessing the document's keywords. 

246 

247 Returns a ``TextStringObject`` or ``None`` if keywords are not 

248 specified. 

249 """ 

250 return self._get_text(DI.KEYWORDS) 

251 

252 @property 

253 def keywords_raw(self) -> Optional[str]: 

254 """The "raw" version of keywords; can return a ``ByteStringObject``.""" 

255 return self.get(DI.KEYWORDS) 

256 

257 

258class PdfDocCommon: 

259 """ 

260 Common functions from PdfWriter and PdfReader objects. 

261 

262 This root class is strongly abstracted. 

263 """ 

264 

265 strict: bool = False # default 

266 

267 flattened_pages: Optional[list[PageObject]] = None 

268 

269 _encryption: Optional[Encryption] = None 

270 

271 _readonly: bool = False 

272 

273 @property 

274 @abstractmethod 

275 def root_object(self) -> DictionaryObject: 

276 ... # pragma: no cover 

277 

278 @property 

279 @abstractmethod 

280 def pdf_header(self) -> str: 

281 ... # pragma: no cover 

282 

283 @abstractmethod 

284 def get_object( 

285 self, indirect_reference: Union[int, IndirectObject] 

286 ) -> Optional[PdfObject]: 

287 ... # pragma: no cover 

288 

289 @abstractmethod 

290 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: 

291 ... # pragma: no cover 

292 

293 @property 

294 @abstractmethod 

295 def _info(self) -> Optional[DictionaryObject]: 

296 ... # pragma: no cover 

297 

298 @property 

299 def metadata(self) -> Optional[DocumentInformation]: 

300 """ 

301 Retrieve the PDF file's document information dictionary, if it exists. 

302 

303 Note that some PDF files use metadata streams instead of document 

304 information dictionaries, and these metadata streams will not be 

305 accessed by this function. 

306 """ 

307 retval = DocumentInformation() 

308 if self._info is None: 

309 return None 

310 retval.update(self._info) 

311 return retval 

312 

313 @property 

314 def xmp_metadata(self) -> Optional[XmpInformation]: 

315 ... # pragma: no cover 

316 

317 @property 

318 def viewer_preferences(self) -> Optional[ViewerPreferences]: 

319 """Returns the existing ViewerPreferences as an overloaded dictionary.""" 

320 o = self.root_object.get(CD.VIEWER_PREFERENCES, None) 

321 if o is None: 

322 return None 

323 o = o.get_object() 

324 if not isinstance(o, ViewerPreferences): 

325 o = ViewerPreferences(o) 

326 if hasattr(o, "indirect_reference") and o.indirect_reference is not None: 

327 self._replace_object(o.indirect_reference, o) 

328 else: 

329 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o 

330 return o 

331 

332 def get_num_pages(self) -> int: 

333 """ 

334 Calculate the number of pages in this PDF file. 

335 

336 Returns: 

337 The number of pages of the parsed PDF file. 

338 

339 Raises: 

340 PdfReadError: If restrictions prevent this action. 

341 

342 """ 

343 # Flattened pages will not work on an encrypted PDF; 

344 # the PDF file's page count is used in this case. Otherwise, 

345 # the original method (flattened page count) is used. 

346 if self.is_encrypted: 

347 return self.root_object["/Pages"]["/Count"] # type: ignore 

348 if self.flattened_pages is None: 

349 self._flatten(self._readonly) 

350 assert self.flattened_pages is not None 

351 return len(self.flattened_pages) 

352 

353 def get_page(self, page_number: int) -> PageObject: 

354 """ 

355 Retrieve a page by number from this PDF file. 

356 Most of the time ``.pages[page_number]`` is preferred. 

357 

358 Args: 

359 page_number: The page number to retrieve 

360 (pages begin at zero) 

361 

362 Returns: 

363 A :class:`PageObject<pypdf._page.PageObject>` instance. 

364 

365 """ 

366 if self.flattened_pages is None: 

367 self._flatten(self._readonly) 

368 assert self.flattened_pages is not None, "hint for mypy" 

369 return self.flattened_pages[page_number] 

370 

371 def _get_page_in_node( 

372 self, 

373 page_number: int, 

374 ) -> tuple[DictionaryObject, int]: 

375 """ 

376 Retrieve the node and position within the /Kids containing the page. 

377 If page_number is greater than the number of pages, it returns the top node, -1. 

378 """ 

379 top = cast(DictionaryObject, self.root_object["/Pages"]) 

380 

381 def recursive_call( 

382 node: DictionaryObject, mi: int 

383 ) -> tuple[Optional[PdfObject], int]: 

384 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types 

385 if node["/Type"] == "/Page": 

386 if page_number == mi: 

387 return node, -1 

388 return None, mi + 1 

389 if (page_number - mi) >= ma: # not in nodes below 

390 if node == top: 

391 return top, -1 

392 return None, mi + ma 

393 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): 

394 kid = cast(DictionaryObject, kid.get_object()) 

395 n, i = recursive_call(kid, mi) 

396 if n is not None: # page has just been found ... 

397 if i < 0: # ... just below! 

398 return node, idx 

399 # ... at lower levels 

400 return n, i 

401 mi = i 

402 raise PyPdfError("Unexpectedly cannot find the node.") 

403 

404 node, idx = recursive_call(top, 0) 

405 assert isinstance(node, DictionaryObject), "mypy" 

406 return node, idx 

407 

408 @property 

409 def named_destinations(self) -> dict[str, Destination]: 

410 """A read-only dictionary which maps names to destinations.""" 

411 return self._get_named_destinations() 

412 

413 def get_named_dest_root(self) -> ArrayObject: 

414 named_dest = ArrayObject() 

415 if CA.NAMES in self.root_object and isinstance( 

416 self.root_object[CA.NAMES], DictionaryObject 

417 ): 

418 names = cast(DictionaryObject, self.root_object[CA.NAMES]) 

419 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): 

420 # §3.6.3 Name Dictionary (PDF spec 1.7) 

421 dests = cast(DictionaryObject, names[CA.DESTS]) 

422 dests_ref = dests.indirect_reference 

423 if CA.NAMES in dests: 

424 # §7.9.6, entries in a name tree node dictionary 

425 named_dest = cast(ArrayObject, dests[CA.NAMES]) 

426 else: 

427 named_dest = ArrayObject() 

428 dests[NameObject(CA.NAMES)] = named_dest 

429 elif hasattr(self, "_add_object"): 

430 dests = DictionaryObject() 

431 dests_ref = self._add_object(dests) 

432 names[NameObject(CA.DESTS)] = dests_ref 

433 dests[NameObject(CA.NAMES)] = named_dest 

434 

435 elif hasattr(self, "_add_object"): 

436 names = DictionaryObject() 

437 names_ref = self._add_object(names) 

438 self.root_object[NameObject(CA.NAMES)] = names_ref 

439 dests = DictionaryObject() 

440 dests_ref = self._add_object(dests) 

441 names[NameObject(CA.DESTS)] = dests_ref 

442 dests[NameObject(CA.NAMES)] = named_dest 

443 

444 return named_dest 

445 

446 ## common 

447 def _get_named_destinations( 

448 self, 

449 tree: Union[TreeObject, None] = None, 

450 retval: Optional[dict[str, Destination]] = None, 

451 ) -> dict[str, Destination]: 

452 """ 

453 Retrieve the named destinations present in the document. 

454 

455 Args: 

456 tree: The current tree. 

457 retval: The previously retrieved destinations for nested calls. 

458 

459 Returns: 

460 A dictionary which maps names to destinations. 

461 

462 """ 

463 if retval is None: 

464 retval = {} 

465 catalog = self.root_object 

466 

467 # get the name tree 

468 if CA.DESTS in catalog: 

469 tree = cast(TreeObject, catalog[CA.DESTS]) 

470 elif CA.NAMES in catalog: 

471 names = cast(DictionaryObject, catalog[CA.NAMES]) 

472 if CA.DESTS in names: 

473 tree = cast(TreeObject, names[CA.DESTS]) 

474 

475 if is_null_or_none(tree): 

476 return retval 

477 assert tree is not None, "mypy" 

478 

479 if PagesAttributes.KIDS in tree: 

480 # recurse down the tree 

481 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]): 

482 self._get_named_destinations(kid.get_object(), retval) 

483 # §7.9.6, entries in a name tree node dictionary 

484 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6) 

485 names = cast(DictionaryObject, tree[CA.NAMES]) 

486 i = 0 

487 while i < len(names): 

488 key = names[i].get_object() 

489 i += 1 

490 if not isinstance(key, (bytes, str)): 

491 continue 

492 try: 

493 value = names[i].get_object() 

494 except IndexError: 

495 break 

496 i += 1 

497 if isinstance(value, DictionaryObject): 

498 if "/D" in value: 

499 value = value["/D"] 

500 else: 

501 continue 

502 dest = self._build_destination(key, value) 

503 if dest is not None: 

504 retval[cast(str, dest["/Title"])] = dest 

505 # Remain backwards-compatible. 

506 retval[str(key)] = dest 

507 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1) 

508 for k__, v__ in tree.items(): 

509 val = v__.get_object() 

510 if isinstance(val, DictionaryObject): 

511 if "/D" in val: 

512 val = val["/D"].get_object() 

513 else: 

514 continue 

515 dest = self._build_destination(k__, val) 

516 if dest is not None: 

517 retval[k__] = dest 

518 return retval 

519 

520 # A select group of relevant field attributes. For the complete list, 

521 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification. 

522 

523 def get_fields( 

524 self, 

525 tree: Optional[TreeObject] = None, 

526 retval: Optional[dict[Any, Any]] = None, 

527 fileobj: Optional[Any] = None, 

528 stack: Optional[list[PdfObject]] = None, 

529 ) -> Optional[dict[str, Any]]: 

530 """ 

531 Extract field data if this PDF contains interactive form fields. 

532 

533 The *tree*, *retval*, *stack* parameters are for recursive use. 

534 

535 Args: 

536 tree: Current object to parse. 

537 retval: In-progress list of fields. 

538 fileobj: A file object (usually a text file) to write 

539 a report to on all interactive form fields found. 

540 stack: List of already parsed objects. 

541 

542 Returns: 

543 A dictionary where each key is a field name, and each 

544 value is a :class:`Field<pypdf.generic.Field>` object. By 

545 default, the mapping name is used for keys. 

546 ``None`` if form data could not be located. 

547 

548 """ 

549 field_attributes = FA.attributes_dict() 

550 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) 

551 if retval is None: 

552 retval = {} 

553 catalog = self.root_object 

554 stack = [] 

555 # get the AcroForm tree 

556 if CD.ACRO_FORM in catalog: 

557 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) 

558 else: 

559 return None 

560 if tree is None: 

561 return retval 

562 assert stack is not None 

563 if "/Fields" in tree: 

564 fields = cast(ArrayObject, tree["/Fields"]) 

565 for f in fields: 

566 field = f.get_object() 

567 self._build_field(field, retval, fileobj, field_attributes, stack) 

568 elif any(attr in tree for attr in field_attributes): 

569 # Tree is a field 

570 self._build_field(tree, retval, fileobj, field_attributes, stack) 

571 return retval 

572 

573 def _get_qualified_field_name(self, parent: DictionaryObject) -> str: 

574 if "/TM" in parent: 

575 return cast(str, parent["/TM"]) 

576 if "/Parent" in parent: 

577 return ( 

578 self._get_qualified_field_name( 

579 cast(DictionaryObject, parent["/Parent"]) 

580 ) 

581 + "." 

582 + cast(str, parent.get("/T", "")) 

583 ) 

584 return cast(str, parent.get("/T", "")) 

585 

586 def _build_field( 

587 self, 

588 field: Union[TreeObject, DictionaryObject], 

589 retval: dict[Any, Any], 

590 fileobj: Any, 

591 field_attributes: Any, 

592 stack: list[PdfObject], 

593 ) -> None: 

594 if all(attr not in field for attr in ("/T", "/TM")): 

595 return 

596 key = self._get_qualified_field_name(field) 

597 if fileobj: 

598 self._write_field(fileobj, field, field_attributes) 

599 fileobj.write("\n") 

600 retval[key] = Field(field) 

601 obj = retval[key].indirect_reference.get_object() # to get the full object 

602 if obj.get(FA.FT, "") == "/Ch" and obj.get(NameObject(FA.Opt)): 

603 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] 

604 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: 

605 # Checkbox 

606 retval[key][NameObject("/_States_")] = ArrayObject( 

607 list(obj["/AP"]["/N"].keys()) 

608 ) 

609 if "/Off" not in retval[key]["/_States_"]: 

610 retval[key][NameObject("/_States_")].append(NameObject("/Off")) 

611 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: 

612 states: list[str] = [] 

613 retval[key][NameObject("/_States_")] = ArrayObject(states) 

614 for k in obj.get(FA.Kids, {}): 

615 k = k.get_object() 

616 for s in list(k["/AP"]["/N"].keys()): 

617 if s not in states: 

618 states.append(s) 

619 retval[key][NameObject("/_States_")] = ArrayObject(states) 

620 if ( 

621 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 

622 and "/Off" in retval[key]["/_States_"] 

623 ): 

624 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] 

625 # at last for order 

626 self._check_kids(field, retval, fileobj, stack) 

627 

628 def _check_kids( 

629 self, 

630 tree: Union[TreeObject, DictionaryObject], 

631 retval: Any, 

632 fileobj: Any, 

633 stack: list[PdfObject], 

634 ) -> None: 

635 if tree in stack: 

636 logger_warning( 

637 f"{self._get_qualified_field_name(tree)} already parsed", __name__ 

638 ) 

639 return 

640 stack.append(tree) 

641 if PagesAttributes.KIDS in tree: 

642 # recurse down the tree 

643 for kid in tree[PagesAttributes.KIDS]: # type: ignore 

644 kid = kid.get_object() 

645 self.get_fields(kid, retval, fileobj, stack) 

646 

647 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: 

648 field_attributes_tuple = FA.attributes() 

649 field_attributes_tuple = ( 

650 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() 

651 ) 

652 

653 for attr in field_attributes_tuple: 

654 if attr in ( 

655 FA.Kids, 

656 FA.AA, 

657 ): 

658 continue 

659 attr_name = field_attributes[attr] 

660 try: 

661 if attr == FA.FT: 

662 # Make the field type value clearer 

663 types = { 

664 "/Btn": "Button", 

665 "/Tx": "Text", 

666 "/Ch": "Choice", 

667 "/Sig": "Signature", 

668 } 

669 if field[attr] in types: 

670 fileobj.write(f"{attr_name}: {types[field[attr]]}\n") 

671 elif attr == FA.Parent: 

672 # Let's just write the name of the parent 

673 try: 

674 name = field[attr][FA.TM] 

675 except KeyError: 

676 name = field[attr][FA.T] 

677 fileobj.write(f"{attr_name}: {name}\n") 

678 else: 

679 fileobj.write(f"{attr_name}: {field[attr]}\n") 

680 except KeyError: 

681 # Field attribute is N/A or unknown, so don't write anything 

682 pass 

683 

684 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]: 

685 """ 

686 Retrieve form fields from the document with textual data. 

687 

688 Args: 

689 full_qualified_name: to get full name 

690 

691 Returns: 

692 A dictionary. The key is the name of the form field, 

693 the value is the content of the field. 

694 

695 If the document contains multiple form fields with the same name, the 

696 second and following will get the suffix .2, .3, ... 

697 

698 """ 

699 

700 def indexed_key(k: str, fields: dict[Any, Any]) -> str: 

701 if k not in fields: 

702 return k 

703 return ( 

704 k 

705 + "." 

706 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2) 

707 ) 

708 

709 # Retrieve document form fields 

710 formfields = self.get_fields() 

711 if formfields is None: 

712 return {} 

713 ff = {} 

714 for field, value in formfields.items(): 

715 if value.get("/FT") == "/Tx": 

716 if full_qualified_name: 

717 ff[field] = value.get("/V") 

718 else: 

719 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") 

720 return ff 

721 

722 def get_pages_showing_field( 

723 self, field: Union[Field, PdfObject, IndirectObject] 

724 ) -> list[PageObject]: 

725 """ 

726 Provides list of pages where the field is called. 

727 

728 Args: 

729 field: Field Object, PdfObject or IndirectObject referencing a Field 

730 

731 Returns: 

732 List of pages: 

733 - Empty list: 

734 The field has no widgets attached 

735 (either hidden field or ancestor field). 

736 - Single page list: 

737 Page where the widget is present 

738 (most common). 

739 - Multi-page list: 

740 Field with multiple kids widgets 

741 (example: radio buttons, field repeated on multiple pages). 

742 

743 """ 

744 

745 def _get_inherited(obj: DictionaryObject, key: str) -> Any: 

746 if key in obj: 

747 return obj[key] 

748 if "/Parent" in obj: 

749 return _get_inherited( 

750 cast(DictionaryObject, obj["/Parent"].get_object()), key 

751 ) 

752 return None 

753 

754 try: 

755 # to cope with all types 

756 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore 

757 except Exception as exc: 

758 raise ValueError("Field type is invalid") from exc 

759 if is_null_or_none(_get_inherited(field, "/FT")): 

760 raise ValueError("Field is not valid") 

761 ret = [] 

762 if field.get("/Subtype", "") == "/Widget": 

763 if "/P" in field: 

764 ret = [field["/P"].get_object()] 

765 else: 

766 ret = [ 

767 p 

768 for p in self.pages 

769 if field.indirect_reference in p.get("/Annots", "") 

770 ] 

771 else: 

772 kids = field.get("/Kids", ()) 

773 for k in kids: 

774 k = k.get_object() 

775 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): 

776 # Kid that is just a widget, not a field: 

777 if "/P" in k: 

778 ret += [k["/P"].get_object()] 

779 else: 

780 ret += [ 

781 p 

782 for p in self.pages 

783 if k.indirect_reference in p.get("/Annots", "") 

784 ] 

785 return [ 

786 x 

787 if isinstance(x, PageObject) 

788 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore 

789 for x in ret 

790 ] 

791 

792 @property 

793 def open_destination( 

794 self, 

795 ) -> Union[None, Destination, TextStringObject, ByteStringObject]: 

796 """ 

797 Property to access the opening destination (``/OpenAction`` entry in 

798 the PDF catalog). It returns ``None`` if the entry does not exist 

799 or is not set. 

800 

801 Raises: 

802 Exception: If a destination is invalid. 

803 

804 """ 

805 if "/OpenAction" not in self.root_object: 

806 return None 

807 oa: Any = self.root_object["/OpenAction"] 

808 if isinstance(oa, bytes): # pragma: no cover 

809 oa = oa.decode() 

810 if isinstance(oa, str): 

811 return create_string_object(oa) 

812 if isinstance(oa, ArrayObject): 

813 try: 

814 page, typ, *array = oa 

815 fit = Fit(typ, tuple(array)) 

816 return Destination("OpenAction", page, fit) 

817 except Exception as exc: 

818 raise Exception(f"Invalid Destination {oa}: {exc}") 

819 else: 

820 return None 

821 

822 @open_destination.setter 

823 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: 

824 raise NotImplementedError("No setter for open_destination") 

825 

826 @property 

827 def outline(self) -> OutlineType: 

828 """ 

829 Read-only property for the outline present in the document 

830 (i.e., a collection of 'outline items' which are also known as 

831 'bookmarks'). 

832 """ 

833 return self._get_outline() 

834 

835 def _get_outline( 

836 self, 

837 node: Optional[DictionaryObject] = None, 

838 outline: Optional[Any] = None, 

839 visited: Optional[set[int]] = None, 

840 ) -> OutlineType: 

841 if outline is None: 

842 outline = [] 

843 catalog = self.root_object 

844 

845 # get the outline dictionary and named destinations 

846 if CO.OUTLINES in catalog: 

847 lines = cast(DictionaryObject, catalog[CO.OUTLINES]) 

848 

849 if isinstance(lines, NullObject): 

850 return outline 

851 

852 # §12.3.3 Document outline, entries in the outline dictionary 

853 if not is_null_or_none(lines) and "/First" in lines: 

854 node = cast(DictionaryObject, lines["/First"]) 

855 self._named_destinations = self._get_named_destinations() 

856 

857 if node is None: 

858 return outline 

859 

860 # see if there are any more outline items 

861 if visited is None: 

862 visited = set() 

863 while True: 

864 node_id = id(node) 

865 if node_id in visited: 

866 logger_warning(f"Detected cycle in outline structure for {node}", __name__) 

867 break 

868 visited.add(node_id) 

869 

870 outline_obj = self._build_outline_item(node) 

871 if outline_obj: 

872 outline.append(outline_obj) 

873 

874 # check for sub-outline 

875 if "/First" in node: 

876 sub_outline: list[Any] = [] 

877 # Pass a copy to allow multiple outer entries to reference the same inner one. 

878 inner_visited = visited.copy() 

879 self._get_outline( 

880 node=cast(DictionaryObject, node["/First"]), 

881 outline=sub_outline, 

882 visited=inner_visited, 

883 ) 

884 if sub_outline: 

885 outline.append(sub_outline) 

886 

887 if "/Next" not in node: 

888 break 

889 node = cast(DictionaryObject, node["/Next"]) 

890 

891 return outline 

892 

893 @property 

894 def threads(self) -> Optional[ArrayObject]: 

895 """ 

896 Read-only property for the list of threads. 

897 

898 See §12.4.3 from the PDF 1.7 or 2.0 specification. 

899 

900 It is an array of dictionaries with "/F" (the first bead in the thread) 

901 and "/I" (a thread information dictionary containing information about 

902 the thread, such as its title, author, and creation date) properties or 

903 None if there are no articles. 

904 

905 Since PDF 2.0 it can also contain an indirect reference to a metadata 

906 stream containing information about the thread, such as its title, 

907 author, and creation date. 

908 """ 

909 catalog = self.root_object 

910 if CO.THREADS in catalog: 

911 return cast("ArrayObject", catalog[CO.THREADS]) 

912 return None 

913 

914 @abstractmethod 

915 def _get_page_number_by_indirect( 

916 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

917 ) -> Optional[int]: 

918 ... # pragma: no cover 

919 

920 def get_page_number(self, page: PageObject) -> Optional[int]: 

921 """ 

922 Retrieve page number of a given PageObject. 

923 

924 Args: 

925 page: The page to get page number. Should be 

926 an instance of :class:`PageObject<pypdf._page.PageObject>` 

927 

928 Returns: 

929 The page number or None if page is not found 

930 

931 """ 

932 return self._get_page_number_by_indirect(page.indirect_reference) 

933 

934 def get_destination_page_number(self, destination: Destination) -> Optional[int]: 

935 """ 

936 Retrieve page number of a given Destination object. 

937 

938 Args: 

939 destination: The destination to get page number. 

940 

941 Returns: 

942 The page number or None if page is not found 

943 

944 """ 

945 return self._get_page_number_by_indirect(destination.page) 

946 

947 def _build_destination( 

948 self, 

949 title: Union[str, bytes], 

950 array: Optional[ 

951 list[ 

952 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] 

953 ] 

954 ], 

955 ) -> Destination: 

956 page, typ = None, None 

957 # handle outline items with missing or invalid destination 

958 if ( 

959 isinstance(array, (NullObject, str)) 

960 or (isinstance(array, ArrayObject) and len(array) == 0) 

961 or array is None 

962 ): 

963 page = NullObject() 

964 return Destination(title, page, Fit.fit()) 

965 page, typ, *array = array # type: ignore 

966 try: 

967 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore 

968 except PdfReadError: 

969 logger_warning(f"Unknown destination: {title!r} {array}", __name__) 

970 if self.strict: 

971 raise 

972 # create a link to first Page 

973 tmp = self.pages[0].indirect_reference 

974 indirect_reference = NullObject() if tmp is None else tmp 

975 return Destination(title, indirect_reference, Fit.fit()) 

976 

977 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: 

978 dest, title, outline_item = None, None, None 

979 

980 # title required for valid outline 

981 # §12.3.3, entries in an outline item dictionary 

982 try: 

983 title = cast("str", node["/Title"]) 

984 except KeyError: 

985 if self.strict: 

986 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") 

987 title = "" 

988 

989 if "/A" in node: 

990 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported) 

991 action = cast(DictionaryObject, node["/A"]) 

992 action_type = cast(NameObject, action[GoToActionArguments.S]) 

993 if action_type == "/GoTo": 

994 if GoToActionArguments.D in action: 

995 dest = action[GoToActionArguments.D] 

996 elif self.strict: 

997 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}") 

998 elif "/Dest" in node: 

999 # Destination, PDF 1.7 and PDF 2.0 §12.3.2 

1000 dest = node["/Dest"] 

1001 # if array was referenced in another object, will be a dict w/ key "/D" 

1002 if isinstance(dest, DictionaryObject) and "/D" in dest: 

1003 dest = dest["/D"] 

1004 

1005 if isinstance(dest, ArrayObject): 

1006 outline_item = self._build_destination(title, dest) 

1007 elif isinstance(dest, str): 

1008 # named destination, addresses NameObject Issue #193 

1009 # TODO: Keep named destination instead of replacing it? 

1010 try: 

1011 outline_item = self._build_destination( 

1012 title, self._named_destinations[dest].dest_array 

1013 ) 

1014 except KeyError: 

1015 # named destination not found in Name Dict 

1016 outline_item = self._build_destination(title, None) 

1017 elif dest is None: 

1018 # outline item not required to have destination or action 

1019 # PDFv1.7 Table 153 

1020 outline_item = self._build_destination(title, dest) 

1021 else: 

1022 if self.strict: 

1023 raise PdfReadError(f"Unexpected destination {dest!r}") 

1024 logger_warning( 

1025 f"Removed unexpected destination {dest!r} from destination", 

1026 __name__, 

1027 ) 

1028 outline_item = self._build_destination(title, None) 

1029 

1030 # if outline item created, add color, format, and child count if present 

1031 if outline_item: 

1032 if "/C" in node: 

1033 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 

1034 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore 

1035 if "/F" in node: 

1036 # specifies style characteristics bold and/or italic 

1037 # with 1=italic, 2=bold, 3=both 

1038 outline_item[NameObject("/F")] = node["/F"] 

1039 if "/Count" in node: 

1040 # absolute value = num. visible children 

1041 # with positive = open/unfolded, negative = closed/folded 

1042 outline_item[NameObject("/Count")] = node["/Count"] 

1043 # if count is 0 we will consider it as open (to have available is_open) 

1044 outline_item[NameObject("/%is_open%")] = BooleanObject( 

1045 node.get("/Count", 0) >= 0 

1046 ) 

1047 outline_item.node = node 

1048 try: 

1049 outline_item.indirect_reference = node.indirect_reference 

1050 except AttributeError: 

1051 pass 

1052 return outline_item 

1053 

1054 @property 

1055 def pages(self) -> list[PageObject]: 

1056 """ 

1057 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`. 

1058 This property allows to get a page or a range of pages. 

1059 

1060 Note: 

1061 For PdfWriter only: Provides the capability to remove a page/range of 

1062 page from the list (using the del operator). Remember: Only the page 

1063 entry is removed, as the objects beneath can be used elsewhere. A 

1064 solution to completely remove them - if they are not used anywhere - is 

1065 to write to a buffer/temporary file and then load it into a new 

1066 PdfWriter. 

1067 

1068 """ 

1069 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore 

1070 

1071 @property 

1072 def page_labels(self) -> list[str]: 

1073 """ 

1074 A list of labels for the pages in this document. 

1075 

1076 This property is read-only. The labels are in the order that the pages 

1077 appear in the document. 

1078 """ 

1079 return [page_index2page_label(self, i) for i in range(len(self.pages))] 

1080 

1081 @property 

1082 def page_layout(self) -> Optional[str]: 

1083 """ 

1084 Get the page layout currently being used. 

1085 

1086 .. list-table:: Valid ``layout`` values 

1087 :widths: 50 200 

1088 

1089 * - /NoLayout 

1090 - Layout explicitly not specified 

1091 * - /SinglePage 

1092 - Show one page at a time 

1093 * - /OneColumn 

1094 - Show one column at a time 

1095 * - /TwoColumnLeft 

1096 - Show pages in two columns, odd-numbered pages on the left 

1097 * - /TwoColumnRight 

1098 - Show pages in two columns, odd-numbered pages on the right 

1099 * - /TwoPageLeft 

1100 - Show two pages at a time, odd-numbered pages on the left 

1101 * - /TwoPageRight 

1102 - Show two pages at a time, odd-numbered pages on the right 

1103 """ 

1104 try: 

1105 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) 

1106 except KeyError: 

1107 return None 

1108 

1109 @property 

1110 def page_mode(self) -> Optional[PagemodeType]: 

1111 """ 

1112 Get the page mode currently being used. 

1113 

1114 .. list-table:: Valid ``mode`` values 

1115 :widths: 50 200 

1116 

1117 * - /UseNone 

1118 - Do not show outline or thumbnails panels 

1119 * - /UseOutlines 

1120 - Show outline (aka bookmarks) panel 

1121 * - /UseThumbs 

1122 - Show page thumbnails panel 

1123 * - /FullScreen 

1124 - Fullscreen view 

1125 * - /UseOC 

1126 - Show Optional Content Group (OCG) panel 

1127 * - /UseAttachments 

1128 - Show attachments panel 

1129 """ 

1130 try: 

1131 return self.root_object["/PageMode"] # type: ignore 

1132 except KeyError: 

1133 return None 

1134 

1135 def _flatten( 

1136 self, 

1137 list_only: bool = False, 

1138 pages: Union[None, DictionaryObject, PageObject] = None, 

1139 inherit: Optional[dict[str, Any]] = None, 

1140 indirect_reference: Optional[IndirectObject] = None, 

1141 ) -> None: 

1142 """ 

1143 Process the document pages to ease searching. 

1144 

1145 Attributes of a page may inherit from ancestor nodes 

1146 in the page tree. Flattening means moving 

1147 any inheritance data into descendant nodes, 

1148 effectively removing the inheritance dependency. 

1149 

1150 Note: It is distinct from another use of "flattening" applied to PDFs. 

1151 Flattening a PDF also means combining all the contents into one single layer 

1152 and making the file less editable. 

1153 

1154 Args: 

1155 list_only: Will only list the pages within _flatten_pages. 

1156 pages: 

1157 inherit: 

1158 indirect_reference: Used recursively to flatten the /Pages object. 

1159 

1160 """ 

1161 inheritable_page_attributes = ( 

1162 NameObject(PG.RESOURCES), 

1163 NameObject(PG.MEDIABOX), 

1164 NameObject(PG.CROPBOX), 

1165 NameObject(PG.ROTATE), 

1166 ) 

1167 if inherit is None: 

1168 inherit = {} 

1169 if pages is None: 

1170 # Fix issue 327: set flattened_pages attribute only for 

1171 # decrypted file 

1172 catalog = self.root_object 

1173 pages = catalog.get("/Pages").get_object() # type: ignore 

1174 if not isinstance(pages, DictionaryObject): 

1175 raise PdfReadError("Invalid object in /Pages") 

1176 self.flattened_pages = [] 

1177 

1178 if PagesAttributes.TYPE in pages: 

1179 t = cast(str, pages[PagesAttributes.TYPE]) 

1180 # if the page tree node has no /Type, consider as a page if /Kids is also missing 

1181 elif PagesAttributes.KIDS not in pages: 

1182 t = "/Page" 

1183 else: 

1184 t = "/Pages" 

1185 

1186 if t == "/Pages": 

1187 for attr in inheritable_page_attributes: 

1188 if attr in pages: 

1189 inherit[attr] = pages[attr] 

1190 pages_reference = getattr(pages, "indirect_reference", object()) 

1191 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]): 

1192 if getattr(page, "indirect_reference", object()) == pages_reference: 

1193 raise PdfReadError("Detected cyclic page references.") 

1194 

1195 addt = {} 

1196 if isinstance(page, IndirectObject): 

1197 addt["indirect_reference"] = page 

1198 obj = page.get_object() 

1199 if obj: 

1200 # damaged file may have invalid child in /Pages 

1201 try: 

1202 self._flatten(list_only, obj, inherit, **addt) 

1203 except RecursionError: 

1204 raise PdfReadError( 

1205 "Maximum recursion depth reached during page flattening." 

1206 ) 

1207 elif t == "/Page": 

1208 for attr_in, value in inherit.items(): 

1209 # if the page has its own value, it does not inherit the 

1210 # parent's value 

1211 if attr_in not in pages: 

1212 pages[attr_in] = value 

1213 page_obj = PageObject(self, indirect_reference) 

1214 if not list_only: 

1215 page_obj.update(pages) 

1216 

1217 # TODO: Could flattened_pages be None at this point? 

1218 self.flattened_pages.append(page_obj) # type: ignore 

1219 

1220 def remove_page( 

1221 self, 

1222 page: Union[int, PageObject, IndirectObject], 

1223 clean: bool = False, 

1224 ) -> None: 

1225 """ 

1226 Remove page from pages list. 

1227 

1228 Args: 

1229 page: 

1230 * :class:`int`: Page number to be removed. 

1231 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times 

1232 only the first one will be removed. 

1233 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed. 

1234 

1235 clean: replace PageObject with NullObject to prevent annotations 

1236 or destinations to reference a detached page. 

1237 

1238 """ 

1239 if self.flattened_pages is None: 

1240 self._flatten(self._readonly) 

1241 assert self.flattened_pages is not None 

1242 if isinstance(page, IndirectObject): 

1243 p = page.get_object() 

1244 if not isinstance(p, PageObject): 

1245 logger_warning("IndirectObject is not referencing a page", __name__) 

1246 return 

1247 page = p 

1248 

1249 if not isinstance(page, int): 

1250 try: 

1251 page = self.flattened_pages.index(page) 

1252 except ValueError: 

1253 logger_warning("Cannot find page in pages", __name__) 

1254 return 

1255 if not (0 <= page < len(self.flattened_pages)): 

1256 logger_warning("Page number is out of range", __name__) 

1257 return 

1258 

1259 ind = self.pages[page].indirect_reference 

1260 del self.pages[page] 

1261 if clean and ind is not None: 

1262 self._replace_object(ind, NullObject()) 

1263 

1264 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: 

1265 """ 

1266 Used to ease development. 

1267 

1268 This is equivalent to generic.IndirectObject(num,gen,self).get_object() 

1269 

1270 Args: 

1271 num: The object number of the indirect object. 

1272 gen: The generation number of the indirect object. 

1273 

1274 Returns: 

1275 A PdfObject 

1276 

1277 """ 

1278 return IndirectObject(num, gen, self).get_object() 

1279 

1280 def decode_permissions( 

1281 self, permissions_code: int 

1282 ) -> dict[str, bool]: # pragma: no cover 

1283 """Take the permissions as an integer, return the allowed access.""" 

1284 deprecation_with_replacement( 

1285 old_name="decode_permissions", 

1286 new_name="user_access_permissions", 

1287 removed_in="5.0.0", 

1288 ) 

1289 

1290 permissions_mapping = { 

1291 "print": UserAccessPermissions.PRINT, 

1292 "modify": UserAccessPermissions.MODIFY, 

1293 "copy": UserAccessPermissions.EXTRACT, 

1294 "annotations": UserAccessPermissions.ADD_OR_MODIFY, 

1295 "forms": UserAccessPermissions.FILL_FORM_FIELDS, 

1296 # Do not fix typo, as part of official, but deprecated API. 

1297 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS, 

1298 "assemble": UserAccessPermissions.ASSEMBLE_DOC, 

1299 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION, 

1300 } 

1301 

1302 return { 

1303 key: permissions_code & flag != 0 

1304 for key, flag in permissions_mapping.items() 

1305 } 

1306 

1307 @property 

1308 def user_access_permissions(self) -> Optional[UserAccessPermissions]: 

1309 """Get the user access permissions for encrypted documents. Returns None if not encrypted.""" 

1310 if self._encryption is None: 

1311 return None 

1312 return UserAccessPermissions(self._encryption.P) 

1313 

1314 @property 

1315 @abstractmethod 

1316 def is_encrypted(self) -> bool: 

1317 """ 

1318 Read-only boolean property showing whether this PDF file is encrypted. 

1319 

1320 Note that this property, if true, will remain true even after the 

1321 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1322 """ 

1323 ... # pragma: no cover 

1324 

1325 @property 

1326 def xfa(self) -> Optional[dict[str, Any]]: 

1327 tree: Optional[TreeObject] = None 

1328 retval: dict[str, Any] = {} 

1329 catalog = self.root_object 

1330 

1331 if "/AcroForm" not in catalog or not catalog["/AcroForm"]: 

1332 return None 

1333 

1334 tree = cast(TreeObject, catalog["/AcroForm"]) 

1335 

1336 if "/XFA" in tree: 

1337 fields = cast(ArrayObject, tree["/XFA"]) 

1338 i = iter(fields) 

1339 for f in i: 

1340 tag = f 

1341 f = next(i) 

1342 if isinstance(f, IndirectObject): 

1343 field = cast(Optional[EncodedStreamObject], f.get_object()) 

1344 if field: 

1345 es = zlib.decompress(field._data) 

1346 retval[tag] = es 

1347 return retval 

1348 

1349 @property 

1350 def attachments(self) -> Mapping[str, list[bytes]]: 

1351 """Mapping of attachment filenames to their content.""" 

1352 return LazyDict( 

1353 { 

1354 name: (self._get_attachment_list, name) 

1355 for name in self._list_attachments() 

1356 } 

1357 ) 

1358 

1359 @property 

1360 def attachment_list(self) -> Generator[EmbeddedFile, None, None]: 

1361 """Iterable of attachment objects.""" 

1362 yield from EmbeddedFile._load(self.root_object) 

1363 

1364 def _list_attachments(self) -> list[str]: 

1365 """ 

1366 Retrieves the list of filenames of file attachments. 

1367 

1368 Returns: 

1369 list of filenames 

1370 

1371 """ 

1372 names = [] 

1373 for entry in self.attachment_list: 

1374 names.append(entry.name) 

1375 if (name := entry.alternative_name) != entry.name and name: 

1376 names.append(name) 

1377 return names 

1378 

1379 def _get_attachment_list(self, name: str) -> list[bytes]: 

1380 out = self._get_attachments(name)[name] 

1381 if isinstance(out, list): 

1382 return out 

1383 return [out] 

1384 

1385 def _get_attachments( 

1386 self, filename: Optional[str] = None 

1387 ) -> dict[str, Union[bytes, list[bytes]]]: 

1388 """ 

1389 Retrieves all or selected file attachments of the PDF as a dictionary of file names 

1390 and the file data as a bytestring. 

1391 

1392 Args: 

1393 filename: If filename is None, then a dictionary of all attachments 

1394 will be returned, where the key is the filename and the value 

1395 is the content. Otherwise, a dictionary with just a single key 

1396 - the filename - and its content will be returned. 

1397 

1398 Returns: 

1399 dictionary of filename -> Union[bytestring or List[ByteString]] 

1400 If the filename exists multiple times a list of the different versions will be provided. 

1401 

1402 """ 

1403 attachments: dict[str, Union[bytes, list[bytes]]] = {} 

1404 for entry in self.attachment_list: 

1405 names = set() 

1406 alternative_name = entry.alternative_name 

1407 if filename is not None: 

1408 if filename in {entry.name, alternative_name}: 

1409 name = entry.name if filename == entry.name else alternative_name 

1410 names.add(name) 

1411 else: 

1412 continue 

1413 else: 

1414 names = {entry.name, alternative_name} 

1415 

1416 for name in names: 

1417 if name is None: 

1418 continue 

1419 if name in attachments: 

1420 if not isinstance(attachments[name], list): 

1421 attachments[name] = [attachments[name]] # type:ignore 

1422 attachments[name].append(entry.content) # type:ignore 

1423 else: 

1424 attachments[name] = entry.content 

1425 return attachments 

1426 

1427 @abstractmethod 

1428 def _repr_mimebundle_( 

1429 self, 

1430 include: Union[None, Iterable[str]] = None, 

1431 exclude: Union[None, Iterable[str]] = None, 

1432 ) -> dict[str, Any]: 

1433 """ 

1434 Integration into Jupyter Notebooks. 

1435 

1436 This method returns a dictionary that maps a mime-type to its 

1437 representation. 

1438 

1439 .. seealso:: 

1440 

1441 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1442 """ 

1443 ... # pragma: no cover 

1444 

1445 

1446class LazyDict(Mapping[Any, Any]): 

1447 def __init__(self, *args: Any, **kwargs: Any) -> None: 

1448 self._raw_dict = dict(*args, **kwargs) 

1449 

1450 def __getitem__(self, key: str) -> Any: 

1451 func, arg = self._raw_dict.__getitem__(key) 

1452 return func(arg) 

1453 

1454 def __iter__(self) -> Iterator[Any]: 

1455 return iter(self._raw_dict) 

1456 

1457 def __len__(self) -> int: 

1458 return len(self._raw_dict) 

1459 

1460 def __str__(self) -> str: 

1461 return f"LazyDict(keys={list(self.keys())})"