Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_reader.py: 9%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

687 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import os 

31import re 

32from io import BytesIO, UnsupportedOperation 

33from pathlib import Path 

34from types import TracebackType 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 Callable, 

39 Dict, 

40 Iterable, 

41 List, 

42 Optional, 

43 Set, 

44 Tuple, 

45 Type, 

46 Union, 

47 cast, 

48) 

49 

50from ._doc_common import PdfDocCommon, convert_to_int 

51from ._encryption import Encryption, PasswordType 

52from ._utils import ( 

53 StrByteType, 

54 StreamType, 

55 logger_warning, 

56 read_non_whitespace, 

57 read_previous_line, 

58 read_until_whitespace, 

59 skip_over_comment, 

60 skip_over_whitespace, 

61) 

62from .constants import TrailerKeys as TK 

63from .errors import ( 

64 EmptyFileError, 

65 FileNotDecryptedError, 

66 PdfReadError, 

67 PdfStreamError, 

68 WrongPasswordError, 

69) 

70from .generic import ( 

71 ArrayObject, 

72 ContentStream, 

73 DecodedStreamObject, 

74 DictionaryObject, 

75 EncodedStreamObject, 

76 IndirectObject, 

77 NameObject, 

78 NullObject, 

79 NumberObject, 

80 PdfObject, 

81 StreamObject, 

82 TextStringObject, 

83 is_null_or_none, 

84 read_object, 

85) 

86from .xmp import XmpInformation 

87 

88if TYPE_CHECKING: 

89 from ._page import PageObject 

90 

91 

92class PdfReader(PdfDocCommon): 

93 """ 

94 Initialize a PdfReader object. 

95 

96 This operation can take some time, as the PDF stream's cross-reference 

97 tables are read into memory. 

98 

99 Args: 

100 stream: A File object or an object that supports the standard read 

101 and seek methods similar to a File object. Could also be a 

102 string representing a path to a PDF file. 

103 strict: Determines whether user should be warned of all 

104 problems and also causes some correctable problems to be fatal. 

105 Defaults to ``False``. 

106 password: Decrypt PDF file at initialization. If the 

107 password is None, the file will not be decrypted. 

108 Defaults to ``None``. 

109 

110 """ 

111 

112 def __init__( 

113 self, 

114 stream: Union[StrByteType, Path], 

115 strict: bool = False, 

116 password: Union[None, str, bytes] = None, 

117 ) -> None: 

118 self.strict = strict 

119 self.flattened_pages: Optional[List[PageObject]] = None 

120 

121 #: Storage of parsed PDF objects. 

122 self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {} 

123 

124 self._startxref: int = 0 

125 self.xref_index = 0 

126 self.xref: Dict[int, Dict[Any, Any]] = {} 

127 self.xref_free_entry: Dict[int, Dict[Any, Any]] = {} 

128 self.xref_objStm: Dict[int, Tuple[Any, Any]] = {} 

129 self.trailer = DictionaryObject() 

130 

131 # Map page indirect_reference number to page number 

132 self._page_id2num: Optional[Dict[Any, Any]] = None 

133 

134 self._validated_root: Optional[DictionaryObject] = None 

135 

136 self._initialize_stream(stream) 

137 self._known_objects: Set[Tuple[int, int]] = set() 

138 

139 self._override_encryption = False 

140 self._encryption: Optional[Encryption] = None 

141 if self.is_encrypted: 

142 self._handle_encryption(password) 

143 elif password is not None: 

144 raise PdfReadError("Not an encrypted file") 

145 

146 def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None: 

147 if hasattr(stream, "mode") and "b" not in stream.mode: 

148 logger_warning( 

149 "PdfReader stream/file object is not in binary mode. " 

150 "It may not be read correctly.", 

151 __name__, 

152 ) 

153 self._stream_opened = False 

154 if isinstance(stream, (str, Path)): 

155 with open(stream, "rb") as fh: 

156 stream = BytesIO(fh.read()) 

157 self._stream_opened = True 

158 self.read(stream) 

159 self.stream = stream 

160 

161 def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None: 

162 self._override_encryption = True 

163 # Some documents may not have a /ID, use two empty 

164 # byte strings instead. Solves 

165 # https://github.com/py-pdf/pypdf/issues/608 

166 id_entry = self.trailer.get(TK.ID) 

167 id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" 

168 encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object()) 

169 self._encryption = Encryption.read(encrypt_entry, id1_entry) 

170 

171 # try empty password if no password provided 

172 pwd = password if password is not None else b"" 

173 if ( 

174 self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED 

175 and password is not None 

176 ): 

177 # raise if password provided 

178 raise WrongPasswordError("Wrong password") 

179 self._override_encryption = False 

180 

181 def __enter__(self) -> "PdfReader": 

182 return self 

183 

184 def __exit__( 

185 self, 

186 exc_type: Optional[Type[BaseException]], 

187 exc_val: Optional[BaseException], 

188 exc_tb: Optional[TracebackType], 

189 ) -> None: 

190 self.close() 

191 

192 def close(self) -> None: 

193 """Close the stream if opened in __init__ and clear memory.""" 

194 if self._stream_opened: 

195 self.stream.close() 

196 self.flattened_pages = [] 

197 self.resolved_objects = {} 

198 self.trailer = DictionaryObject() 

199 self.xref = {} 

200 self.xref_free_entry = {} 

201 self.xref_objStm = {} 

202 

203 @property 

204 def root_object(self) -> DictionaryObject: 

205 """Provide access to "/Root". Standardized with PdfWriter.""" 

206 if self._validated_root: 

207 return self._validated_root 

208 root = self.trailer.get(TK.ROOT) 

209 if is_null_or_none(root): 

210 logger_warning('Cannot find "/Root" key in trailer', __name__) 

211 elif ( 

212 cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type") 

213 == "/Catalog" 

214 ): 

215 self._validated_root = cast( 

216 DictionaryObject, cast(PdfObject, root).get_object() 

217 ) 

218 else: 

219 logger_warning("Invalid Root object in trailer", __name__) 

220 if self._validated_root is None: 

221 logger_warning('Searching object with "/Catalog" key', __name__) 

222 nb = cast(int, self.trailer.get("/Size", 0)) 

223 for i in range(nb): 

224 try: 

225 o = self.get_object(i + 1) 

226 except Exception: # to be sure to capture all errors 

227 o = None 

228 if isinstance(o, DictionaryObject) and o.get("/Type") == "/Catalog": 

229 self._validated_root = o 

230 logger_warning(f"Root found at {o.indirect_reference!r}", __name__) 

231 break 

232 if self._validated_root is None: 

233 if not is_null_or_none(root) and "/Pages" in cast(DictionaryObject, cast(PdfObject, root).get_object()): 

234 logger_warning( 

235 f"Possible root found at {cast(PdfObject, root).indirect_reference!r}, but missing /Catalog key", 

236 __name__ 

237 ) 

238 self._validated_root = cast( 

239 DictionaryObject, cast(PdfObject, root).get_object() 

240 ) 

241 else: 

242 raise PdfReadError("Cannot find Root object in pdf") 

243 return self._validated_root 

244 

245 @property 

246 def _info(self) -> Optional[DictionaryObject]: 

247 """ 

248 Provide access to "/Info". Standardized with PdfWriter. 

249 

250 Returns: 

251 /Info Dictionary; None if the entry does not exist 

252 

253 """ 

254 info = self.trailer.get(TK.INFO, None) 

255 if is_null_or_none(info): 

256 return None 

257 assert info is not None, "mypy" 

258 info = info.get_object() 

259 if not isinstance(info, DictionaryObject): 

260 raise PdfReadError( 

261 "Trailer not found or does not point to a document information dictionary" 

262 ) 

263 return info 

264 

265 @property 

266 def _ID(self) -> Optional[ArrayObject]: 

267 """ 

268 Provide access to "/ID". Standardized with PdfWriter. 

269 

270 Returns: 

271 /ID array; None if the entry does not exist 

272 

273 """ 

274 id = self.trailer.get(TK.ID, None) 

275 if is_null_or_none(id): 

276 return None 

277 assert id is not None, "mypy" 

278 return cast(ArrayObject, id.get_object()) 

279 

280 @property 

281 def pdf_header(self) -> str: 

282 """ 

283 The first 8 bytes of the file. 

284 

285 This is typically something like ``'%PDF-1.6'`` and can be used to 

286 detect if the file is actually a PDF file and which version it is. 

287 """ 

288 # TODO: Make this return a bytes object for consistency 

289 # but that needs a deprecation 

290 loc = self.stream.tell() 

291 self.stream.seek(0, 0) 

292 pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") 

293 self.stream.seek(loc, 0) # return to where it was 

294 return pdf_file_version 

295 

296 @property 

297 def xmp_metadata(self) -> Optional[XmpInformation]: 

298 """XMP (Extensible Metadata Platform) data.""" 

299 try: 

300 self._override_encryption = True 

301 return cast(XmpInformation, self.root_object.xmp_metadata) 

302 finally: 

303 self._override_encryption = False 

304 

305 def _get_page_number_by_indirect( 

306 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

307 ) -> Optional[int]: 

308 """ 

309 Retrieve the page number from an indirect reference. 

310 

311 Args: 

312 indirect_reference: The indirect reference to locate. 

313 

314 Returns: 

315 Page number or None. 

316 

317 """ 

318 if self._page_id2num is None: 

319 self._page_id2num = { 

320 x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore 

321 } 

322 

323 if is_null_or_none(indirect_reference): 

324 return None 

325 assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" 

326 if isinstance(indirect_reference, int): 

327 idnum = indirect_reference 

328 else: 

329 idnum = indirect_reference.idnum 

330 assert self._page_id2num is not None, "hint for mypy" 

331 return self._page_id2num.get(idnum, None) 

332 

333 def _get_object_from_stream( 

334 self, indirect_reference: IndirectObject 

335 ) -> Union[int, PdfObject, str]: 

336 # indirect reference to object in object stream 

337 # read the entire object stream into memory 

338 stmnum, idx = self.xref_objStm[indirect_reference.idnum] 

339 obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore 

340 # This is an xref to a stream, so its type better be a stream 

341 assert cast(str, obj_stm["/Type"]) == "/ObjStm" 

342 stream_data = BytesIO(obj_stm.get_data()) 

343 for i in range(obj_stm["/N"]): # type: ignore 

344 read_non_whitespace(stream_data) 

345 stream_data.seek(-1, 1) 

346 objnum = NumberObject.read_from_stream(stream_data) 

347 read_non_whitespace(stream_data) 

348 stream_data.seek(-1, 1) 

349 offset = NumberObject.read_from_stream(stream_data) 

350 read_non_whitespace(stream_data) 

351 stream_data.seek(-1, 1) 

352 if objnum != indirect_reference.idnum: 

353 # We're only interested in one object 

354 continue 

355 if self.strict and idx != i: 

356 raise PdfReadError("Object is in wrong index.") 

357 stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore 

358 

359 # To cope with case where the 'pointer' is on a white space 

360 read_non_whitespace(stream_data) 

361 stream_data.seek(-1, 1) 

362 

363 try: 

364 obj = read_object(stream_data, self) 

365 except PdfStreamError as exc: 

366 # Stream object cannot be read. Normally, a critical error, but 

367 # Adobe Reader doesn't complain, so continue (in strict mode?) 

368 logger_warning( 

369 f"Invalid stream (index {i}) within object " 

370 f"{indirect_reference.idnum} {indirect_reference.generation}: " 

371 f"{exc}", 

372 __name__, 

373 ) 

374 

375 if self.strict: # pragma: no cover 

376 raise PdfReadError( 

377 f"Cannot read object stream: {exc}" 

378 ) # pragma: no cover 

379 # Replace with null. Hopefully it's nothing important. 

380 obj = NullObject() # pragma: no cover 

381 return obj 

382 

383 if self.strict: # pragma: no cover 

384 raise PdfReadError( 

385 "This is a fatal error in strict mode." 

386 ) # pragma: no cover 

387 return NullObject() # pragma: no cover 

388 

389 def get_object( 

390 self, indirect_reference: Union[int, IndirectObject] 

391 ) -> Optional[PdfObject]: 

392 if isinstance(indirect_reference, int): 

393 indirect_reference = IndirectObject(indirect_reference, 0, self) 

394 retval = self.cache_get_indirect_object( 

395 indirect_reference.generation, indirect_reference.idnum 

396 ) 

397 if retval is not None: 

398 return retval 

399 if ( 

400 indirect_reference.generation == 0 

401 and indirect_reference.idnum in self.xref_objStm 

402 ): 

403 retval = self._get_object_from_stream(indirect_reference) # type: ignore 

404 elif ( 

405 indirect_reference.generation in self.xref 

406 and indirect_reference.idnum in self.xref[indirect_reference.generation] 

407 ): 

408 if self.xref_free_entry.get(indirect_reference.generation, {}).get( 

409 indirect_reference.idnum, False 

410 ): 

411 return NullObject() 

412 start = self.xref[indirect_reference.generation][indirect_reference.idnum] 

413 self.stream.seek(start, 0) 

414 try: 

415 idnum, generation = self.read_object_header(self.stream) 

416 if ( 

417 idnum != indirect_reference.idnum 

418 or generation != indirect_reference.generation 

419 ): 

420 raise PdfReadError("Not matching, we parse the file for it") 

421 except Exception: 

422 if hasattr(self.stream, "getbuffer"): 

423 buf = bytes(self.stream.getbuffer()) 

424 else: 

425 p = self.stream.tell() 

426 self.stream.seek(0, 0) 

427 buf = self.stream.read(-1) 

428 self.stream.seek(p, 0) 

429 m = re.search( 

430 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

431 buf, 

432 ) 

433 if m is not None: 

434 logger_warning( 

435 f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", 

436 __name__, 

437 ) 

438 self.xref[indirect_reference.generation][ 

439 indirect_reference.idnum 

440 ] = (m.start(0) + 1) 

441 self.stream.seek(m.start(0) + 1) 

442 idnum, generation = self.read_object_header(self.stream) 

443 else: 

444 idnum = -1 

445 generation = -1 # exception will be raised below 

446 if idnum != indirect_reference.idnum and self.xref_index: 

447 # xref table probably had bad indexes due to not being zero-indexed 

448 if self.strict: 

449 raise PdfReadError( 

450 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

451 f"does not match actual ({idnum} {generation}); " 

452 "xref table not zero-indexed." 

453 ) 

454 # xref table is corrected in non-strict mode 

455 elif idnum != indirect_reference.idnum and self.strict: 

456 # some other problem 

457 raise PdfReadError( 

458 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

459 f"does not match actual ({idnum} {generation})." 

460 ) 

461 if self.strict: 

462 assert generation == indirect_reference.generation 

463 

464 current_object = (indirect_reference.idnum, indirect_reference.generation) 

465 if current_object in self._known_objects: 

466 raise PdfReadError(f"Detected loop with self reference for {indirect_reference!r}.") 

467 self._known_objects.add(current_object) 

468 retval = read_object(self.stream, self) # type: ignore 

469 self._known_objects.remove(current_object) 

470 

471 # override encryption is used for the /Encrypt dictionary 

472 if not self._override_encryption and self._encryption is not None: 

473 # if we don't have the encryption key: 

474 if not self._encryption.is_decrypted(): 

475 raise FileNotDecryptedError("File has not been decrypted") 

476 # otherwise, decrypt here... 

477 retval = cast(PdfObject, retval) 

478 retval = self._encryption.decrypt_object( 

479 retval, indirect_reference.idnum, indirect_reference.generation 

480 ) 

481 else: 

482 if hasattr(self.stream, "getbuffer"): 

483 buf = bytes(self.stream.getbuffer()) 

484 else: 

485 p = self.stream.tell() 

486 self.stream.seek(0, 0) 

487 buf = self.stream.read(-1) 

488 self.stream.seek(p, 0) 

489 m = re.search( 

490 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

491 buf, 

492 ) 

493 if m is not None: 

494 logger_warning( 

495 f"Object {indirect_reference.idnum} {indirect_reference.generation} found", 

496 __name__, 

497 ) 

498 if indirect_reference.generation not in self.xref: 

499 self.xref[indirect_reference.generation] = {} 

500 self.xref[indirect_reference.generation][indirect_reference.idnum] = ( 

501 m.start(0) + 1 

502 ) 

503 self.stream.seek(m.end(0) + 1) 

504 skip_over_whitespace(self.stream) 

505 self.stream.seek(-1, 1) 

506 retval = read_object(self.stream, self) # type: ignore 

507 

508 # override encryption is used for the /Encrypt dictionary 

509 if not self._override_encryption and self._encryption is not None: 

510 # if we don't have the encryption key: 

511 if not self._encryption.is_decrypted(): 

512 raise FileNotDecryptedError("File has not been decrypted") 

513 # otherwise, decrypt here... 

514 retval = cast(PdfObject, retval) 

515 retval = self._encryption.decrypt_object( 

516 retval, indirect_reference.idnum, indirect_reference.generation 

517 ) 

518 else: 

519 logger_warning( 

520 f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", 

521 __name__, 

522 ) 

523 if self.strict: 

524 raise PdfReadError("Could not find object.") 

525 self.cache_indirect_object( 

526 indirect_reference.generation, indirect_reference.idnum, retval 

527 ) 

528 return retval 

529 

530 def read_object_header(self, stream: StreamType) -> Tuple[int, int]: 

531 # Should never be necessary to read out whitespace, since the 

532 # cross-reference table should put us in the right spot to read the 

533 # object header. In reality some files have stupid cross-reference 

534 # tables that are off by whitespace bytes. 

535 skip_over_comment(stream) 

536 extra = skip_over_whitespace(stream) 

537 stream.seek(-1, 1) 

538 idnum = read_until_whitespace(stream) 

539 extra |= skip_over_whitespace(stream) 

540 stream.seek(-1, 1) 

541 generation = read_until_whitespace(stream) 

542 extra |= skip_over_whitespace(stream) 

543 stream.seek(-1, 1) 

544 

545 # although it's not used, it might still be necessary to read 

546 _obj = stream.read(3) 

547 

548 read_non_whitespace(stream) 

549 stream.seek(-1, 1) 

550 if extra and self.strict: 

551 logger_warning( 

552 f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore 

553 __name__, 

554 ) 

555 return int(idnum), int(generation) 

556 

557 def cache_get_indirect_object( 

558 self, generation: int, idnum: int 

559 ) -> Optional[PdfObject]: 

560 try: 

561 return self.resolved_objects.get((generation, idnum)) 

562 except RecursionError: 

563 raise PdfReadError("Maximum recursion depth reached.") 

564 

565 def cache_indirect_object( 

566 self, generation: int, idnum: int, obj: Optional[PdfObject] 

567 ) -> Optional[PdfObject]: 

568 if (generation, idnum) in self.resolved_objects: 

569 msg = f"Overwriting cache for {generation} {idnum}" 

570 if self.strict: 

571 raise PdfReadError(msg) 

572 logger_warning(msg, __name__) 

573 self.resolved_objects[(generation, idnum)] = obj 

574 if obj is not None: 

575 obj.indirect_reference = IndirectObject(idnum, generation, self) 

576 return obj 

577 

578 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: 

579 # function reserved for future development 

580 if indirect.pdf != self: 

581 raise ValueError("Cannot update PdfReader with external object") 

582 if (indirect.generation, indirect.idnum) not in self.resolved_objects: 

583 raise ValueError("Cannot find referenced object") 

584 self.resolved_objects[(indirect.generation, indirect.idnum)] = obj 

585 obj.indirect_reference = indirect 

586 return obj 

587 

588 def read(self, stream: StreamType) -> None: 

589 """ 

590 Read and process the PDF stream, extracting necessary data. 

591 

592 Args: 

593 stream: The PDF file stream. 

594 

595 """ 

596 self._basic_validation(stream) 

597 self._find_eof_marker(stream) 

598 startxref = self._find_startxref_pos(stream) 

599 self._startxref = startxref 

600 

601 # check and eventually correct the startxref only if not strict 

602 xref_issue_nr = self._get_xref_issues(stream, startxref) 

603 if xref_issue_nr != 0: 

604 if self.strict and xref_issue_nr: 

605 raise PdfReadError("Broken xref table") 

606 logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) 

607 

608 # read all cross-reference tables and their trailers 

609 self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) 

610 

611 # if not zero-indexed, verify that the table is correct; change it if necessary 

612 if self.xref_index and not self.strict: 

613 loc = stream.tell() 

614 for gen, xref_entry in self.xref.items(): 

615 if gen == 65535: 

616 continue 

617 xref_k = sorted( 

618 xref_entry.keys() 

619 ) # ensure ascending to prevent damage 

620 for id in xref_k: 

621 stream.seek(xref_entry[id], 0) 

622 try: 

623 pid, _pgen = self.read_object_header(stream) 

624 except ValueError: 

625 self._rebuild_xref_table(stream) 

626 break 

627 if pid == id - self.xref_index: 

628 # fixing index item per item is required for revised PDF. 

629 self.xref[gen][pid] = self.xref[gen][id] 

630 del self.xref[gen][id] 

631 # if not, then either it's just plain wrong, or the 

632 # non-zero-index is actually correct 

633 stream.seek(loc, 0) # return to where it was 

634 

635 # remove wrong objects (not pointing to correct structures) - cf #2326 

636 if not self.strict: 

637 loc = stream.tell() 

638 for gen, xref_entry in self.xref.items(): 

639 if gen == 65535: 

640 continue 

641 ids = list(xref_entry.keys()) 

642 for id in ids: 

643 stream.seek(xref_entry[id], 0) 

644 try: 

645 self.read_object_header(stream) 

646 except ValueError: 

647 logger_warning( 

648 f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})", 

649 __name__, 

650 ) 

651 del xref_entry[id] # we can delete the id, we are parsing ids 

652 stream.seek(loc, 0) # return to where it was 

653 

654 def _basic_validation(self, stream: StreamType) -> None: 

655 """Ensure the stream is valid and not empty.""" 

656 stream.seek(0, os.SEEK_SET) 

657 try: 

658 header_byte = stream.read(5) 

659 except UnicodeDecodeError: 

660 raise UnsupportedOperation("cannot read header") 

661 if header_byte == b"": 

662 raise EmptyFileError("Cannot read an empty file") 

663 if header_byte != b"%PDF-": 

664 if self.strict: 

665 raise PdfReadError( 

666 f"PDF starts with '{header_byte.decode('utf8')}', " 

667 "but '%PDF-' expected" 

668 ) 

669 logger_warning(f"invalid pdf header: {header_byte}", __name__) 

670 stream.seek(0, os.SEEK_END) 

671 

672 def _find_eof_marker(self, stream: StreamType) -> None: 

673 """ 

674 Jump to the %%EOF marker. 

675 

676 According to the specs, the %%EOF marker should be at the very end of 

677 the file. Hence for standard-compliant PDF documents this function will 

678 read only the last part (DEFAULT_BUFFER_SIZE). 

679 """ 

680 HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' 

681 line = b"" 

682 first = True 

683 while not line.startswith(b"%%EOF"): 

684 if line != b"" and first: 

685 if any( 

686 line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%") 

687 ): 

688 # Consider the file as truncated while 

689 # having enough confidence to carry on. 

690 logger_warning("EOF marker seems truncated", __name__) 

691 break 

692 first = False 

693 if b"startxref" in line: 

694 logger_warning( 

695 "CAUTION: startxref found while searching for %%EOF. " 

696 "The file might be truncated and some data might not be read.", 

697 __name__, 

698 ) 

699 if stream.tell() < HEADER_SIZE: 

700 if self.strict: 

701 raise PdfReadError("EOF marker not found") 

702 logger_warning("EOF marker not found", __name__) 

703 line = read_previous_line(stream) 

704 

705 def _find_startxref_pos(self, stream: StreamType) -> int: 

706 """ 

707 Find startxref entry - the location of the xref table. 

708 

709 Args: 

710 stream: 

711 

712 Returns: 

713 The bytes offset 

714 

715 """ 

716 line = read_previous_line(stream) 

717 try: 

718 startxref = int(line) 

719 except ValueError: 

720 # 'startxref' may be on the same line as the location 

721 if not line.startswith(b"startxref"): 

722 raise PdfReadError("startxref not found") 

723 startxref = int(line[9:].strip()) 

724 logger_warning("startxref on same line as offset", __name__) 

725 else: 

726 line = read_previous_line(stream) 

727 if not line.startswith(b"startxref"): 

728 raise PdfReadError("startxref not found") 

729 return startxref 

730 

731 def _read_standard_xref_table(self, stream: StreamType) -> None: 

732 # standard cross-reference table 

733 ref = stream.read(3) 

734 if ref != b"ref": 

735 raise PdfReadError("xref table read error") 

736 read_non_whitespace(stream) 

737 stream.seek(-1, 1) 

738 first_time = True # check if the first time looking at the xref table 

739 while True: 

740 num = cast(int, read_object(stream, self)) 

741 if first_time and num != 0: 

742 self.xref_index = num 

743 if self.strict: 

744 logger_warning( 

745 "Xref table not zero-indexed. ID numbers for objects will be corrected.", 

746 __name__, 

747 ) 

748 # if table not zero indexed, could be due to error from when PDF was created 

749 # which will lead to mismatched indices later on, only warned and corrected if self.strict==True 

750 first_time = False 

751 read_non_whitespace(stream) 

752 stream.seek(-1, 1) 

753 size = cast(int, read_object(stream, self)) 

754 if not isinstance(size, int): 

755 logger_warning( 

756 "Invalid/Truncated xref table. Rebuilding it.", 

757 __name__, 

758 ) 

759 self._rebuild_xref_table(stream) 

760 stream.read() 

761 return 

762 read_non_whitespace(stream) 

763 stream.seek(-1, 1) 

764 cnt = 0 

765 while cnt < size: 

766 line = stream.read(20) 

767 if not line: 

768 raise PdfReadError("Unexpected empty line in Xref table.") 

769 

770 # It's very clear in section 3.4.3 of the PDF spec 

771 # that all cross-reference table lines are a fixed 

772 # 20 bytes (as of PDF 1.7). However, some files have 

773 # 21-byte entries (or more) due to the use of \r\n 

774 # (CRLF) EOL's. Detect that case, and adjust the line 

775 # until it does not begin with a \r (CR) or \n (LF). 

776 while line[0] in b"\x0D\x0A": 

777 stream.seek(-20 + 1, 1) 

778 line = stream.read(20) 

779 

780 # On the other hand, some malformed PDF files 

781 # use a single character EOL without a preceding 

782 # space. Detect that case, and seek the stream 

783 # back one character (0-9 means we've bled into 

784 # the next xref entry, t means we've bled into the 

785 # text "trailer"): 

786 if line[-1] in b"0123456789t": 

787 stream.seek(-1, 1) 

788 

789 try: 

790 offset_b, generation_b = line[:16].split(b" ") 

791 entry_type_b = line[17:18] 

792 

793 offset, generation = int(offset_b), int(generation_b) 

794 except Exception: 

795 if hasattr(stream, "getbuffer"): 

796 buf = bytes(stream.getbuffer()) 

797 else: 

798 p = stream.tell() 

799 stream.seek(0, 0) 

800 buf = stream.read(-1) 

801 stream.seek(p) 

802 

803 f = re.search(rf"{num}\s+(\d+)\s+obj".encode(), buf) 

804 if f is None: 

805 logger_warning( 

806 f"entry {num} in Xref table invalid; object not found", 

807 __name__, 

808 ) 

809 generation = 65535 

810 offset = -1 

811 else: 

812 logger_warning( 

813 f"entry {num} in Xref table invalid but object found", 

814 __name__, 

815 ) 

816 generation = int(f.group(1)) 

817 offset = f.start() 

818 

819 if generation not in self.xref: 

820 self.xref[generation] = {} 

821 self.xref_free_entry[generation] = {} 

822 if num in self.xref[generation]: 

823 # It really seems like we should allow the last 

824 # xref table in the file to override previous 

825 # ones. Since we read the file backwards, assume 

826 # any existing key is already set correctly. 

827 pass 

828 else: 

829 if entry_type_b == b"n": 

830 self.xref[generation][num] = offset 

831 try: 

832 self.xref_free_entry[generation][num] = entry_type_b == b"f" 

833 except Exception: 

834 pass 

835 try: 

836 self.xref_free_entry[65535][num] = entry_type_b == b"f" 

837 except Exception: 

838 pass 

839 cnt += 1 

840 num += 1 

841 read_non_whitespace(stream) 

842 stream.seek(-1, 1) 

843 trailer_tag = stream.read(7) 

844 if trailer_tag != b"trailer": 

845 # more xrefs! 

846 stream.seek(-7, 1) 

847 else: 

848 break 

849 

850 def _read_xref_tables_and_trailers( 

851 self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int 

852 ) -> None: 

853 """Read the cross-reference tables and trailers in the PDF stream.""" 

854 self.xref = {} 

855 self.xref_free_entry = {} 

856 self.xref_objStm = {} 

857 self.trailer = DictionaryObject() 

858 while startxref is not None: 

859 # load the xref table 

860 stream.seek(startxref, 0) 

861 x = stream.read(1) 

862 if x in b"\r\n": 

863 x = stream.read(1) 

864 if x == b"x": 

865 startxref = self._read_xref(stream) 

866 elif xref_issue_nr: 

867 try: 

868 self._rebuild_xref_table(stream) 

869 break 

870 except Exception: 

871 xref_issue_nr = 0 

872 elif x.isdigit(): 

873 try: 

874 xrefstream = self._read_pdf15_xref_stream(stream) 

875 except Exception as e: 

876 if TK.ROOT in self.trailer: 

877 logger_warning( 

878 f"Previous trailer cannot be read: {e.args}", __name__ 

879 ) 

880 break 

881 raise PdfReadError(f"Trailer cannot be read: {e!s}") 

882 self._process_xref_stream(xrefstream) 

883 if "/Prev" in xrefstream: 

884 startxref = cast(int, xrefstream["/Prev"]) 

885 else: 

886 break 

887 else: 

888 startxref = self._read_xref_other_error(stream, startxref) 

889 

890 def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: 

891 """Process and handle the xref stream.""" 

892 trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE 

893 for key in trailer_keys: 

894 if key in xrefstream and key not in self.trailer: 

895 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 

896 if "/XRefStm" in xrefstream: 

897 p = self.stream.tell() 

898 self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) 

899 self._read_pdf15_xref_stream(self.stream) 

900 self.stream.seek(p, 0) 

901 

902 def _read_xref(self, stream: StreamType) -> Optional[int]: 

903 self._read_standard_xref_table(stream) 

904 if stream.read(1) == b"": 

905 return None 

906 stream.seek(-1, 1) 

907 read_non_whitespace(stream) 

908 stream.seek(-1, 1) 

909 new_trailer = cast(Dict[str, Any], read_object(stream, self)) 

910 for key, value in new_trailer.items(): 

911 if key not in self.trailer: 

912 self.trailer[key] = value 

913 if "/XRefStm" in new_trailer: 

914 p = stream.tell() 

915 stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) 

916 try: 

917 self._read_pdf15_xref_stream(stream) 

918 except Exception: 

919 logger_warning( 

920 f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", 

921 __name__, 

922 ) 

923 stream.seek(p, 0) 

924 if "/Prev" in new_trailer: 

925 return new_trailer["/Prev"] 

926 return None 

927 

928 def _read_xref_other_error( 

929 self, stream: StreamType, startxref: int 

930 ) -> Optional[int]: 

931 # some PDFs have /Prev=0 in the trailer, instead of no /Prev 

932 if startxref == 0: 

933 if self.strict: 

934 raise PdfReadError( 

935 "/Prev=0 in the trailer (try opening with strict=False)" 

936 ) 

937 logger_warning( 

938 "/Prev=0 in the trailer - assuming there is no previous xref table", 

939 __name__, 

940 ) 

941 return None 

942 # bad xref character at startxref. Let's see if we can find 

943 # the xref table nearby, as we've observed this error with an 

944 # off-by-one before. 

945 stream.seek(-11, 1) 

946 tmp = stream.read(20) 

947 xref_loc = tmp.find(b"xref") 

948 if xref_loc != -1: 

949 startxref -= 10 - xref_loc 

950 return startxref 

951 # No explicit xref table, try finding a cross-reference stream. 

952 stream.seek(startxref, 0) 

953 for look in range(25): # value extended to cope with more linearized files 

954 if stream.read(1).isdigit(): 

955 # This is not a standard PDF, consider adding a warning 

956 startxref += look 

957 return startxref 

958 # no xref table found at specified location 

959 if "/Root" in self.trailer and not self.strict: 

960 # if Root has been already found, just raise warning 

961 logger_warning("Invalid parent xref., rebuild xref", __name__) 

962 try: 

963 self._rebuild_xref_table(stream) 

964 return None 

965 except Exception: 

966 raise PdfReadError("Cannot rebuild xref") 

967 raise PdfReadError("Could not find xref table at specified location") 

968 

969 def _read_pdf15_xref_stream( 

970 self, stream: StreamType 

971 ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: 

972 """Read the cross-reference stream for PDF 1.5+.""" 

973 stream.seek(-1, 1) 

974 idnum, generation = self.read_object_header(stream) 

975 xrefstream = cast(ContentStream, read_object(stream, self)) 

976 if cast(str, xrefstream["/Type"]) != "/XRef": 

977 raise PdfReadError(f"Unexpected type {xrefstream['/Type']!r}") 

978 self.cache_indirect_object(generation, idnum, xrefstream) 

979 stream_data = BytesIO(xrefstream.get_data()) 

980 # Index pairs specify the subsections in the dictionary. 

981 # If none, create one subsection that spans everything. 

982 idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) 

983 entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W")) 

984 assert len(entry_sizes) >= 3 

985 if self.strict and len(entry_sizes) > 3: 

986 raise PdfReadError(f"Too many entry sizes: {entry_sizes}") 

987 

988 def get_entry(i: int) -> Union[int, Tuple[int, ...]]: 

989 # Reads the correct number of bytes for each entry. See the 

990 # discussion of the W parameter in PDF spec table 17. 

991 if entry_sizes[i] > 0: 

992 d = stream_data.read(entry_sizes[i]) 

993 return convert_to_int(d, entry_sizes[i]) 

994 

995 # PDF Spec Table 17: A value of zero for an element in the 

996 # W array indicates...the default value shall be used 

997 if i == 0: 

998 return 1 # First value defaults to 1 

999 return 0 

1000 

1001 def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool: 

1002 # We move backwards through the xrefs, don't replace any. 

1003 return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore 

1004 

1005 # Iterate through each subsection 

1006 self._read_xref_subsections(idx_pairs, get_entry, used_before) 

1007 return xrefstream 

1008 

1009 @staticmethod 

1010 def _get_xref_issues(stream: StreamType, startxref: int) -> int: 

1011 """ 

1012 Return an int which indicates an issue. 0 means there is no issue. 

1013 

1014 Args: 

1015 stream: 

1016 startxref: 

1017 

1018 Returns: 

1019 0 means no issue, other values represent specific issues. 

1020 

1021 """ 

1022 if startxref == 0: 

1023 return 4 

1024 

1025 stream.seek(startxref - 1, 0) # -1 to check character before 

1026 line = stream.read(1) 

1027 if line == b"j": 

1028 line = stream.read(1) 

1029 if line not in b"\r\n \t": 

1030 return 1 

1031 line = stream.read(4) 

1032 if line != b"xref": 

1033 # not a xref so check if it is an XREF object 

1034 line = b"" 

1035 while line in b"0123456789 \t": 

1036 line = stream.read(1) 

1037 if line == b"": 

1038 return 2 

1039 line += stream.read(2) # 1 char already read, +2 to check "obj" 

1040 if line.lower() != b"obj": 

1041 return 3 

1042 return 0 

1043 

1044 def _rebuild_xref_table(self, stream: StreamType) -> None: 

1045 self.xref = {} 

1046 stream.seek(0, 0) 

1047 f_ = stream.read(-1) 

1048 

1049 for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): 

1050 idnum = int(m.group(1)) 

1051 generation = int(m.group(2)) 

1052 if generation not in self.xref: 

1053 self.xref[generation] = {} 

1054 self.xref[generation][idnum] = m.start(1) 

1055 

1056 logger_warning("parsing for Object Streams", __name__) 

1057 for g in self.xref: 

1058 for i in self.xref[g]: 

1059 # get_object in manual 

1060 stream.seek(self.xref[g][i], 0) 

1061 try: 

1062 _ = self.read_object_header(stream) 

1063 o = cast(StreamObject, read_object(stream, self)) 

1064 if o.get("/Type", "") != "/ObjStm": 

1065 continue 

1066 strm = BytesIO(o.get_data()) 

1067 cpt = 0 

1068 while True: 

1069 s = read_until_whitespace(strm) 

1070 if not s.isdigit(): 

1071 break 

1072 _i = int(s) 

1073 skip_over_whitespace(strm) 

1074 strm.seek(-1, 1) 

1075 s = read_until_whitespace(strm) 

1076 if not s.isdigit(): # pragma: no cover 

1077 break # pragma: no cover 

1078 _o = int(s) 

1079 self.xref_objStm[_i] = (i, _o) 

1080 cpt += 1 

1081 if cpt != o.get("/N"): # pragma: no cover 

1082 logger_warning( # pragma: no cover 

1083 f"found {cpt} objects within Object({i},{g})" 

1084 f" whereas {o.get('/N')} expected", 

1085 __name__, 

1086 ) 

1087 except Exception: # could be multiple causes 

1088 pass 

1089 

1090 stream.seek(0, 0) 

1091 for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): 

1092 stream.seek(m.start(1), 0) 

1093 new_trailer = cast(Dict[Any, Any], read_object(stream, self)) 

1094 # Here, we are parsing the file from start to end, the new data have to erase the existing. 

1095 for key, value in list(new_trailer.items()): 

1096 self.trailer[key] = value 

1097 

1098 def _read_xref_subsections( 

1099 self, 

1100 idx_pairs: List[int], 

1101 get_entry: Callable[[int], Union[int, Tuple[int, ...]]], 

1102 used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], 

1103 ) -> None: 

1104 """Read and process the subsections of the xref.""" 

1105 for start, size in self._pairs(idx_pairs): 

1106 # The subsections must increase 

1107 for num in range(start, start + size): 

1108 # The first entry is the type 

1109 xref_type = get_entry(0) 

1110 # The rest of the elements depend on the xref_type 

1111 if xref_type == 0: 

1112 # linked list of free objects 

1113 next_free_object = get_entry(1) # noqa: F841 

1114 next_generation = get_entry(2) # noqa: F841 

1115 elif xref_type == 1: 

1116 # objects that are in use but are not compressed 

1117 byte_offset = get_entry(1) 

1118 generation = get_entry(2) 

1119 if generation not in self.xref: 

1120 self.xref[generation] = {} # type: ignore 

1121 if not used_before(num, generation): 

1122 self.xref[generation][num] = byte_offset # type: ignore 

1123 elif xref_type == 2: 

1124 # compressed objects 

1125 objstr_num = get_entry(1) 

1126 obstr_idx = get_entry(2) 

1127 generation = 0 # PDF spec table 18, generation is 0 

1128 if not used_before(num, generation): 

1129 self.xref_objStm[num] = (objstr_num, obstr_idx) 

1130 elif self.strict: 

1131 raise PdfReadError(f"Unknown xref type: {xref_type}") 

1132 

1133 def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: 

1134 """Iterate over pairs in the array.""" 

1135 i = 0 

1136 while i + 1 < len(array): 

1137 yield array[i], array[i + 1] 

1138 i += 2 

1139 

1140 def decrypt(self, password: Union[str, bytes]) -> PasswordType: 

1141 """ 

1142 When using an encrypted / secured PDF file with the PDF Standard 

1143 encryption handler, this function will allow the file to be decrypted. 

1144 It checks the given password against the document's user password and 

1145 owner password, and then stores the resulting decryption key if either 

1146 password is correct. 

1147 

1148 It does not matter which password was matched. Both passwords provide 

1149 the correct decryption key that will allow the document to be used with 

1150 this library. 

1151 

1152 Args: 

1153 password: The password to match. 

1154 

1155 Returns: 

1156 An indicator if the document was decrypted and whether it was the 

1157 owner password or the user password. 

1158 

1159 """ 

1160 if not self._encryption: 

1161 raise PdfReadError("Not encrypted file") 

1162 # TODO: raise Exception for wrong password 

1163 return self._encryption.verify(password) 

1164 

1165 @property 

1166 def is_encrypted(self) -> bool: 

1167 """ 

1168 Read-only boolean property showing whether this PDF file is encrypted. 

1169 

1170 Note that this property, if true, will remain true even after the 

1171 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1172 """ 

1173 return TK.ENCRYPT in self.trailer 

1174 

1175 def add_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1176 """ 

1177 Add a top level form that groups all form fields below it. 

1178 

1179 Args: 

1180 name: text string of the "/T" Attribute of the created object 

1181 

1182 Returns: 

1183 The created object. ``None`` means no object was created. 

1184 

1185 """ 

1186 catalog = self.root_object 

1187 

1188 if "/AcroForm" not in catalog or not isinstance( 

1189 catalog["/AcroForm"], DictionaryObject 

1190 ): 

1191 return None 

1192 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1193 if "/Fields" not in acroform: 

1194 # TODO: No error but this may be extended for XFA Forms 

1195 return None 

1196 

1197 interim = DictionaryObject() 

1198 interim[NameObject("/T")] = TextStringObject(name) 

1199 interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] 

1200 self.cache_indirect_object( 

1201 0, 

1202 max(i for (g, i) in self.resolved_objects if g == 0) + 1, 

1203 interim, 

1204 ) 

1205 arr = ArrayObject() 

1206 arr.append(interim.indirect_reference) 

1207 acroform[NameObject("/Fields")] = arr 

1208 for o in cast(ArrayObject, interim["/Kids"]): 

1209 obj = o.get_object() 

1210 if "/Parent" in obj: 

1211 logger_warning( 

1212 f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", 

1213 __name__, 

1214 ) 

1215 obj[NameObject("/Parent")] = interim.indirect_reference 

1216 return interim 

1217 

1218 def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1219 """ 

1220 Rename top level form field that all form fields below it. 

1221 

1222 Args: 

1223 name: text string of the "/T" field of the created object 

1224 

1225 Returns: 

1226 The modified object. ``None`` means no object was modified. 

1227 

1228 """ 

1229 catalog = self.root_object 

1230 

1231 if "/AcroForm" not in catalog or not isinstance( 

1232 catalog["/AcroForm"], DictionaryObject 

1233 ): 

1234 return None 

1235 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1236 if "/Fields" not in acroform: 

1237 return None 

1238 

1239 interim = cast( 

1240 DictionaryObject, 

1241 cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), 

1242 ) 

1243 interim[NameObject("/T")] = TextStringObject(name) 

1244 return interim 

1245 

1246 def _repr_mimebundle_( 

1247 self, 

1248 include: Union[None, Iterable[str]] = None, 

1249 exclude: Union[None, Iterable[str]] = None, 

1250 ) -> Dict[str, Any]: 

1251 """ 

1252 Integration into Jupyter Notebooks. 

1253 

1254 This method returns a dictionary that maps a mime-type to its 

1255 representation. 

1256 

1257 .. seealso:: 

1258 

1259 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1260 """ 

1261 self.stream.seek(0) 

1262 pdf_data = self.stream.read() 

1263 data = { 

1264 "application/pdf": pdf_data, 

1265 } 

1266 

1267 if include is not None: 

1268 # Filter representations based on include list 

1269 data = {k: v for k, v in data.items() if k in include} 

1270 

1271 if exclude is not None: 

1272 # Remove representations based on exclude list 

1273 data = {k: v for k, v in data.items() if k not in exclude} 

1274 

1275 return data