Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_reader.py: 36%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

690 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import os 

31import re 

32from collections.abc import Iterable 

33from io import BytesIO, UnsupportedOperation 

34from pathlib import Path 

35from types import TracebackType 

36from typing import ( 

37 TYPE_CHECKING, 

38 Any, 

39 Callable, 

40 Optional, 

41 Union, 

42 cast, 

43) 

44 

45from ._doc_common import PdfDocCommon, convert_to_int 

46from ._encryption import Encryption, PasswordType 

47from ._utils import ( 

48 StrByteType, 

49 StreamType, 

50 logger_warning, 

51 read_non_whitespace, 

52 read_previous_line, 

53 read_until_whitespace, 

54 skip_over_comment, 

55 skip_over_whitespace, 

56) 

57from .constants import TrailerKeys as TK 

58from .errors import ( 

59 EmptyFileError, 

60 FileNotDecryptedError, 

61 PdfReadError, 

62 PdfStreamError, 

63 WrongPasswordError, 

64) 

65from .generic import ( 

66 ArrayObject, 

67 ContentStream, 

68 DecodedStreamObject, 

69 DictionaryObject, 

70 EncodedStreamObject, 

71 IndirectObject, 

72 NameObject, 

73 NullObject, 

74 NumberObject, 

75 PdfObject, 

76 StreamObject, 

77 TextStringObject, 

78 is_null_or_none, 

79 read_object, 

80) 

81from .xmp import XmpInformation 

82 

83if TYPE_CHECKING: 

84 from ._page import PageObject 

85 

86 

87class PdfReader(PdfDocCommon): 

88 """ 

89 Initialize a PdfReader object. 

90 

91 This operation can take some time, as the PDF stream's cross-reference 

92 tables are read into memory. 

93 

94 Args: 

95 stream: A File object or an object that supports the standard read 

96 and seek methods similar to a File object. Could also be a 

97 string representing a path to a PDF file. 

98 strict: Determines whether user should be warned of all 

99 problems and also causes some correctable problems to be fatal. 

100 Defaults to ``False``. 

101 password: Decrypt PDF file at initialization. If the 

102 password is None, the file will not be decrypted. 

103 Defaults to ``None``. 

104 

105 """ 

106 

107 def __init__( 

108 self, 

109 stream: Union[StrByteType, Path], 

110 strict: bool = False, 

111 password: Union[None, str, bytes] = None, 

112 ) -> None: 

113 self.strict = strict 

114 self.flattened_pages: Optional[list[PageObject]] = None 

115 

116 #: Storage of parsed PDF objects. 

117 self.resolved_objects: dict[tuple[Any, Any], Optional[PdfObject]] = {} 

118 

119 self._startxref: int = 0 

120 self.xref_index = 0 

121 self.xref: dict[int, dict[Any, Any]] = {} 

122 self.xref_free_entry: dict[int, dict[Any, Any]] = {} 

123 self.xref_objStm: dict[int, tuple[Any, Any]] = {} 

124 self.trailer = DictionaryObject() 

125 

126 # Map page indirect_reference number to page number 

127 self._page_id2num: Optional[dict[Any, Any]] = None 

128 

129 self._validated_root: Optional[DictionaryObject] = None 

130 

131 self._initialize_stream(stream) 

132 self._known_objects: set[tuple[int, int]] = set() 

133 

134 self._override_encryption = False 

135 self._encryption: Optional[Encryption] = None 

136 if self.is_encrypted: 

137 self._handle_encryption(password) 

138 elif password is not None: 

139 raise PdfReadError("Not an encrypted file") 

140 

141 def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None: 

142 if hasattr(stream, "mode") and "b" not in stream.mode: 

143 logger_warning( 

144 "PdfReader stream/file object is not in binary mode. " 

145 "It may not be read correctly.", 

146 __name__, 

147 ) 

148 self._stream_opened = False 

149 if isinstance(stream, (str, Path)): 

150 with open(stream, "rb") as fh: 

151 stream = BytesIO(fh.read()) 

152 self._stream_opened = True 

153 self.read(stream) 

154 self.stream = stream 

155 

156 def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None: 

157 self._override_encryption = True 

158 # Some documents may not have a /ID, use two empty 

159 # byte strings instead. Solves 

160 # https://github.com/py-pdf/pypdf/issues/608 

161 id_entry = self.trailer.get(TK.ID) 

162 id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" 

163 encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object()) 

164 self._encryption = Encryption.read(encrypt_entry, id1_entry) 

165 

166 # try empty password if no password provided 

167 pwd = password if password is not None else b"" 

168 if ( 

169 self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED 

170 and password is not None 

171 ): 

172 # raise if password provided 

173 raise WrongPasswordError("Wrong password") 

174 self._override_encryption = False 

175 

176 def __enter__(self) -> "PdfReader": 

177 return self 

178 

179 def __exit__( 

180 self, 

181 exc_type: Optional[type[BaseException]], 

182 exc_val: Optional[BaseException], 

183 exc_tb: Optional[TracebackType], 

184 ) -> None: 

185 self.close() 

186 

187 def close(self) -> None: 

188 """Close the stream if opened in __init__ and clear memory.""" 

189 if self._stream_opened: 

190 self.stream.close() 

191 self.flattened_pages = [] 

192 self.resolved_objects = {} 

193 self.trailer = DictionaryObject() 

194 self.xref = {} 

195 self.xref_free_entry = {} 

196 self.xref_objStm = {} 

197 

198 @property 

199 def root_object(self) -> DictionaryObject: 

200 """Provide access to "/Root". Standardized with PdfWriter.""" 

201 if self._validated_root: 

202 return self._validated_root 

203 root = self.trailer.get(TK.ROOT) 

204 if is_null_or_none(root): 

205 logger_warning('Cannot find "/Root" key in trailer', __name__) 

206 elif ( 

207 cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type") 

208 == "/Catalog" 

209 ): 

210 self._validated_root = cast( 

211 DictionaryObject, cast(PdfObject, root).get_object() 

212 ) 

213 else: 

214 logger_warning("Invalid Root object in trailer", __name__) 

215 if self._validated_root is None: 

216 logger_warning('Searching object with "/Catalog" key', __name__) 

217 nb = cast(int, self.trailer.get("/Size", 0)) 

218 for i in range(nb): 

219 try: 

220 o = self.get_object(i + 1) 

221 except Exception: # to be sure to capture all errors 

222 o = None 

223 if isinstance(o, DictionaryObject) and o.get("/Type") == "/Catalog": 

224 self._validated_root = o 

225 logger_warning(f"Root found at {o.indirect_reference!r}", __name__) 

226 break 

227 if self._validated_root is None: 

228 if not is_null_or_none(root) and "/Pages" in cast(DictionaryObject, cast(PdfObject, root).get_object()): 

229 logger_warning( 

230 f"Possible root found at {cast(PdfObject, root).indirect_reference!r}, but missing /Catalog key", 

231 __name__ 

232 ) 

233 self._validated_root = cast( 

234 DictionaryObject, cast(PdfObject, root).get_object() 

235 ) 

236 else: 

237 raise PdfReadError("Cannot find Root object in pdf") 

238 return self._validated_root 

239 

240 @property 

241 def _info(self) -> Optional[DictionaryObject]: 

242 """ 

243 Provide access to "/Info". Standardized with PdfWriter. 

244 

245 Returns: 

246 /Info Dictionary; None if the entry does not exist 

247 

248 """ 

249 info = self.trailer.get(TK.INFO, None) 

250 if is_null_or_none(info): 

251 return None 

252 assert info is not None, "mypy" 

253 info = info.get_object() 

254 if not isinstance(info, DictionaryObject): 

255 raise PdfReadError( 

256 "Trailer not found or does not point to a document information dictionary" 

257 ) 

258 return info 

259 

260 @property 

261 def _ID(self) -> Optional[ArrayObject]: 

262 """ 

263 Provide access to "/ID". Standardized with PdfWriter. 

264 

265 Returns: 

266 /ID array; None if the entry does not exist 

267 

268 """ 

269 id = self.trailer.get(TK.ID, None) 

270 if is_null_or_none(id): 

271 return None 

272 assert id is not None, "mypy" 

273 return cast(ArrayObject, id.get_object()) 

274 

275 @property 

276 def pdf_header(self) -> str: 

277 """ 

278 The first 8 bytes of the file. 

279 

280 This is typically something like ``'%PDF-1.6'`` and can be used to 

281 detect if the file is actually a PDF file and which version it is. 

282 """ 

283 # TODO: Make this return a bytes object for consistency 

284 # but that needs a deprecation 

285 loc = self.stream.tell() 

286 self.stream.seek(0, 0) 

287 pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") 

288 self.stream.seek(loc, 0) # return to where it was 

289 return pdf_file_version 

290 

291 @property 

292 def xmp_metadata(self) -> Optional[XmpInformation]: 

293 """XMP (Extensible Metadata Platform) data.""" 

294 try: 

295 self._override_encryption = True 

296 return cast(XmpInformation, self.root_object.xmp_metadata) 

297 finally: 

298 self._override_encryption = False 

299 

300 def _get_page_number_by_indirect( 

301 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

302 ) -> Optional[int]: 

303 """ 

304 Retrieve the page number from an indirect reference. 

305 

306 Args: 

307 indirect_reference: The indirect reference to locate. 

308 

309 Returns: 

310 Page number or None. 

311 

312 """ 

313 if self._page_id2num is None: 

314 self._page_id2num = { 

315 x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore 

316 } 

317 

318 if is_null_or_none(indirect_reference): 

319 return None 

320 assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" 

321 if isinstance(indirect_reference, int): 

322 idnum = indirect_reference 

323 else: 

324 idnum = indirect_reference.idnum 

325 assert self._page_id2num is not None, "hint for mypy" 

326 return self._page_id2num.get(idnum, None) 

327 

328 def _get_object_from_stream( 

329 self, indirect_reference: IndirectObject 

330 ) -> Union[int, PdfObject, str]: 

331 # indirect reference to object in object stream 

332 # read the entire object stream into memory 

333 stmnum, idx = self.xref_objStm[indirect_reference.idnum] 

334 obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore 

335 # This is an xref to a stream, so its type better be a stream 

336 assert cast(str, obj_stm["/Type"]) == "/ObjStm" 

337 stream_data = BytesIO(obj_stm.get_data()) 

338 for i in range(obj_stm["/N"]): # type: ignore 

339 read_non_whitespace(stream_data) 

340 stream_data.seek(-1, 1) 

341 objnum = NumberObject.read_from_stream(stream_data) 

342 read_non_whitespace(stream_data) 

343 stream_data.seek(-1, 1) 

344 offset = NumberObject.read_from_stream(stream_data) 

345 read_non_whitespace(stream_data) 

346 stream_data.seek(-1, 1) 

347 if objnum != indirect_reference.idnum: 

348 # We're only interested in one object 

349 continue 

350 if self.strict and idx != i: 

351 raise PdfReadError("Object is in wrong index.") 

352 stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore 

353 

354 # To cope with case where the 'pointer' is on a white space 

355 read_non_whitespace(stream_data) 

356 stream_data.seek(-1, 1) 

357 

358 try: 

359 obj = read_object(stream_data, self) 

360 except PdfStreamError as exc: 

361 # Stream object cannot be read. Normally, a critical error, but 

362 # Adobe Reader doesn't complain, so continue (in strict mode?) 

363 logger_warning( 

364 f"Invalid stream (index {i}) within object " 

365 f"{indirect_reference.idnum} {indirect_reference.generation}: " 

366 f"{exc}", 

367 __name__, 

368 ) 

369 

370 if self.strict: # pragma: no cover 

371 raise PdfReadError( 

372 f"Cannot read object stream: {exc}" 

373 ) # pragma: no cover 

374 # Replace with null. Hopefully it's nothing important. 

375 obj = NullObject() # pragma: no cover 

376 return obj 

377 

378 if self.strict: # pragma: no cover 

379 raise PdfReadError( 

380 "This is a fatal error in strict mode." 

381 ) # pragma: no cover 

382 return NullObject() # pragma: no cover 

383 

384 def get_object( 

385 self, indirect_reference: Union[int, IndirectObject] 

386 ) -> Optional[PdfObject]: 

387 if isinstance(indirect_reference, int): 

388 indirect_reference = IndirectObject(indirect_reference, 0, self) 

389 retval = self.cache_get_indirect_object( 

390 indirect_reference.generation, indirect_reference.idnum 

391 ) 

392 if retval is not None: 

393 return retval 

394 if ( 

395 indirect_reference.generation == 0 

396 and indirect_reference.idnum in self.xref_objStm 

397 ): 

398 retval = self._get_object_from_stream(indirect_reference) # type: ignore 

399 elif ( 

400 indirect_reference.generation in self.xref 

401 and indirect_reference.idnum in self.xref[indirect_reference.generation] 

402 ): 

403 if self.xref_free_entry.get(indirect_reference.generation, {}).get( 

404 indirect_reference.idnum, False 

405 ): 

406 return NullObject() 

407 start = self.xref[indirect_reference.generation][indirect_reference.idnum] 

408 self.stream.seek(start, 0) 

409 try: 

410 idnum, generation = self.read_object_header(self.stream) 

411 if ( 

412 idnum != indirect_reference.idnum 

413 or generation != indirect_reference.generation 

414 ): 

415 raise PdfReadError("Not matching, we parse the file for it") 

416 except Exception: 

417 if hasattr(self.stream, "getbuffer"): 

418 buf = bytes(self.stream.getbuffer()) 

419 else: 

420 p = self.stream.tell() 

421 self.stream.seek(0, 0) 

422 buf = self.stream.read(-1) 

423 self.stream.seek(p, 0) 

424 m = re.search( 

425 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

426 buf, 

427 ) 

428 if m is not None: 

429 logger_warning( 

430 f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", 

431 __name__, 

432 ) 

433 self.xref[indirect_reference.generation][ 

434 indirect_reference.idnum 

435 ] = (m.start(0) + 1) 

436 self.stream.seek(m.start(0) + 1) 

437 idnum, generation = self.read_object_header(self.stream) 

438 else: 

439 idnum = -1 

440 generation = -1 # exception will be raised below 

441 if idnum != indirect_reference.idnum and self.xref_index: 

442 # xref table probably had bad indexes due to not being zero-indexed 

443 if self.strict: 

444 raise PdfReadError( 

445 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

446 f"does not match actual ({idnum} {generation}); " 

447 "xref table not zero-indexed." 

448 ) 

449 # xref table is corrected in non-strict mode 

450 elif idnum != indirect_reference.idnum and self.strict: 

451 # some other problem 

452 raise PdfReadError( 

453 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

454 f"does not match actual ({idnum} {generation})." 

455 ) 

456 if self.strict: 

457 assert generation == indirect_reference.generation 

458 

459 current_object = (indirect_reference.idnum, indirect_reference.generation) 

460 if current_object in self._known_objects: 

461 raise PdfReadError(f"Detected loop with self reference for {indirect_reference!r}.") 

462 self._known_objects.add(current_object) 

463 retval = read_object(self.stream, self) # type: ignore 

464 self._known_objects.remove(current_object) 

465 

466 # override encryption is used for the /Encrypt dictionary 

467 if not self._override_encryption and self._encryption is not None: 

468 # if we don't have the encryption key: 

469 if not self._encryption.is_decrypted(): 

470 raise FileNotDecryptedError("File has not been decrypted") 

471 # otherwise, decrypt here... 

472 retval = cast(PdfObject, retval) 

473 retval = self._encryption.decrypt_object( 

474 retval, indirect_reference.idnum, indirect_reference.generation 

475 ) 

476 else: 

477 if hasattr(self.stream, "getbuffer"): 

478 buf = bytes(self.stream.getbuffer()) 

479 else: 

480 p = self.stream.tell() 

481 self.stream.seek(0, 0) 

482 buf = self.stream.read(-1) 

483 self.stream.seek(p, 0) 

484 m = re.search( 

485 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

486 buf, 

487 ) 

488 if m is not None: 

489 logger_warning( 

490 f"Object {indirect_reference.idnum} {indirect_reference.generation} found", 

491 __name__, 

492 ) 

493 if indirect_reference.generation not in self.xref: 

494 self.xref[indirect_reference.generation] = {} 

495 self.xref[indirect_reference.generation][indirect_reference.idnum] = ( 

496 m.start(0) + 1 

497 ) 

498 self.stream.seek(m.end(0) + 1) 

499 skip_over_whitespace(self.stream) 

500 self.stream.seek(-1, 1) 

501 retval = read_object(self.stream, self) # type: ignore 

502 

503 # override encryption is used for the /Encrypt dictionary 

504 if not self._override_encryption and self._encryption is not None: 

505 # if we don't have the encryption key: 

506 if not self._encryption.is_decrypted(): 

507 raise FileNotDecryptedError("File has not been decrypted") 

508 # otherwise, decrypt here... 

509 retval = cast(PdfObject, retval) 

510 retval = self._encryption.decrypt_object( 

511 retval, indirect_reference.idnum, indirect_reference.generation 

512 ) 

513 else: 

514 logger_warning( 

515 f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", 

516 __name__, 

517 ) 

518 if self.strict: 

519 raise PdfReadError("Could not find object.") 

520 self.cache_indirect_object( 

521 indirect_reference.generation, indirect_reference.idnum, retval 

522 ) 

523 return retval 

524 

525 def read_object_header(self, stream: StreamType) -> tuple[int, int]: 

526 # Should never be necessary to read out whitespace, since the 

527 # cross-reference table should put us in the right spot to read the 

528 # object header. In reality some files have stupid cross-reference 

529 # tables that are off by whitespace bytes. 

530 skip_over_comment(stream) 

531 extra = skip_over_whitespace(stream) 

532 stream.seek(-1, 1) 

533 idnum = read_until_whitespace(stream) 

534 extra |= skip_over_whitespace(stream) 

535 stream.seek(-1, 1) 

536 generation = read_until_whitespace(stream) 

537 extra |= skip_over_whitespace(stream) 

538 stream.seek(-1, 1) 

539 

540 # although it's not used, it might still be necessary to read 

541 _obj = stream.read(3) 

542 

543 read_non_whitespace(stream) 

544 stream.seek(-1, 1) 

545 if extra and self.strict: 

546 logger_warning( 

547 f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore 

548 __name__, 

549 ) 

550 return int(idnum), int(generation) 

551 

552 def cache_get_indirect_object( 

553 self, generation: int, idnum: int 

554 ) -> Optional[PdfObject]: 

555 try: 

556 return self.resolved_objects.get((generation, idnum)) 

557 except RecursionError: 

558 raise PdfReadError("Maximum recursion depth reached.") 

559 

560 def cache_indirect_object( 

561 self, generation: int, idnum: int, obj: Optional[PdfObject] 

562 ) -> Optional[PdfObject]: 

563 if (generation, idnum) in self.resolved_objects: 

564 msg = f"Overwriting cache for {generation} {idnum}" 

565 if self.strict: 

566 raise PdfReadError(msg) 

567 logger_warning(msg, __name__) 

568 self.resolved_objects[(generation, idnum)] = obj 

569 if obj is not None: 

570 obj.indirect_reference = IndirectObject(idnum, generation, self) 

571 return obj 

572 

573 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: 

574 # function reserved for future development 

575 if indirect.pdf != self: 

576 raise ValueError("Cannot update PdfReader with external object") 

577 if (indirect.generation, indirect.idnum) not in self.resolved_objects: 

578 raise ValueError("Cannot find referenced object") 

579 self.resolved_objects[(indirect.generation, indirect.idnum)] = obj 

580 obj.indirect_reference = indirect 

581 return obj 

582 

583 def read(self, stream: StreamType) -> None: 

584 """ 

585 Read and process the PDF stream, extracting necessary data. 

586 

587 Args: 

588 stream: The PDF file stream. 

589 

590 """ 

591 self._basic_validation(stream) 

592 self._find_eof_marker(stream) 

593 startxref = self._find_startxref_pos(stream) 

594 self._startxref = startxref 

595 

596 # check and eventually correct the startxref only if not strict 

597 xref_issue_nr = self._get_xref_issues(stream, startxref) 

598 if xref_issue_nr != 0: 

599 if self.strict and xref_issue_nr: 

600 raise PdfReadError("Broken xref table") 

601 logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) 

602 

603 # read all cross-reference tables and their trailers 

604 self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) 

605 

606 # if not zero-indexed, verify that the table is correct; change it if necessary 

607 if self.xref_index and not self.strict: 

608 loc = stream.tell() 

609 for gen, xref_entry in self.xref.items(): 

610 if gen == 65535: 

611 continue 

612 xref_k = sorted( 

613 xref_entry.keys() 

614 ) # ensure ascending to prevent damage 

615 for id in xref_k: 

616 stream.seek(xref_entry[id], 0) 

617 try: 

618 pid, _pgen = self.read_object_header(stream) 

619 except ValueError: 

620 self._rebuild_xref_table(stream) 

621 break 

622 if pid == id - self.xref_index: 

623 # fixing index item per item is required for revised PDF. 

624 self.xref[gen][pid] = self.xref[gen][id] 

625 del self.xref[gen][id] 

626 # if not, then either it's just plain wrong, or the 

627 # non-zero-index is actually correct 

628 stream.seek(loc, 0) # return to where it was 

629 

630 # remove wrong objects (not pointing to correct structures) - cf #2326 

631 if not self.strict: 

632 loc = stream.tell() 

633 for gen, xref_entry in self.xref.items(): 

634 if gen == 65535: 

635 continue 

636 ids = list(xref_entry.keys()) 

637 for id in ids: 

638 stream.seek(xref_entry[id], 0) 

639 try: 

640 self.read_object_header(stream) 

641 except ValueError: 

642 logger_warning( 

643 f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})", 

644 __name__, 

645 ) 

646 del xref_entry[id] # we can delete the id, we are parsing ids 

647 stream.seek(loc, 0) # return to where it was 

648 

649 def _basic_validation(self, stream: StreamType) -> None: 

650 """Ensure the stream is valid and not empty.""" 

651 stream.seek(0, os.SEEK_SET) 

652 try: 

653 header_byte = stream.read(5) 

654 except UnicodeDecodeError: 

655 raise UnsupportedOperation("cannot read header") 

656 if header_byte == b"": 

657 raise EmptyFileError("Cannot read an empty file") 

658 if header_byte != b"%PDF-": 

659 if self.strict: 

660 raise PdfReadError( 

661 f"PDF starts with '{header_byte.decode('utf8')}', " 

662 "but '%PDF-' expected" 

663 ) 

664 logger_warning(f"invalid pdf header: {header_byte}", __name__) 

665 stream.seek(0, os.SEEK_END) 

666 

667 def _find_eof_marker(self, stream: StreamType) -> None: 

668 """ 

669 Jump to the %%EOF marker. 

670 

671 According to the specs, the %%EOF marker should be at the very end of 

672 the file. Hence for standard-compliant PDF documents this function will 

673 read only the last part (DEFAULT_BUFFER_SIZE). 

674 """ 

675 HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' 

676 line = b"" 

677 first = True 

678 while not line.startswith(b"%%EOF"): 

679 if line != b"" and first: 

680 if any( 

681 line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%") 

682 ): 

683 # Consider the file as truncated while 

684 # having enough confidence to carry on. 

685 logger_warning("EOF marker seems truncated", __name__) 

686 break 

687 first = False 

688 if b"startxref" in line: 

689 logger_warning( 

690 "CAUTION: startxref found while searching for %%EOF. " 

691 "The file might be truncated and some data might not be read.", 

692 __name__, 

693 ) 

694 if stream.tell() < HEADER_SIZE: 

695 if self.strict: 

696 raise PdfReadError("EOF marker not found") 

697 logger_warning("EOF marker not found", __name__) 

698 line = read_previous_line(stream) 

699 

700 def _find_startxref_pos(self, stream: StreamType) -> int: 

701 """ 

702 Find startxref entry - the location of the xref table. 

703 

704 Args: 

705 stream: 

706 

707 Returns: 

708 The bytes offset 

709 

710 """ 

711 line = read_previous_line(stream) 

712 try: 

713 startxref = int(line) 

714 except ValueError: 

715 # 'startxref' may be on the same line as the location 

716 if not line.startswith(b"startxref"): 

717 raise PdfReadError("startxref not found") 

718 startxref = int(line[9:].strip()) 

719 logger_warning("startxref on same line as offset", __name__) 

720 else: 

721 line = read_previous_line(stream) 

722 if not line.startswith(b"startxref"): 

723 raise PdfReadError("startxref not found") 

724 return startxref 

725 

726 def _read_standard_xref_table(self, stream: StreamType) -> None: 

727 # standard cross-reference table 

728 ref = stream.read(3) 

729 if ref != b"ref": 

730 raise PdfReadError("xref table read error") 

731 read_non_whitespace(stream) 

732 stream.seek(-1, 1) 

733 first_time = True # check if the first time looking at the xref table 

734 while True: 

735 num = cast(int, read_object(stream, self)) 

736 if first_time and num != 0: 

737 self.xref_index = num 

738 if self.strict: 

739 logger_warning( 

740 "Xref table not zero-indexed. ID numbers for objects will be corrected.", 

741 __name__, 

742 ) 

743 # if table not zero indexed, could be due to error from when PDF was created 

744 # which will lead to mismatched indices later on, only warned and corrected if self.strict==True 

745 first_time = False 

746 read_non_whitespace(stream) 

747 stream.seek(-1, 1) 

748 size = cast(int, read_object(stream, self)) 

749 if not isinstance(size, int): 

750 logger_warning( 

751 "Invalid/Truncated xref table. Rebuilding it.", 

752 __name__, 

753 ) 

754 self._rebuild_xref_table(stream) 

755 stream.read() 

756 return 

757 read_non_whitespace(stream) 

758 stream.seek(-1, 1) 

759 cnt = 0 

760 while cnt < size: 

761 line = stream.read(20) 

762 if not line: 

763 raise PdfReadError("Unexpected empty line in Xref table.") 

764 

765 # It's very clear in section 3.4.3 of the PDF spec 

766 # that all cross-reference table lines are a fixed 

767 # 20 bytes (as of PDF 1.7). However, some files have 

768 # 21-byte entries (or more) due to the use of \r\n 

769 # (CRLF) EOL's. Detect that case, and adjust the line 

770 # until it does not begin with a \r (CR) or \n (LF). 

771 while line[0] in b"\x0D\x0A": 

772 stream.seek(-20 + 1, 1) 

773 line = stream.read(20) 

774 

775 # On the other hand, some malformed PDF files 

776 # use a single character EOL without a preceding 

777 # space. Detect that case, and seek the stream 

778 # back one character (0-9 means we've bled into 

779 # the next xref entry, t means we've bled into the 

780 # text "trailer"): 

781 if line[-1] in b"0123456789t": 

782 stream.seek(-1, 1) 

783 

784 try: 

785 offset_b, generation_b = line[:16].split(b" ") 

786 entry_type_b = line[17:18] 

787 

788 offset, generation = int(offset_b), int(generation_b) 

789 except Exception: 

790 if hasattr(stream, "getbuffer"): 

791 buf = bytes(stream.getbuffer()) 

792 else: 

793 p = stream.tell() 

794 stream.seek(0, 0) 

795 buf = stream.read(-1) 

796 stream.seek(p) 

797 

798 f = re.search(rf"{num}\s+(\d+)\s+obj".encode(), buf) 

799 if f is None: 

800 logger_warning( 

801 f"entry {num} in Xref table invalid; object not found", 

802 __name__, 

803 ) 

804 generation = 65535 

805 offset = -1 

806 else: 

807 logger_warning( 

808 f"entry {num} in Xref table invalid but object found", 

809 __name__, 

810 ) 

811 generation = int(f.group(1)) 

812 offset = f.start() 

813 

814 if generation not in self.xref: 

815 self.xref[generation] = {} 

816 self.xref_free_entry[generation] = {} 

817 if num in self.xref[generation]: 

818 # It really seems like we should allow the last 

819 # xref table in the file to override previous 

820 # ones. Since we read the file backwards, assume 

821 # any existing key is already set correctly. 

822 pass 

823 else: 

824 if entry_type_b == b"n": 

825 self.xref[generation][num] = offset 

826 try: 

827 self.xref_free_entry[generation][num] = entry_type_b == b"f" 

828 except Exception: 

829 pass 

830 try: 

831 self.xref_free_entry[65535][num] = entry_type_b == b"f" 

832 except Exception: 

833 pass 

834 cnt += 1 

835 num += 1 

836 read_non_whitespace(stream) 

837 stream.seek(-1, 1) 

838 trailer_tag = stream.read(7) 

839 if trailer_tag != b"trailer": 

840 # more xrefs! 

841 stream.seek(-7, 1) 

842 else: 

843 break 

844 

845 def _read_xref_tables_and_trailers( 

846 self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int 

847 ) -> None: 

848 """Read the cross-reference tables and trailers in the PDF stream.""" 

849 self.xref = {} 

850 self.xref_free_entry = {} 

851 self.xref_objStm = {} 

852 self.trailer = DictionaryObject() 

853 while startxref is not None: 

854 # load the xref table 

855 stream.seek(startxref, 0) 

856 x = stream.read(1) 

857 if x in b"\r\n": 

858 x = stream.read(1) 

859 if x == b"x": 

860 startxref = self._read_xref(stream) 

861 elif xref_issue_nr: 

862 try: 

863 self._rebuild_xref_table(stream) 

864 break 

865 except Exception: 

866 xref_issue_nr = 0 

867 elif x.isdigit(): 

868 try: 

869 xrefstream = self._read_pdf15_xref_stream(stream) 

870 except Exception as e: 

871 if TK.ROOT in self.trailer: 

872 logger_warning( 

873 f"Previous trailer cannot be read: {e.args}", __name__ 

874 ) 

875 break 

876 raise PdfReadError(f"Trailer cannot be read: {e!s}") 

877 self._process_xref_stream(xrefstream) 

878 if "/Prev" in xrefstream: 

879 startxref = cast(int, xrefstream["/Prev"]) 

880 else: 

881 break 

882 else: 

883 startxref = self._read_xref_other_error(stream, startxref) 

884 

885 def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: 

886 """Process and handle the xref stream.""" 

887 trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE 

888 for key in trailer_keys: 

889 if key in xrefstream and key not in self.trailer: 

890 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 

891 if "/XRefStm" in xrefstream: 

892 p = self.stream.tell() 

893 self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) 

894 self._read_pdf15_xref_stream(self.stream) 

895 self.stream.seek(p, 0) 

896 

897 def _read_xref(self, stream: StreamType) -> Optional[int]: 

898 self._read_standard_xref_table(stream) 

899 if stream.read(1) == b"": 

900 return None 

901 stream.seek(-1, 1) 

902 read_non_whitespace(stream) 

903 stream.seek(-1, 1) 

904 new_trailer = cast(dict[str, Any], read_object(stream, self)) 

905 for key, value in new_trailer.items(): 

906 if key not in self.trailer: 

907 self.trailer[key] = value 

908 if "/XRefStm" in new_trailer: 

909 p = stream.tell() 

910 stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) 

911 try: 

912 self._read_pdf15_xref_stream(stream) 

913 except Exception: 

914 logger_warning( 

915 f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", 

916 __name__, 

917 ) 

918 stream.seek(p, 0) 

919 if "/Prev" in new_trailer: 

920 return new_trailer["/Prev"] 

921 return None 

922 

923 def _read_xref_other_error( 

924 self, stream: StreamType, startxref: int 

925 ) -> Optional[int]: 

926 # some PDFs have /Prev=0 in the trailer, instead of no /Prev 

927 if startxref == 0: 

928 if self.strict: 

929 raise PdfReadError( 

930 "/Prev=0 in the trailer (try opening with strict=False)" 

931 ) 

932 logger_warning( 

933 "/Prev=0 in the trailer - assuming there is no previous xref table", 

934 __name__, 

935 ) 

936 return None 

937 # bad xref character at startxref. Let's see if we can find 

938 # the xref table nearby, as we've observed this error with an 

939 # off-by-one before. 

940 stream.seek(-11, 1) 

941 tmp = stream.read(20) 

942 xref_loc = tmp.find(b"xref") 

943 if xref_loc != -1: 

944 startxref -= 10 - xref_loc 

945 return startxref 

946 # No explicit xref table, try finding a cross-reference stream. 

947 stream.seek(startxref, 0) 

948 for look in range(25): # value extended to cope with more linearized files 

949 if stream.read(1).isdigit(): 

950 # This is not a standard PDF, consider adding a warning 

951 startxref += look 

952 return startxref 

953 # no xref table found at specified location 

954 if "/Root" in self.trailer and not self.strict: 

955 # if Root has been already found, just raise warning 

956 logger_warning("Invalid parent xref., rebuild xref", __name__) 

957 try: 

958 self._rebuild_xref_table(stream) 

959 return None 

960 except Exception: 

961 raise PdfReadError("Cannot rebuild xref") 

962 raise PdfReadError("Could not find xref table at specified location") 

963 

964 def _read_pdf15_xref_stream( 

965 self, stream: StreamType 

966 ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: 

967 """Read the cross-reference stream for PDF 1.5+.""" 

968 stream.seek(-1, 1) 

969 idnum, generation = self.read_object_header(stream) 

970 xrefstream = cast(ContentStream, read_object(stream, self)) 

971 if cast(str, xrefstream["/Type"]) != "/XRef": 

972 raise PdfReadError(f"Unexpected type {xrefstream['/Type']!r}") 

973 self.cache_indirect_object(generation, idnum, xrefstream) 

974 

975 # Index pairs specify the subsections in the dictionary. 

976 # If none, create one subsection that spans everything. 

977 if "/Size" not in xrefstream: 

978 # According to table 17 of the PDF 2.0 specification, this key is required. 

979 raise PdfReadError(f"Size missing from XRef stream {xrefstream!r}!") 

980 idx_pairs = xrefstream.get("/Index", [0, xrefstream["/Size"]]) 

981 

982 entry_sizes = cast(dict[Any, Any], xrefstream.get("/W")) 

983 assert len(entry_sizes) >= 3 

984 if self.strict and len(entry_sizes) > 3: 

985 raise PdfReadError(f"Too many entry sizes: {entry_sizes}") 

986 

987 stream_data = BytesIO(xrefstream.get_data()) 

988 

989 def get_entry(i: int) -> Union[int, tuple[int, ...]]: 

990 # Reads the correct number of bytes for each entry. See the 

991 # discussion of the W parameter in PDF spec table 17. 

992 if entry_sizes[i] > 0: 

993 d = stream_data.read(entry_sizes[i]) 

994 return convert_to_int(d, entry_sizes[i]) 

995 

996 # PDF Spec Table 17: A value of zero for an element in the 

997 # W array indicates...the default value shall be used 

998 if i == 0: 

999 return 1 # First value defaults to 1 

1000 return 0 

1001 

1002 def used_before(num: int, generation: Union[int, tuple[int, ...]]) -> bool: 

1003 # We move backwards through the xrefs, don't replace any. 

1004 return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore 

1005 

1006 # Iterate through each subsection 

1007 self._read_xref_subsections(idx_pairs, get_entry, used_before) 

1008 return xrefstream 

1009 

1010 @staticmethod 

1011 def _get_xref_issues(stream: StreamType, startxref: int) -> int: 

1012 """ 

1013 Return an int which indicates an issue. 0 means there is no issue. 

1014 

1015 Args: 

1016 stream: 

1017 startxref: 

1018 

1019 Returns: 

1020 0 means no issue, other values represent specific issues. 

1021 

1022 """ 

1023 if startxref == 0: 

1024 return 4 

1025 

1026 stream.seek(startxref - 1, 0) # -1 to check character before 

1027 line = stream.read(1) 

1028 if line == b"j": 

1029 line = stream.read(1) 

1030 if line not in b"\r\n \t": 

1031 return 1 

1032 line = stream.read(4) 

1033 if line != b"xref": 

1034 # not a xref so check if it is an XREF object 

1035 line = b"" 

1036 while line in b"0123456789 \t": 

1037 line = stream.read(1) 

1038 if line == b"": 

1039 return 2 

1040 line += stream.read(2) # 1 char already read, +2 to check "obj" 

1041 if line.lower() != b"obj": 

1042 return 3 

1043 return 0 

1044 

1045 def _rebuild_xref_table(self, stream: StreamType) -> None: 

1046 self.xref = {} 

1047 stream.seek(0, 0) 

1048 f_ = stream.read(-1) 

1049 

1050 for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): 

1051 idnum = int(m.group(1)) 

1052 generation = int(m.group(2)) 

1053 if generation not in self.xref: 

1054 self.xref[generation] = {} 

1055 self.xref[generation][idnum] = m.start(1) 

1056 

1057 logger_warning("parsing for Object Streams", __name__) 

1058 for g in self.xref: 

1059 for i in self.xref[g]: 

1060 # get_object in manual 

1061 stream.seek(self.xref[g][i], 0) 

1062 try: 

1063 _ = self.read_object_header(stream) 

1064 o = cast(StreamObject, read_object(stream, self)) 

1065 if o.get("/Type", "") != "/ObjStm": 

1066 continue 

1067 strm = BytesIO(o.get_data()) 

1068 cpt = 0 

1069 while True: 

1070 s = read_until_whitespace(strm) 

1071 if not s.isdigit(): 

1072 break 

1073 _i = int(s) 

1074 skip_over_whitespace(strm) 

1075 strm.seek(-1, 1) 

1076 s = read_until_whitespace(strm) 

1077 if not s.isdigit(): # pragma: no cover 

1078 break # pragma: no cover 

1079 _o = int(s) 

1080 self.xref_objStm[_i] = (i, _o) 

1081 cpt += 1 

1082 if cpt != o.get("/N"): # pragma: no cover 

1083 logger_warning( # pragma: no cover 

1084 f"found {cpt} objects within Object({i},{g})" 

1085 f" whereas {o.get('/N')} expected", 

1086 __name__, 

1087 ) 

1088 except Exception: # could be multiple causes 

1089 pass 

1090 

1091 stream.seek(0, 0) 

1092 for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): 

1093 stream.seek(m.start(1), 0) 

1094 new_trailer = cast(dict[Any, Any], read_object(stream, self)) 

1095 # Here, we are parsing the file from start to end, the new data have to erase the existing. 

1096 for key, value in list(new_trailer.items()): 

1097 self.trailer[key] = value 

1098 

1099 def _read_xref_subsections( 

1100 self, 

1101 idx_pairs: list[int], 

1102 get_entry: Callable[[int], Union[int, tuple[int, ...]]], 

1103 used_before: Callable[[int, Union[int, tuple[int, ...]]], bool], 

1104 ) -> None: 

1105 """Read and process the subsections of the xref.""" 

1106 for start, size in self._pairs(idx_pairs): 

1107 # The subsections must increase 

1108 for num in range(start, start + size): 

1109 # The first entry is the type 

1110 xref_type = get_entry(0) 

1111 # The rest of the elements depend on the xref_type 

1112 if xref_type == 0: 

1113 # linked list of free objects 

1114 next_free_object = get_entry(1) # noqa: F841 

1115 next_generation = get_entry(2) # noqa: F841 

1116 elif xref_type == 1: 

1117 # objects that are in use but are not compressed 

1118 byte_offset = get_entry(1) 

1119 generation = get_entry(2) 

1120 if generation not in self.xref: 

1121 self.xref[generation] = {} # type: ignore 

1122 if not used_before(num, generation): 

1123 self.xref[generation][num] = byte_offset # type: ignore 

1124 elif xref_type == 2: 

1125 # compressed objects 

1126 objstr_num = get_entry(1) 

1127 obstr_idx = get_entry(2) 

1128 generation = 0 # PDF spec table 18, generation is 0 

1129 if not used_before(num, generation): 

1130 self.xref_objStm[num] = (objstr_num, obstr_idx) 

1131 elif self.strict: 

1132 raise PdfReadError(f"Unknown xref type: {xref_type}") 

1133 

1134 def _pairs(self, array: list[int]) -> Iterable[tuple[int, int]]: 

1135 """Iterate over pairs in the array.""" 

1136 i = 0 

1137 while i + 1 < len(array): 

1138 yield array[i], array[i + 1] 

1139 i += 2 

1140 

1141 def decrypt(self, password: Union[str, bytes]) -> PasswordType: 

1142 """ 

1143 When using an encrypted / secured PDF file with the PDF Standard 

1144 encryption handler, this function will allow the file to be decrypted. 

1145 It checks the given password against the document's user password and 

1146 owner password, and then stores the resulting decryption key if either 

1147 password is correct. 

1148 

1149 It does not matter which password was matched. Both passwords provide 

1150 the correct decryption key that will allow the document to be used with 

1151 this library. 

1152 

1153 Args: 

1154 password: The password to match. 

1155 

1156 Returns: 

1157 An indicator if the document was decrypted and whether it was the 

1158 owner password or the user password. 

1159 

1160 """ 

1161 if not self._encryption: 

1162 raise PdfReadError("Not encrypted file") 

1163 # TODO: raise Exception for wrong password 

1164 return self._encryption.verify(password) 

1165 

1166 @property 

1167 def is_encrypted(self) -> bool: 

1168 """ 

1169 Read-only boolean property showing whether this PDF file is encrypted. 

1170 

1171 Note that this property, if true, will remain true even after the 

1172 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1173 """ 

1174 return TK.ENCRYPT in self.trailer 

1175 

1176 def add_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1177 """ 

1178 Add a top level form that groups all form fields below it. 

1179 

1180 Args: 

1181 name: text string of the "/T" Attribute of the created object 

1182 

1183 Returns: 

1184 The created object. ``None`` means no object was created. 

1185 

1186 """ 

1187 catalog = self.root_object 

1188 

1189 if "/AcroForm" not in catalog or not isinstance( 

1190 catalog["/AcroForm"], DictionaryObject 

1191 ): 

1192 return None 

1193 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1194 if "/Fields" not in acroform: 

1195 # TODO: No error but this may be extended for XFA Forms 

1196 return None 

1197 

1198 interim = DictionaryObject() 

1199 interim[NameObject("/T")] = TextStringObject(name) 

1200 interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] 

1201 self.cache_indirect_object( 

1202 0, 

1203 max(i for (g, i) in self.resolved_objects if g == 0) + 1, 

1204 interim, 

1205 ) 

1206 arr = ArrayObject() 

1207 arr.append(interim.indirect_reference) 

1208 acroform[NameObject("/Fields")] = arr 

1209 for o in cast(ArrayObject, interim["/Kids"]): 

1210 obj = o.get_object() 

1211 if "/Parent" in obj: 

1212 logger_warning( 

1213 f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", 

1214 __name__, 

1215 ) 

1216 obj[NameObject("/Parent")] = interim.indirect_reference 

1217 return interim 

1218 

1219 def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1220 """ 

1221 Rename top level form field that all form fields below it. 

1222 

1223 Args: 

1224 name: text string of the "/T" field of the created object 

1225 

1226 Returns: 

1227 The modified object. ``None`` means no object was modified. 

1228 

1229 """ 

1230 catalog = self.root_object 

1231 

1232 if "/AcroForm" not in catalog or not isinstance( 

1233 catalog["/AcroForm"], DictionaryObject 

1234 ): 

1235 return None 

1236 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1237 if "/Fields" not in acroform: 

1238 return None 

1239 

1240 interim = cast( 

1241 DictionaryObject, 

1242 cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), 

1243 ) 

1244 interim[NameObject("/T")] = TextStringObject(name) 

1245 return interim 

1246 

1247 def _repr_mimebundle_( 

1248 self, 

1249 include: Union[None, Iterable[str]] = None, 

1250 exclude: Union[None, Iterable[str]] = None, 

1251 ) -> dict[str, Any]: 

1252 """ 

1253 Integration into Jupyter Notebooks. 

1254 

1255 This method returns a dictionary that maps a mime-type to its 

1256 representation. 

1257 

1258 .. seealso:: 

1259 

1260 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1261 """ 

1262 self.stream.seek(0) 

1263 pdf_data = self.stream.read() 

1264 data = { 

1265 "application/pdf": pdf_data, 

1266 } 

1267 

1268 if include is not None: 

1269 # Filter representations based on include list 

1270 data = {k: v for k, v in data.items() if k in include} 

1271 

1272 if exclude is not None: 

1273 # Remove representations based on exclude list 

1274 data = {k: v for k, v in data.items() if k not in exclude} 

1275 

1276 return data