Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_reader.py: 33%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

789 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import os 

31import re 

32import sys 

33from collections.abc import Iterable 

34from io import BytesIO, UnsupportedOperation 

35from pathlib import Path 

36from types import TracebackType 

37from typing import ( 

38 TYPE_CHECKING, 

39 Any, 

40 Callable, 

41 Optional, 

42 Union, 

43 cast, 

44) 

45 

46if sys.version_info >= (3, 11): 

47 from typing import Self 

48else: 

49 from typing_extensions import Self 

50 

51from ._doc_common import PdfDocCommon, convert_to_int 

52from ._encryption import Encryption, PasswordType 

53from ._utils import ( 

54 WHITESPACES_AS_BYTES, 

55 StrByteType, 

56 StreamType, 

57 logger_warning, 

58 read_non_whitespace, 

59 read_previous_line, 

60 read_until_whitespace, 

61 skip_over_comment, 

62 skip_over_whitespace, 

63) 

64from .constants import TrailerKeys as TK 

65from .errors import ( 

66 EmptyFileError, 

67 FileNotDecryptedError, 

68 LimitReachedError, 

69 PdfReadError, 

70 PdfStreamError, 

71 WrongPasswordError, 

72) 

73from .generic import ( 

74 ArrayObject, 

75 ContentStream, 

76 DecodedStreamObject, 

77 DictionaryObject, 

78 EncodedStreamObject, 

79 IndirectObject, 

80 NameObject, 

81 NullObject, 

82 NumberObject, 

83 PdfObject, 

84 StreamObject, 

85 TextStringObject, 

86 is_null_or_none, 

87 read_object, 

88) 

89from .xmp import XmpInformation 

90 

91if TYPE_CHECKING: 

92 from ._page import PageObject 

93 

94 

95class PdfReader(PdfDocCommon): 

96 """ 

97 Initialize a PdfReader object. 

98 

99 This operation can take some time, as the PDF stream's cross-reference 

100 tables are read into memory. 

101 

102 Args: 

103 stream: A File object or an object that supports the standard read 

104 and seek methods similar to a File object. Could also be a 

105 string representing a path to a PDF file. 

106 strict: Determines whether user should be warned of all 

107 problems and also causes some correctable problems to be fatal. 

108 Defaults to ``False``. 

109 password: Decrypt PDF file at initialization. If the 

110 password is None, the file will not be decrypted. 

111 Defaults to ``None``. 

112 root_object_recovery_limit: The maximum number of objects to query 

113 for recovering the Root object in non-strict mode. To disable 

114 this security measure, pass ``None``. 

115 

116 """ 

117 

118 def __init__( 

119 self, 

120 stream: Union[StrByteType, Path], 

121 strict: bool = False, 

122 password: Union[None, str, bytes] = None, 

123 *, 

124 root_object_recovery_limit: Optional[int] = 10_000, 

125 ) -> None: 

126 self.strict = strict 

127 self.flattened_pages: Optional[list[PageObject]] = None 

128 

129 #: Storage of parsed PDF objects. 

130 self.resolved_objects: dict[tuple[Any, Any], Optional[PdfObject]] = {} 

131 

132 self._startxref: int = 0 

133 self.xref_index = 0 

134 self.xref: dict[int, dict[Any, Any]] = {} 

135 self.xref_free_entry: dict[int, dict[Any, Any]] = {} 

136 self.xref_objStm: dict[int, tuple[Any, Any]] = {} 

137 self.trailer = DictionaryObject() 

138 

139 # Security parameters. 

140 self._root_object_recovery_limit = ( 

141 root_object_recovery_limit if isinstance(root_object_recovery_limit, int) else sys.maxsize 

142 ) 

143 

144 # Map page indirect_reference number to page number 

145 self._page_id2num: Optional[dict[Any, Any]] = None 

146 

147 self._validated_root: Optional[DictionaryObject] = None 

148 

149 self._initialize_stream(stream) 

150 self._known_objects: set[tuple[int, int]] = set() 

151 

152 self._override_encryption = False 

153 self._encryption: Optional[Encryption] = None 

154 if self.is_encrypted: 

155 self._handle_encryption(password) 

156 elif password is not None: 

157 raise PdfReadError("Not an encrypted file") 

158 

159 def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None: 

160 if hasattr(stream, "mode") and "b" not in stream.mode: 

161 logger_warning( 

162 "PdfReader stream/file object is not in binary mode. " 

163 "It may not be read correctly.", 

164 __name__, 

165 ) 

166 self._stream_opened = False 

167 if isinstance(stream, (str, Path)): 

168 with open(stream, "rb") as fh: 

169 stream = BytesIO(fh.read()) 

170 self._stream_opened = True 

171 self.read(stream) 

172 self.stream = stream 

173 

174 def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None: 

175 self._override_encryption = True 

176 # Some documents may not have a /ID, use two empty 

177 # byte strings instead. Solves 

178 # https://github.com/py-pdf/pypdf/issues/608 

179 id_entry = self.trailer.get(TK.ID) 

180 id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" 

181 encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object()) 

182 self._encryption = Encryption.read(encrypt_entry, id1_entry) 

183 

184 # try empty password if no password provided 

185 pwd = password if password is not None else b"" 

186 if ( 

187 self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED 

188 and password is not None 

189 ): 

190 # raise if password provided 

191 raise WrongPasswordError("Wrong password") 

192 self._override_encryption = False 

193 

194 def __enter__(self) -> Self: 

195 return self 

196 

197 def __exit__( 

198 self, 

199 exc_type: Optional[type[BaseException]], 

200 exc_val: Optional[BaseException], 

201 exc_tb: Optional[TracebackType], 

202 ) -> None: 

203 self.close() 

204 

205 def close(self) -> None: 

206 """Close the stream if opened in __init__ and clear memory.""" 

207 if self._stream_opened: 

208 self.stream.close() 

209 self.flattened_pages = [] 

210 self.resolved_objects = {} 

211 self.trailer = DictionaryObject() 

212 self.xref = {} 

213 self.xref_free_entry = {} 

214 self.xref_objStm = {} 

215 

216 @property 

217 def root_object(self) -> DictionaryObject: 

218 """Provide access to "/Root". Standardized with PdfWriter.""" 

219 if self._validated_root: 

220 return self._validated_root 

221 root = self.trailer.get(TK.ROOT) 

222 if is_null_or_none(root): 

223 logger_warning('Cannot find "/Root" key in trailer', __name__) 

224 elif ( 

225 cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type") 

226 == "/Catalog" 

227 ): 

228 self._validated_root = cast( 

229 DictionaryObject, cast(PdfObject, root).get_object() 

230 ) 

231 else: 

232 logger_warning("Invalid Root object in trailer", __name__) 

233 if self._validated_root is None: 

234 logger_warning('Searching object with "/Catalog" key', __name__) 

235 number_of_objects = cast(int, self.trailer.get("/Size", 0)) 

236 for i in range(number_of_objects): 

237 if i >= self._root_object_recovery_limit: 

238 raise LimitReachedError("Maximum Root object recovery limit reached.") 

239 try: 

240 obj = self.get_object(i + 1) 

241 except Exception: # to be sure to capture all errors 

242 obj = None 

243 if isinstance(obj, DictionaryObject) and obj.get("/Type") == "/Catalog": 

244 self._validated_root = obj 

245 logger_warning(f"Root found at {obj.indirect_reference!r}", __name__) 

246 break 

247 if self._validated_root is None: 

248 if not is_null_or_none(root) and "/Pages" in cast(DictionaryObject, cast(PdfObject, root).get_object()): 

249 logger_warning( 

250 f"Possible root found at {cast(PdfObject, root).indirect_reference!r}, but missing /Catalog key", 

251 __name__ 

252 ) 

253 self._validated_root = cast( 

254 DictionaryObject, cast(PdfObject, root).get_object() 

255 ) 

256 else: 

257 raise PdfReadError("Cannot find Root object in pdf") 

258 return self._validated_root 

259 

260 @property 

261 def _info(self) -> Optional[DictionaryObject]: 

262 """ 

263 Provide access to "/Info". Standardized with PdfWriter. 

264 

265 Returns: 

266 /Info Dictionary; None if the entry does not exist 

267 

268 """ 

269 info = self.trailer.get(TK.INFO, None) 

270 if is_null_or_none(info): 

271 return None 

272 assert info is not None, "mypy" 

273 info = info.get_object() 

274 if not isinstance(info, DictionaryObject): 

275 raise PdfReadError( 

276 "Trailer not found or does not point to a document information dictionary" 

277 ) 

278 return info 

279 

280 @property 

281 def _ID(self) -> Optional[ArrayObject]: 

282 """ 

283 Provide access to "/ID". Standardized with PdfWriter. 

284 

285 Returns: 

286 /ID array; None if the entry does not exist 

287 

288 """ 

289 id = self.trailer.get(TK.ID, None) 

290 if is_null_or_none(id): 

291 return None 

292 assert id is not None, "mypy" 

293 return cast(ArrayObject, id.get_object()) 

294 

295 @property 

296 def pdf_header(self) -> str: 

297 """ 

298 The first 8 bytes of the file. 

299 

300 This is typically something like ``'%PDF-1.6'`` and can be used to 

301 detect if the file is actually a PDF file and which version it is. 

302 """ 

303 # TODO: Make this return a bytes object for consistency 

304 # but that needs a deprecation 

305 loc = self.stream.tell() 

306 self.stream.seek(0, 0) 

307 pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") 

308 self.stream.seek(loc, 0) # return to where it was 

309 return pdf_file_version 

310 

311 @property 

312 def xmp_metadata(self) -> Optional[XmpInformation]: 

313 """XMP (Extensible Metadata Platform) data.""" 

314 try: 

315 self._override_encryption = True 

316 return cast(XmpInformation, self.root_object.xmp_metadata) 

317 finally: 

318 self._override_encryption = False 

319 

320 def _get_page_number_by_indirect( 

321 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

322 ) -> Optional[int]: 

323 """ 

324 Retrieve the page number from an indirect reference. 

325 

326 Args: 

327 indirect_reference: The indirect reference to locate. 

328 

329 Returns: 

330 Page number or None. 

331 

332 """ 

333 if self._page_id2num is None: 

334 self._page_id2num = { 

335 x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore 

336 } 

337 

338 if is_null_or_none(indirect_reference): 

339 return None 

340 assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" 

341 if isinstance(indirect_reference, int): 

342 idnum = indirect_reference 

343 else: 

344 idnum = indirect_reference.idnum 

345 assert self._page_id2num is not None, "hint for mypy" 

346 return self._page_id2num.get(idnum, None) 

347 

348 def _get_object_from_stream( 

349 self, indirect_reference: IndirectObject 

350 ) -> Union[int, PdfObject, str]: 

351 # indirect reference to object in object stream 

352 # read the entire object stream into memory 

353 stmnum, _idx = self.xref_objStm[indirect_reference.idnum] 

354 obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore 

355 # This is an xref to a stream, so its type better be a stream 

356 assert cast(str, obj_stm["/Type"]) == "/ObjStm" 

357 # Parse ALL objects in this stream in one pass and cache them. 

358 # This avoids O(N²) behavior when many objects from the same stream 

359 # are resolved individually (each call would re-parse the header). 

360 stream_data = BytesIO(obj_stm.get_data()) 

361 n = int(obj_stm["/N"]) # type: ignore[call-overload] 

362 first_offset = int(obj_stm["/First"]) # type: ignore[call-overload] 

363 

364 # ObjStm header format: "objnum offset objnum offset ..." 

365 # smallest possible entry: "0 0" = 3 bytes (1 digit + 1 space + 1 digit) 

366 # using // 4 would reject a valid 3-byte single entry (3 // 4 = 0) 

367 max_n = stream_data.getbuffer().nbytes // 3 

368 stream_data.seek(0) 

369 if n > max_n: 

370 if self.strict: 

371 raise LimitReachedError(f"Value /N {n} for object {stmnum} exceeds maximum allowed value {max_n}.") 

372 logger_warning( 

373 f"Value /N {n} for object {stmnum} exceeds maximum allowed value {max_n}. Limiting to {max_n}.", 

374 src=__name__ 

375 ) 

376 n = max_n 

377 

378 # Phase 1: Read the index (objnum, offset) pairs from the header. 

379 obj_index: list[tuple[int, int]] = [] 

380 for _i in range(n): 

381 read_non_whitespace(stream_data) 

382 stream_data.seek(-1, 1) 

383 objnum = NumberObject.read_from_stream(stream_data) 

384 read_non_whitespace(stream_data) 

385 stream_data.seek(-1, 1) 

386 offset = NumberObject.read_from_stream(stream_data) 

387 read_non_whitespace(stream_data) 

388 stream_data.seek(-1, 1) 

389 obj_index.append((int(objnum), int(offset))) 

390 

391 # Phase 2: Parse each object and cache it. 

392 target_obj: Union[int, PdfObject, str] = NullObject() 

393 found = False 

394 for i, (obj_num, obj_offset) in enumerate(obj_index): 

395 # Skip objects already in the cache. 

396 cached = self.cache_get_indirect_object(0, obj_num) 

397 if cached is not None: 

398 if obj_num == indirect_reference.idnum: 

399 target_obj = cached 

400 found = True 

401 continue 

402 

403 stream_data.seek(first_offset + obj_offset, 0) 

404 

405 # To cope with case where the 'pointer' is on a white space 

406 read_non_whitespace(stream_data) 

407 stream_data.seek(-1, 1) 

408 

409 try: 

410 obj = read_object(stream_data, self) 

411 except PdfStreamError as exc: 

412 # Stream object cannot be read. Normally, a critical error, but 

413 # Adobe Reader doesn't complain, so continue (in strict mode?) 

414 logger_warning( 

415 f"Invalid stream (index {i}) within object " 

416 f"{obj_num} 0: {exc}", 

417 __name__, 

418 ) 

419 if self.strict: # pragma: no cover 

420 raise PdfReadError( 

421 f"Cannot read object stream: {exc}" 

422 ) # pragma: no cover 

423 obj = NullObject() # pragma: no cover 

424 

425 # Only cache if this stream is the authoritative source for the object. 

426 # Incremental updates may override objects originally in the stream; 

427 # caching those stale versions would shadow the newer xref entry. 

428 authoritative_stm, _idx = self.xref_objStm.get(obj_num, (None, None)) 

429 if authoritative_stm == stmnum: 

430 self.cache_indirect_object(0, obj_num, obj) # type: ignore[arg-type] 

431 

432 if obj_num == indirect_reference.idnum: 

433 target_obj = obj 

434 found = True 

435 

436 if not found and self.strict: # pragma: no cover 

437 raise PdfReadError( 

438 "This is a fatal error in strict mode." 

439 ) # pragma: no cover 

440 return target_obj 

441 

442 def get_object( 

443 self, indirect_reference: Union[int, IndirectObject] 

444 ) -> Optional[PdfObject]: 

445 if isinstance(indirect_reference, int): 

446 indirect_reference = IndirectObject(indirect_reference, 0, self) 

447 retval = self.cache_get_indirect_object( 

448 indirect_reference.generation, indirect_reference.idnum 

449 ) 

450 if retval is not None: 

451 return retval 

452 if ( 

453 indirect_reference.generation == 0 

454 and indirect_reference.idnum in self.xref_objStm 

455 ): 

456 retval = self._get_object_from_stream(indirect_reference) # type: ignore 

457 elif ( 

458 indirect_reference.generation in self.xref 

459 and indirect_reference.idnum in self.xref[indirect_reference.generation] 

460 ): 

461 if self.xref_free_entry.get(indirect_reference.generation, {}).get( 

462 indirect_reference.idnum, False 

463 ): 

464 return NullObject() 

465 start = self.xref[indirect_reference.generation][indirect_reference.idnum] 

466 self.stream.seek(start, 0) 

467 try: 

468 idnum, generation = self.read_object_header(self.stream) 

469 if ( 

470 idnum != indirect_reference.idnum 

471 or generation != indirect_reference.generation 

472 ): 

473 raise PdfReadError("Not matching, we parse the file for it") 

474 except Exception: 

475 if hasattr(self.stream, "getbuffer"): 

476 buf = bytes(self.stream.getbuffer()) 

477 else: 

478 p = self.stream.tell() 

479 self.stream.seek(0, 0) 

480 buf = self.stream.read(-1) 

481 self.stream.seek(p, 0) 

482 m = re.search( 

483 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

484 buf, 

485 ) 

486 if m is not None: 

487 logger_warning( 

488 f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", 

489 __name__, 

490 ) 

491 self.xref[indirect_reference.generation][ 

492 indirect_reference.idnum 

493 ] = (m.start(0) + 1) 

494 self.stream.seek(m.start(0) + 1) 

495 idnum, generation = self.read_object_header(self.stream) 

496 else: 

497 idnum = -1 

498 generation = -1 # exception will be raised below 

499 if idnum != indirect_reference.idnum and self.xref_index: 

500 # xref table probably had bad indexes due to not being zero-indexed 

501 if self.strict: 

502 raise PdfReadError( 

503 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

504 f"does not match actual ({idnum} {generation}); " 

505 "xref table not zero-indexed." 

506 ) 

507 # xref table is corrected in non-strict mode 

508 elif idnum != indirect_reference.idnum and self.strict: 

509 # some other problem 

510 raise PdfReadError( 

511 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

512 f"does not match actual ({idnum} {generation})." 

513 ) 

514 if self.strict: 

515 assert generation == indirect_reference.generation 

516 

517 current_object = (indirect_reference.idnum, indirect_reference.generation) 

518 if current_object in self._known_objects: 

519 raise LimitReachedError(f"Detected loop with self reference for {indirect_reference!r}.") 

520 self._known_objects.add(current_object) 

521 retval = read_object(self.stream, self) # type: ignore 

522 self._known_objects.remove(current_object) 

523 

524 # override encryption is used for the /Encrypt dictionary 

525 if not self._override_encryption and self._encryption is not None: 

526 # if we don't have the encryption key: 

527 if not self._encryption.is_decrypted(): 

528 raise FileNotDecryptedError("File has not been decrypted") 

529 # otherwise, decrypt here... 

530 retval = cast(PdfObject, retval) 

531 retval = self._encryption.decrypt_object( 

532 retval, indirect_reference.idnum, indirect_reference.generation, 

533 strict=self.strict, 

534 ) 

535 else: 

536 if hasattr(self.stream, "getbuffer"): 

537 buf = bytes(self.stream.getbuffer()) 

538 else: 

539 p = self.stream.tell() 

540 self.stream.seek(0, 0) 

541 buf = self.stream.read(-1) 

542 self.stream.seek(p, 0) 

543 m = re.search( 

544 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

545 buf, 

546 ) 

547 if m is not None: 

548 logger_warning( 

549 f"Object {indirect_reference.idnum} {indirect_reference.generation} found", 

550 __name__, 

551 ) 

552 if indirect_reference.generation not in self.xref: 

553 self.xref[indirect_reference.generation] = {} 

554 self.xref[indirect_reference.generation][indirect_reference.idnum] = ( 

555 m.start(0) + 1 

556 ) 

557 self.stream.seek(m.end(0) + 1) 

558 skip_over_whitespace(self.stream) 

559 self.stream.seek(-1, 1) 

560 retval = read_object(self.stream, self) # type: ignore 

561 

562 # override encryption is used for the /Encrypt dictionary 

563 if not self._override_encryption and self._encryption is not None: 

564 # if we don't have the encryption key: 

565 if not self._encryption.is_decrypted(): 

566 raise FileNotDecryptedError("File has not been decrypted") 

567 # otherwise, decrypt here... 

568 retval = cast(PdfObject, retval) 

569 retval = self._encryption.decrypt_object( 

570 retval, indirect_reference.idnum, indirect_reference.generation, 

571 strict=self.strict, 

572 ) 

573 else: 

574 logger_warning( 

575 f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", 

576 __name__, 

577 ) 

578 if self.strict: 

579 raise PdfReadError("Could not find object.") 

580 # For ObjStm objects, _get_object_from_stream already cached 

581 # the result during batch parsing; skip the redundant cache write 

582 # to avoid "Overwriting cache" warnings. For non-ObjStm objects 

583 # (including encrypted ones that need decrypted values cached), 

584 # always write. 

585 if not ( 

586 indirect_reference.generation == 0 

587 and indirect_reference.idnum in self.xref_objStm 

588 ): 

589 self.cache_indirect_object( 

590 indirect_reference.generation, indirect_reference.idnum, retval 

591 ) 

592 return retval 

593 

594 def read_object_header(self, stream: StreamType) -> tuple[int, int]: 

595 # Should never be necessary to read out whitespace, since the 

596 # cross-reference table should put us in the right spot to read the 

597 # object header. In reality some files have stupid cross-reference 

598 # tables that are off by whitespace bytes. 

599 skip_over_comment(stream) 

600 extra = skip_over_whitespace(stream) 

601 stream.seek(-1, 1) 

602 idnum = read_until_whitespace(stream) 

603 extra |= skip_over_whitespace(stream) 

604 stream.seek(-1, 1) 

605 generation = read_until_whitespace(stream) 

606 extra |= skip_over_whitespace(stream) 

607 stream.seek(-1, 1) 

608 

609 # although it's not used, it might still be necessary to read 

610 _obj = stream.read(3) 

611 

612 read_non_whitespace(stream) 

613 stream.seek(-1, 1) 

614 if extra and self.strict: 

615 logger_warning( 

616 f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore 

617 __name__, 

618 ) 

619 return int(idnum), int(generation) 

620 

621 def cache_get_indirect_object( 

622 self, generation: int, idnum: int 

623 ) -> Optional[PdfObject]: 

624 try: 

625 return self.resolved_objects.get((generation, idnum)) 

626 except RecursionError: 

627 raise PdfReadError("Maximum recursion depth reached.") 

628 

629 def cache_indirect_object( 

630 self, generation: int, idnum: int, obj: Optional[PdfObject] 

631 ) -> Optional[PdfObject]: 

632 if (generation, idnum) in self.resolved_objects: 

633 msg = f"Overwriting cache for {generation} {idnum}" 

634 if self.strict: 

635 raise PdfReadError(msg) 

636 logger_warning(msg, __name__) 

637 self.resolved_objects[(generation, idnum)] = obj 

638 if obj is not None: 

639 obj.indirect_reference = IndirectObject(idnum, generation, self) 

640 return obj 

641 

642 def _replace_object(self, indirect_reference: IndirectObject, obj: PdfObject) -> PdfObject: 

643 # function reserved for future development 

644 if indirect_reference.pdf != self: 

645 raise ValueError("Cannot update PdfReader with external object") 

646 if (indirect_reference.generation, indirect_reference.idnum) not in self.resolved_objects: 

647 raise ValueError("Cannot find referenced object") 

648 self.resolved_objects[(indirect_reference.generation, indirect_reference.idnum)] = obj 

649 obj.indirect_reference = indirect_reference 

650 return obj 

651 

652 def read(self, stream: StreamType) -> None: 

653 """ 

654 Read and process the PDF stream, extracting necessary data. 

655 

656 Args: 

657 stream: The PDF file stream. 

658 

659 """ 

660 self._basic_validation(stream) 

661 self._find_eof_marker(stream) 

662 startxref = self._find_startxref_pos(stream) 

663 self._startxref = startxref 

664 

665 # check and eventually correct the startxref only if not strict 

666 xref_issue_nr = self._get_xref_issues(stream, startxref) 

667 if xref_issue_nr != 0: 

668 if self.strict and xref_issue_nr: 

669 raise PdfReadError("Broken xref table") 

670 logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) 

671 

672 # read all cross-reference tables and their trailers 

673 self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) 

674 

675 # if not zero-indexed, verify that the table is correct; change it if necessary 

676 if self.xref_index and not self.strict: 

677 loc = stream.tell() 

678 for gen, xref_entry in self.xref.items(): 

679 if gen == 65535: 

680 continue 

681 xref_k = sorted( 

682 xref_entry.keys() 

683 ) # ensure ascending to prevent damage 

684 for id in xref_k: 

685 stream.seek(xref_entry[id], 0) 

686 try: 

687 pid, _pgen = self.read_object_header(stream) 

688 except ValueError: 

689 self._rebuild_xref_table(stream) 

690 break 

691 if pid == id - self.xref_index: 

692 # fixing index item per item is required for revised PDF. 

693 self.xref[gen][pid] = self.xref[gen][id] 

694 del self.xref[gen][id] 

695 # if not, then either it's just plain wrong, or the 

696 # non-zero-index is actually correct 

697 stream.seek(loc, 0) # return to where it was 

698 

699 # remove wrong objects (not pointing to correct structures) - cf #2326 

700 if not self.strict: 

701 loc = stream.tell() 

702 for gen, xref_entry in self.xref.items(): 

703 if gen == 65535: 

704 continue 

705 ids = list(xref_entry.keys()) 

706 for id in ids: 

707 stream.seek(xref_entry[id], 0) 

708 try: 

709 self.read_object_header(stream) 

710 except ValueError: 

711 logger_warning( 

712 f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})", 

713 __name__, 

714 ) 

715 del xref_entry[id] # we can delete the id, we are parsing ids 

716 stream.seek(loc, 0) # return to where it was 

717 

718 def _basic_validation(self, stream: StreamType) -> None: 

719 """Ensure the stream is valid and not empty.""" 

720 stream.seek(0, os.SEEK_SET) 

721 try: 

722 header_byte = stream.read(5) 

723 except UnicodeDecodeError: 

724 raise UnsupportedOperation("cannot read header") 

725 if header_byte == b"": 

726 raise EmptyFileError("Cannot read an empty file") 

727 if header_byte != b"%PDF-": 

728 if self.strict: 

729 raise PdfReadError( 

730 f"PDF starts with '{header_byte.decode('utf8')}', " 

731 "but '%PDF-' expected" 

732 ) 

733 logger_warning(f"invalid pdf header: {header_byte!r}", __name__) 

734 stream.seek(0, os.SEEK_END) 

735 

736 def _find_eof_marker(self, stream: StreamType) -> None: 

737 """ 

738 Jump to the %%EOF marker. 

739 

740 According to the specs, the %%EOF marker should be at the very end of 

741 the file. Hence for standard-compliant PDF documents this function will 

742 read only the last part (DEFAULT_BUFFER_SIZE). 

743 """ 

744 HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' 

745 line = b"" 

746 first = True 

747 while not line.startswith(b"%%EOF"): 

748 if line != b"" and first: 

749 if any( 

750 line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%") 

751 ): 

752 # Consider the file as truncated while 

753 # having enough confidence to carry on. 

754 logger_warning("EOF marker seems truncated", __name__) 

755 break 

756 first = False 

757 if b"startxref" in line: 

758 logger_warning( 

759 "CAUTION: startxref found while searching for %%EOF. " 

760 "The file might be truncated and some data might not be read.", 

761 __name__, 

762 ) 

763 if stream.tell() < HEADER_SIZE: 

764 if self.strict: 

765 raise PdfReadError("EOF marker not found") 

766 logger_warning("EOF marker not found", __name__) 

767 line = read_previous_line(stream) 

768 

769 def _find_startxref_pos(self, stream: StreamType) -> int: 

770 """ 

771 Find startxref entry - the location of the xref table. 

772 

773 Args: 

774 stream: 

775 

776 Returns: 

777 The bytes offset 

778 

779 """ 

780 line = read_previous_line(stream) 

781 try: 

782 startxref = int(line) 

783 except ValueError: 

784 # 'startxref' may be on the same line as the location 

785 if not line.startswith(b"startxref"): 

786 raise PdfReadError("startxref not found") 

787 startxref = int(line[9:].strip()) 

788 logger_warning("startxref on same line as offset", __name__) 

789 else: 

790 line = read_previous_line(stream) 

791 if not line.startswith(b"startxref"): 

792 raise PdfReadError("startxref not found") 

793 return startxref 

794 

795 def _read_standard_xref_table(self, stream: StreamType) -> None: 

796 # standard cross-reference table 

797 ref = stream.read(3) 

798 if ref != b"ref": 

799 raise PdfReadError("xref table read error") 

800 read_non_whitespace(stream) 

801 stream.seek(-1, 1) 

802 first_time = True # check if the first time looking at the xref table 

803 while True: 

804 num = cast(int, read_object(stream, self)) 

805 if first_time and num != 0: 

806 self.xref_index = num 

807 if self.strict: 

808 logger_warning( 

809 "Xref table not zero-indexed. ID numbers for objects will be corrected.", 

810 __name__, 

811 ) 

812 # if table not zero indexed, could be due to error from when PDF was created 

813 # which will lead to mismatched indices later on, only warned and corrected if self.strict==True 

814 first_time = False 

815 read_non_whitespace(stream) 

816 stream.seek(-1, 1) 

817 size = cast(int, read_object(stream, self)) 

818 if not isinstance(size, int): 

819 logger_warning( 

820 "Invalid/Truncated xref table. Rebuilding it.", 

821 __name__, 

822 ) 

823 self._rebuild_xref_table(stream) 

824 stream.read() 

825 return 

826 read_non_whitespace(stream) 

827 stream.seek(-1, 1) 

828 cnt = 0 

829 while cnt < size: 

830 line = stream.read(20) 

831 if not line: 

832 raise PdfReadError("Unexpected empty line in Xref table.") 

833 

834 # It's very clear in section 3.4.3 of the PDF spec 

835 # that all cross-reference table lines are a fixed 

836 # 20 bytes (as of PDF 1.7). However, some files have 

837 # 21-byte entries (or more) due to the use of \r\n 

838 # (CRLF) EOL's. Detect that case, and adjust the line 

839 # until it does not begin with a \r (CR) or \n (LF). 

840 while line[0] in b"\x0D\x0A": 

841 stream.seek(-20 + 1, 1) 

842 line = stream.read(20) 

843 

844 # On the other hand, some malformed PDF files 

845 # use a single character EOL without a preceding 

846 # space. Detect that case, and seek the stream 

847 # back one character (0-9 means we've bled into 

848 # the next xref entry, t means we've bled into the 

849 # text "trailer"): 

850 if line[-1] in b"0123456789t": 

851 stream.seek(-1, 1) 

852 

853 try: 

854 offset_b, generation_b = line[:16].split(b" ") 

855 entry_type_b = line[17:18] 

856 

857 offset, generation = int(offset_b), int(generation_b) 

858 except Exception: 

859 if hasattr(stream, "getbuffer"): 

860 buf = bytes(stream.getbuffer()) 

861 else: 

862 p = stream.tell() 

863 stream.seek(0, 0) 

864 buf = stream.read(-1) 

865 stream.seek(p) 

866 

867 f = re.search(rf"{num}\s+(\d+)\s+obj".encode(), buf) 

868 if f is None: 

869 logger_warning( 

870 f"entry {num} in Xref table invalid; object not found", 

871 __name__, 

872 ) 

873 generation = 65535 

874 offset = -1 

875 entry_type_b = b"f" 

876 else: 

877 logger_warning( 

878 f"entry {num} in Xref table invalid but object found", 

879 __name__, 

880 ) 

881 generation = int(f.group(1)) 

882 offset = f.start() 

883 

884 if generation not in self.xref: 

885 self.xref[generation] = {} 

886 self.xref_free_entry[generation] = {} 

887 if num in self.xref[generation]: 

888 # It really seems like we should allow the last 

889 # xref table in the file to override previous 

890 # ones. Since we read the file backwards, assume 

891 # any existing key is already set correctly. 

892 pass 

893 else: 

894 if entry_type_b == b"n": 

895 self.xref[generation][num] = offset 

896 try: 

897 self.xref_free_entry[generation][num] = entry_type_b == b"f" 

898 except Exception: 

899 pass 

900 try: 

901 self.xref_free_entry[65535][num] = entry_type_b == b"f" 

902 except Exception: 

903 pass 

904 cnt += 1 

905 num += 1 

906 read_non_whitespace(stream) 

907 stream.seek(-1, 1) 

908 # Skip any PDF comments between xref entries and the trailer 

909 # keyword. Some PDF producers (e.g. Vectorizer.AI) insert 

910 # comments here which are legal per the PDF spec (§7.2.3). 

911 while stream.read(1) == b"%": 

912 stream.seek(-1, 1) 

913 skip_over_comment(stream) 

914 read_non_whitespace(stream) 

915 stream.seek(-1, 1) 

916 stream.seek(-1, 1) 

917 trailer_tag = stream.read(7) 

918 if trailer_tag != b"trailer": 

919 # more xrefs! 

920 stream.seek(-7, 1) 

921 else: 

922 break 

923 

924 def _read_xref_tables_and_trailers( 

925 self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int 

926 ) -> None: 

927 """Read the cross-reference tables and trailers in the PDF stream.""" 

928 self.xref = {} 

929 self.xref_free_entry = {} 

930 self.xref_objStm = {} 

931 self.trailer = DictionaryObject() 

932 visited_xref_offsets: set[int] = set() 

933 while startxref is not None: 

934 # Detect circular /Prev references in the xref chain 

935 if startxref in visited_xref_offsets: 

936 logger_warning( 

937 f"Circular xref chain detected at offset {startxref}, stopping", 

938 __name__, 

939 ) 

940 break 

941 visited_xref_offsets.add(startxref) 

942 # load the xref table 

943 stream.seek(startxref, 0) 

944 x = stream.read(1) 

945 if x in b"\r\n": 

946 x = stream.read(1) 

947 if x == b"x": 

948 startxref = self._read_xref(stream) 

949 elif xref_issue_nr: 

950 try: 

951 self._rebuild_xref_table(stream) 

952 break 

953 except Exception: 

954 xref_issue_nr = 0 

955 elif x.isdigit(): 

956 try: 

957 xrefstream = self._read_pdf15_xref_stream(stream) 

958 except Exception as e: 

959 if TK.ROOT in self.trailer: 

960 logger_warning( 

961 f"Previous trailer cannot be read: {e.args}", __name__ 

962 ) 

963 break 

964 raise PdfReadError(f"Trailer cannot be read: {e!s}") 

965 self._process_xref_stream(xrefstream) 

966 if "/Prev" in xrefstream: 

967 startxref = cast(int, xrefstream["/Prev"]) 

968 else: 

969 break 

970 else: 

971 startxref = self._read_xref_other_error(stream, startxref) 

972 

973 def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: 

974 """Process and handle the xref stream.""" 

975 trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE 

976 for key in trailer_keys: 

977 if key in xrefstream and key not in self.trailer: 

978 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 

979 if "/XRefStm" in xrefstream: 

980 p = self.stream.tell() 

981 self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) 

982 self._read_pdf15_xref_stream(self.stream) 

983 self.stream.seek(p, 0) 

984 

985 def _read_xref(self, stream: StreamType) -> Optional[int]: 

986 self._read_standard_xref_table(stream) 

987 if stream.read(1) == b"": 

988 return None 

989 stream.seek(-1, 1) 

990 read_non_whitespace(stream) 

991 stream.seek(-1, 1) 

992 new_trailer = cast(dict[str, Any], read_object(stream, self)) 

993 for key, value in new_trailer.items(): 

994 if key not in self.trailer: 

995 self.trailer[key] = value 

996 if "/XRefStm" in new_trailer: 

997 p = stream.tell() 

998 stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) 

999 try: 

1000 self._read_pdf15_xref_stream(stream) 

1001 except Exception: 

1002 logger_warning( 

1003 f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", 

1004 __name__, 

1005 ) 

1006 stream.seek(p, 0) 

1007 if "/Prev" in new_trailer: 

1008 return cast(int, new_trailer["/Prev"]) 

1009 return None 

1010 

1011 def _read_xref_other_error( 

1012 self, stream: StreamType, startxref: int 

1013 ) -> Optional[int]: 

1014 # some PDFs have /Prev=0 in the trailer, instead of no /Prev 

1015 if startxref == 0: 

1016 if self.strict: 

1017 raise PdfReadError( 

1018 "/Prev=0 in the trailer (try opening with strict=False)" 

1019 ) 

1020 logger_warning( 

1021 "/Prev=0 in the trailer - assuming there is no previous xref table", 

1022 __name__, 

1023 ) 

1024 return None 

1025 # bad xref character at startxref. Let's see if we can find 

1026 # the xref table nearby, as we've observed this error with an 

1027 # off-by-one before. 

1028 stream.seek(-11, 1) 

1029 tmp = stream.read(20) 

1030 xref_loc = tmp.find(b"xref") 

1031 if xref_loc != -1: 

1032 startxref -= 10 - xref_loc 

1033 return startxref 

1034 # No explicit xref table, try finding a cross-reference stream. 

1035 stream.seek(startxref, 0) 

1036 for look in range(25): # value extended to cope with more linearized files 

1037 if stream.read(1).isdigit(): 

1038 # This is not a standard PDF, consider adding a warning 

1039 startxref += look 

1040 return startxref 

1041 # no xref table found at specified location 

1042 if "/Root" in self.trailer and not self.strict: 

1043 # if Root has been already found, just raise warning 

1044 logger_warning("Invalid parent xref., rebuild xref", __name__) 

1045 try: 

1046 self._rebuild_xref_table(stream) 

1047 return None 

1048 except Exception: 

1049 raise PdfReadError("Cannot rebuild xref") 

1050 raise PdfReadError("Could not find xref table at specified location") 

1051 

1052 def _sanitize_pdf15_xref_stream_index_pairs( 

1053 self, index_pairs: list[int], entry_sizes: list[int], xref_stream: ContentStream 

1054 ) -> list[int]: 

1055 # `entry_sizes` holds the byte widths for the entries. Summing determines the total number of bytes per entry. 

1056 # We expect up to 3 values, clamping to at least 1 avoids ZeroDivisionError in next step. 

1057 # `min_entry_bytes` will be the smallest plausible size of one xref entry. 

1058 min_entry_bytes = max(sum(int(entry_sizes[i]) for i in range(min(len(entry_sizes), 3))), 1) 

1059 # maximum number of entries that could physically fit 

1060 max_entries = len(xref_stream.get_data()) // min_entry_bytes + 1 

1061 

1062 result = [] 

1063 total = 0 

1064 

1065 for index, pair_value in enumerate(index_pairs): 

1066 pair_value_int = int(pair_value) 

1067 

1068 # `index_pairs` has the format `[start0, count0, start1, count1, ...]` 

1069 # Only modify the counts here, but keep the start values. 

1070 if index % 2 == 1: 

1071 if total + pair_value_int > max_entries: 

1072 if self.strict: 

1073 raise LimitReachedError( 

1074 f"Total XRef entries {total + pair_value_int} exceed maximum allowed value {max_entries}." 

1075 ) 

1076 new_v = max(0, max_entries - total) 

1077 logger_warning( 

1078 f"Clamping XRef count from {pair_value_int} to {new_v} to fit stream size.", 

1079 src=__name__ 

1080 ) 

1081 pair_value_int = new_v 

1082 

1083 total += pair_value_int 

1084 

1085 result.append(pair_value_int) 

1086 

1087 return result 

1088 

1089 def _read_pdf15_xref_stream( 

1090 self, stream: StreamType 

1091 ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: 

1092 """Read the cross-reference stream for PDF 1.5+.""" 

1093 stream.seek(-1, 1) 

1094 stream_idnum, stream_generation = self.read_object_header(stream) 

1095 xref_stream = cast(ContentStream, read_object(stream, self)) 

1096 if cast(str, xref_stream["/Type"]) != "/XRef": 

1097 raise PdfReadError(f"Unexpected type {xref_stream['/Type']!r}") 

1098 self.cache_indirect_object(stream_generation, stream_idnum, xref_stream) 

1099 

1100 # Index pairs specify the subsections in the dictionary. 

1101 # If none, create one subsection that spans everything. 

1102 if "/Size" not in xref_stream: 

1103 # According to table 17 of the PDF 2.0 specification, this key is required. 

1104 raise PdfReadError(f"Size missing from XRef stream {xref_stream!r}!") 

1105 index_pairs = xref_stream.get("/Index", [0, xref_stream["/Size"]]) 

1106 

1107 entry_sizes = cast(list[int], xref_stream.get("/W")) 

1108 assert len(entry_sizes) >= 3 

1109 if self.strict and len(entry_sizes) > 3: 

1110 raise PdfReadError(f"Too many entry sizes: {entry_sizes}") 

1111 index_pairs = self._sanitize_pdf15_xref_stream_index_pairs( 

1112 index_pairs=index_pairs, entry_sizes=entry_sizes, xref_stream=xref_stream 

1113 ) 

1114 

1115 stream_data = BytesIO(xref_stream.get_data()) 

1116 

1117 def get_entry(i: int) -> Union[int, tuple[int, ...]]: 

1118 # Reads the correct number of bytes for each entry. See the 

1119 # discussion of the W parameter in PDF spec table 17. 

1120 if entry_sizes[i] > 0: 

1121 d = stream_data.read(entry_sizes[i]) 

1122 return convert_to_int(d, entry_sizes[i]) 

1123 

1124 # PDF Spec Table 17: A value of zero for an element in the 

1125 # W array indicates...the default value shall be used 

1126 if i == 0: 

1127 return 1 # First value defaults to 1 

1128 return 0 

1129 

1130 def used_before(num: int, generation: Union[int, tuple[int, ...]]) -> bool: 

1131 # We move backwards through the xrefs, don't replace any. 

1132 return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore 

1133 

1134 # Iterate through each subsection 

1135 self._read_xref_subsections(index_pairs, get_entry, used_before) 

1136 return xref_stream 

1137 

1138 @staticmethod 

1139 def _get_xref_issues(stream: StreamType, startxref: int) -> int: 

1140 """ 

1141 Return an int which indicates an issue. 0 means there is no issue. 

1142 

1143 Args: 

1144 stream: 

1145 startxref: 

1146 

1147 Returns: 

1148 0 means no issue, other values represent specific issues. 

1149 

1150 """ 

1151 if startxref == 0: 

1152 return 4 

1153 

1154 stream.seek(startxref - 1, 0) # -1 to check character before 

1155 line = stream.read(1) 

1156 if line == b"j": 

1157 line = stream.read(1) 

1158 if line not in b"\r\n \t": 

1159 return 1 

1160 line = stream.read(4) 

1161 if line != b"xref": 

1162 # not a xref so check if it is an XREF object 

1163 line = b"" 

1164 while line in b"0123456789 \t": 

1165 line = stream.read(1) 

1166 if line == b"": 

1167 return 2 

1168 line += stream.read(2) # 1 char already read, +2 to check "obj" 

1169 if line.lower() != b"obj": 

1170 return 3 

1171 return 0 

1172 

1173 @classmethod 

1174 def _find_pdf_objects(cls, data: bytes) -> Iterable[tuple[int, int, int]]: 

1175 index = 0 

1176 ord_0 = ord("0") 

1177 ord_9 = ord("9") 

1178 while True: 

1179 index = data.find(b" obj", index) 

1180 if index == -1: 

1181 return 

1182 

1183 index_before_space = index - 1 

1184 

1185 # Skip whitespace backwards 

1186 while index_before_space >= 0 and data[index_before_space] in WHITESPACES_AS_BYTES: 

1187 index_before_space -= 1 

1188 

1189 # Read generation number 

1190 generation_end = index_before_space + 1 

1191 while index_before_space >= 0 and ord_0 <= data[index_before_space] <= ord_9: 

1192 index_before_space -= 1 

1193 generation_start = index_before_space + 1 

1194 

1195 # Skip whitespace 

1196 while index_before_space >= 0 and data[index_before_space] in WHITESPACES_AS_BYTES: 

1197 index_before_space -= 1 

1198 

1199 # Read object number 

1200 object_end = index_before_space + 1 

1201 while index_before_space >= 0 and ord_0 <= data[index_before_space] <= ord_9: 

1202 index_before_space -= 1 

1203 object_start = index_before_space + 1 

1204 

1205 # Validate 

1206 if object_start < object_end and generation_start < generation_end: 

1207 object_number = int(data[object_start:object_end]) 

1208 generation_number = int(data[generation_start:generation_end]) 

1209 

1210 yield object_number, generation_number, object_start 

1211 

1212 index += 4 # len(b" obj") 

1213 

1214 @classmethod 

1215 def _find_pdf_trailers(cls, data: bytes) -> Iterable[int]: 

1216 index = 0 

1217 data_length = len(data) 

1218 while True: 

1219 index = data.find(b"trailer", index) 

1220 if index == -1: 

1221 return 

1222 

1223 index_after_trailer = index + 7 # len(b"trailer") 

1224 

1225 # Skip whitespace 

1226 while index_after_trailer < data_length and data[index_after_trailer] in WHITESPACES_AS_BYTES: 

1227 index_after_trailer += 1 

1228 

1229 # Must be dictionary start 

1230 if index_after_trailer + 1 < data_length and data[index_after_trailer:index_after_trailer+2] == b"<<": 

1231 yield index_after_trailer # offset of '<<' 

1232 

1233 index += 7 # len(b"trailer") 

1234 

1235 def _rebuild_xref_table(self, stream: StreamType) -> None: 

1236 self.xref = {} 

1237 stream.seek(0, 0) 

1238 stream_data = stream.read(-1) 

1239 

1240 for object_number, generation_number, object_start in self._find_pdf_objects(stream_data): 

1241 if generation_number not in self.xref: 

1242 self.xref[generation_number] = {} 

1243 self.xref[generation_number][object_number] = object_start 

1244 

1245 logger_warning("parsing for Object Streams", __name__) 

1246 for generation_number in self.xref: 

1247 for object_number in self.xref[generation_number]: 

1248 # get_object in manual 

1249 stream.seek(self.xref[generation_number][object_number], 0) 

1250 try: 

1251 _ = self.read_object_header(stream) 

1252 obj = cast(StreamObject, read_object(stream, self)) 

1253 if obj.get("/Type", "") != "/ObjStm": 

1254 continue 

1255 object_stream = BytesIO(obj.get_data()) 

1256 actual_count = 0 

1257 while True: 

1258 current = read_until_whitespace(object_stream) 

1259 if not current.isdigit(): 

1260 break 

1261 inner_object_number = int(current) 

1262 skip_over_whitespace(object_stream) 

1263 object_stream.seek(-1, 1) 

1264 current = read_until_whitespace(object_stream) 

1265 if not current.isdigit(): # pragma: no cover 

1266 break # pragma: no cover 

1267 inner_generation_number = int(current) 

1268 self.xref_objStm[inner_object_number] = (object_number, inner_generation_number) 

1269 actual_count += 1 

1270 if actual_count != obj.get("/N"): # pragma: no cover 

1271 logger_warning( # pragma: no cover 

1272 f"found {actual_count} objects within Object({object_number},{generation_number})" 

1273 f" whereas {obj.get('/N')} expected", 

1274 __name__, 

1275 ) 

1276 except Exception: # could be multiple causes 

1277 pass 

1278 

1279 stream.seek(0, 0) 

1280 for position in self._find_pdf_trailers(stream_data): 

1281 stream.seek(position, 0) 

1282 new_trailer = cast(dict[Any, Any], read_object(stream, self)) 

1283 # Here, we are parsing the file from start to end, the new data have to erase the existing. 

1284 for key, value in new_trailer.items(): 

1285 self.trailer[key] = value 

1286 

1287 def _read_xref_subsections( 

1288 self, 

1289 idx_pairs: list[int], 

1290 get_entry: Callable[[int], Union[int, tuple[int, ...]]], 

1291 used_before: Callable[[int, Union[int, tuple[int, ...]]], bool], 

1292 ) -> None: 

1293 """Read and process the subsections of the xref.""" 

1294 for start, size in self._pairs(idx_pairs): 

1295 # The subsections must increase 

1296 for num in range(start, start + size): 

1297 # The first entry is the type 

1298 xref_type = get_entry(0) 

1299 # The rest of the elements depend on the xref_type 

1300 if xref_type == 0: 

1301 # linked list of free objects 

1302 next_free_object = get_entry(1) # noqa: F841 

1303 next_generation = get_entry(2) # noqa: F841 

1304 elif xref_type == 1: 

1305 # objects that are in use but are not compressed 

1306 byte_offset = get_entry(1) 

1307 generation = get_entry(2) 

1308 if generation not in self.xref: 

1309 self.xref[generation] = {} # type: ignore 

1310 if not used_before(num, generation): 

1311 self.xref[generation][num] = byte_offset # type: ignore 

1312 elif xref_type == 2: 

1313 # compressed objects 

1314 objstr_num = get_entry(1) 

1315 obstr_idx = get_entry(2) 

1316 generation = 0 # PDF spec table 18, generation is 0 

1317 if not used_before(num, generation): 

1318 self.xref_objStm[num] = (objstr_num, obstr_idx) 

1319 elif self.strict: 

1320 raise PdfReadError(f"Unknown xref type: {xref_type}") 

1321 

1322 def _pairs(self, array: list[int]) -> Iterable[tuple[int, int]]: 

1323 """Iterate over pairs in the array.""" 

1324 i = 0 

1325 while i + 1 < len(array): 

1326 yield array[i], array[i + 1] 

1327 i += 2 

1328 

1329 def decrypt(self, password: Union[str, bytes]) -> PasswordType: 

1330 """ 

1331 When using an encrypted / secured PDF file with the PDF Standard 

1332 encryption handler, this function will allow the file to be decrypted. 

1333 It checks the given password against the document's user password and 

1334 owner password, and then stores the resulting decryption key if either 

1335 password is correct. 

1336 

1337 It does not matter which password was matched. Both passwords provide 

1338 the correct decryption key that will allow the document to be used with 

1339 this library. 

1340 

1341 Args: 

1342 password: The password to match. 

1343 

1344 Returns: 

1345 An indicator if the document was decrypted and whether it was the 

1346 owner password or the user password. 

1347 

1348 """ 

1349 if not self._encryption: 

1350 raise PdfReadError("Not encrypted file") 

1351 # TODO: raise Exception for wrong password 

1352 return self._encryption.verify(password) 

1353 

1354 @property 

1355 def is_encrypted(self) -> bool: 

1356 """ 

1357 Read-only boolean property showing whether this PDF file is encrypted. 

1358 

1359 Note that this property, if true, will remain true even after the 

1360 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1361 """ 

1362 return TK.ENCRYPT in self.trailer 

1363 

1364 def add_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1365 """ 

1366 Add a top level form that groups all form fields below it. 

1367 

1368 Args: 

1369 name: text string of the "/T" Attribute of the created object 

1370 

1371 Returns: 

1372 The created object. ``None`` means no object was created. 

1373 

1374 """ 

1375 catalog = self.root_object 

1376 

1377 if "/AcroForm" not in catalog or not isinstance( 

1378 catalog["/AcroForm"], DictionaryObject 

1379 ): 

1380 return None 

1381 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1382 if "/Fields" not in acroform: 

1383 # TODO: No error but this may be extended for XFA Forms 

1384 return None 

1385 

1386 interim = DictionaryObject() 

1387 interim[NameObject("/T")] = TextStringObject(name) 

1388 interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] 

1389 self.cache_indirect_object( 

1390 0, 

1391 max(i for (g, i) in self.resolved_objects if g == 0) + 1, 

1392 interim, 

1393 ) 

1394 arr = ArrayObject() 

1395 arr.append(interim.indirect_reference) 

1396 acroform[NameObject("/Fields")] = arr 

1397 for o in cast(ArrayObject, interim["/Kids"]): 

1398 obj = o.get_object() 

1399 if "/Parent" in obj: 

1400 logger_warning( 

1401 f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", 

1402 __name__, 

1403 ) 

1404 obj[NameObject("/Parent")] = interim.indirect_reference 

1405 return interim 

1406 

1407 def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1408 """ 

1409 Rename top level form field that all form fields below it. 

1410 

1411 Args: 

1412 name: text string of the "/T" field of the created object 

1413 

1414 Returns: 

1415 The modified object. ``None`` means no object was modified. 

1416 

1417 """ 

1418 catalog = self.root_object 

1419 

1420 if "/AcroForm" not in catalog or not isinstance( 

1421 catalog["/AcroForm"], DictionaryObject 

1422 ): 

1423 return None 

1424 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1425 if "/Fields" not in acroform: 

1426 return None 

1427 

1428 interim = cast( 

1429 DictionaryObject, 

1430 cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), 

1431 ) 

1432 interim[NameObject("/T")] = TextStringObject(name) 

1433 return interim 

1434 

1435 def _repr_mimebundle_( 

1436 self, 

1437 include: Union[None, Iterable[str]] = None, 

1438 exclude: Union[None, Iterable[str]] = None, 

1439 ) -> dict[str, Any]: 

1440 """ 

1441 Integration into Jupyter Notebooks. 

1442 

1443 This method returns a dictionary that maps a mime-type to its 

1444 representation. 

1445 

1446 .. seealso:: 

1447 

1448 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1449 """ 

1450 self.stream.seek(0) 

1451 pdf_data = self.stream.read() 

1452 data = { 

1453 "application/pdf": pdf_data, 

1454 } 

1455 

1456 if include is not None: 

1457 # Filter representations based on include list 

1458 data = {k: v for k, v in data.items() if k in include} 

1459 

1460 if exclude is not None: 

1461 # Remove representations based on exclude list 

1462 data = {k: v for k, v in data.items() if k not in exclude} 

1463 

1464 return data