Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_reader.py: 33%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

804 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import os 

31import re 

32import sys 

33from collections.abc import Iterable 

34from io import BytesIO, UnsupportedOperation 

35from pathlib import Path 

36from types import TracebackType 

37from typing import ( 

38 TYPE_CHECKING, 

39 Any, 

40 Callable, 

41 Optional, 

42 Union, 

43 cast, 

44) 

45 

46if sys.version_info >= (3, 11): 

47 from typing import Self 

48else: 

49 from typing_extensions import Self 

50 

51from ._doc_common import PdfDocCommon, convert_to_int 

52from ._encryption import Encryption, PasswordType 

53from ._utils import ( 

54 WHITESPACES_AS_BYTES, 

55 StrByteType, 

56 StreamType, 

57 logger_warning, 

58 read_non_whitespace, 

59 read_previous_line, 

60 read_until_whitespace, 

61 skip_over_comment, 

62 skip_over_whitespace, 

63) 

64from .constants import TrailerKeys as TK 

65from .errors import ( 

66 EmptyFileError, 

67 FileNotDecryptedError, 

68 LimitReachedError, 

69 PdfReadError, 

70 PdfStreamError, 

71 WrongPasswordError, 

72) 

73from .generic import ( 

74 ArrayObject, 

75 ContentStream, 

76 DecodedStreamObject, 

77 Destination, 

78 DictionaryObject, 

79 EncodedStreamObject, 

80 IndirectObject, 

81 NameObject, 

82 NullObject, 

83 NumberObject, 

84 PdfObject, 

85 StreamObject, 

86 TextStringObject, 

87 TreeObject, 

88 is_null_or_none, 

89 read_object, 

90) 

91from .xmp import XmpInformation 

92 

93if TYPE_CHECKING: 

94 from ._page import PageObject 

95 

96 

97class PdfReader(PdfDocCommon): 

98 """ 

99 Initialize a PdfReader object. 

100 

101 This operation can take some time, as the PDF stream's cross-reference 

102 tables are read into memory. 

103 

104 Args: 

105 stream: A File object or an object that supports the standard read 

106 and seek methods similar to a File object. Could also be a 

107 string representing a path to a PDF file. 

108 strict: Determines whether user should be warned of all 

109 problems and also causes some correctable problems to be fatal. 

110 Defaults to ``False``. 

111 password: Decrypt PDF file at initialization. If the 

112 password is None, the file will not be decrypted. 

113 Defaults to ``None``. 

114 root_object_recovery_limit: The maximum number of objects to query 

115 for recovering the Root object in non-strict mode. To disable 

116 this security measure, pass ``None``. 

117 

118 """ 

119 

120 def __init__( 

121 self, 

122 stream: Union[StrByteType, Path], 

123 strict: bool = False, 

124 password: Union[None, str, bytes] = None, 

125 *, 

126 root_object_recovery_limit: Optional[int] = 10_000, 

127 ) -> None: 

128 self.strict = strict 

129 self.flattened_pages: Optional[list[PageObject]] = None 

130 

131 #: Storage of parsed PDF objects. 

132 self.resolved_objects: dict[tuple[Any, Any], Optional[PdfObject]] = {} 

133 

134 self._startxref: int = 0 

135 self.xref_index = 0 

136 self.xref: dict[int, dict[Any, Any]] = {} 

137 self.xref_free_entry: dict[int, dict[Any, Any]] = {} 

138 self.xref_objStm: dict[int, tuple[Any, Any]] = {} 

139 self.trailer = DictionaryObject() 

140 

141 # Security parameters. 

142 self._root_object_recovery_limit = ( 

143 root_object_recovery_limit if isinstance(root_object_recovery_limit, int) else sys.maxsize 

144 ) 

145 

146 # Map page indirect_reference number to page number 

147 self._page_id2num: Optional[dict[Any, Any]] = None 

148 

149 self._validated_root: Optional[DictionaryObject] = None 

150 

151 self._initialize_stream(stream) 

152 self._known_objects: set[tuple[int, int]] = set() 

153 

154 self._override_encryption = False 

155 self._encryption: Optional[Encryption] = None 

156 if self.is_encrypted: 

157 self._handle_encryption(password) 

158 elif password is not None: 

159 raise PdfReadError("Not an encrypted file") 

160 

161 self._named_destinations_cache: Optional[dict[str, Destination]] = None 

162 

163 def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None: 

164 if hasattr(stream, "mode") and "b" not in stream.mode: 

165 logger_warning( 

166 "PdfReader stream/file object is not in binary mode. " 

167 "It may not be read correctly.", 

168 source=__name__, 

169 ) 

170 self._stream_opened = False 

171 if isinstance(stream, (str, Path)): 

172 with open(stream, "rb") as fh: 

173 stream = BytesIO(fh.read()) 

174 self._stream_opened = True 

175 self.read(stream) 

176 self.stream = stream 

177 

178 def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None: 

179 self._override_encryption = True 

180 # Some documents may not have a /ID, use two empty 

181 # byte strings instead. Solves 

182 # https://github.com/py-pdf/pypdf/issues/608 

183 id_entry = self.trailer.get(TK.ID) 

184 id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" 

185 encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object()) 

186 self._encryption = Encryption.read(encrypt_entry, id1_entry) 

187 

188 # try empty password if no password provided 

189 pwd = password if password is not None else b"" 

190 if ( 

191 self._encryption.verify(pwd, strict=self.strict) == PasswordType.NOT_DECRYPTED 

192 and password is not None 

193 ): 

194 # raise if password provided 

195 raise WrongPasswordError("Wrong password") 

196 self._override_encryption = False 

197 

198 def __enter__(self) -> Self: 

199 return self 

200 

201 def __exit__( 

202 self, 

203 exc_type: Optional[type[BaseException]], 

204 exc_val: Optional[BaseException], 

205 exc_tb: Optional[TracebackType], 

206 ) -> None: 

207 self.close() 

208 

209 def close(self) -> None: 

210 """Close the stream if opened in __init__ and clear memory.""" 

211 if self._stream_opened: 

212 self.stream.close() 

213 self.flattened_pages = [] 

214 self.resolved_objects = {} 

215 self.trailer = DictionaryObject() 

216 self.xref = {} 

217 self.xref_free_entry = {} 

218 self.xref_objStm = {} 

219 

220 @property 

221 def root_object(self) -> DictionaryObject: 

222 """Provide access to "/Root". Standardized with PdfWriter.""" 

223 if self._validated_root: 

224 return self._validated_root 

225 root = self.trailer.get(TK.ROOT) 

226 if is_null_or_none(root): 

227 logger_warning('Cannot find "/Root" key in trailer', source=__name__) 

228 elif ( 

229 cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type") 

230 == "/Catalog" 

231 ): 

232 self._validated_root = cast( 

233 DictionaryObject, cast(PdfObject, root).get_object() 

234 ) 

235 else: 

236 logger_warning("Invalid Root object in trailer", source=__name__) 

237 if self._validated_root is None: 

238 logger_warning('Searching object with "/Catalog" key', source=__name__) 

239 number_of_objects = cast(int, self.trailer.get("/Size", 0)) 

240 for i in range(number_of_objects): 

241 if i >= self._root_object_recovery_limit: 

242 raise LimitReachedError("Maximum Root object recovery limit reached.") 

243 try: 

244 obj = self.get_object(i + 1) 

245 except Exception: # to be sure to capture all errors 

246 obj = None 

247 if isinstance(obj, DictionaryObject) and obj.get("/Type") == "/Catalog": 

248 self._validated_root = obj 

249 logger_warning( 

250 "Root found at %(obj_reference)r", 

251 source=__name__, 

252 obj_reference=obj.indirect_reference, 

253 ) 

254 break 

255 if self._validated_root is None: 

256 if not is_null_or_none(root) and "/Pages" in cast(DictionaryObject, cast(PdfObject, root).get_object()): 

257 logger_warning( 

258 "Possible root found at %(root_ref)r, but missing /Catalog key", 

259 source=__name__, 

260 root_ref=cast(PdfObject, root).indirect_reference, 

261 ) 

262 self._validated_root = cast( 

263 DictionaryObject, cast(PdfObject, root).get_object() 

264 ) 

265 else: 

266 raise PdfReadError("Cannot find Root object in pdf") 

267 return self._validated_root 

268 

269 @property 

270 def _info(self) -> Optional[DictionaryObject]: 

271 """ 

272 Provide access to "/Info". Standardized with PdfWriter. 

273 

274 Returns: 

275 /Info Dictionary; None if the entry does not exist 

276 

277 """ 

278 info = self.trailer.get(TK.INFO, None) 

279 if is_null_or_none(info): 

280 return None 

281 assert info is not None, "mypy" 

282 info = info.get_object() 

283 if not isinstance(info, DictionaryObject): 

284 raise PdfReadError( 

285 "Trailer not found or does not point to a document information dictionary" 

286 ) 

287 return info 

288 

289 @property 

290 def _ID(self) -> Optional[ArrayObject]: 

291 """ 

292 Provide access to "/ID". Standardized with PdfWriter. 

293 

294 Returns: 

295 /ID array; None if the entry does not exist 

296 

297 """ 

298 id = self.trailer.get(TK.ID, None) 

299 if is_null_or_none(id): 

300 return None 

301 assert id is not None, "mypy" 

302 return cast(ArrayObject, id.get_object()) 

303 

304 @property 

305 def pdf_header(self) -> str: 

306 """ 

307 The first 8 bytes of the file. 

308 

309 This is typically something like ``'%PDF-1.6'`` and can be used to 

310 detect if the file is actually a PDF file and which version it is. 

311 """ 

312 # TODO: Make this return a bytes object for consistency 

313 # but that needs a deprecation 

314 loc = self.stream.tell() 

315 self.stream.seek(0, 0) 

316 pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") 

317 self.stream.seek(loc, 0) # return to where it was 

318 return pdf_file_version 

319 

320 @property 

321 def xmp_metadata(self) -> Optional[XmpInformation]: 

322 """XMP (Extensible Metadata Platform) data.""" 

323 try: 

324 self._override_encryption = True 

325 return cast(XmpInformation, self.root_object.xmp_metadata) 

326 finally: 

327 self._override_encryption = False 

328 

329 def _get_page_number_by_indirect( 

330 self, indirect_reference: Union[None, int, NullObject, IndirectObject] 

331 ) -> Optional[int]: 

332 """ 

333 Retrieve the page number from an indirect reference. 

334 

335 Args: 

336 indirect_reference: The indirect reference to locate. 

337 

338 Returns: 

339 Page number or None. 

340 

341 """ 

342 if self._page_id2num is None: 

343 self._page_id2num = { 

344 x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore[union-attr] 

345 } 

346 

347 if is_null_or_none(indirect_reference): 

348 return None 

349 assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" 

350 if isinstance(indirect_reference, int): 

351 idnum = indirect_reference 

352 else: 

353 idnum = indirect_reference.idnum 

354 assert self._page_id2num is not None, "hint for mypy" 

355 return self._page_id2num.get(idnum, None) 

356 

357 def _get_object_from_stream( 

358 self, indirect_reference: IndirectObject 

359 ) -> Union[int, PdfObject, str]: 

360 # indirect reference to object in object stream 

361 # read the entire object stream into memory 

362 stmnum, _idx = self.xref_objStm[indirect_reference.idnum] 

363 obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore[assignment] 

364 # This is an xref to a stream, so its type better be a stream 

365 assert cast(str, obj_stm["/Type"]) == "/ObjStm" 

366 # Parse ALL objects in this stream in one pass and cache them. 

367 # This avoids O(N²) behavior when many objects from the same stream 

368 # are resolved individually (each call would re-parse the header). 

369 stream_data = BytesIO(obj_stm.get_data()) 

370 n = int(obj_stm["/N"]) # type: ignore[call-overload] 

371 first_offset = int(obj_stm["/First"]) # type: ignore[call-overload] 

372 

373 # ObjStm header format: "objnum offset objnum offset ..." 

374 # smallest possible entry: "0 0" = 3 bytes (1 digit + 1 space + 1 digit) 

375 # using // 4 would reject a valid 3-byte single entry (3 // 4 = 0) 

376 max_n = stream_data.getbuffer().nbytes // 3 

377 stream_data.seek(0) 

378 if n > max_n: 

379 if self.strict: 

380 raise LimitReachedError(f"Value /N {n} for object {stmnum} exceeds maximum allowed value {max_n}.") 

381 logger_warning( 

382 "Value /N %(n)d for object %(stmnum)d exceeds maximum allowed value %(max_n)d. Limiting to %(max_n)d.", 

383 source=__name__, 

384 n=n, 

385 stmnum=stmnum, 

386 max_n=max_n, 

387 ) 

388 n = max_n 

389 

390 # Phase 1: Read the index (objnum, offset) pairs from the header. 

391 obj_index: list[tuple[int, int]] = [] 

392 for _i in range(n): 

393 read_non_whitespace(stream_data) 

394 stream_data.seek(-1, 1) 

395 objnum = NumberObject.read_from_stream(stream_data) 

396 read_non_whitespace(stream_data) 

397 stream_data.seek(-1, 1) 

398 offset = NumberObject.read_from_stream(stream_data) 

399 read_non_whitespace(stream_data) 

400 stream_data.seek(-1, 1) 

401 obj_index.append((int(objnum), int(offset))) 

402 

403 # Phase 2: Parse each object and cache it. 

404 target_obj: Union[int, PdfObject, str] = NullObject() 

405 found = False 

406 for i, (obj_num, obj_offset) in enumerate(obj_index): 

407 # Skip objects already in the cache. 

408 cached = self.cache_get_indirect_object(0, obj_num) 

409 if cached is not None: 

410 if obj_num == indirect_reference.idnum: 

411 target_obj = cached 

412 found = True 

413 continue 

414 

415 stream_data.seek(first_offset + obj_offset, 0) 

416 

417 # To cope with case where the 'pointer' is on a white space 

418 read_non_whitespace(stream_data) 

419 stream_data.seek(-1, 1) 

420 

421 try: 

422 obj = read_object(stream_data, self) 

423 except PdfStreamError as exc: 

424 # Stream object cannot be read. Normally, a critical error, but 

425 # Adobe Reader doesn't complain, so continue (in strict mode?) 

426 logger_warning( 

427 "Invalid stream (index %(index)d) within object %(obj_num)d 0: %(exc)s", 

428 source=__name__, 

429 index=i, 

430 obj_num=obj_num, 

431 exc=exc, 

432 ) 

433 if self.strict: # pragma: no cover 

434 raise PdfReadError( 

435 f"Cannot read object stream: {exc}" 

436 ) # pragma: no cover 

437 obj = NullObject() # pragma: no cover 

438 

439 # Only cache if this stream is the authoritative source for the object. 

440 # Incremental updates may override objects originally in the stream; 

441 # caching those stale versions would shadow the newer xref entry. 

442 authoritative_stm, _idx = self.xref_objStm.get(obj_num, (None, None)) 

443 if authoritative_stm == stmnum: 

444 self.cache_indirect_object(0, obj_num, obj) # type: ignore[arg-type] 

445 

446 if obj_num == indirect_reference.idnum: 

447 target_obj = obj 

448 found = True 

449 

450 if not found and self.strict: # pragma: no cover 

451 raise PdfReadError( 

452 "This is a fatal error in strict mode." 

453 ) # pragma: no cover 

454 return target_obj 

455 

456 def get_object( 

457 self, indirect_reference: Union[int, IndirectObject] 

458 ) -> Optional[PdfObject]: 

459 if isinstance(indirect_reference, int): 

460 indirect_reference = IndirectObject(indirect_reference, 0, self) 

461 retval = self.cache_get_indirect_object( 

462 indirect_reference.generation, indirect_reference.idnum 

463 ) 

464 if retval is not None: 

465 return retval 

466 if ( 

467 indirect_reference.generation == 0 

468 and indirect_reference.idnum in self.xref_objStm 

469 ): 

470 retval = self._get_object_from_stream(indirect_reference) # type: ignore 

471 elif ( 

472 indirect_reference.generation in self.xref 

473 and indirect_reference.idnum in self.xref[indirect_reference.generation] 

474 ): 

475 if self.xref_free_entry.get(indirect_reference.generation, {}).get( 

476 indirect_reference.idnum, False 

477 ): 

478 return NullObject() 

479 start = self.xref[indirect_reference.generation][indirect_reference.idnum] 

480 self.stream.seek(start, 0) 

481 try: 

482 idnum, generation = self.read_object_header(self.stream) 

483 if ( 

484 idnum != indirect_reference.idnum 

485 or generation != indirect_reference.generation 

486 ): 

487 raise PdfReadError("Not matching, we parse the file for it") 

488 except Exception: 

489 if hasattr(self.stream, "getbuffer"): 

490 buf = bytes(self.stream.getbuffer()) 

491 else: 

492 p = self.stream.tell() 

493 self.stream.seek(0, 0) 

494 buf = self.stream.read(-1) 

495 self.stream.seek(p, 0) 

496 m = re.search( 

497 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

498 buf, 

499 ) 

500 if m is not None: 

501 logger_warning( 

502 "Object ID %(idnum)d,%(generation)d ref repaired", 

503 source=__name__, 

504 idnum=indirect_reference.idnum, 

505 generation=indirect_reference.generation, 

506 ) 

507 self.xref[indirect_reference.generation][ 

508 indirect_reference.idnum 

509 ] = (m.start(0) + 1) 

510 self.stream.seek(m.start(0) + 1) 

511 idnum, generation = self.read_object_header(self.stream) 

512 else: 

513 idnum = -1 

514 generation = -1 # exception will be raised below 

515 if idnum != indirect_reference.idnum and self.xref_index: 

516 # xref table probably had bad indexes due to not being zero-indexed 

517 if self.strict: 

518 raise PdfReadError( 

519 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

520 f"does not match actual ({idnum} {generation}); " 

521 "xref table not zero-indexed." 

522 ) 

523 # xref table is corrected in non-strict mode 

524 elif idnum != indirect_reference.idnum and self.strict: 

525 # some other problem 

526 raise PdfReadError( 

527 f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " 

528 f"does not match actual ({idnum} {generation})." 

529 ) 

530 if self.strict: 

531 assert generation == indirect_reference.generation 

532 

533 current_object = (indirect_reference.idnum, indirect_reference.generation) 

534 if current_object in self._known_objects: 

535 raise LimitReachedError(f"Detected loop with self reference for {indirect_reference!r}.") 

536 self._known_objects.add(current_object) 

537 retval = read_object(self.stream, self) # type: ignore[assignment] 

538 self._known_objects.remove(current_object) 

539 

540 # override encryption is used for the /Encrypt dictionary 

541 if not self._override_encryption and self._encryption is not None: 

542 # if we don't have the encryption key: 

543 if not self._encryption.is_decrypted(): 

544 raise FileNotDecryptedError("File has not been decrypted") 

545 # otherwise, decrypt here... 

546 retval = cast(PdfObject, retval) 

547 retval = self._encryption.decrypt_object( 

548 retval, indirect_reference.idnum, indirect_reference.generation, 

549 strict=self.strict, 

550 ) 

551 else: 

552 if hasattr(self.stream, "getbuffer"): 

553 buf = bytes(self.stream.getbuffer()) 

554 else: 

555 p = self.stream.tell() 

556 self.stream.seek(0, 0) 

557 buf = self.stream.read(-1) 

558 self.stream.seek(p, 0) 

559 m = re.search( 

560 rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), 

561 buf, 

562 ) 

563 if m is not None: 

564 logger_warning( 

565 "Object %(idnum)d %(generation)d found", 

566 source=__name__, 

567 idnum=indirect_reference.idnum, 

568 generation=indirect_reference.generation, 

569 ) 

570 if indirect_reference.generation not in self.xref: 

571 self.xref[indirect_reference.generation] = {} 

572 self.xref[indirect_reference.generation][indirect_reference.idnum] = ( 

573 m.start(0) + 1 

574 ) 

575 self.stream.seek(m.end(0) + 1) 

576 skip_over_whitespace(self.stream) 

577 self.stream.seek(-1, 1) 

578 retval = read_object(self.stream, self) # type: ignore[assignment] 

579 

580 # override encryption is used for the /Encrypt dictionary 

581 if not self._override_encryption and self._encryption is not None: 

582 # if we don't have the encryption key: 

583 if not self._encryption.is_decrypted(): 

584 raise FileNotDecryptedError("File has not been decrypted") 

585 # otherwise, decrypt here... 

586 retval = cast(PdfObject, retval) 

587 retval = self._encryption.decrypt_object( 

588 retval, indirect_reference.idnum, indirect_reference.generation, 

589 strict=self.strict, 

590 ) 

591 else: 

592 logger_warning( 

593 "Object %(idnum)d %(generation)d not defined.", 

594 source=__name__, 

595 idnum=indirect_reference.idnum, 

596 generation=indirect_reference.generation, 

597 ) 

598 if self.strict: 

599 raise PdfReadError("Could not find object.") 

600 # For ObjStm objects, _get_object_from_stream already cached 

601 # the result during batch parsing; skip the redundant cache write 

602 # to avoid "Overwriting cache" warnings. For non-ObjStm objects 

603 # (including encrypted ones that need decrypted values cached), 

604 # always write. 

605 if not ( 

606 indirect_reference.generation == 0 

607 and indirect_reference.idnum in self.xref_objStm 

608 ): 

609 self.cache_indirect_object( 

610 indirect_reference.generation, indirect_reference.idnum, retval 

611 ) 

612 return retval 

613 

614 def read_object_header(self, stream: StreamType) -> tuple[int, int]: 

615 # Should never be necessary to read out whitespace, since the 

616 # cross-reference table should put us in the right spot to read the 

617 # object header. In reality some files have stupid cross-reference 

618 # tables that are off by whitespace bytes. 

619 skip_over_comment(stream) 

620 extra = skip_over_whitespace(stream) 

621 stream.seek(-1, 1) 

622 idnum = read_until_whitespace(stream) 

623 extra |= skip_over_whitespace(stream) 

624 stream.seek(-1, 1) 

625 generation = read_until_whitespace(stream) 

626 extra |= skip_over_whitespace(stream) 

627 stream.seek(-1, 1) 

628 

629 # although it's not used, it might still be necessary to read 

630 _obj = stream.read(3) 

631 

632 read_non_whitespace(stream) 

633 stream.seek(-1, 1) 

634 if extra and self.strict: 

635 logger_warning( 

636 "Superfluous whitespace found in object header %(idnum)r %(generation)r", 

637 source=__name__, 

638 idnum=idnum, 

639 generation=generation, 

640 ) 

641 return int(idnum), int(generation) 

642 

643 def cache_get_indirect_object( 

644 self, generation: int, idnum: int 

645 ) -> Optional[PdfObject]: 

646 try: 

647 return self.resolved_objects.get((generation, idnum)) 

648 except RecursionError: 

649 raise PdfReadError("Maximum recursion depth reached.") 

650 

651 def cache_indirect_object( 

652 self, generation: int, idnum: int, obj: Optional[PdfObject] 

653 ) -> Optional[PdfObject]: 

654 if (generation, idnum) in self.resolved_objects: 

655 msg = "Overwriting cache for %(generation)d %(idnum)d" 

656 values = {"generation": generation, "idnum": idnum} 

657 if self.strict: 

658 raise PdfReadError(msg % values) 

659 logger_warning(msg, source=__name__, **values) 

660 self.resolved_objects[(generation, idnum)] = obj 

661 if obj is not None: 

662 obj.indirect_reference = IndirectObject(idnum, generation, self) 

663 return obj 

664 

665 def _replace_object(self, indirect_reference: IndirectObject, obj: PdfObject) -> PdfObject: 

666 # function reserved for future development 

667 if indirect_reference.pdf != self: 

668 raise ValueError("Cannot update PdfReader with external object") 

669 if (indirect_reference.generation, indirect_reference.idnum) not in self.resolved_objects: 

670 raise ValueError("Cannot find referenced object") 

671 self.resolved_objects[(indirect_reference.generation, indirect_reference.idnum)] = obj 

672 obj.indirect_reference = indirect_reference 

673 return obj 

674 

675 def read(self, stream: StreamType) -> None: 

676 """ 

677 Read and process the PDF stream, extracting necessary data. 

678 

679 Args: 

680 stream: The PDF file stream. 

681 

682 """ 

683 self._basic_validation(stream) 

684 self._find_eof_marker(stream) 

685 startxref = self._find_startxref_pos(stream) 

686 self._startxref = startxref 

687 

688 # check and eventually correct the startxref only if not strict 

689 xref_issue_nr = self._get_xref_issues(stream, startxref) 

690 if xref_issue_nr != 0: 

691 if self.strict and xref_issue_nr: 

692 raise PdfReadError("Broken xref table") 

693 logger_warning( 

694 "incorrect startxref pointer(%(xref_issue_nr)d)", 

695 source=__name__, 

696 xref_issue_nr=xref_issue_nr, 

697 ) 

698 

699 # read all cross-reference tables and their trailers 

700 self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) 

701 

702 # if not zero-indexed, verify that the table is correct; change it if necessary 

703 if self.xref_index and not self.strict: 

704 loc = stream.tell() 

705 for gen, xref_entry in self.xref.items(): 

706 if gen == 65535: 

707 continue 

708 xref_k = sorted( 

709 xref_entry.keys() 

710 ) # ensure ascending to prevent damage 

711 for id in xref_k: 

712 stream.seek(xref_entry[id], 0) 

713 try: 

714 pid, _pgen = self.read_object_header(stream) 

715 except ValueError: 

716 self._rebuild_xref_table(stream) 

717 break 

718 if pid == id - self.xref_index: 

719 # fixing index item per item is required for revised PDF. 

720 self.xref[gen][pid] = self.xref[gen][id] 

721 del self.xref[gen][id] 

722 # if not, then either it's just plain wrong, or the 

723 # non-zero-index is actually correct 

724 stream.seek(loc, 0) # return to where it was 

725 

726 # remove wrong objects (not pointing to correct structures) - cf #2326 

727 if not self.strict: 

728 loc = stream.tell() 

729 for gen, xref_entry in self.xref.items(): 

730 if gen == 65535: 

731 continue 

732 ids = list(xref_entry.keys()) 

733 for id in ids: 

734 stream.seek(xref_entry[id], 0) 

735 try: 

736 self.read_object_header(stream) 

737 except ValueError: 

738 logger_warning( 

739 "Ignoring wrong pointing object %(id)d %(gen)d (offset %(offset)d)", 

740 source=__name__, 

741 id=id, 

742 gen=gen, 

743 offset=xref_entry[id], 

744 ) 

745 del xref_entry[id] # we can delete the id, we are parsing ids 

746 stream.seek(loc, 0) # return to where it was 

747 

748 def _basic_validation(self, stream: StreamType) -> None: 

749 """Ensure the stream is valid and not empty.""" 

750 stream.seek(0, os.SEEK_SET) 

751 try: 

752 header_byte = stream.read(5) 

753 except UnicodeDecodeError: 

754 raise UnsupportedOperation("cannot read header") 

755 if header_byte == b"": 

756 raise EmptyFileError("Cannot read an empty file") 

757 if header_byte != b"%PDF-": 

758 if self.strict: 

759 raise PdfReadError( 

760 f"PDF starts with '{header_byte.decode('utf8')}', " 

761 "but '%PDF-' expected" 

762 ) 

763 logger_warning("invalid pdf header: %(header_byte)r", source=__name__, header_byte=header_byte) 

764 stream.seek(0, os.SEEK_END) 

765 

766 def _find_eof_marker(self, stream: StreamType) -> None: 

767 """ 

768 Jump to the %%EOF marker. 

769 

770 According to the specs, the %%EOF marker should be at the very end of 

771 the file. Hence for standard-compliant PDF documents this function will 

772 read only the last part (DEFAULT_BUFFER_SIZE). 

773 """ 

774 HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' 

775 line = b"" 

776 first = True 

777 while not line.startswith(b"%%EOF"): 

778 if line != b"" and first: 

779 if any( 

780 line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%") 

781 ): 

782 # Consider the file as truncated while 

783 # having enough confidence to carry on. 

784 logger_warning("EOF marker seems truncated", source=__name__) 

785 break 

786 first = False 

787 if b"startxref" in line: 

788 logger_warning( 

789 "CAUTION: startxref found while searching for %%EOF. " 

790 "The file might be truncated and some data might not be read.", 

791 source=__name__, 

792 ) 

793 if stream.tell() < HEADER_SIZE: 

794 if self.strict: 

795 raise PdfReadError("EOF marker not found") 

796 logger_warning("EOF marker not found", source=__name__) 

797 line = read_previous_line(stream) 

798 

799 def _find_startxref_pos(self, stream: StreamType) -> int: 

800 """ 

801 Find startxref entry - the location of the xref table. 

802 

803 Args: 

804 stream: 

805 

806 Returns: 

807 The bytes offset 

808 

809 """ 

810 line = read_previous_line(stream) 

811 try: 

812 startxref = int(line) 

813 except ValueError: 

814 # 'startxref' may be on the same line as the location 

815 if not line.startswith(b"startxref"): 

816 raise PdfReadError("startxref not found") 

817 startxref = int(line[9:].strip()) 

818 logger_warning("startxref on same line as offset", source=__name__) 

819 else: 

820 line = read_previous_line(stream) 

821 if not line.startswith(b"startxref"): 

822 raise PdfReadError("startxref not found") 

823 return startxref 

824 

825 def _read_standard_xref_table(self, stream: StreamType) -> None: 

826 # standard cross-reference table 

827 ref = stream.read(3) 

828 if ref != b"ref": 

829 raise PdfReadError("xref table read error") 

830 read_non_whitespace(stream) 

831 stream.seek(-1, 1) 

832 first_time = True # check if the first time looking at the xref table 

833 while True: 

834 num = cast(int, read_object(stream, self)) 

835 if first_time and num != 0: 

836 self.xref_index = num 

837 if self.strict: 

838 logger_warning( 

839 "Xref table not zero-indexed. ID numbers for objects will be corrected.", 

840 source=__name__, 

841 ) 

842 # if table not zero indexed, could be due to error from when PDF was created 

843 # which will lead to mismatched indices later on, only warned and corrected if self.strict==True 

844 first_time = False 

845 read_non_whitespace(stream) 

846 stream.seek(-1, 1) 

847 size = cast(int, read_object(stream, self)) 

848 if not isinstance(size, int): 

849 logger_warning( 

850 "Invalid/Truncated xref table. Rebuilding it.", 

851 source=__name__, 

852 ) 

853 self._rebuild_xref_table(stream) 

854 stream.read() 

855 return 

856 read_non_whitespace(stream) 

857 stream.seek(-1, 1) 

858 cnt = 0 

859 while cnt < size: 

860 line = stream.read(20) 

861 if not line: 

862 raise PdfReadError("Unexpected empty line in Xref table.") 

863 

864 # It's very clear in section 3.4.3 of the PDF spec 

865 # that all cross-reference table lines are a fixed 

866 # 20 bytes (as of PDF 1.7). However, some files have 

867 # 21-byte entries (or more) due to the use of \r\n 

868 # (CRLF) EOL's. Detect that case, and adjust the line 

869 # until it does not begin with a \r (CR) or \n (LF). 

870 while line[0] in b"\x0D\x0A": 

871 stream.seek(-20 + 1, 1) 

872 line = stream.read(20) 

873 

874 # On the other hand, some malformed PDF files 

875 # use a single character EOL without a preceding 

876 # space. Detect that case, and seek the stream 

877 # back one character (0-9 means we've bled into 

878 # the next xref entry, t means we've bled into the 

879 # text "trailer"): 

880 if line[-1] in b"0123456789t": 

881 stream.seek(-1, 1) 

882 

883 try: 

884 offset_b, generation_b = line[:16].split(b" ") 

885 entry_type_b = line[17:18] 

886 

887 offset, generation = int(offset_b), int(generation_b) 

888 except Exception: 

889 if hasattr(stream, "getbuffer"): 

890 buf = bytes(stream.getbuffer()) 

891 else: 

892 p = stream.tell() 

893 stream.seek(0, 0) 

894 buf = stream.read(-1) 

895 stream.seek(p) 

896 

897 f = re.search(rf"{num}\s+(\d+)\s+obj".encode(), buf) 

898 if f is None: 

899 logger_warning( 

900 "entry %(num)d in Xref table invalid; object not found", 

901 source=__name__, 

902 num=num, 

903 ) 

904 generation = 65535 

905 offset = -1 

906 entry_type_b = b"f" 

907 else: 

908 logger_warning( 

909 "entry %(num)d in Xref table invalid but object found", 

910 source=__name__, 

911 num=num, 

912 ) 

913 generation = int(f.group(1)) 

914 offset = f.start() 

915 

916 if generation not in self.xref: 

917 self.xref[generation] = {} 

918 self.xref_free_entry[generation] = {} 

919 if num in self.xref[generation]: 

920 # It really seems like we should allow the last 

921 # xref table in the file to override previous 

922 # ones. Since we read the file backwards, assume 

923 # any existing key is already set correctly. 

924 pass 

925 else: 

926 if entry_type_b == b"n": 

927 self.xref[generation][num] = offset 

928 try: 

929 self.xref_free_entry[generation][num] = entry_type_b == b"f" 

930 except Exception: 

931 pass 

932 try: 

933 self.xref_free_entry[65535][num] = entry_type_b == b"f" 

934 except Exception: 

935 pass 

936 cnt += 1 

937 num += 1 

938 read_non_whitespace(stream) 

939 stream.seek(-1, 1) 

940 # Skip any PDF comments between xref entries and the trailer 

941 # keyword. Some PDF producers (e.g. Vectorizer.AI) insert 

942 # comments here which are legal per the PDF spec (§7.2.3). 

943 while stream.read(1) == b"%": 

944 stream.seek(-1, 1) 

945 skip_over_comment(stream) 

946 read_non_whitespace(stream) 

947 stream.seek(-1, 1) 

948 stream.seek(-1, 1) 

949 trailer_tag = stream.read(7) 

950 if trailer_tag != b"trailer": 

951 # more xrefs! 

952 stream.seek(-7, 1) 

953 else: 

954 break 

955 

956 def _read_xref_tables_and_trailers( 

957 self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int 

958 ) -> None: 

959 """Read the cross-reference tables and trailers in the PDF stream.""" 

960 self.xref = {} 

961 self.xref_free_entry = {} 

962 self.xref_objStm = {} 

963 self.trailer = DictionaryObject() 

964 visited_xref_offsets: set[int] = set() 

965 while startxref is not None: 

966 # Detect circular /Prev references in the xref chain 

967 if startxref in visited_xref_offsets: 

968 logger_warning( 

969 "Circular xref chain detected at offset %(startxref)d, stopping", 

970 source=__name__, 

971 startxref=startxref, 

972 ) 

973 break 

974 visited_xref_offsets.add(startxref) 

975 # load the xref table 

976 stream.seek(startxref, 0) 

977 x = stream.read(1) 

978 if x in b"\r\n": 

979 x = stream.read(1) 

980 if x == b"x": 

981 startxref = self._read_xref(stream) 

982 elif xref_issue_nr: 

983 try: 

984 self._rebuild_xref_table(stream) 

985 break 

986 except Exception: 

987 xref_issue_nr = 0 

988 elif x.isdigit(): 

989 try: 

990 xrefstream = self._read_pdf15_xref_stream(stream) 

991 except Exception as e: 

992 if TK.ROOT in self.trailer: 

993 logger_warning( 

994 "Previous trailer cannot be read: %(args)s", 

995 source=__name__, 

996 args=e.args, 

997 ) 

998 break 

999 raise PdfReadError(f"Trailer cannot be read: {e!s}") 

1000 self._process_xref_stream(xrefstream) 

1001 if "/Prev" in xrefstream: 

1002 startxref = cast(int, xrefstream["/Prev"]) 

1003 else: 

1004 break 

1005 else: 

1006 startxref = self._read_xref_other_error(stream, startxref) 

1007 

1008 def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: 

1009 """Process and handle the xref stream.""" 

1010 trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE 

1011 for key in trailer_keys: 

1012 if key in xrefstream and key not in self.trailer: 

1013 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 

1014 if "/XRefStm" in xrefstream: 

1015 p = self.stream.tell() 

1016 self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) 

1017 self._read_pdf15_xref_stream(self.stream) 

1018 self.stream.seek(p, 0) 

1019 

1020 def _read_xref(self, stream: StreamType) -> Optional[int]: 

1021 self._read_standard_xref_table(stream) 

1022 if stream.read(1) == b"": 

1023 return None 

1024 stream.seek(-1, 1) 

1025 read_non_whitespace(stream) 

1026 stream.seek(-1, 1) 

1027 new_trailer = cast(dict[str, Any], read_object(stream, self)) 

1028 for key, value in new_trailer.items(): 

1029 if key not in self.trailer: 

1030 self.trailer[key] = value 

1031 if "/XRefStm" in new_trailer: 

1032 p = stream.tell() 

1033 stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) 

1034 try: 

1035 self._read_pdf15_xref_stream(stream) 

1036 except Exception: 

1037 logger_warning( 

1038 "XRef object at %(xref_stm)d can not be read, some object may be missing", 

1039 source=__name__, 

1040 xref_stm=int(new_trailer["/XRefStm"]), 

1041 ) 

1042 stream.seek(p, 0) 

1043 if "/Prev" in new_trailer: 

1044 return cast(int, new_trailer["/Prev"]) 

1045 return None 

1046 

1047 def _read_xref_other_error( 

1048 self, stream: StreamType, startxref: int 

1049 ) -> Optional[int]: 

1050 # some PDFs have /Prev=0 in the trailer, instead of no /Prev 

1051 if startxref == 0: 

1052 if self.strict: 

1053 raise PdfReadError( 

1054 "/Prev=0 in the trailer (try opening with strict=False)" 

1055 ) 

1056 logger_warning( 

1057 "/Prev=0 in the trailer - assuming there is no previous xref table", 

1058 source=__name__, 

1059 ) 

1060 return None 

1061 # bad xref character at startxref. Let's see if we can find 

1062 # the xref table nearby, as we've observed this error with an 

1063 # off-by-one before. 

1064 stream.seek(-11, 1) 

1065 tmp = stream.read(20) 

1066 xref_loc = tmp.find(b"xref") 

1067 if xref_loc != -1: 

1068 startxref -= 10 - xref_loc 

1069 return startxref 

1070 # No explicit xref table, try finding a cross-reference stream. 

1071 stream.seek(startxref, 0) 

1072 for look in range(25): # value extended to cope with more linearized files 

1073 if stream.read(1).isdigit(): 

1074 # This is not a standard PDF, consider adding a warning 

1075 startxref += look 

1076 return startxref 

1077 # no xref table found at specified location 

1078 if "/Root" in self.trailer and not self.strict: 

1079 # if Root has been already found, just raise warning 

1080 logger_warning("Invalid parent xref., rebuild xref", source=__name__) 

1081 try: 

1082 self._rebuild_xref_table(stream) 

1083 return None 

1084 except Exception: 

1085 raise PdfReadError("Cannot rebuild xref") 

1086 raise PdfReadError("Could not find xref table at specified location") 

1087 

1088 def _sanitize_pdf15_xref_stream_index_pairs( 

1089 self, index_pairs: list[int], entry_sizes: list[int], xref_stream: ContentStream 

1090 ) -> list[int]: 

1091 # `entry_sizes` holds the byte widths for the entries. Summing determines the total number of bytes per entry. 

1092 # We expect up to 3 values. `min_entry_bytes` will be the smallest plausible size of one xref entry. 

1093 min_entry_bytes = sum(int(entry_sizes[i]) for i in range(min(len(entry_sizes), 3))) 

1094 if min_entry_bytes == 0: 

1095 message = "Cross-reference stream encodes no entry data." 

1096 if self.strict: 

1097 raise PdfStreamError(message) 

1098 logger_warning(message, source=__name__) 

1099 return [] 

1100 

1101 # maximum number of entries that could physically fit 

1102 max_entries = len(xref_stream.get_data()) // min_entry_bytes + 1 

1103 

1104 result = [] 

1105 total = 0 

1106 

1107 for index, pair_value in enumerate(index_pairs): 

1108 pair_value_int = int(pair_value) 

1109 

1110 # `index_pairs` has the format `[start0, count0, start1, count1, ...]` 

1111 # Only modify the counts here, but keep the start values. 

1112 if index % 2 == 1: 

1113 if total + pair_value_int > max_entries: 

1114 if self.strict: 

1115 raise LimitReachedError( 

1116 f"Total XRef entries {total + pair_value_int} exceed maximum allowed value {max_entries}." 

1117 ) 

1118 new_v = max(0, max_entries - total) 

1119 logger_warning( 

1120 "Clamping XRef count from %(old_count)d to %(new_count)d to fit stream size.", 

1121 source=__name__, 

1122 old_count=pair_value_int, 

1123 new_count=new_v, 

1124 ) 

1125 pair_value_int = new_v 

1126 

1127 total += pair_value_int 

1128 

1129 result.append(pair_value_int) 

1130 

1131 return result 

1132 

1133 def _read_pdf15_xref_stream( 

1134 self, stream: StreamType 

1135 ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: 

1136 """Read the cross-reference stream for PDF 1.5+.""" 

1137 stream.seek(-1, 1) 

1138 stream_idnum, stream_generation = self.read_object_header(stream) 

1139 xref_stream = cast(ContentStream, read_object(stream, self)) 

1140 if cast(str, xref_stream["/Type"]) != "/XRef": 

1141 raise PdfReadError(f"Unexpected type {xref_stream['/Type']!r}") 

1142 self.cache_indirect_object(stream_generation, stream_idnum, xref_stream) 

1143 

1144 # Index pairs specify the subsections in the dictionary. 

1145 # If none, create one subsection that spans everything. 

1146 if "/Size" not in xref_stream: 

1147 # According to table 17 of the PDF 2.0 specification, this key is required. 

1148 raise PdfReadError(f"Size missing from XRef stream {xref_stream!r}!") 

1149 index_pairs = xref_stream.get("/Index", [0, xref_stream["/Size"]]) 

1150 

1151 entry_sizes = cast(list[int], xref_stream.get("/W")) 

1152 assert len(entry_sizes) >= 3 

1153 if self.strict and len(entry_sizes) > 3: 

1154 raise PdfReadError(f"Too many entry sizes: {entry_sizes}") 

1155 index_pairs = self._sanitize_pdf15_xref_stream_index_pairs( 

1156 index_pairs=index_pairs, entry_sizes=entry_sizes, xref_stream=xref_stream 

1157 ) 

1158 

1159 stream_data = BytesIO(xref_stream.get_data()) 

1160 

1161 def get_entry(i: int) -> Union[int, tuple[int, ...]]: 

1162 # Reads the correct number of bytes for each entry. See the 

1163 # discussion of the W parameter in PDF spec table 17. 

1164 if entry_sizes[i] > 0: 

1165 d = stream_data.read(entry_sizes[i]) 

1166 return convert_to_int(d, entry_sizes[i]) 

1167 

1168 # PDF Spec Table 17: A value of zero for an element in the 

1169 # W array indicates...the default value shall be used 

1170 if i == 0: 

1171 return 1 # First value defaults to 1 

1172 return 0 

1173 

1174 def used_before(num: int, generation: Union[int, tuple[int, ...]]) -> bool: 

1175 # We move backwards through the xrefs, don't replace any. 

1176 return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore[arg-type] 

1177 

1178 # Iterate through each subsection 

1179 self._read_xref_subsections(index_pairs, get_entry, used_before) 

1180 return xref_stream 

1181 

1182 @staticmethod 

1183 def _get_xref_issues(stream: StreamType, startxref: int) -> int: 

1184 """ 

1185 Return an int which indicates an issue. 0 means there is no issue. 

1186 

1187 Args: 

1188 stream: 

1189 startxref: 

1190 

1191 Returns: 

1192 0 means no issue, other values represent specific issues. 

1193 

1194 """ 

1195 if startxref == 0: 

1196 return 4 

1197 

1198 stream.seek(startxref - 1, 0) # -1 to check character before 

1199 line = stream.read(1) 

1200 if line == b"j": 

1201 line = stream.read(1) 

1202 if line not in b"\r\n \t": 

1203 return 1 

1204 line = stream.read(4) 

1205 if line != b"xref": 

1206 # not a xref so check if it is an XREF object 

1207 line = b"" 

1208 while line in b"0123456789 \t": 

1209 line = stream.read(1) 

1210 if line == b"": 

1211 return 2 

1212 line += stream.read(2) # 1 char already read, +2 to check "obj" 

1213 if line.lower() != b"obj": 

1214 return 3 

1215 return 0 

1216 

1217 @classmethod 

1218 def _find_pdf_objects(cls, data: bytes) -> Iterable[tuple[int, int, int]]: 

1219 index = 0 

1220 ord_0 = ord("0") 

1221 ord_9 = ord("9") 

1222 while True: 

1223 index = data.find(b" obj", index) 

1224 if index == -1: 

1225 return 

1226 

1227 index_before_space = index - 1 

1228 

1229 # Skip whitespace backwards 

1230 while index_before_space >= 0 and data[index_before_space] in WHITESPACES_AS_BYTES: 

1231 index_before_space -= 1 

1232 

1233 # Read generation number 

1234 generation_end = index_before_space + 1 

1235 while index_before_space >= 0 and ord_0 <= data[index_before_space] <= ord_9: 

1236 index_before_space -= 1 

1237 generation_start = index_before_space + 1 

1238 

1239 # Skip whitespace 

1240 while index_before_space >= 0 and data[index_before_space] in WHITESPACES_AS_BYTES: 

1241 index_before_space -= 1 

1242 

1243 # Read object number 

1244 object_end = index_before_space + 1 

1245 while index_before_space >= 0 and ord_0 <= data[index_before_space] <= ord_9: 

1246 index_before_space -= 1 

1247 object_start = index_before_space + 1 

1248 

1249 # Validate 

1250 if object_start < object_end and generation_start < generation_end: 

1251 object_number = int(data[object_start:object_end]) 

1252 generation_number = int(data[generation_start:generation_end]) 

1253 

1254 yield object_number, generation_number, object_start 

1255 

1256 index += 4 # len(b" obj") 

1257 

1258 @classmethod 

1259 def _find_pdf_trailers(cls, data: bytes) -> Iterable[int]: 

1260 index = 0 

1261 data_length = len(data) 

1262 while True: 

1263 index = data.find(b"trailer", index) 

1264 if index == -1: 

1265 return 

1266 

1267 index_after_trailer = index + 7 # len(b"trailer") 

1268 

1269 # Skip whitespace 

1270 while index_after_trailer < data_length and data[index_after_trailer] in WHITESPACES_AS_BYTES: 

1271 index_after_trailer += 1 

1272 

1273 # Must be dictionary start 

1274 if index_after_trailer + 1 < data_length and data[index_after_trailer:index_after_trailer+2] == b"<<": 

1275 yield index_after_trailer # offset of '<<' 

1276 

1277 index += 7 # len(b"trailer") 

1278 

1279 def _rebuild_xref_table(self, stream: StreamType) -> None: 

1280 self.xref = {} 

1281 stream.seek(0, 0) 

1282 stream_data = stream.read(-1) 

1283 

1284 for object_number, generation_number, object_start in self._find_pdf_objects(stream_data): 

1285 if generation_number not in self.xref: 

1286 self.xref[generation_number] = {} 

1287 self.xref[generation_number][object_number] = object_start 

1288 

1289 logger_warning("parsing for Object Streams", source=__name__) 

1290 for generation_number in self.xref: 

1291 for object_number in self.xref[generation_number]: 

1292 # get_object in manual 

1293 stream.seek(self.xref[generation_number][object_number], 0) 

1294 try: 

1295 _ = self.read_object_header(stream) 

1296 obj = cast(StreamObject, read_object(stream, self)) 

1297 if obj.get("/Type", "") != "/ObjStm": 

1298 continue 

1299 object_stream = BytesIO(obj.get_data()) 

1300 actual_count = 0 

1301 while True: 

1302 current = read_until_whitespace(object_stream) 

1303 if not current.isdigit(): 

1304 break 

1305 inner_object_number = int(current) 

1306 skip_over_whitespace(object_stream) 

1307 object_stream.seek(-1, 1) 

1308 current = read_until_whitespace(object_stream) 

1309 if not current.isdigit(): # pragma: no cover 

1310 break # pragma: no cover 

1311 inner_generation_number = int(current) 

1312 self.xref_objStm[inner_object_number] = (object_number, inner_generation_number) 

1313 actual_count += 1 

1314 expected_count = cast(int, obj["/N"]) 

1315 if actual_count != expected_count: # pragma: no cover 

1316 logger_warning( # pragma: no cover 

1317 ( 

1318 "found %(actual_count)d objects within " 

1319 "Object(%(object_number)d,%(generation_number)d) " 

1320 "whereas %(expected)d expected" 

1321 ), 

1322 source=__name__, 

1323 actual_count=actual_count, 

1324 object_number=object_number, 

1325 generation_number=generation_number, 

1326 expected=expected_count, 

1327 ) 

1328 except Exception: # could be multiple causes 

1329 pass 

1330 

1331 stream.seek(0, 0) 

1332 for position in self._find_pdf_trailers(stream_data): 

1333 stream.seek(position, 0) 

1334 new_trailer = cast(dict[Any, Any], read_object(stream, self)) 

1335 # Here, we are parsing the file from start to end, the new data have to erase the existing. 

1336 for key, value in new_trailer.items(): 

1337 self.trailer[key] = value 

1338 

1339 def _read_xref_subsections( 

1340 self, 

1341 idx_pairs: list[int], 

1342 get_entry: Callable[[int], Union[int, tuple[int, ...]]], 

1343 used_before: Callable[[int, Union[int, tuple[int, ...]]], bool], 

1344 ) -> None: 

1345 """Read and process the subsections of the xref.""" 

1346 for start, size in self._pairs(idx_pairs): 

1347 # The subsections must increase 

1348 for num in range(start, start + size): 

1349 # The first entry is the type 

1350 xref_type = get_entry(0) 

1351 # The rest of the elements depend on the xref_type 

1352 if xref_type == 0: 

1353 # linked list of free objects 

1354 next_free_object = get_entry(1) # noqa: F841 

1355 next_generation = get_entry(2) # noqa: F841 

1356 elif xref_type == 1: 

1357 # objects that are in use but are not compressed 

1358 byte_offset = get_entry(1) 

1359 generation = get_entry(2) 

1360 if generation not in self.xref: 

1361 self.xref[generation] = {} # type: ignore[index] 

1362 if not used_before(num, generation): 

1363 self.xref[generation][num] = byte_offset # type: ignore[index] 

1364 elif xref_type == 2: 

1365 # compressed objects 

1366 objstr_num = get_entry(1) 

1367 obstr_idx = get_entry(2) 

1368 generation = 0 # PDF spec table 18, generation is 0 

1369 if not used_before(num, generation): 

1370 self.xref_objStm[num] = (objstr_num, obstr_idx) 

1371 elif self.strict: 

1372 raise PdfReadError(f"Unknown xref type: {xref_type}") 

1373 

1374 def _pairs(self, array: list[int]) -> Iterable[tuple[int, int]]: 

1375 """Iterate over pairs in the array.""" 

1376 i = 0 

1377 while i + 1 < len(array): 

1378 yield array[i], array[i + 1] 

1379 i += 2 

1380 

1381 def decrypt(self, password: Union[str, bytes]) -> PasswordType: 

1382 """ 

1383 When using an encrypted / secured PDF file with the PDF Standard 

1384 encryption handler, this function will allow the file to be decrypted. 

1385 It checks the given password against the document's user password and 

1386 owner password, and then stores the resulting decryption key if either 

1387 password is correct. 

1388 

1389 It does not matter which password was matched. Both passwords provide 

1390 the correct decryption key that will allow the document to be used with 

1391 this library. 

1392 

1393 Args: 

1394 password: The password to match. 

1395 

1396 Returns: 

1397 An indicator if the document was decrypted and whether it was the 

1398 owner password or the user password. 

1399 

1400 """ 

1401 if not self._encryption: 

1402 raise PdfReadError("Not encrypted file") 

1403 # TODO: raise Exception for wrong password 

1404 return self._encryption.verify(password, strict=self.strict) 

1405 

1406 @property 

1407 def is_encrypted(self) -> bool: 

1408 """ 

1409 Read-only boolean property showing whether this PDF file is encrypted. 

1410 

1411 Note that this property, if true, will remain true even after the 

1412 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. 

1413 """ 

1414 return TK.ENCRYPT in self.trailer 

1415 

1416 def add_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1417 """ 

1418 Add a top level form that groups all form fields below it. 

1419 

1420 Args: 

1421 name: text string of the "/T" Attribute of the created object 

1422 

1423 Returns: 

1424 The created object. ``None`` means no object was created. 

1425 

1426 """ 

1427 catalog = self.root_object 

1428 

1429 if "/AcroForm" not in catalog or not isinstance( 

1430 catalog["/AcroForm"], DictionaryObject 

1431 ): 

1432 return None 

1433 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1434 if "/Fields" not in acroform: 

1435 # TODO: No error but this may be extended for XFA Forms 

1436 return None 

1437 

1438 interim = DictionaryObject() 

1439 interim[NameObject("/T")] = TextStringObject(name) 

1440 interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] 

1441 self.cache_indirect_object( 

1442 0, 

1443 max(i for (g, i) in self.resolved_objects if g == 0) + 1, 

1444 interim, 

1445 ) 

1446 arr = ArrayObject() 

1447 arr.append(interim.indirect_reference) 

1448 acroform[NameObject("/Fields")] = arr 

1449 for o in cast(ArrayObject, interim["/Kids"]): 

1450 obj = o.get_object() 

1451 if "/Parent" in obj: 

1452 logger_warning( 

1453 "Top Level Form Field %(obj_ref)s has a non-expected parent", 

1454 source=__name__, 

1455 obj_ref=obj.indirect_reference, 

1456 ) 

1457 obj[NameObject("/Parent")] = interim.indirect_reference 

1458 return interim 

1459 

1460 def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: 

1461 """ 

1462 Rename top level form field that all form fields below it. 

1463 

1464 Args: 

1465 name: text string of the "/T" field of the created object 

1466 

1467 Returns: 

1468 The modified object. ``None`` means no object was modified. 

1469 

1470 """ 

1471 catalog = self.root_object 

1472 

1473 if "/AcroForm" not in catalog or not isinstance( 

1474 catalog["/AcroForm"], DictionaryObject 

1475 ): 

1476 return None 

1477 acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) 

1478 if "/Fields" not in acroform: 

1479 return None 

1480 

1481 interim = cast( 

1482 DictionaryObject, 

1483 cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), 

1484 ) 

1485 interim[NameObject("/T")] = TextStringObject(name) 

1486 return interim 

1487 

1488 def _repr_mimebundle_( 

1489 self, 

1490 include: Union[None, Iterable[str]] = None, 

1491 exclude: Union[None, Iterable[str]] = None, 

1492 ) -> dict[str, Any]: 

1493 """ 

1494 Integration into Jupyter Notebooks. 

1495 

1496 This method returns a dictionary that maps a mime-type to its 

1497 representation. 

1498 

1499 .. seealso:: 

1500 

1501 https://ipython.readthedocs.io/en/stable/config/integrating.html 

1502 """ 

1503 self.stream.seek(0) 

1504 pdf_data = self.stream.read() 

1505 data = { 

1506 "application/pdf": pdf_data, 

1507 } 

1508 

1509 if include is not None: 

1510 # Filter representations based on include list 

1511 data = {k: v for k, v in data.items() if k in include} 

1512 

1513 if exclude is not None: 

1514 # Remove representations based on exclude list 

1515 data = {k: v for k, v in data.items() if k not in exclude} 

1516 

1517 return data 

1518 

1519 def _get_named_destinations( 

1520 self, 

1521 tree: Union[TreeObject, None] = None, 

1522 retval: Optional[dict[str, Destination]] = None, 

1523 ) -> dict[str, Destination]: 

1524 """Override from PdfDocCommon. In the reader we can assume this is 

1525 static, but not in the writer. 

1526 """ 

1527 if tree or retval: 

1528 return super()._get_named_destinations(tree, retval) 

1529 

1530 if self._named_destinations_cache is None: 

1531 self._named_destinations_cache = super()._get_named_destinations() 

1532 return self._named_destinations_cache