Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

447 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27import binascii 

28import codecs 

29import hashlib 

30import re 

31import sys 

32from collections.abc import Sequence 

33from math import log10 

34from struct import iter_unpack 

35from typing import Any, Callable, ClassVar, Optional, Union, cast 

36 

37if sys.version_info[:2] >= (3, 10): 

38 from typing import TypeGuard 

39else: 

40 from typing_extensions import TypeGuard # PEP 647 

41 

42if sys.version_info >= (3, 11): 

43 from typing import Self 

44else: 

45 from typing_extensions import Self 

46 

47from .._codecs import _pdfdoc_encoding_rev 

48from .._protocols import PdfObjectProtocol, PdfWriterProtocol 

49from .._utils import ( 

50 StreamType, 

51 classproperty, 

52 deprecation_no_replacement, 

53 deprecation_with_replacement, 

54 logger_warning, 

55 read_non_whitespace, 

56 read_until_regex, 

57) 

58from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError 

59 

60__author__ = "Mathieu Fenniak" 

61__author_email__ = "biziqe@mathieu.fenniak.net" 

62 

63 

64class PdfObject(PdfObjectProtocol): 

65 # function for calculating a hash value 

66 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 

67 indirect_reference: Optional["IndirectObject"] 

68 

69 def hash_bin(self) -> int: 

70 """ 

71 Used to detect modified object. 

72 

73 Returns: 

74 Hash considering type and value. 

75 

76 """ 

77 raise NotImplementedError( 

78 f"{self.__class__.__name__} does not implement .hash_bin() so far" 

79 ) 

80 

81 def hash_value_data(self) -> bytes: 

82 return f"{self}".encode() 

83 

84 def hash_value(self) -> bytes: 

85 return ( 

86 f"{self.__class__.__name__}:" 

87 f"{self.hash_func(self.hash_value_data()).hexdigest()}" 

88 ).encode() 

89 

90 def replicate( 

91 self, 

92 pdf_dest: PdfWriterProtocol, 

93 ) -> "PdfObject": 

94 """ 

95 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) 

96 without ensuring links. This is used in clone_document_from_root with incremental = True. 

97 

98 Args: 

99 pdf_dest: Target to clone to. 

100 

101 Returns: 

102 The cloned PdfObject 

103 

104 """ 

105 return self.clone(pdf_dest) 

106 

107 def clone( 

108 self, 

109 pdf_dest: PdfWriterProtocol, 

110 force_duplicate: bool = False, 

111 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

112 ) -> "PdfObject": 

113 """ 

114 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). 

115 

116 By default, this method will call ``_reference_clone`` (see ``_reference``). 

117 

118 

119 Args: 

120 pdf_dest: Target to clone to. 

121 force_duplicate: By default, if the object has already been cloned and referenced, 

122 the copy will be returned; when ``True``, a new copy will be created. 

123 (Default value = ``False``) 

124 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored 

125 during cloning (applies to children duplication as well). If fields are to be 

126 considered for a limited number of levels, you have to add it as integer, for 

127 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first 

128 level only but ``"/TOTO"`` on all levels. 

129 

130 Returns: 

131 The cloned PdfObject 

132 

133 """ 

134 raise NotImplementedError( 

135 f"{self.__class__.__name__} does not implement .clone so far" 

136 ) 

137 

138 def _reference_clone( 

139 self, clone: "PdfObject", pdf_dest: PdfWriterProtocol, force_duplicate: bool = False 

140 ) -> "PdfObject": 

141 """ 

142 Reference the object within the _objects of pdf_dest only if 

143 indirect_reference attribute exists (which means the objects was 

144 already identified in xref/xobjstm) if object has been already 

145 referenced do nothing. 

146 

147 Args: 

148 clone: 

149 pdf_dest: 

150 

151 Returns: 

152 The clone 

153 

154 """ 

155 try: 

156 if ( 

157 not force_duplicate 

158 and clone.indirect_reference is not None 

159 and clone.indirect_reference.pdf == pdf_dest 

160 ): 

161 return clone 

162 except Exception: 

163 pass 

164 # if hasattr(clone, "indirect_reference"): 

165 try: 

166 ind = self.indirect_reference 

167 except AttributeError: 

168 return clone 

169 if ( 

170 pdf_dest.incremental 

171 and ind is not None 

172 and ind.pdf == pdf_dest._reader 

173 and ind.idnum <= len(pdf_dest._objects) 

174 ): 

175 i = ind.idnum 

176 else: 

177 i = len(pdf_dest._objects) + 1 

178 if ind is not None: 

179 if id(ind.pdf) not in pdf_dest._id_translated: 

180 pdf_dest._id_translated[id(ind.pdf)] = {} 

181 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index] 

182 if ( 

183 not force_duplicate 

184 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] 

185 ): 

186 obj = pdf_dest.get_object( 

187 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] 

188 ) 

189 assert isinstance(obj, PdfObject), "mypy" 

190 return obj 

191 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i 

192 try: 

193 pdf_dest._objects[i - 1] = clone 

194 except IndexError: 

195 pdf_dest._objects.append(clone) 

196 i = len(pdf_dest._objects) 

197 clone.indirect_reference = IndirectObject(i, 0, pdf_dest) 

198 return clone 

199 

200 def get_object(self) -> Optional["PdfObject"]: 

201 """Resolve indirect references.""" 

202 return self 

203 

204 def write_to_stream( 

205 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

206 ) -> None: 

207 raise NotImplementedError 

208 

209 

210class NullObject(PdfObject): 

211 def clone( 

212 self, 

213 pdf_dest: PdfWriterProtocol, 

214 force_duplicate: bool = False, 

215 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

216 ) -> "NullObject": 

217 """Clone object into pdf_dest.""" 

218 return cast( 

219 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) 

220 ) 

221 

222 def hash_bin(self) -> int: 

223 """ 

224 Used to detect modified object. 

225 

226 Returns: 

227 Hash considering type and value. 

228 

229 """ 

230 return hash((self.__class__,)) 

231 

232 def write_to_stream( 

233 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

234 ) -> None: 

235 if encryption_key is not None: # deprecated 

236 deprecation_no_replacement( 

237 "the encryption_key parameter of write_to_stream", "5.0.0" 

238 ) 

239 stream.write(b"null") 

240 

241 @staticmethod 

242 def read_from_stream(stream: StreamType) -> "NullObject": 

243 nulltxt = stream.read(4) 

244 if nulltxt != b"null": 

245 raise PdfReadError("Could not read Null object") 

246 return NullObject() 

247 

248 def __repr__(self) -> str: 

249 return "NullObject" 

250 

251 def __eq__(self, other: object) -> bool: 

252 return isinstance(other, NullObject) 

253 

254 def __hash__(self) -> int: 

255 return self.hash_bin() 

256 

257 

258class BooleanObject(PdfObject): 

259 value: bool 

260 

261 def __init__(self, value: Any) -> None: 

262 self.value = value 

263 

264 def clone( 

265 self, 

266 pdf_dest: PdfWriterProtocol, 

267 force_duplicate: bool = False, 

268 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

269 ) -> "BooleanObject": 

270 """Clone object into pdf_dest.""" 

271 return cast( 

272 "BooleanObject", 

273 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), 

274 ) 

275 

276 def hash_bin(self) -> int: 

277 """ 

278 Used to detect modified object. 

279 

280 Returns: 

281 Hash considering type and value. 

282 

283 """ 

284 return hash((self.__class__, self.value)) 

285 

286 def __eq__(self, o: object, /) -> bool: 

287 if isinstance(o, BooleanObject): 

288 return self.value == o.value 

289 if isinstance(o, bool): 

290 return self.value == o 

291 return False 

292 

293 def __hash__(self) -> int: 

294 return self.hash_bin() 

295 

296 def __repr__(self) -> str: 

297 return "True" if self.value else "False" 

298 

299 def write_to_stream( 

300 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

301 ) -> None: 

302 if encryption_key is not None: # deprecated 

303 deprecation_no_replacement( 

304 "the encryption_key parameter of write_to_stream", "5.0.0" 

305 ) 

306 if self.value: 

307 stream.write(b"true") 

308 else: 

309 stream.write(b"false") 

310 

311 @staticmethod 

312 def read_from_stream(stream: StreamType) -> "BooleanObject": 

313 word = stream.read(4) 

314 if word == b"true": 

315 return BooleanObject(True) 

316 if word == b"fals": 

317 stream.read(1) 

318 return BooleanObject(False) 

319 raise PdfReadError("Could not read Boolean object") 

320 

321 

322class IndirectObject(PdfObject): 

323 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader 

324 self.idnum = idnum 

325 self.generation = generation 

326 self.pdf = pdf 

327 

328 def __hash__(self) -> int: 

329 return hash((self.idnum, self.generation, id(self.pdf))) 

330 

331 def hash_bin(self) -> int: 

332 """ 

333 Used to detect modified object. 

334 

335 Returns: 

336 Hash considering type and value. 

337 

338 """ 

339 return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) 

340 

341 def replicate( 

342 self, 

343 pdf_dest: PdfWriterProtocol, 

344 ) -> "PdfObject": 

345 return IndirectObject(self.idnum, self.generation, pdf_dest) 

346 

347 def clone( 

348 self, 

349 pdf_dest: PdfWriterProtocol, 

350 force_duplicate: bool = False, 

351 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

352 ) -> "IndirectObject": 

353 """Clone object into pdf_dest.""" 

354 if self.pdf == pdf_dest and not force_duplicate: 

355 # Already duplicated and no extra duplication required 

356 return self 

357 if id(self.pdf) not in pdf_dest._id_translated: 

358 pdf_dest._id_translated[id(self.pdf)] = {} 

359 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index] 

360 

361 if self.idnum in pdf_dest._id_translated[id(self.pdf)]: 

362 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) 

363 if force_duplicate: 

364 assert dup is not None 

365 assert dup.indirect_reference is not None 

366 idref = dup.indirect_reference 

367 return IndirectObject(idref.idnum, idref.generation, idref.pdf) 

368 else: 

369 obj = self.get_object() 

370 # case observed : a pointed object can not be found 

371 if obj is None: 

372 # this normally 

373 obj = NullObject() 

374 assert isinstance(self, (IndirectObject,)) 

375 obj.indirect_reference = self 

376 dup = pdf_dest._add_object( 

377 obj.clone(pdf_dest, force_duplicate, ignore_fields) 

378 ) 

379 assert isinstance(dup, PdfObject), "mypy" 

380 assert dup.indirect_reference is not None, "mypy" 

381 return dup.indirect_reference 

382 

383 @property 

384 def indirect_reference(self) -> "IndirectObject": # type: ignore[override] 

385 return self 

386 

387 def get_object(self) -> Optional["PdfObject"]: 

388 obj: Optional[PdfObject] = self.pdf.get_object(self) 

389 return obj 

390 

391 def __deepcopy__(self, memo: Any) -> "IndirectObject": 

392 return IndirectObject(self.idnum, self.generation, self.pdf) 

393 

394 def _get_object_with_check(self) -> Optional["PdfObject"]: 

395 o = self.get_object() 

396 # the check is done here to not slow down get_object() 

397 if isinstance(o, IndirectObject): 

398 raise PdfStreamError( 

399 f"{self.__repr__()} references an IndirectObject {o.__repr__()}" 

400 ) 

401 return o 

402 

403 def __getattr__(self, name: str) -> Any: 

404 # Attribute not found in object: look in pointed object 

405 try: 

406 return getattr(self._get_object_with_check(), name) 

407 except AttributeError: 

408 raise AttributeError( 

409 f"No attribute {name} found in IndirectObject or pointed object" 

410 ) 

411 

412 def __getitem__(self, key: Any) -> Any: 

413 # items should be extracted from pointed Object 

414 return self._get_object_with_check()[key] # type: ignore 

415 

416 def __contains__(self, key: Any) -> bool: 

417 return key in self._get_object_with_check() # type: ignore 

418 

419 def __iter__(self) -> Any: 

420 return self._get_object_with_check().__iter__() # type: ignore 

421 

422 def __float__(self) -> str: 

423 # in this case we are looking for the pointed data 

424 return self.get_object().__float__() # type: ignore 

425 

426 def __int__(self) -> int: 

427 # in this case we are looking for the pointed data 

428 return self.get_object().__int__() # type: ignore 

429 

430 def __str__(self) -> str: 

431 # in this case we are looking for the pointed data 

432 return self.get_object().__str__() 

433 

434 def __repr__(self) -> str: 

435 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" 

436 

437 def __eq__(self, other: object) -> bool: 

438 return ( 

439 other is not None 

440 and isinstance(other, IndirectObject) 

441 and self.idnum == other.idnum 

442 and self.generation == other.generation 

443 and self.pdf is other.pdf 

444 ) 

445 

446 def __ne__(self, other: object) -> bool: 

447 return not self.__eq__(other) 

448 

449 def write_to_stream( 

450 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

451 ) -> None: 

452 if encryption_key is not None: # deprecated 

453 deprecation_no_replacement( 

454 "the encryption_key parameter of write_to_stream", "5.0.0" 

455 ) 

456 stream.write(f"{self.idnum} {self.generation} R".encode()) 

457 

458 @staticmethod 

459 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader 

460 idnum = b"" 

461 while True: 

462 tok = stream.read(1) 

463 if not tok: 

464 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

465 if tok.isspace(): 

466 break 

467 idnum += tok 

468 generation = b"" 

469 while True: 

470 tok = stream.read(1) 

471 if not tok: 

472 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

473 if tok.isspace(): 

474 if not generation: 

475 continue 

476 break 

477 generation += tok 

478 r = read_non_whitespace(stream) 

479 if r != b"R": 

480 raise PdfReadError( 

481 f"Error reading indirect object reference at byte {hex(stream.tell())}" 

482 ) 

483 return IndirectObject(int(idnum), int(generation), pdf) 

484 

485 

486FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj 

487 

488 

489class FloatObject(float, PdfObject): 

490 def __new__( 

491 cls, value: Any = "0.0", context: Optional[Any] = None 

492 ) -> Self: 

493 try: 

494 value = float(value) 

495 return float.__new__(cls, value) 

496 except Exception as e: 

497 # If this isn't a valid decimal (happens in malformed PDFs) 

498 # fallback to 0 

499 logger_warning( 

500 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ 

501 ) 

502 return float.__new__(cls, 0.0) 

503 

504 def clone( 

505 self, 

506 pdf_dest: Any, 

507 force_duplicate: bool = False, 

508 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

509 ) -> "FloatObject": 

510 """Clone object into pdf_dest.""" 

511 return cast( 

512 "FloatObject", 

513 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), 

514 ) 

515 

516 def hash_bin(self) -> int: 

517 """ 

518 Used to detect modified object. 

519 

520 Returns: 

521 Hash considering type and value. 

522 

523 """ 

524 return hash((self.__class__, self.as_numeric)) 

525 

526 def myrepr(self) -> str: 

527 if self == 0: # type: ignore[comparison-overlap] 

528 return "0.0" 

529 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) 

530 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".") 

531 

532 def __repr__(self) -> str: 

533 return self.myrepr() # repr(float(self)) 

534 

535 def as_numeric(self) -> float: 

536 return float(self) 

537 

538 def write_to_stream( 

539 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

540 ) -> None: 

541 if encryption_key is not None: # deprecated 

542 deprecation_no_replacement( 

543 "the encryption_key parameter of write_to_stream", "5.0.0" 

544 ) 

545 stream.write(self.myrepr().encode("utf8")) 

546 

547 

548class NumberObject(int, PdfObject): 

549 NumberPattern = re.compile(b"[^+-.0-9]") 

550 

551 def __new__(cls, value: Any) -> Self: 

552 try: 

553 return int.__new__(cls, int(value)) 

554 except ValueError: 

555 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) 

556 return int.__new__(cls, 0) 

557 

558 def clone( 

559 self, 

560 pdf_dest: Any, 

561 force_duplicate: bool = False, 

562 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

563 ) -> "NumberObject": 

564 """Clone object into pdf_dest.""" 

565 return cast( 

566 "NumberObject", 

567 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), 

568 ) 

569 

570 def hash_bin(self) -> int: 

571 """ 

572 Used to detect modified object. 

573 

574 Returns: 

575 Hash considering type and value. 

576 

577 """ 

578 return hash((self.__class__, self.as_numeric())) 

579 

580 def as_numeric(self) -> int: 

581 return int(repr(self).encode("utf8")) 

582 

583 def write_to_stream( 

584 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

585 ) -> None: 

586 if encryption_key is not None: # deprecated 

587 deprecation_no_replacement( 

588 "the encryption_key parameter of write_to_stream", "5.0.0" 

589 ) 

590 stream.write(repr(self).encode("utf8")) 

591 

592 @staticmethod 

593 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: 

594 num = read_until_regex(stream, NumberObject.NumberPattern) 

595 if b"." in num: 

596 return FloatObject(num) 

597 return NumberObject(num) 

598 

599 

600class ByteStringObject(bytes, PdfObject): 

601 """ 

602 Represents a string object where the text encoding could not be determined. 

603 

604 This occurs quite often, as the PDF spec doesn't provide an alternate way to 

605 represent strings -- for example, the encryption data stored in files (like 

606 /O) is clearly not text, but is still stored in a "String" object. 

607 """ 

608 

609 def clone( 

610 self, 

611 pdf_dest: Any, 

612 force_duplicate: bool = False, 

613 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

614 ) -> "ByteStringObject": 

615 """Clone object into pdf_dest.""" 

616 return cast( 

617 "ByteStringObject", 

618 self._reference_clone( 

619 ByteStringObject(bytes(self)), pdf_dest, force_duplicate 

620 ), 

621 ) 

622 

623 def hash_bin(self) -> int: 

624 """ 

625 Used to detect modified object. 

626 

627 Returns: 

628 Hash considering type and value. 

629 

630 """ 

631 return hash((self.__class__, bytes(self))) 

632 

633 @property 

634 def original_bytes(self) -> bytes: 

635 """For compatibility with TextStringObject.original_bytes.""" 

636 return self 

637 

638 def write_to_stream( 

639 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

640 ) -> None: 

641 if encryption_key is not None: # deprecated 

642 deprecation_no_replacement( 

643 "the encryption_key parameter of write_to_stream", "5.0.0" 

644 ) 

645 stream.write(b"<") 

646 stream.write(binascii.hexlify(self)) 

647 stream.write(b">") 

648 

649 def __str__(self) -> str: 

650 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)] 

651 for enc in charset_to_try: 

652 try: 

653 return self.decode(enc) 

654 except UnicodeDecodeError: 

655 pass 

656 raise PdfReadError("Cannot decode ByteStringObject.") 

657 

658 

659class TextStringObject(str, PdfObject): # noqa: SLOT000 

660 """ 

661 A string object that has been decoded into a real unicode string. 

662 

663 If read from a PDF document, this string appeared to match the 

664 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding 

665 to occur. 

666 """ 

667 

668 autodetect_pdfdocencoding: bool 

669 autodetect_utf16: bool 

670 utf16_bom: bytes 

671 _original_bytes: Optional[bytes] = None 

672 

673 def __new__(cls, value: Any) -> Self: 

674 original_bytes = None 

675 if isinstance(value, bytes): 

676 original_bytes = value 

677 value = value.decode("charmap") 

678 text_string_object = str.__new__(cls, value) 

679 text_string_object._original_bytes = original_bytes 

680 text_string_object.autodetect_utf16 = False 

681 text_string_object.autodetect_pdfdocencoding = False 

682 text_string_object.utf16_bom = b"" 

683 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}: 

684 # The value of `original_bytes` is only set for inputs being `bytes`. 

685 # If this is UTF-16 data according to the BOM (first two characters), 

686 # perform special handling. All other cases should not need any special conversion 

687 # due to already being a string. 

688 try: 

689 text_string_object = str.__new__(cls, original_bytes.decode("utf-16")) 

690 except UnicodeDecodeError as exception: 

691 logger_warning( 

692 f"{exception!s}\ninitial string:{exception.object!r}", 

693 __name__, 

694 ) 

695 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16")) 

696 text_string_object._original_bytes = original_bytes 

697 text_string_object.autodetect_utf16 = True 

698 text_string_object.utf16_bom = original_bytes[:2] 

699 else: 

700 try: 

701 encode_pdfdocencoding(text_string_object) 

702 text_string_object.autodetect_pdfdocencoding = True 

703 except UnicodeEncodeError: 

704 text_string_object.autodetect_utf16 = True 

705 text_string_object.utf16_bom = codecs.BOM_UTF16_BE 

706 return text_string_object 

707 

708 def clone( 

709 self, 

710 pdf_dest: Any, 

711 force_duplicate: bool = False, 

712 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

713 ) -> "TextStringObject": 

714 """Clone object into pdf_dest.""" 

715 obj = TextStringObject(self) 

716 obj._original_bytes = self._original_bytes 

717 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding 

718 obj.autodetect_utf16 = self.autodetect_utf16 

719 obj.utf16_bom = self.utf16_bom 

720 return cast( 

721 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) 

722 ) 

723 

724 def hash_bin(self) -> int: 

725 """ 

726 Used to detect modified object. 

727 

728 Returns: 

729 Hash considering type and value. 

730 

731 """ 

732 return hash((self.__class__, self.original_bytes)) 

733 

734 @property 

735 def original_bytes(self) -> bytes: 

736 """ 

737 It is occasionally possible that a text string object gets created where 

738 a byte string object was expected due to the autodetection mechanism -- 

739 if that occurs, this "original_bytes" property can be used to 

740 back-calculate what the original encoded bytes were. 

741 """ 

742 if self._original_bytes is not None: 

743 return self._original_bytes 

744 return self.get_original_bytes() 

745 

746 def get_original_bytes(self) -> bytes: 

747 # We're a text string object, but the library is trying to get our raw 

748 # bytes. This can happen if we auto-detected this string as text, but 

749 # we were wrong. It's pretty common. Return the original bytes that 

750 # would have been used to create this object, based upon the autodetect 

751 # method. 

752 if self.autodetect_utf16: 

753 if self.utf16_bom == codecs.BOM_UTF16_LE: 

754 return codecs.BOM_UTF16_LE + self.encode("utf-16le") 

755 if self.utf16_bom == codecs.BOM_UTF16_BE: 

756 return codecs.BOM_UTF16_BE + self.encode("utf-16be") 

757 return self.encode("utf-16be") 

758 if self.autodetect_pdfdocencoding: 

759 return encode_pdfdocencoding(self) 

760 raise Exception("no information about original bytes") # pragma: no cover 

761 

762 def get_encoded_bytes(self) -> bytes: 

763 # Try to write the string out as a PDFDocEncoding encoded string. It's 

764 # nicer to look at in the PDF file. Sadly, we take a performance hit 

765 # here for trying... 

766 try: 

767 if self._original_bytes is not None: 

768 return self._original_bytes 

769 if self.autodetect_utf16: 

770 raise UnicodeEncodeError("", "forced", -1, -1, "") 

771 bytearr = encode_pdfdocencoding(self) 

772 except UnicodeEncodeError: 

773 if self.utf16_bom == codecs.BOM_UTF16_LE: 

774 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") 

775 elif self.utf16_bom == codecs.BOM_UTF16_BE: 

776 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") 

777 else: 

778 bytearr = self.encode("utf-16be") 

779 return bytearr 

780 

781 def write_to_stream( 

782 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

783 ) -> None: 

784 if encryption_key is not None: # deprecated 

785 deprecation_no_replacement( 

786 "the encryption_key parameter of write_to_stream", "5.0.0" 

787 ) 

788 bytearr = self.get_encoded_bytes() 

789 stream.write(b"(") 

790 for c_ in iter_unpack("c", bytearr): 

791 c = cast(bytes, c_[0]) 

792 if not c.isalnum() and c != b" ": 

793 # This: 

794 # stream.write(rf"\{c:0>3o}".encode()) 

795 # gives 

796 # https://github.com/davidhalter/parso/issues/207 

797 stream.write(b"\\%03o" % ord(c)) 

798 else: 

799 stream.write(c) 

800 stream.write(b")") 

801 

802 

803class NameObject(str, PdfObject): # noqa: SLOT000 

804 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") 

805 prefix = b"/" 

806 renumber_table: ClassVar[dict[str, bytes]] = { 

807 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"}, 

808 **{chr(i): f"#{i:02X}".encode() for i in range(33)}, 

809 } 

810 

811 def clone( 

812 self, 

813 pdf_dest: Any, 

814 force_duplicate: bool = False, 

815 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

816 ) -> "NameObject": 

817 """Clone object into pdf_dest.""" 

818 return cast( 

819 "NameObject", 

820 self._reference_clone(NameObject(self), pdf_dest, force_duplicate), 

821 ) 

822 

823 def hash_bin(self) -> int: 

824 """ 

825 Used to detect modified object. 

826 

827 Returns: 

828 Hash considering type and value. 

829 

830 """ 

831 return hash((self.__class__, self)) 

832 

833 def write_to_stream( 

834 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

835 ) -> None: 

836 if encryption_key is not None: # deprecated 

837 deprecation_no_replacement( 

838 "the encryption_key parameter of write_to_stream", "5.0.0" 

839 ) 

840 stream.write(self.renumber()) 

841 

842 def renumber(self) -> bytes: 

843 out = self[0].encode("utf-8") 

844 if out != b"/": 

845 deprecation_no_replacement( 

846 f"Incorrect first char in NameObject, should start with '/': ({self})", 

847 "5.0.0", 

848 ) 

849 parts = [out] 

850 for c in self[1:]: 

851 if c > "~": 

852 parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8")) 

853 else: 

854 try: 

855 parts.append(self.renumber_table[c]) 

856 except KeyError: 

857 parts.append(c.encode("utf-8")) 

858 return b"".join(parts) 

859 

860 def _sanitize(self) -> "NameObject": 

861 """ 

862 Sanitize the NameObject's name to be a valid PDF name part 

863 (alphanumeric, underscore, hyphen). The _sanitize method replaces 

864 spaces and any non-alphanumeric/non-underscore/non-hyphen with 

865 underscores. 

866 

867 Returns: 

868 NameObject with sanitized name. 

869 """ 

870 name = str(self).removeprefix("/") 

871 name = re.sub(r"\ ", "_", name) 

872 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) 

873 return NameObject("/" + name) 

874 

875 @classproperty 

876 def surfix(cls) -> bytes: # noqa: N805 

877 deprecation_with_replacement("surfix", "prefix", "5.0.0") 

878 return b"/" 

879 

880 @staticmethod 

881 def unnumber(sin: bytes) -> bytes: 

882 result = bytearray() 

883 i = 0 

884 while i < len(sin): 

885 if sin[i:i + 1] == b"#": 

886 try: 

887 result.append(int(sin[i + 1 : i + 3], 16)) 

888 i += 3 

889 continue 

890 except (ValueError, IndexError): 

891 # if the 2 characters after # can not be converted to hex 

892 # we change nothing and carry on 

893 pass 

894 result.append(sin[i]) 

895 i += 1 

896 return bytes(result) 

897 

898 CHARSETS = ("utf-8", "gbk", "latin1") 

899 

900 @staticmethod 

901 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader 

902 name = stream.read(1) 

903 if name != NameObject.prefix: 

904 raise PdfReadError("Name read error") 

905 name += read_until_regex(stream, NameObject.delimiter_pattern) 

906 try: 

907 # Name objects should represent irregular characters 

908 # with a '#' followed by the symbol's hex number 

909 name = NameObject.unnumber(name) 

910 for enc in NameObject.CHARSETS: 

911 try: 

912 ret = name.decode(enc) 

913 return NameObject(ret) 

914 except Exception: 

915 pass 

916 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") 

917 except (UnicodeEncodeError, UnicodeDecodeError) as e: 

918 if not pdf.strict: 

919 logger_warning( 

920 f"Illegal character in NameObject ({name!r}), " 

921 "you may need to adjust NameObject.CHARSETS", 

922 __name__, 

923 ) 

924 return NameObject(name.decode("charmap")) 

925 raise PdfReadError( 

926 f"Illegal character in NameObject ({name!r}). " 

927 "You may need to adjust NameObject.CHARSETS.", 

928 ) from e 

929 

930 

931def encode_pdfdocencoding(unicode_string: str) -> bytes: 

932 try: 

933 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) 

934 except KeyError: 

935 raise UnicodeEncodeError( 

936 "pdfdocencoding", 

937 unicode_string, 

938 -1, 

939 -1, 

940 "does not exist in translation table", 

941 ) 

942 

943 

944def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]: 

945 """ 

946 Returns: 

947 True if x is None or NullObject. 

948 

949 """ 

950 return x is None or ( 

951 isinstance(x, PdfObject) 

952 and (x.get_object() is None or isinstance(x.get_object(), NullObject)) 

953 )