Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

439 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27import binascii 

28import codecs 

29import hashlib 

30import re 

31import sys 

32from binascii import unhexlify 

33from collections.abc import Sequence 

34from math import log10 

35from struct import iter_unpack 

36from typing import Any, Callable, ClassVar, Optional, Union, cast 

37 

38if sys.version_info[:2] >= (3, 10): 

39 from typing import TypeGuard 

40else: 

41 from typing_extensions import TypeGuard # PEP 647 

42 

43from .._codecs import _pdfdoc_encoding_rev 

44from .._protocols import PdfObjectProtocol, PdfWriterProtocol 

45from .._utils import ( 

46 StreamType, 

47 classproperty, 

48 deprecation_no_replacement, 

49 deprecation_with_replacement, 

50 logger_warning, 

51 read_non_whitespace, 

52 read_until_regex, 

53) 

54from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError 

55 

56__author__ = "Mathieu Fenniak" 

57__author_email__ = "biziqe@mathieu.fenniak.net" 

58 

59 

60class PdfObject(PdfObjectProtocol): 

61 # function for calculating a hash value 

62 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 

63 indirect_reference: Optional["IndirectObject"] 

64 

65 def hash_bin(self) -> int: 

66 """ 

67 Used to detect modified object. 

68 

69 Returns: 

70 Hash considering type and value. 

71 

72 """ 

73 raise NotImplementedError( 

74 f"{self.__class__.__name__} does not implement .hash_bin() so far" 

75 ) 

76 

77 def hash_value_data(self) -> bytes: 

78 return f"{self}".encode() 

79 

80 def hash_value(self) -> bytes: 

81 return ( 

82 f"{self.__class__.__name__}:" 

83 f"{self.hash_func(self.hash_value_data()).hexdigest()}" 

84 ).encode() 

85 

86 def replicate( 

87 self, 

88 pdf_dest: PdfWriterProtocol, 

89 ) -> "PdfObject": 

90 """ 

91 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) 

92 without ensuring links. This is used in clone_document_from_root with incremental = True. 

93 

94 Args: 

95 pdf_dest: Target to clone to. 

96 

97 Returns: 

98 The cloned PdfObject 

99 

100 """ 

101 return self.clone(pdf_dest) 

102 

103 def clone( 

104 self, 

105 pdf_dest: PdfWriterProtocol, 

106 force_duplicate: bool = False, 

107 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

108 ) -> "PdfObject": 

109 """ 

110 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). 

111 

112 By default, this method will call ``_reference_clone`` (see ``_reference``). 

113 

114 

115 Args: 

116 pdf_dest: Target to clone to. 

117 force_duplicate: By default, if the object has already been cloned and referenced, 

118 the copy will be returned; when ``True``, a new copy will be created. 

119 (Default value = ``False``) 

120 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored 

121 during cloning (applies to children duplication as well). If fields are to be 

122 considered for a limited number of levels, you have to add it as integer, for 

123 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first 

124 level only but ``"/TOTO"`` on all levels. 

125 

126 Returns: 

127 The cloned PdfObject 

128 

129 """ 

130 raise NotImplementedError( 

131 f"{self.__class__.__name__} does not implement .clone so far" 

132 ) 

133 

134 def _reference_clone( 

135 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False 

136 ) -> PdfObjectProtocol: 

137 """ 

138 Reference the object within the _objects of pdf_dest only if 

139 indirect_reference attribute exists (which means the objects was 

140 already identified in xref/xobjstm) if object has been already 

141 referenced do nothing. 

142 

143 Args: 

144 clone: 

145 pdf_dest: 

146 

147 Returns: 

148 The clone 

149 

150 """ 

151 try: 

152 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest: 

153 return clone 

154 except Exception: 

155 pass 

156 # if hasattr(clone, "indirect_reference"): 

157 try: 

158 ind = self.indirect_reference 

159 except AttributeError: 

160 return clone 

161 if ( 

162 pdf_dest.incremental 

163 and ind is not None 

164 and ind.pdf == pdf_dest._reader 

165 and ind.idnum <= len(pdf_dest._objects) 

166 ): 

167 i = ind.idnum 

168 else: 

169 i = len(pdf_dest._objects) + 1 

170 if ind is not None: 

171 if id(ind.pdf) not in pdf_dest._id_translated: 

172 pdf_dest._id_translated[id(ind.pdf)] = {} 

173 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index] 

174 if ( 

175 not force_duplicate 

176 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] 

177 ): 

178 obj = pdf_dest.get_object( 

179 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] 

180 ) 

181 assert obj is not None 

182 return obj 

183 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i 

184 try: 

185 pdf_dest._objects[i - 1] = clone 

186 except IndexError: 

187 pdf_dest._objects.append(clone) 

188 i = len(pdf_dest._objects) 

189 clone.indirect_reference = IndirectObject(i, 0, pdf_dest) 

190 return clone 

191 

192 def get_object(self) -> Optional["PdfObject"]: 

193 """Resolve indirect references.""" 

194 return self 

195 

196 def write_to_stream( 

197 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

198 ) -> None: 

199 raise NotImplementedError 

200 

201 

202class NullObject(PdfObject): 

203 def clone( 

204 self, 

205 pdf_dest: PdfWriterProtocol, 

206 force_duplicate: bool = False, 

207 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

208 ) -> "NullObject": 

209 """Clone object into pdf_dest.""" 

210 return cast( 

211 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) 

212 ) 

213 

214 def hash_bin(self) -> int: 

215 """ 

216 Used to detect modified object. 

217 

218 Returns: 

219 Hash considering type and value. 

220 

221 """ 

222 return hash((self.__class__,)) 

223 

224 def write_to_stream( 

225 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

226 ) -> None: 

227 if encryption_key is not None: # deprecated 

228 deprecation_no_replacement( 

229 "the encryption_key parameter of write_to_stream", "5.0.0" 

230 ) 

231 stream.write(b"null") 

232 

233 @staticmethod 

234 def read_from_stream(stream: StreamType) -> "NullObject": 

235 nulltxt = stream.read(4) 

236 if nulltxt != b"null": 

237 raise PdfReadError("Could not read Null object") 

238 return NullObject() 

239 

240 def __repr__(self) -> str: 

241 return "NullObject" 

242 

243 def __eq__(self, other: object) -> bool: 

244 return isinstance(other, NullObject) 

245 

246 def __hash__(self) -> int: 

247 return self.hash_bin() 

248 

249 

250class BooleanObject(PdfObject): 

251 def __init__(self, value: Any) -> None: 

252 self.value = value 

253 

254 def clone( 

255 self, 

256 pdf_dest: PdfWriterProtocol, 

257 force_duplicate: bool = False, 

258 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

259 ) -> "BooleanObject": 

260 """Clone object into pdf_dest.""" 

261 return cast( 

262 "BooleanObject", 

263 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), 

264 ) 

265 

266 def hash_bin(self) -> int: 

267 """ 

268 Used to detect modified object. 

269 

270 Returns: 

271 Hash considering type and value. 

272 

273 """ 

274 return hash((self.__class__, self.value)) 

275 

276 def __eq__(self, o: object, /) -> bool: 

277 if isinstance(o, BooleanObject): 

278 return self.value == o.value 

279 if isinstance(o, bool): 

280 return self.value == o 

281 return False 

282 

283 def __hash__(self) -> int: 

284 return self.hash_bin() 

285 

286 def __repr__(self) -> str: 

287 return "True" if self.value else "False" 

288 

289 def write_to_stream( 

290 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

291 ) -> None: 

292 if encryption_key is not None: # deprecated 

293 deprecation_no_replacement( 

294 "the encryption_key parameter of write_to_stream", "5.0.0" 

295 ) 

296 if self.value: 

297 stream.write(b"true") 

298 else: 

299 stream.write(b"false") 

300 

301 @staticmethod 

302 def read_from_stream(stream: StreamType) -> "BooleanObject": 

303 word = stream.read(4) 

304 if word == b"true": 

305 return BooleanObject(True) 

306 if word == b"fals": 

307 stream.read(1) 

308 return BooleanObject(False) 

309 raise PdfReadError("Could not read Boolean object") 

310 

311 

312class IndirectObject(PdfObject): 

313 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader 

314 self.idnum = idnum 

315 self.generation = generation 

316 self.pdf = pdf 

317 

318 def __hash__(self) -> int: 

319 return hash((self.idnum, self.generation, id(self.pdf))) 

320 

321 def hash_bin(self) -> int: 

322 """ 

323 Used to detect modified object. 

324 

325 Returns: 

326 Hash considering type and value. 

327 

328 """ 

329 return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) 

330 

331 def replicate( 

332 self, 

333 pdf_dest: PdfWriterProtocol, 

334 ) -> "PdfObject": 

335 return IndirectObject(self.idnum, self.generation, pdf_dest) 

336 

337 def clone( 

338 self, 

339 pdf_dest: PdfWriterProtocol, 

340 force_duplicate: bool = False, 

341 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

342 ) -> "IndirectObject": 

343 """Clone object into pdf_dest.""" 

344 if self.pdf == pdf_dest and not force_duplicate: 

345 # Already duplicated and no extra duplication required 

346 return self 

347 if id(self.pdf) not in pdf_dest._id_translated: 

348 pdf_dest._id_translated[id(self.pdf)] = {} 

349 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index] 

350 

351 if self.idnum in pdf_dest._id_translated[id(self.pdf)]: 

352 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) 

353 if force_duplicate: 

354 assert dup is not None 

355 assert dup.indirect_reference is not None 

356 idref = dup.indirect_reference 

357 return IndirectObject(idref.idnum, idref.generation, idref.pdf) 

358 else: 

359 obj = self.get_object() 

360 # case observed : a pointed object can not be found 

361 if obj is None: 

362 # this normally 

363 obj = NullObject() 

364 assert isinstance(self, (IndirectObject,)) 

365 obj.indirect_reference = self 

366 dup = pdf_dest._add_object( 

367 obj.clone(pdf_dest, force_duplicate, ignore_fields) 

368 ) 

369 assert dup is not None, "mypy" 

370 assert dup.indirect_reference is not None, "mypy" 

371 return dup.indirect_reference 

372 

373 @property 

374 def indirect_reference(self) -> "IndirectObject": # type: ignore[override] 

375 return self 

376 

377 def get_object(self) -> Optional["PdfObject"]: 

378 return self.pdf.get_object(self) 

379 

380 def __deepcopy__(self, memo: Any) -> "IndirectObject": 

381 return IndirectObject(self.idnum, self.generation, self.pdf) 

382 

383 def _get_object_with_check(self) -> Optional["PdfObject"]: 

384 o = self.get_object() 

385 # the check is done here to not slow down get_object() 

386 if isinstance(o, IndirectObject): 

387 raise PdfStreamError( 

388 f"{self.__repr__()} references an IndirectObject {o.__repr__()}" 

389 ) 

390 return o 

391 

392 def __getattr__(self, name: str) -> Any: 

393 # Attribute not found in object: look in pointed object 

394 try: 

395 return getattr(self._get_object_with_check(), name) 

396 except AttributeError: 

397 raise AttributeError( 

398 f"No attribute {name} found in IndirectObject or pointed object" 

399 ) 

400 

401 def __getitem__(self, key: Any) -> Any: 

402 # items should be extracted from pointed Object 

403 return self._get_object_with_check()[key] # type: ignore 

404 

405 def __contains__(self, key: Any) -> bool: 

406 return key in self._get_object_with_check() # type: ignore 

407 

408 def __iter__(self) -> Any: 

409 return self._get_object_with_check().__iter__() # type: ignore 

410 

411 def __float__(self) -> str: 

412 # in this case we are looking for the pointed data 

413 return self.get_object().__float__() # type: ignore 

414 

415 def __int__(self) -> int: 

416 # in this case we are looking for the pointed data 

417 return self.get_object().__int__() # type: ignore 

418 

419 def __str__(self) -> str: 

420 # in this case we are looking for the pointed data 

421 return self.get_object().__str__() 

422 

423 def __repr__(self) -> str: 

424 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" 

425 

426 def __eq__(self, other: object) -> bool: 

427 return ( 

428 other is not None 

429 and isinstance(other, IndirectObject) 

430 and self.idnum == other.idnum 

431 and self.generation == other.generation 

432 and self.pdf is other.pdf 

433 ) 

434 

435 def __ne__(self, other: object) -> bool: 

436 return not self.__eq__(other) 

437 

438 def write_to_stream( 

439 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

440 ) -> None: 

441 if encryption_key is not None: # deprecated 

442 deprecation_no_replacement( 

443 "the encryption_key parameter of write_to_stream", "5.0.0" 

444 ) 

445 stream.write(f"{self.idnum} {self.generation} R".encode()) 

446 

447 @staticmethod 

448 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader 

449 idnum = b"" 

450 while True: 

451 tok = stream.read(1) 

452 if not tok: 

453 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

454 if tok.isspace(): 

455 break 

456 idnum += tok 

457 generation = b"" 

458 while True: 

459 tok = stream.read(1) 

460 if not tok: 

461 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

462 if tok.isspace(): 

463 if not generation: 

464 continue 

465 break 

466 generation += tok 

467 r = read_non_whitespace(stream) 

468 if r != b"R": 

469 raise PdfReadError( 

470 f"Error reading indirect object reference at byte {hex(stream.tell())}" 

471 ) 

472 return IndirectObject(int(idnum), int(generation), pdf) 

473 

474 

475FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj 

476 

477 

478class FloatObject(float, PdfObject): 

479 def __new__( 

480 cls, value: Any = "0.0", context: Optional[Any] = None 

481 ) -> "FloatObject": 

482 try: 

483 value = float(value) 

484 return float.__new__(cls, value) 

485 except Exception as e: 

486 # If this isn't a valid decimal (happens in malformed PDFs) 

487 # fallback to 0 

488 logger_warning( 

489 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ 

490 ) 

491 return float.__new__(cls, 0.0) 

492 

493 def clone( 

494 self, 

495 pdf_dest: Any, 

496 force_duplicate: bool = False, 

497 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

498 ) -> "FloatObject": 

499 """Clone object into pdf_dest.""" 

500 return cast( 

501 "FloatObject", 

502 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), 

503 ) 

504 

505 def hash_bin(self) -> int: 

506 """ 

507 Used to detect modified object. 

508 

509 Returns: 

510 Hash considering type and value. 

511 

512 """ 

513 return hash((self.__class__, self.as_numeric)) 

514 

515 def myrepr(self) -> str: 

516 if self == 0: 

517 return "0.0" 

518 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) 

519 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".") 

520 

521 def __repr__(self) -> str: 

522 return self.myrepr() # repr(float(self)) 

523 

524 def as_numeric(self) -> float: 

525 return float(self) 

526 

527 def write_to_stream( 

528 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

529 ) -> None: 

530 if encryption_key is not None: # deprecated 

531 deprecation_no_replacement( 

532 "the encryption_key parameter of write_to_stream", "5.0.0" 

533 ) 

534 stream.write(self.myrepr().encode("utf8")) 

535 

536 

537class NumberObject(int, PdfObject): 

538 NumberPattern = re.compile(b"[^+-.0-9]") 

539 

540 def __new__(cls, value: Any) -> "NumberObject": 

541 try: 

542 return int.__new__(cls, int(value)) 

543 except ValueError: 

544 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) 

545 return int.__new__(cls, 0) 

546 

547 def clone( 

548 self, 

549 pdf_dest: Any, 

550 force_duplicate: bool = False, 

551 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

552 ) -> "NumberObject": 

553 """Clone object into pdf_dest.""" 

554 return cast( 

555 "NumberObject", 

556 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), 

557 ) 

558 

559 def hash_bin(self) -> int: 

560 """ 

561 Used to detect modified object. 

562 

563 Returns: 

564 Hash considering type and value. 

565 

566 """ 

567 return hash((self.__class__, self.as_numeric())) 

568 

569 def as_numeric(self) -> int: 

570 return int(repr(self).encode("utf8")) 

571 

572 def write_to_stream( 

573 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

574 ) -> None: 

575 if encryption_key is not None: # deprecated 

576 deprecation_no_replacement( 

577 "the encryption_key parameter of write_to_stream", "5.0.0" 

578 ) 

579 stream.write(repr(self).encode("utf8")) 

580 

581 @staticmethod 

582 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: 

583 num = read_until_regex(stream, NumberObject.NumberPattern) 

584 if b"." in num: 

585 return FloatObject(num) 

586 return NumberObject(num) 

587 

588 

589class ByteStringObject(bytes, PdfObject): 

590 """ 

591 Represents a string object where the text encoding could not be determined. 

592 

593 This occurs quite often, as the PDF spec doesn't provide an alternate way to 

594 represent strings -- for example, the encryption data stored in files (like 

595 /O) is clearly not text, but is still stored in a "String" object. 

596 """ 

597 

598 def clone( 

599 self, 

600 pdf_dest: Any, 

601 force_duplicate: bool = False, 

602 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

603 ) -> "ByteStringObject": 

604 """Clone object into pdf_dest.""" 

605 return cast( 

606 "ByteStringObject", 

607 self._reference_clone( 

608 ByteStringObject(bytes(self)), pdf_dest, force_duplicate 

609 ), 

610 ) 

611 

612 def hash_bin(self) -> int: 

613 """ 

614 Used to detect modified object. 

615 

616 Returns: 

617 Hash considering type and value. 

618 

619 """ 

620 return hash((self.__class__, bytes(self))) 

621 

622 @property 

623 def original_bytes(self) -> bytes: 

624 """For compatibility with TextStringObject.original_bytes.""" 

625 return self 

626 

627 def write_to_stream( 

628 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

629 ) -> None: 

630 if encryption_key is not None: # deprecated 

631 deprecation_no_replacement( 

632 "the encryption_key parameter of write_to_stream", "5.0.0" 

633 ) 

634 stream.write(b"<") 

635 stream.write(binascii.hexlify(self)) 

636 stream.write(b">") 

637 

638 def __str__(self) -> str: 

639 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)] 

640 for enc in charset_to_try: 

641 try: 

642 return self.decode(enc) 

643 except UnicodeDecodeError: 

644 pass 

645 raise PdfReadError("Cannot decode ByteStringObject.") 

646 

647 

648class TextStringObject(str, PdfObject): # noqa: SLOT000 

649 """ 

650 A string object that has been decoded into a real unicode string. 

651 

652 If read from a PDF document, this string appeared to match the 

653 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding 

654 to occur. 

655 """ 

656 

657 autodetect_pdfdocencoding: bool 

658 autodetect_utf16: bool 

659 utf16_bom: bytes 

660 _original_bytes: Optional[bytes] = None 

661 

662 def __new__(cls, value: Any) -> "TextStringObject": 

663 org = None 

664 if isinstance(value, bytes): 

665 org = value 

666 value = value.decode("charmap") 

667 o = str.__new__(cls, value) 

668 o._original_bytes = org 

669 o.autodetect_utf16 = False 

670 o.autodetect_pdfdocencoding = False 

671 o.utf16_bom = b"" 

672 if o.startswith(("\xfe\xff", "\xff\xfe")): 

673 assert org is not None, "mypy" 

674 try: 

675 o = str.__new__(cls, org.decode("utf-16")) 

676 except UnicodeDecodeError as exc: 

677 logger_warning( 

678 f"{exc!s}\ninitial string:{exc.object!r}", 

679 __name__, 

680 ) 

681 o = str.__new__(cls, exc.object[: exc.start].decode("utf-16")) 

682 o._original_bytes = org 

683 o.autodetect_utf16 = True 

684 o.utf16_bom = org[:2] 

685 else: 

686 try: 

687 encode_pdfdocencoding(o) 

688 o.autodetect_pdfdocencoding = True 

689 except UnicodeEncodeError: 

690 o.autodetect_utf16 = True 

691 o.utf16_bom = codecs.BOM_UTF16_BE 

692 return o 

693 

694 def clone( 

695 self, 

696 pdf_dest: Any, 

697 force_duplicate: bool = False, 

698 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

699 ) -> "TextStringObject": 

700 """Clone object into pdf_dest.""" 

701 obj = TextStringObject(self) 

702 obj._original_bytes = self._original_bytes 

703 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding 

704 obj.autodetect_utf16 = self.autodetect_utf16 

705 obj.utf16_bom = self.utf16_bom 

706 return cast( 

707 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) 

708 ) 

709 

710 def hash_bin(self) -> int: 

711 """ 

712 Used to detect modified object. 

713 

714 Returns: 

715 Hash considering type and value. 

716 

717 """ 

718 return hash((self.__class__, self.original_bytes)) 

719 

720 @property 

721 def original_bytes(self) -> bytes: 

722 """ 

723 It is occasionally possible that a text string object gets created where 

724 a byte string object was expected due to the autodetection mechanism -- 

725 if that occurs, this "original_bytes" property can be used to 

726 back-calculate what the original encoded bytes were. 

727 """ 

728 if self._original_bytes is not None: 

729 return self._original_bytes 

730 return self.get_original_bytes() 

731 

732 def get_original_bytes(self) -> bytes: 

733 # We're a text string object, but the library is trying to get our raw 

734 # bytes. This can happen if we auto-detected this string as text, but 

735 # we were wrong. It's pretty common. Return the original bytes that 

736 # would have been used to create this object, based upon the autodetect 

737 # method. 

738 if self.autodetect_utf16: 

739 if self.utf16_bom == codecs.BOM_UTF16_LE: 

740 return codecs.BOM_UTF16_LE + self.encode("utf-16le") 

741 if self.utf16_bom == codecs.BOM_UTF16_BE: 

742 return codecs.BOM_UTF16_BE + self.encode("utf-16be") 

743 return self.encode("utf-16be") 

744 if self.autodetect_pdfdocencoding: 

745 return encode_pdfdocencoding(self) 

746 raise Exception("no information about original bytes") # pragma: no cover 

747 

748 def get_encoded_bytes(self) -> bytes: 

749 # Try to write the string out as a PDFDocEncoding encoded string. It's 

750 # nicer to look at in the PDF file. Sadly, we take a performance hit 

751 # here for trying... 

752 try: 

753 if self._original_bytes is not None: 

754 return self._original_bytes 

755 if self.autodetect_utf16: 

756 raise UnicodeEncodeError("", "forced", -1, -1, "") 

757 bytearr = encode_pdfdocencoding(self) 

758 except UnicodeEncodeError: 

759 if self.utf16_bom == codecs.BOM_UTF16_LE: 

760 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") 

761 elif self.utf16_bom == codecs.BOM_UTF16_BE: 

762 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") 

763 else: 

764 bytearr = self.encode("utf-16be") 

765 return bytearr 

766 

767 def write_to_stream( 

768 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

769 ) -> None: 

770 if encryption_key is not None: # deprecated 

771 deprecation_no_replacement( 

772 "the encryption_key parameter of write_to_stream", "5.0.0" 

773 ) 

774 bytearr = self.get_encoded_bytes() 

775 stream.write(b"(") 

776 for c_ in iter_unpack("c", bytearr): 

777 c = cast(bytes, c_[0]) 

778 if not c.isalnum() and c != b" ": 

779 # This: 

780 # stream.write(rf"\{c:0>3o}".encode()) 

781 # gives 

782 # https://github.com/davidhalter/parso/issues/207 

783 stream.write(b"\\%03o" % ord(c)) 

784 else: 

785 stream.write(c) 

786 stream.write(b")") 

787 

788 

789class NameObject(str, PdfObject): # noqa: SLOT000 

790 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") 

791 prefix = b"/" 

792 renumber_table: ClassVar[dict[str, bytes]] = { 

793 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"}, 

794 **{chr(i): f"#{i:02X}".encode() for i in range(33)}, 

795 } 

796 

797 def clone( 

798 self, 

799 pdf_dest: Any, 

800 force_duplicate: bool = False, 

801 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

802 ) -> "NameObject": 

803 """Clone object into pdf_dest.""" 

804 return cast( 

805 "NameObject", 

806 self._reference_clone(NameObject(self), pdf_dest, force_duplicate), 

807 ) 

808 

809 def hash_bin(self) -> int: 

810 """ 

811 Used to detect modified object. 

812 

813 Returns: 

814 Hash considering type and value. 

815 

816 """ 

817 return hash((self.__class__, self)) 

818 

819 def write_to_stream( 

820 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

821 ) -> None: 

822 if encryption_key is not None: # deprecated 

823 deprecation_no_replacement( 

824 "the encryption_key parameter of write_to_stream", "5.0.0" 

825 ) 

826 stream.write(self.renumber()) 

827 

828 def renumber(self) -> bytes: 

829 out = self[0].encode("utf-8") 

830 if out != b"/": 

831 deprecation_no_replacement( 

832 f"Incorrect first char in NameObject, should start with '/': ({self})", 

833 "5.0.0", 

834 ) 

835 for c in self[1:]: 

836 if c > "~": 

837 for x in c.encode("utf-8"): 

838 out += f"#{x:02X}".encode() 

839 else: 

840 try: 

841 out += self.renumber_table[c] 

842 except KeyError: 

843 out += c.encode("utf-8") 

844 return out 

845 

846 def _sanitize(self) -> "NameObject": 

847 """ 

848 Sanitize the NameObject's name to be a valid PDF name part 

849 (alphanumeric, underscore, hyphen). The _sanitize method replaces 

850 spaces and any non-alphanumeric/non-underscore/non-hyphen with 

851 underscores. 

852 

853 Returns: 

854 NameObject with sanitized name. 

855 """ 

856 name = str(self).removeprefix("/") 

857 name = re.sub(r"\ ", "_", name) 

858 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) 

859 return NameObject("/" + name) 

860 

861 @classproperty 

862 def surfix(cls) -> bytes: # noqa: N805 

863 deprecation_with_replacement("surfix", "prefix", "5.0.0") 

864 return b"/" 

865 

866 @staticmethod 

867 def unnumber(sin: bytes) -> bytes: 

868 i = sin.find(b"#", 0) 

869 while i >= 0: 

870 try: 

871 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :] 

872 i = sin.find(b"#", i + 1) 

873 except ValueError: 

874 # if the 2 characters after # can not be converted to hex 

875 # we change nothing and carry on 

876 i = i + 1 

877 return sin 

878 

879 CHARSETS = ("utf-8", "gbk", "latin1") 

880 

881 @staticmethod 

882 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader 

883 name = stream.read(1) 

884 if name != NameObject.prefix: 

885 raise PdfReadError("Name read error") 

886 name += read_until_regex(stream, NameObject.delimiter_pattern) 

887 try: 

888 # Name objects should represent irregular characters 

889 # with a '#' followed by the symbol's hex number 

890 name = NameObject.unnumber(name) 

891 for enc in NameObject.CHARSETS: 

892 try: 

893 ret = name.decode(enc) 

894 return NameObject(ret) 

895 except Exception: 

896 pass 

897 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") 

898 except (UnicodeEncodeError, UnicodeDecodeError) as e: 

899 if not pdf.strict: 

900 logger_warning( 

901 f"Illegal character in NameObject ({name!r}), " 

902 "you may need to adjust NameObject.CHARSETS", 

903 __name__, 

904 ) 

905 return NameObject(name.decode("charmap")) 

906 raise PdfReadError( 

907 f"Illegal character in NameObject ({name!r}). " 

908 "You may need to adjust NameObject.CHARSETS.", 

909 ) from e 

910 

911 

912def encode_pdfdocencoding(unicode_string: str) -> bytes: 

913 try: 

914 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) 

915 except KeyError: 

916 raise UnicodeEncodeError( 

917 "pdfdocencoding", 

918 unicode_string, 

919 -1, 

920 -1, 

921 "does not exist in translation table", 

922 ) 

923 

924 

925def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]: 

926 """ 

927 Returns: 

928 True if x is None or NullObject. 

929 

930 """ 

931 return x is None or ( 

932 isinstance(x, PdfObject) 

933 and (x.get_object() is None or isinstance(x.get_object(), NullObject)) 

934 )