Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

438 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27import binascii 

28import codecs 

29import hashlib 

30import re 

31import sys 

32from binascii import unhexlify 

33from collections.abc import Sequence 

34from math import log10 

35from struct import iter_unpack 

36from typing import Any, Callable, ClassVar, Optional, Union, cast 

37 

38if sys.version_info[:2] >= (3, 10): 

39 from typing import TypeGuard 

40else: 

41 from typing_extensions import TypeGuard # PEP 647 

42 

43from .._codecs import _pdfdoc_encoding_rev 

44from .._protocols import PdfObjectProtocol, PdfWriterProtocol 

45from .._utils import ( 

46 StreamType, 

47 classproperty, 

48 deprecation_no_replacement, 

49 deprecation_with_replacement, 

50 logger_warning, 

51 read_non_whitespace, 

52 read_until_regex, 

53) 

54from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError 

55 

56__author__ = "Mathieu Fenniak" 

57__author_email__ = "biziqe@mathieu.fenniak.net" 

58 

59 

60class PdfObject(PdfObjectProtocol): 

61 # function for calculating a hash value 

62 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 

63 indirect_reference: Optional["IndirectObject"] 

64 

65 def hash_bin(self) -> int: 

66 """ 

67 Used to detect modified object. 

68 

69 Returns: 

70 Hash considering type and value. 

71 

72 """ 

73 raise NotImplementedError( 

74 f"{self.__class__.__name__} does not implement .hash_bin() so far" 

75 ) 

76 

77 def hash_value_data(self) -> bytes: 

78 return f"{self}".encode() 

79 

80 def hash_value(self) -> bytes: 

81 return ( 

82 f"{self.__class__.__name__}:" 

83 f"{self.hash_func(self.hash_value_data()).hexdigest()}" 

84 ).encode() 

85 

86 def replicate( 

87 self, 

88 pdf_dest: PdfWriterProtocol, 

89 ) -> "PdfObject": 

90 """ 

91 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) 

92 without ensuring links. This is used in clone_document_from_root with incremental = True. 

93 

94 Args: 

95 pdf_dest: Target to clone to. 

96 

97 Returns: 

98 The cloned PdfObject 

99 

100 """ 

101 return self.clone(pdf_dest) 

102 

103 def clone( 

104 self, 

105 pdf_dest: PdfWriterProtocol, 

106 force_duplicate: bool = False, 

107 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

108 ) -> "PdfObject": 

109 """ 

110 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). 

111 

112 By default, this method will call ``_reference_clone`` (see ``_reference``). 

113 

114 

115 Args: 

116 pdf_dest: Target to clone to. 

117 force_duplicate: By default, if the object has already been cloned and referenced, 

118 the copy will be returned; when ``True``, a new copy will be created. 

119 (Default value = ``False``) 

120 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored 

121 during cloning (applies to children duplication as well). If fields are to be 

122 considered for a limited number of levels, you have to add it as integer, for 

123 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first 

124 level only but ``"/TOTO"`` on all levels. 

125 

126 Returns: 

127 The cloned PdfObject 

128 

129 """ 

130 raise NotImplementedError( 

131 f"{self.__class__.__name__} does not implement .clone so far" 

132 ) 

133 

134 def _reference_clone( 

135 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False 

136 ) -> PdfObjectProtocol: 

137 """ 

138 Reference the object within the _objects of pdf_dest only if 

139 indirect_reference attribute exists (which means the objects was 

140 already identified in xref/xobjstm) if object has been already 

141 referenced do nothing. 

142 

143 Args: 

144 clone: 

145 pdf_dest: 

146 

147 Returns: 

148 The clone 

149 

150 """ 

151 try: 

152 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest: 

153 return clone 

154 except Exception: 

155 pass 

156 # if hasattr(clone, "indirect_reference"): 

157 try: 

158 ind = self.indirect_reference 

159 except AttributeError: 

160 return clone 

161 if ( 

162 pdf_dest.incremental 

163 and ind is not None 

164 and ind.pdf == pdf_dest._reader 

165 and ind.idnum <= len(pdf_dest._objects) 

166 ): 

167 i = ind.idnum 

168 else: 

169 i = len(pdf_dest._objects) + 1 

170 if ind is not None: 

171 if id(ind.pdf) not in pdf_dest._id_translated: 

172 pdf_dest._id_translated[id(ind.pdf)] = {} 

173 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore 

174 if ( 

175 not force_duplicate 

176 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] 

177 ): 

178 obj = pdf_dest.get_object( 

179 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] 

180 ) 

181 assert obj is not None 

182 return obj 

183 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i 

184 try: 

185 pdf_dest._objects[i - 1] = clone 

186 except IndexError: 

187 pdf_dest._objects.append(clone) 

188 i = len(pdf_dest._objects) 

189 clone.indirect_reference = IndirectObject(i, 0, pdf_dest) 

190 return clone 

191 

192 def get_object(self) -> Optional["PdfObject"]: 

193 """Resolve indirect references.""" 

194 return self 

195 

196 def write_to_stream( 

197 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

198 ) -> None: 

199 raise NotImplementedError 

200 

201 

202class NullObject(PdfObject): 

203 def clone( 

204 self, 

205 pdf_dest: PdfWriterProtocol, 

206 force_duplicate: bool = False, 

207 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

208 ) -> "NullObject": 

209 """Clone object into pdf_dest.""" 

210 return cast( 

211 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) 

212 ) 

213 

214 def hash_bin(self) -> int: 

215 """ 

216 Used to detect modified object. 

217 

218 Returns: 

219 Hash considering type and value. 

220 

221 """ 

222 return hash((self.__class__,)) 

223 

224 def write_to_stream( 

225 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

226 ) -> None: 

227 if encryption_key is not None: # deprecated 

228 deprecation_no_replacement( 

229 "the encryption_key parameter of write_to_stream", "5.0.0" 

230 ) 

231 stream.write(b"null") 

232 

233 @staticmethod 

234 def read_from_stream(stream: StreamType) -> "NullObject": 

235 nulltxt = stream.read(4) 

236 if nulltxt != b"null": 

237 raise PdfReadError("Could not read Null object") 

238 return NullObject() 

239 

240 def __repr__(self) -> str: 

241 return "NullObject" 

242 

243 def __eq__(self, other: object) -> bool: 

244 return isinstance(other, NullObject) 

245 

246 def __hash__(self) -> int: 

247 return self.hash_bin() 

248 

249 

250class BooleanObject(PdfObject): 

251 def __init__(self, value: Any) -> None: 

252 self.value = value 

253 

254 def clone( 

255 self, 

256 pdf_dest: PdfWriterProtocol, 

257 force_duplicate: bool = False, 

258 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

259 ) -> "BooleanObject": 

260 """Clone object into pdf_dest.""" 

261 return cast( 

262 "BooleanObject", 

263 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), 

264 ) 

265 

266 def hash_bin(self) -> int: 

267 """ 

268 Used to detect modified object. 

269 

270 Returns: 

271 Hash considering type and value. 

272 

273 """ 

274 return hash((self.__class__, self.value)) 

275 

276 def __eq__(self, o: object, /) -> bool: 

277 if isinstance(o, BooleanObject): 

278 return self.value == o.value 

279 if isinstance(o, bool): 

280 return self.value == o 

281 return False 

282 

283 def __hash__(self) -> int: 

284 return self.hash_bin() 

285 

286 def __repr__(self) -> str: 

287 return "True" if self.value else "False" 

288 

289 def write_to_stream( 

290 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

291 ) -> None: 

292 if encryption_key is not None: # deprecated 

293 deprecation_no_replacement( 

294 "the encryption_key parameter of write_to_stream", "5.0.0" 

295 ) 

296 if self.value: 

297 stream.write(b"true") 

298 else: 

299 stream.write(b"false") 

300 

301 @staticmethod 

302 def read_from_stream(stream: StreamType) -> "BooleanObject": 

303 word = stream.read(4) 

304 if word == b"true": 

305 return BooleanObject(True) 

306 if word == b"fals": 

307 stream.read(1) 

308 return BooleanObject(False) 

309 raise PdfReadError("Could not read Boolean object") 

310 

311 

312class IndirectObject(PdfObject): 

313 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader 

314 self.idnum = idnum 

315 self.generation = generation 

316 self.pdf = pdf 

317 

318 def __hash__(self) -> int: 

319 return hash((self.idnum, self.generation, id(self.pdf))) 

320 

321 def hash_bin(self) -> int: 

322 """ 

323 Used to detect modified object. 

324 

325 Returns: 

326 Hash considering type and value. 

327 

328 """ 

329 return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) 

330 

331 def replicate( 

332 self, 

333 pdf_dest: PdfWriterProtocol, 

334 ) -> "PdfObject": 

335 return IndirectObject(self.idnum, self.generation, pdf_dest) 

336 

337 def clone( 

338 self, 

339 pdf_dest: PdfWriterProtocol, 

340 force_duplicate: bool = False, 

341 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

342 ) -> "IndirectObject": 

343 """Clone object into pdf_dest.""" 

344 if self.pdf == pdf_dest and not force_duplicate: 

345 # Already duplicated and no extra duplication required 

346 return self 

347 if id(self.pdf) not in pdf_dest._id_translated: 

348 pdf_dest._id_translated[id(self.pdf)] = {} 

349 

350 if self.idnum in pdf_dest._id_translated[id(self.pdf)]: 

351 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) 

352 if force_duplicate: 

353 assert dup is not None 

354 assert dup.indirect_reference is not None 

355 idref = dup.indirect_reference 

356 return IndirectObject(idref.idnum, idref.generation, idref.pdf) 

357 else: 

358 obj = self.get_object() 

359 # case observed : a pointed object can not be found 

360 if obj is None: 

361 # this normally 

362 obj = NullObject() 

363 assert isinstance(self, (IndirectObject,)) 

364 obj.indirect_reference = self 

365 dup = pdf_dest._add_object( 

366 obj.clone(pdf_dest, force_duplicate, ignore_fields) 

367 ) 

368 assert dup is not None, "mypy" 

369 assert dup.indirect_reference is not None, "mypy" 

370 return dup.indirect_reference 

371 

372 @property 

373 def indirect_reference(self) -> "IndirectObject": # type: ignore[override] 

374 return self 

375 

376 def get_object(self) -> Optional["PdfObject"]: 

377 return self.pdf.get_object(self) 

378 

379 def __deepcopy__(self, memo: Any) -> "IndirectObject": 

380 return IndirectObject(self.idnum, self.generation, self.pdf) 

381 

382 def _get_object_with_check(self) -> Optional["PdfObject"]: 

383 o = self.get_object() 

384 # the check is done here to not slow down get_object() 

385 if isinstance(o, IndirectObject): 

386 raise PdfStreamError( 

387 f"{self.__repr__()} references an IndirectObject {o.__repr__()}" 

388 ) 

389 return o 

390 

391 def __getattr__(self, name: str) -> Any: 

392 # Attribute not found in object: look in pointed object 

393 try: 

394 return getattr(self._get_object_with_check(), name) 

395 except AttributeError: 

396 raise AttributeError( 

397 f"No attribute {name} found in IndirectObject or pointed object" 

398 ) 

399 

400 def __getitem__(self, key: Any) -> Any: 

401 # items should be extracted from pointed Object 

402 return self._get_object_with_check()[key] # type: ignore 

403 

404 def __contains__(self, key: Any) -> bool: 

405 return key in self._get_object_with_check() # type: ignore 

406 

407 def __iter__(self) -> Any: 

408 return self._get_object_with_check().__iter__() # type: ignore 

409 

410 def __float__(self) -> str: 

411 # in this case we are looking for the pointed data 

412 return self.get_object().__float__() # type: ignore 

413 

414 def __int__(self) -> int: 

415 # in this case we are looking for the pointed data 

416 return self.get_object().__int__() # type: ignore 

417 

418 def __str__(self) -> str: 

419 # in this case we are looking for the pointed data 

420 return self.get_object().__str__() 

421 

422 def __repr__(self) -> str: 

423 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" 

424 

425 def __eq__(self, other: object) -> bool: 

426 return ( 

427 other is not None 

428 and isinstance(other, IndirectObject) 

429 and self.idnum == other.idnum 

430 and self.generation == other.generation 

431 and self.pdf is other.pdf 

432 ) 

433 

434 def __ne__(self, other: object) -> bool: 

435 return not self.__eq__(other) 

436 

437 def write_to_stream( 

438 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

439 ) -> None: 

440 if encryption_key is not None: # deprecated 

441 deprecation_no_replacement( 

442 "the encryption_key parameter of write_to_stream", "5.0.0" 

443 ) 

444 stream.write(f"{self.idnum} {self.generation} R".encode()) 

445 

446 @staticmethod 

447 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader 

448 idnum = b"" 

449 while True: 

450 tok = stream.read(1) 

451 if not tok: 

452 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

453 if tok.isspace(): 

454 break 

455 idnum += tok 

456 generation = b"" 

457 while True: 

458 tok = stream.read(1) 

459 if not tok: 

460 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

461 if tok.isspace(): 

462 if not generation: 

463 continue 

464 break 

465 generation += tok 

466 r = read_non_whitespace(stream) 

467 if r != b"R": 

468 raise PdfReadError( 

469 f"Error reading indirect object reference at byte {hex(stream.tell())}" 

470 ) 

471 return IndirectObject(int(idnum), int(generation), pdf) 

472 

473 

474FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj 

475 

476 

477class FloatObject(float, PdfObject): 

478 def __new__( 

479 cls, value: Any = "0.0", context: Optional[Any] = None 

480 ) -> "FloatObject": 

481 try: 

482 value = float(value) 

483 return float.__new__(cls, value) 

484 except Exception as e: 

485 # If this isn't a valid decimal (happens in malformed PDFs) 

486 # fallback to 0 

487 logger_warning( 

488 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ 

489 ) 

490 return float.__new__(cls, 0.0) 

491 

492 def clone( 

493 self, 

494 pdf_dest: Any, 

495 force_duplicate: bool = False, 

496 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

497 ) -> "FloatObject": 

498 """Clone object into pdf_dest.""" 

499 return cast( 

500 "FloatObject", 

501 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), 

502 ) 

503 

504 def hash_bin(self) -> int: 

505 """ 

506 Used to detect modified object. 

507 

508 Returns: 

509 Hash considering type and value. 

510 

511 """ 

512 return hash((self.__class__, self.as_numeric)) 

513 

514 def myrepr(self) -> str: 

515 if self == 0: 

516 return "0.0" 

517 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) 

518 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".") 

519 

520 def __repr__(self) -> str: 

521 return self.myrepr() # repr(float(self)) 

522 

523 def as_numeric(self) -> float: 

524 return float(self) 

525 

526 def write_to_stream( 

527 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

528 ) -> None: 

529 if encryption_key is not None: # deprecated 

530 deprecation_no_replacement( 

531 "the encryption_key parameter of write_to_stream", "5.0.0" 

532 ) 

533 stream.write(self.myrepr().encode("utf8")) 

534 

535 

536class NumberObject(int, PdfObject): 

537 NumberPattern = re.compile(b"[^+-.0-9]") 

538 

539 def __new__(cls, value: Any) -> "NumberObject": 

540 try: 

541 return int.__new__(cls, int(value)) 

542 except ValueError: 

543 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) 

544 return int.__new__(cls, 0) 

545 

546 def clone( 

547 self, 

548 pdf_dest: Any, 

549 force_duplicate: bool = False, 

550 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

551 ) -> "NumberObject": 

552 """Clone object into pdf_dest.""" 

553 return cast( 

554 "NumberObject", 

555 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), 

556 ) 

557 

558 def hash_bin(self) -> int: 

559 """ 

560 Used to detect modified object. 

561 

562 Returns: 

563 Hash considering type and value. 

564 

565 """ 

566 return hash((self.__class__, self.as_numeric())) 

567 

568 def as_numeric(self) -> int: 

569 return int(repr(self).encode("utf8")) 

570 

571 def write_to_stream( 

572 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

573 ) -> None: 

574 if encryption_key is not None: # deprecated 

575 deprecation_no_replacement( 

576 "the encryption_key parameter of write_to_stream", "5.0.0" 

577 ) 

578 stream.write(repr(self).encode("utf8")) 

579 

580 @staticmethod 

581 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: 

582 num = read_until_regex(stream, NumberObject.NumberPattern) 

583 if b"." in num: 

584 return FloatObject(num) 

585 return NumberObject(num) 

586 

587 

588class ByteStringObject(bytes, PdfObject): 

589 """ 

590 Represents a string object where the text encoding could not be determined. 

591 

592 This occurs quite often, as the PDF spec doesn't provide an alternate way to 

593 represent strings -- for example, the encryption data stored in files (like 

594 /O) is clearly not text, but is still stored in a "String" object. 

595 """ 

596 

597 def clone( 

598 self, 

599 pdf_dest: Any, 

600 force_duplicate: bool = False, 

601 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

602 ) -> "ByteStringObject": 

603 """Clone object into pdf_dest.""" 

604 return cast( 

605 "ByteStringObject", 

606 self._reference_clone( 

607 ByteStringObject(bytes(self)), pdf_dest, force_duplicate 

608 ), 

609 ) 

610 

611 def hash_bin(self) -> int: 

612 """ 

613 Used to detect modified object. 

614 

615 Returns: 

616 Hash considering type and value. 

617 

618 """ 

619 return hash((self.__class__, bytes(self))) 

620 

621 @property 

622 def original_bytes(self) -> bytes: 

623 """For compatibility with TextStringObject.original_bytes.""" 

624 return self 

625 

626 def write_to_stream( 

627 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

628 ) -> None: 

629 if encryption_key is not None: # deprecated 

630 deprecation_no_replacement( 

631 "the encryption_key parameter of write_to_stream", "5.0.0" 

632 ) 

633 stream.write(b"<") 

634 stream.write(binascii.hexlify(self)) 

635 stream.write(b">") 

636 

637 def __str__(self) -> str: 

638 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)] 

639 for enc in charset_to_try: 

640 try: 

641 return self.decode(enc) 

642 except UnicodeDecodeError: 

643 pass 

644 raise PdfReadError("Cannot decode ByteStringObject.") 

645 

646 

647class TextStringObject(str, PdfObject): # noqa: SLOT000 

648 """ 

649 A string object that has been decoded into a real unicode string. 

650 

651 If read from a PDF document, this string appeared to match the 

652 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding 

653 to occur. 

654 """ 

655 

656 autodetect_pdfdocencoding: bool 

657 autodetect_utf16: bool 

658 utf16_bom: bytes 

659 _original_bytes: Optional[bytes] = None 

660 

661 def __new__(cls, value: Any) -> "TextStringObject": 

662 org = None 

663 if isinstance(value, bytes): 

664 org = value 

665 value = value.decode("charmap") 

666 o = str.__new__(cls, value) 

667 o._original_bytes = org 

668 o.autodetect_utf16 = False 

669 o.autodetect_pdfdocencoding = False 

670 o.utf16_bom = b"" 

671 if o.startswith(("\xfe\xff", "\xff\xfe")): 

672 assert org is not None, "mypy" 

673 try: 

674 o = str.__new__(cls, org.decode("utf-16")) 

675 except UnicodeDecodeError as exc: 

676 logger_warning( 

677 f"{exc!s}\ninitial string:{exc.object!r}", 

678 __name__, 

679 ) 

680 o = str.__new__(cls, exc.object[: exc.start].decode("utf-16")) 

681 o._original_bytes = org 

682 o.autodetect_utf16 = True 

683 o.utf16_bom = org[:2] 

684 else: 

685 try: 

686 encode_pdfdocencoding(o) 

687 o.autodetect_pdfdocencoding = True 

688 except UnicodeEncodeError: 

689 o.autodetect_utf16 = True 

690 o.utf16_bom = codecs.BOM_UTF16_BE 

691 return o 

692 

693 def clone( 

694 self, 

695 pdf_dest: Any, 

696 force_duplicate: bool = False, 

697 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

698 ) -> "TextStringObject": 

699 """Clone object into pdf_dest.""" 

700 obj = TextStringObject(self) 

701 obj._original_bytes = self._original_bytes 

702 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding 

703 obj.autodetect_utf16 = self.autodetect_utf16 

704 obj.utf16_bom = self.utf16_bom 

705 return cast( 

706 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) 

707 ) 

708 

709 def hash_bin(self) -> int: 

710 """ 

711 Used to detect modified object. 

712 

713 Returns: 

714 Hash considering type and value. 

715 

716 """ 

717 return hash((self.__class__, self.original_bytes)) 

718 

719 @property 

720 def original_bytes(self) -> bytes: 

721 """ 

722 It is occasionally possible that a text string object gets created where 

723 a byte string object was expected due to the autodetection mechanism -- 

724 if that occurs, this "original_bytes" property can be used to 

725 back-calculate what the original encoded bytes were. 

726 """ 

727 if self._original_bytes is not None: 

728 return self._original_bytes 

729 return self.get_original_bytes() 

730 

731 def get_original_bytes(self) -> bytes: 

732 # We're a text string object, but the library is trying to get our raw 

733 # bytes. This can happen if we auto-detected this string as text, but 

734 # we were wrong. It's pretty common. Return the original bytes that 

735 # would have been used to create this object, based upon the autodetect 

736 # method. 

737 if self.autodetect_utf16: 

738 if self.utf16_bom == codecs.BOM_UTF16_LE: 

739 return codecs.BOM_UTF16_LE + self.encode("utf-16le") 

740 if self.utf16_bom == codecs.BOM_UTF16_BE: 

741 return codecs.BOM_UTF16_BE + self.encode("utf-16be") 

742 return self.encode("utf-16be") 

743 if self.autodetect_pdfdocencoding: 

744 return encode_pdfdocencoding(self) 

745 raise Exception("no information about original bytes") # pragma: no cover 

746 

747 def get_encoded_bytes(self) -> bytes: 

748 # Try to write the string out as a PDFDocEncoding encoded string. It's 

749 # nicer to look at in the PDF file. Sadly, we take a performance hit 

750 # here for trying... 

751 try: 

752 if self._original_bytes is not None: 

753 return self._original_bytes 

754 if self.autodetect_utf16: 

755 raise UnicodeEncodeError("", "forced", -1, -1, "") 

756 bytearr = encode_pdfdocencoding(self) 

757 except UnicodeEncodeError: 

758 if self.utf16_bom == codecs.BOM_UTF16_LE: 

759 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") 

760 elif self.utf16_bom == codecs.BOM_UTF16_BE: 

761 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") 

762 else: 

763 bytearr = self.encode("utf-16be") 

764 return bytearr 

765 

766 def write_to_stream( 

767 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

768 ) -> None: 

769 if encryption_key is not None: # deprecated 

770 deprecation_no_replacement( 

771 "the encryption_key parameter of write_to_stream", "5.0.0" 

772 ) 

773 bytearr = self.get_encoded_bytes() 

774 stream.write(b"(") 

775 for c_ in iter_unpack("c", bytearr): 

776 c = cast(bytes, c_[0]) 

777 if not c.isalnum() and c != b" ": 

778 # This: 

779 # stream.write(rf"\{c:0>3o}".encode()) 

780 # gives 

781 # https://github.com/davidhalter/parso/issues/207 

782 stream.write(b"\\%03o" % ord(c)) 

783 else: 

784 stream.write(c) 

785 stream.write(b")") 

786 

787 

788class NameObject(str, PdfObject): # noqa: SLOT000 

789 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") 

790 prefix = b"/" 

791 renumber_table: ClassVar[dict[str, bytes]] = { 

792 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"}, 

793 **{chr(i): f"#{i:02X}".encode() for i in range(33)}, 

794 } 

795 

796 def clone( 

797 self, 

798 pdf_dest: Any, 

799 force_duplicate: bool = False, 

800 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

801 ) -> "NameObject": 

802 """Clone object into pdf_dest.""" 

803 return cast( 

804 "NameObject", 

805 self._reference_clone(NameObject(self), pdf_dest, force_duplicate), 

806 ) 

807 

808 def hash_bin(self) -> int: 

809 """ 

810 Used to detect modified object. 

811 

812 Returns: 

813 Hash considering type and value. 

814 

815 """ 

816 return hash((self.__class__, self)) 

817 

818 def write_to_stream( 

819 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

820 ) -> None: 

821 if encryption_key is not None: # deprecated 

822 deprecation_no_replacement( 

823 "the encryption_key parameter of write_to_stream", "5.0.0" 

824 ) 

825 stream.write(self.renumber()) 

826 

827 def renumber(self) -> bytes: 

828 out = self[0].encode("utf-8") 

829 if out != b"/": 

830 deprecation_no_replacement( 

831 f"Incorrect first char in NameObject, should start with '/': ({self})", 

832 "5.0.0", 

833 ) 

834 for c in self[1:]: 

835 if c > "~": 

836 for x in c.encode("utf-8"): 

837 out += f"#{x:02X}".encode() 

838 else: 

839 try: 

840 out += self.renumber_table[c] 

841 except KeyError: 

842 out += c.encode("utf-8") 

843 return out 

844 

845 def _sanitize(self) -> "NameObject": 

846 """ 

847 Sanitize the NameObject's name to be a valid PDF name part 

848 (alphanumeric, underscore, hyphen). The _sanitize method replaces 

849 spaces and any non-alphanumeric/non-underscore/non-hyphen with 

850 underscores. 

851 

852 Returns: 

853 NameObject with sanitized name. 

854 """ 

855 name = str(self)[1:] # Remove leading forward slash 

856 name = re.sub(r"\ ", "_", name) 

857 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) 

858 return NameObject("/" + name) 

859 

860 @classproperty 

861 def surfix(cls) -> bytes: # noqa: N805 

862 deprecation_with_replacement("surfix", "prefix", "5.0.0") 

863 return b"/" 

864 

865 @staticmethod 

866 def unnumber(sin: bytes) -> bytes: 

867 i = sin.find(b"#", 0) 

868 while i >= 0: 

869 try: 

870 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :] 

871 i = sin.find(b"#", i + 1) 

872 except ValueError: 

873 # if the 2 characters after # can not be converted to hex 

874 # we change nothing and carry on 

875 i = i + 1 

876 return sin 

877 

878 CHARSETS = ("utf-8", "gbk", "latin1") 

879 

880 @staticmethod 

881 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader 

882 name = stream.read(1) 

883 if name != NameObject.prefix: 

884 raise PdfReadError("Name read error") 

885 name += read_until_regex(stream, NameObject.delimiter_pattern) 

886 try: 

887 # Name objects should represent irregular characters 

888 # with a '#' followed by the symbol's hex number 

889 name = NameObject.unnumber(name) 

890 for enc in NameObject.CHARSETS: 

891 try: 

892 ret = name.decode(enc) 

893 return NameObject(ret) 

894 except Exception: 

895 pass 

896 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") 

897 except (UnicodeEncodeError, UnicodeDecodeError) as e: 

898 if not pdf.strict: 

899 logger_warning( 

900 f"Illegal character in NameObject ({name!r}), " 

901 "you may need to adjust NameObject.CHARSETS", 

902 __name__, 

903 ) 

904 return NameObject(name.decode("charmap")) 

905 raise PdfReadError( 

906 f"Illegal character in NameObject ({name!r}). " 

907 "You may need to adjust NameObject.CHARSETS.", 

908 ) from e 

909 

910 

911def encode_pdfdocencoding(unicode_string: str) -> bytes: 

912 try: 

913 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) 

914 except KeyError: 

915 raise UnicodeEncodeError( 

916 "pdfdocencoding", 

917 unicode_string, 

918 -1, 

919 -1, 

920 "does not exist in translation table", 

921 ) 

922 

923 

924def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]: 

925 """ 

926 Returns: 

927 True if x is None or NullObject. 

928 

929 """ 

930 return x is None or ( 

931 isinstance(x, PdfObject) 

932 and (x.get_object() is None or isinstance(x.get_object(), NullObject)) 

933 )