Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

441 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27import binascii 

28import codecs 

29import hashlib 

30import re 

31import sys 

32from binascii import unhexlify 

33from collections.abc import Sequence 

34from math import log10 

35from struct import iter_unpack 

36from typing import Any, Callable, ClassVar, Optional, Union, cast 

37 

38if sys.version_info[:2] >= (3, 10): 

39 from typing import TypeGuard 

40else: 

41 from typing_extensions import TypeGuard # PEP 647 

42 

43if sys.version_info >= (3, 11): 

44 from typing import Self 

45else: 

46 from typing_extensions import Self 

47 

48from .._codecs import _pdfdoc_encoding_rev 

49from .._protocols import PdfObjectProtocol, PdfWriterProtocol 

50from .._utils import ( 

51 StreamType, 

52 classproperty, 

53 deprecation_no_replacement, 

54 deprecation_with_replacement, 

55 logger_warning, 

56 read_non_whitespace, 

57 read_until_regex, 

58) 

59from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError 

60 

61__author__ = "Mathieu Fenniak" 

62__author_email__ = "biziqe@mathieu.fenniak.net" 

63 

64 

65class PdfObject(PdfObjectProtocol): 

66 # function for calculating a hash value 

67 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 

68 indirect_reference: Optional["IndirectObject"] 

69 

70 def hash_bin(self) -> int: 

71 """ 

72 Used to detect modified object. 

73 

74 Returns: 

75 Hash considering type and value. 

76 

77 """ 

78 raise NotImplementedError( 

79 f"{self.__class__.__name__} does not implement .hash_bin() so far" 

80 ) 

81 

82 def hash_value_data(self) -> bytes: 

83 return f"{self}".encode() 

84 

85 def hash_value(self) -> bytes: 

86 return ( 

87 f"{self.__class__.__name__}:" 

88 f"{self.hash_func(self.hash_value_data()).hexdigest()}" 

89 ).encode() 

90 

91 def replicate( 

92 self, 

93 pdf_dest: PdfWriterProtocol, 

94 ) -> "PdfObject": 

95 """ 

96 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) 

97 without ensuring links. This is used in clone_document_from_root with incremental = True. 

98 

99 Args: 

100 pdf_dest: Target to clone to. 

101 

102 Returns: 

103 The cloned PdfObject 

104 

105 """ 

106 return self.clone(pdf_dest) 

107 

108 def clone( 

109 self, 

110 pdf_dest: PdfWriterProtocol, 

111 force_duplicate: bool = False, 

112 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

113 ) -> "PdfObject": 

114 """ 

115 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). 

116 

117 By default, this method will call ``_reference_clone`` (see ``_reference``). 

118 

119 

120 Args: 

121 pdf_dest: Target to clone to. 

122 force_duplicate: By default, if the object has already been cloned and referenced, 

123 the copy will be returned; when ``True``, a new copy will be created. 

124 (Default value = ``False``) 

125 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored 

126 during cloning (applies to children duplication as well). If fields are to be 

127 considered for a limited number of levels, you have to add it as integer, for 

128 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first 

129 level only but ``"/TOTO"`` on all levels. 

130 

131 Returns: 

132 The cloned PdfObject 

133 

134 """ 

135 raise NotImplementedError( 

136 f"{self.__class__.__name__} does not implement .clone so far" 

137 ) 

138 

139 def _reference_clone( 

140 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False 

141 ) -> PdfObjectProtocol: 

142 """ 

143 Reference the object within the _objects of pdf_dest only if 

144 indirect_reference attribute exists (which means the objects was 

145 already identified in xref/xobjstm) if object has been already 

146 referenced do nothing. 

147 

148 Args: 

149 clone: 

150 pdf_dest: 

151 

152 Returns: 

153 The clone 

154 

155 """ 

156 try: 

157 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest: 

158 return clone 

159 except Exception: 

160 pass 

161 # if hasattr(clone, "indirect_reference"): 

162 try: 

163 ind = self.indirect_reference 

164 except AttributeError: 

165 return clone 

166 if ( 

167 pdf_dest.incremental 

168 and ind is not None 

169 and ind.pdf == pdf_dest._reader 

170 and ind.idnum <= len(pdf_dest._objects) 

171 ): 

172 i = ind.idnum 

173 else: 

174 i = len(pdf_dest._objects) + 1 

175 if ind is not None: 

176 if id(ind.pdf) not in pdf_dest._id_translated: 

177 pdf_dest._id_translated[id(ind.pdf)] = {} 

178 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index] 

179 if ( 

180 not force_duplicate 

181 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] 

182 ): 

183 obj = pdf_dest.get_object( 

184 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] 

185 ) 

186 assert obj is not None 

187 return obj 

188 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i 

189 try: 

190 pdf_dest._objects[i - 1] = clone 

191 except IndexError: 

192 pdf_dest._objects.append(clone) 

193 i = len(pdf_dest._objects) 

194 clone.indirect_reference = IndirectObject(i, 0, pdf_dest) 

195 return clone 

196 

197 def get_object(self) -> Optional["PdfObject"]: 

198 """Resolve indirect references.""" 

199 return self 

200 

201 def write_to_stream( 

202 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

203 ) -> None: 

204 raise NotImplementedError 

205 

206 

207class NullObject(PdfObject): 

208 def clone( 

209 self, 

210 pdf_dest: PdfWriterProtocol, 

211 force_duplicate: bool = False, 

212 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

213 ) -> "NullObject": 

214 """Clone object into pdf_dest.""" 

215 return cast( 

216 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) 

217 ) 

218 

219 def hash_bin(self) -> int: 

220 """ 

221 Used to detect modified object. 

222 

223 Returns: 

224 Hash considering type and value. 

225 

226 """ 

227 return hash((self.__class__,)) 

228 

229 def write_to_stream( 

230 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

231 ) -> None: 

232 if encryption_key is not None: # deprecated 

233 deprecation_no_replacement( 

234 "the encryption_key parameter of write_to_stream", "5.0.0" 

235 ) 

236 stream.write(b"null") 

237 

238 @staticmethod 

239 def read_from_stream(stream: StreamType) -> "NullObject": 

240 nulltxt = stream.read(4) 

241 if nulltxt != b"null": 

242 raise PdfReadError("Could not read Null object") 

243 return NullObject() 

244 

245 def __repr__(self) -> str: 

246 return "NullObject" 

247 

248 def __eq__(self, other: object) -> bool: 

249 return isinstance(other, NullObject) 

250 

251 def __hash__(self) -> int: 

252 return self.hash_bin() 

253 

254 

255class BooleanObject(PdfObject): 

256 def __init__(self, value: Any) -> None: 

257 self.value = value 

258 

259 def clone( 

260 self, 

261 pdf_dest: PdfWriterProtocol, 

262 force_duplicate: bool = False, 

263 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

264 ) -> "BooleanObject": 

265 """Clone object into pdf_dest.""" 

266 return cast( 

267 "BooleanObject", 

268 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), 

269 ) 

270 

271 def hash_bin(self) -> int: 

272 """ 

273 Used to detect modified object. 

274 

275 Returns: 

276 Hash considering type and value. 

277 

278 """ 

279 return hash((self.__class__, self.value)) 

280 

281 def __eq__(self, o: object, /) -> bool: 

282 if isinstance(o, BooleanObject): 

283 return self.value == o.value 

284 if isinstance(o, bool): 

285 return self.value == o 

286 return False 

287 

288 def __hash__(self) -> int: 

289 return self.hash_bin() 

290 

291 def __repr__(self) -> str: 

292 return "True" if self.value else "False" 

293 

294 def write_to_stream( 

295 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

296 ) -> None: 

297 if encryption_key is not None: # deprecated 

298 deprecation_no_replacement( 

299 "the encryption_key parameter of write_to_stream", "5.0.0" 

300 ) 

301 if self.value: 

302 stream.write(b"true") 

303 else: 

304 stream.write(b"false") 

305 

306 @staticmethod 

307 def read_from_stream(stream: StreamType) -> "BooleanObject": 

308 word = stream.read(4) 

309 if word == b"true": 

310 return BooleanObject(True) 

311 if word == b"fals": 

312 stream.read(1) 

313 return BooleanObject(False) 

314 raise PdfReadError("Could not read Boolean object") 

315 

316 

317class IndirectObject(PdfObject): 

318 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader 

319 self.idnum = idnum 

320 self.generation = generation 

321 self.pdf = pdf 

322 

323 def __hash__(self) -> int: 

324 return hash((self.idnum, self.generation, id(self.pdf))) 

325 

326 def hash_bin(self) -> int: 

327 """ 

328 Used to detect modified object. 

329 

330 Returns: 

331 Hash considering type and value. 

332 

333 """ 

334 return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) 

335 

336 def replicate( 

337 self, 

338 pdf_dest: PdfWriterProtocol, 

339 ) -> "PdfObject": 

340 return IndirectObject(self.idnum, self.generation, pdf_dest) 

341 

342 def clone( 

343 self, 

344 pdf_dest: PdfWriterProtocol, 

345 force_duplicate: bool = False, 

346 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

347 ) -> "IndirectObject": 

348 """Clone object into pdf_dest.""" 

349 if self.pdf == pdf_dest and not force_duplicate: 

350 # Already duplicated and no extra duplication required 

351 return self 

352 if id(self.pdf) not in pdf_dest._id_translated: 

353 pdf_dest._id_translated[id(self.pdf)] = {} 

354 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index] 

355 

356 if self.idnum in pdf_dest._id_translated[id(self.pdf)]: 

357 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) 

358 if force_duplicate: 

359 assert dup is not None 

360 assert dup.indirect_reference is not None 

361 idref = dup.indirect_reference 

362 return IndirectObject(idref.idnum, idref.generation, idref.pdf) 

363 else: 

364 obj = self.get_object() 

365 # case observed : a pointed object can not be found 

366 if obj is None: 

367 # this normally 

368 obj = NullObject() 

369 assert isinstance(self, (IndirectObject,)) 

370 obj.indirect_reference = self 

371 dup = pdf_dest._add_object( 

372 obj.clone(pdf_dest, force_duplicate, ignore_fields) 

373 ) 

374 assert dup is not None, "mypy" 

375 assert dup.indirect_reference is not None, "mypy" 

376 return dup.indirect_reference 

377 

378 @property 

379 def indirect_reference(self) -> "IndirectObject": # type: ignore[override] 

380 return self 

381 

382 def get_object(self) -> Optional["PdfObject"]: 

383 return self.pdf.get_object(self) 

384 

385 def __deepcopy__(self, memo: Any) -> "IndirectObject": 

386 return IndirectObject(self.idnum, self.generation, self.pdf) 

387 

388 def _get_object_with_check(self) -> Optional["PdfObject"]: 

389 o = self.get_object() 

390 # the check is done here to not slow down get_object() 

391 if isinstance(o, IndirectObject): 

392 raise PdfStreamError( 

393 f"{self.__repr__()} references an IndirectObject {o.__repr__()}" 

394 ) 

395 return o 

396 

397 def __getattr__(self, name: str) -> Any: 

398 # Attribute not found in object: look in pointed object 

399 try: 

400 return getattr(self._get_object_with_check(), name) 

401 except AttributeError: 

402 raise AttributeError( 

403 f"No attribute {name} found in IndirectObject or pointed object" 

404 ) 

405 

406 def __getitem__(self, key: Any) -> Any: 

407 # items should be extracted from pointed Object 

408 return self._get_object_with_check()[key] # type: ignore 

409 

410 def __contains__(self, key: Any) -> bool: 

411 return key in self._get_object_with_check() # type: ignore 

412 

413 def __iter__(self) -> Any: 

414 return self._get_object_with_check().__iter__() # type: ignore 

415 

416 def __float__(self) -> str: 

417 # in this case we are looking for the pointed data 

418 return self.get_object().__float__() # type: ignore 

419 

420 def __int__(self) -> int: 

421 # in this case we are looking for the pointed data 

422 return self.get_object().__int__() # type: ignore 

423 

424 def __str__(self) -> str: 

425 # in this case we are looking for the pointed data 

426 return self.get_object().__str__() 

427 

428 def __repr__(self) -> str: 

429 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" 

430 

431 def __eq__(self, other: object) -> bool: 

432 return ( 

433 other is not None 

434 and isinstance(other, IndirectObject) 

435 and self.idnum == other.idnum 

436 and self.generation == other.generation 

437 and self.pdf is other.pdf 

438 ) 

439 

440 def __ne__(self, other: object) -> bool: 

441 return not self.__eq__(other) 

442 

443 def write_to_stream( 

444 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

445 ) -> None: 

446 if encryption_key is not None: # deprecated 

447 deprecation_no_replacement( 

448 "the encryption_key parameter of write_to_stream", "5.0.0" 

449 ) 

450 stream.write(f"{self.idnum} {self.generation} R".encode()) 

451 

452 @staticmethod 

453 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader 

454 idnum = b"" 

455 while True: 

456 tok = stream.read(1) 

457 if not tok: 

458 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

459 if tok.isspace(): 

460 break 

461 idnum += tok 

462 generation = b"" 

463 while True: 

464 tok = stream.read(1) 

465 if not tok: 

466 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

467 if tok.isspace(): 

468 if not generation: 

469 continue 

470 break 

471 generation += tok 

472 r = read_non_whitespace(stream) 

473 if r != b"R": 

474 raise PdfReadError( 

475 f"Error reading indirect object reference at byte {hex(stream.tell())}" 

476 ) 

477 return IndirectObject(int(idnum), int(generation), pdf) 

478 

479 

480FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj 

481 

482 

483class FloatObject(float, PdfObject): 

484 def __new__( 

485 cls, value: Any = "0.0", context: Optional[Any] = None 

486 ) -> Self: 

487 try: 

488 value = float(value) 

489 return float.__new__(cls, value) 

490 except Exception as e: 

491 # If this isn't a valid decimal (happens in malformed PDFs) 

492 # fallback to 0 

493 logger_warning( 

494 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ 

495 ) 

496 return float.__new__(cls, 0.0) 

497 

498 def clone( 

499 self, 

500 pdf_dest: Any, 

501 force_duplicate: bool = False, 

502 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

503 ) -> "FloatObject": 

504 """Clone object into pdf_dest.""" 

505 return cast( 

506 "FloatObject", 

507 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), 

508 ) 

509 

510 def hash_bin(self) -> int: 

511 """ 

512 Used to detect modified object. 

513 

514 Returns: 

515 Hash considering type and value. 

516 

517 """ 

518 return hash((self.__class__, self.as_numeric)) 

519 

520 def myrepr(self) -> str: 

521 if self == 0: 

522 return "0.0" 

523 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) 

524 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".") 

525 

526 def __repr__(self) -> str: 

527 return self.myrepr() # repr(float(self)) 

528 

529 def as_numeric(self) -> float: 

530 return float(self) 

531 

532 def write_to_stream( 

533 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

534 ) -> None: 

535 if encryption_key is not None: # deprecated 

536 deprecation_no_replacement( 

537 "the encryption_key parameter of write_to_stream", "5.0.0" 

538 ) 

539 stream.write(self.myrepr().encode("utf8")) 

540 

541 

542class NumberObject(int, PdfObject): 

543 NumberPattern = re.compile(b"[^+-.0-9]") 

544 

545 def __new__(cls, value: Any) -> Self: 

546 try: 

547 return int.__new__(cls, int(value)) 

548 except ValueError: 

549 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) 

550 return int.__new__(cls, 0) 

551 

552 def clone( 

553 self, 

554 pdf_dest: Any, 

555 force_duplicate: bool = False, 

556 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

557 ) -> "NumberObject": 

558 """Clone object into pdf_dest.""" 

559 return cast( 

560 "NumberObject", 

561 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), 

562 ) 

563 

564 def hash_bin(self) -> int: 

565 """ 

566 Used to detect modified object. 

567 

568 Returns: 

569 Hash considering type and value. 

570 

571 """ 

572 return hash((self.__class__, self.as_numeric())) 

573 

574 def as_numeric(self) -> int: 

575 return int(repr(self).encode("utf8")) 

576 

577 def write_to_stream( 

578 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

579 ) -> None: 

580 if encryption_key is not None: # deprecated 

581 deprecation_no_replacement( 

582 "the encryption_key parameter of write_to_stream", "5.0.0" 

583 ) 

584 stream.write(repr(self).encode("utf8")) 

585 

586 @staticmethod 

587 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: 

588 num = read_until_regex(stream, NumberObject.NumberPattern) 

589 if b"." in num: 

590 return FloatObject(num) 

591 return NumberObject(num) 

592 

593 

594class ByteStringObject(bytes, PdfObject): 

595 """ 

596 Represents a string object where the text encoding could not be determined. 

597 

598 This occurs quite often, as the PDF spec doesn't provide an alternate way to 

599 represent strings -- for example, the encryption data stored in files (like 

600 /O) is clearly not text, but is still stored in a "String" object. 

601 """ 

602 

603 def clone( 

604 self, 

605 pdf_dest: Any, 

606 force_duplicate: bool = False, 

607 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

608 ) -> "ByteStringObject": 

609 """Clone object into pdf_dest.""" 

610 return cast( 

611 "ByteStringObject", 

612 self._reference_clone( 

613 ByteStringObject(bytes(self)), pdf_dest, force_duplicate 

614 ), 

615 ) 

616 

617 def hash_bin(self) -> int: 

618 """ 

619 Used to detect modified object. 

620 

621 Returns: 

622 Hash considering type and value. 

623 

624 """ 

625 return hash((self.__class__, bytes(self))) 

626 

627 @property 

628 def original_bytes(self) -> bytes: 

629 """For compatibility with TextStringObject.original_bytes.""" 

630 return self 

631 

632 def write_to_stream( 

633 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

634 ) -> None: 

635 if encryption_key is not None: # deprecated 

636 deprecation_no_replacement( 

637 "the encryption_key parameter of write_to_stream", "5.0.0" 

638 ) 

639 stream.write(b"<") 

640 stream.write(binascii.hexlify(self)) 

641 stream.write(b">") 

642 

643 def __str__(self) -> str: 

644 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)] 

645 for enc in charset_to_try: 

646 try: 

647 return self.decode(enc) 

648 except UnicodeDecodeError: 

649 pass 

650 raise PdfReadError("Cannot decode ByteStringObject.") 

651 

652 

653class TextStringObject(str, PdfObject): # noqa: SLOT000 

654 """ 

655 A string object that has been decoded into a real unicode string. 

656 

657 If read from a PDF document, this string appeared to match the 

658 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding 

659 to occur. 

660 """ 

661 

662 autodetect_pdfdocencoding: bool 

663 autodetect_utf16: bool 

664 utf16_bom: bytes 

665 _original_bytes: Optional[bytes] = None 

666 

667 def __new__(cls, value: Any) -> Self: 

668 original_bytes = None 

669 if isinstance(value, bytes): 

670 original_bytes = value 

671 value = value.decode("charmap") 

672 text_string_object = str.__new__(cls, value) 

673 text_string_object._original_bytes = original_bytes 

674 text_string_object.autodetect_utf16 = False 

675 text_string_object.autodetect_pdfdocencoding = False 

676 text_string_object.utf16_bom = b"" 

677 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}: 

678 # The value of `original_bytes` is only set for inputs being `bytes`. 

679 # If this is UTF-16 data according to the BOM (first two characters), 

680 # perform special handling. All other cases should not need any special conversion 

681 # due to already being a string. 

682 try: 

683 text_string_object = str.__new__(cls, original_bytes.decode("utf-16")) 

684 except UnicodeDecodeError as exception: 

685 logger_warning( 

686 f"{exception!s}\ninitial string:{exception.object!r}", 

687 __name__, 

688 ) 

689 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16")) 

690 text_string_object._original_bytes = original_bytes 

691 text_string_object.autodetect_utf16 = True 

692 text_string_object.utf16_bom = original_bytes[:2] 

693 else: 

694 try: 

695 encode_pdfdocencoding(text_string_object) 

696 text_string_object.autodetect_pdfdocencoding = True 

697 except UnicodeEncodeError: 

698 text_string_object.autodetect_utf16 = True 

699 text_string_object.utf16_bom = codecs.BOM_UTF16_BE 

700 return text_string_object 

701 

702 def clone( 

703 self, 

704 pdf_dest: Any, 

705 force_duplicate: bool = False, 

706 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

707 ) -> "TextStringObject": 

708 """Clone object into pdf_dest.""" 

709 obj = TextStringObject(self) 

710 obj._original_bytes = self._original_bytes 

711 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding 

712 obj.autodetect_utf16 = self.autodetect_utf16 

713 obj.utf16_bom = self.utf16_bom 

714 return cast( 

715 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) 

716 ) 

717 

718 def hash_bin(self) -> int: 

719 """ 

720 Used to detect modified object. 

721 

722 Returns: 

723 Hash considering type and value. 

724 

725 """ 

726 return hash((self.__class__, self.original_bytes)) 

727 

728 @property 

729 def original_bytes(self) -> bytes: 

730 """ 

731 It is occasionally possible that a text string object gets created where 

732 a byte string object was expected due to the autodetection mechanism -- 

733 if that occurs, this "original_bytes" property can be used to 

734 back-calculate what the original encoded bytes were. 

735 """ 

736 if self._original_bytes is not None: 

737 return self._original_bytes 

738 return self.get_original_bytes() 

739 

740 def get_original_bytes(self) -> bytes: 

741 # We're a text string object, but the library is trying to get our raw 

742 # bytes. This can happen if we auto-detected this string as text, but 

743 # we were wrong. It's pretty common. Return the original bytes that 

744 # would have been used to create this object, based upon the autodetect 

745 # method. 

746 if self.autodetect_utf16: 

747 if self.utf16_bom == codecs.BOM_UTF16_LE: 

748 return codecs.BOM_UTF16_LE + self.encode("utf-16le") 

749 if self.utf16_bom == codecs.BOM_UTF16_BE: 

750 return codecs.BOM_UTF16_BE + self.encode("utf-16be") 

751 return self.encode("utf-16be") 

752 if self.autodetect_pdfdocencoding: 

753 return encode_pdfdocencoding(self) 

754 raise Exception("no information about original bytes") # pragma: no cover 

755 

756 def get_encoded_bytes(self) -> bytes: 

757 # Try to write the string out as a PDFDocEncoding encoded string. It's 

758 # nicer to look at in the PDF file. Sadly, we take a performance hit 

759 # here for trying... 

760 try: 

761 if self._original_bytes is not None: 

762 return self._original_bytes 

763 if self.autodetect_utf16: 

764 raise UnicodeEncodeError("", "forced", -1, -1, "") 

765 bytearr = encode_pdfdocencoding(self) 

766 except UnicodeEncodeError: 

767 if self.utf16_bom == codecs.BOM_UTF16_LE: 

768 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") 

769 elif self.utf16_bom == codecs.BOM_UTF16_BE: 

770 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") 

771 else: 

772 bytearr = self.encode("utf-16be") 

773 return bytearr 

774 

775 def write_to_stream( 

776 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

777 ) -> None: 

778 if encryption_key is not None: # deprecated 

779 deprecation_no_replacement( 

780 "the encryption_key parameter of write_to_stream", "5.0.0" 

781 ) 

782 bytearr = self.get_encoded_bytes() 

783 stream.write(b"(") 

784 for c_ in iter_unpack("c", bytearr): 

785 c = cast(bytes, c_[0]) 

786 if not c.isalnum() and c != b" ": 

787 # This: 

788 # stream.write(rf"\{c:0>3o}".encode()) 

789 # gives 

790 # https://github.com/davidhalter/parso/issues/207 

791 stream.write(b"\\%03o" % ord(c)) 

792 else: 

793 stream.write(c) 

794 stream.write(b")") 

795 

796 

797class NameObject(str, PdfObject): # noqa: SLOT000 

798 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") 

799 prefix = b"/" 

800 renumber_table: ClassVar[dict[str, bytes]] = { 

801 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"}, 

802 **{chr(i): f"#{i:02X}".encode() for i in range(33)}, 

803 } 

804 

805 def clone( 

806 self, 

807 pdf_dest: Any, 

808 force_duplicate: bool = False, 

809 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

810 ) -> "NameObject": 

811 """Clone object into pdf_dest.""" 

812 return cast( 

813 "NameObject", 

814 self._reference_clone(NameObject(self), pdf_dest, force_duplicate), 

815 ) 

816 

817 def hash_bin(self) -> int: 

818 """ 

819 Used to detect modified object. 

820 

821 Returns: 

822 Hash considering type and value. 

823 

824 """ 

825 return hash((self.__class__, self)) 

826 

827 def write_to_stream( 

828 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

829 ) -> None: 

830 if encryption_key is not None: # deprecated 

831 deprecation_no_replacement( 

832 "the encryption_key parameter of write_to_stream", "5.0.0" 

833 ) 

834 stream.write(self.renumber()) 

835 

836 def renumber(self) -> bytes: 

837 out = self[0].encode("utf-8") 

838 if out != b"/": 

839 deprecation_no_replacement( 

840 f"Incorrect first char in NameObject, should start with '/': ({self})", 

841 "5.0.0", 

842 ) 

843 for c in self[1:]: 

844 if c > "~": 

845 for x in c.encode("utf-8"): 

846 out += f"#{x:02X}".encode() 

847 else: 

848 try: 

849 out += self.renumber_table[c] 

850 except KeyError: 

851 out += c.encode("utf-8") 

852 return out 

853 

854 def _sanitize(self) -> "NameObject": 

855 """ 

856 Sanitize the NameObject's name to be a valid PDF name part 

857 (alphanumeric, underscore, hyphen). The _sanitize method replaces 

858 spaces and any non-alphanumeric/non-underscore/non-hyphen with 

859 underscores. 

860 

861 Returns: 

862 NameObject with sanitized name. 

863 """ 

864 name = str(self).removeprefix("/") 

865 name = re.sub(r"\ ", "_", name) 

866 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) 

867 return NameObject("/" + name) 

868 

869 @classproperty 

870 def surfix(cls) -> bytes: # noqa: N805 

871 deprecation_with_replacement("surfix", "prefix", "5.0.0") 

872 return b"/" 

873 

874 @staticmethod 

875 def unnumber(sin: bytes) -> bytes: 

876 i = sin.find(b"#", 0) 

877 while i >= 0: 

878 try: 

879 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :] 

880 i = sin.find(b"#", i + 1) 

881 except ValueError: 

882 # if the 2 characters after # can not be converted to hex 

883 # we change nothing and carry on 

884 i = i + 1 

885 return sin 

886 

887 CHARSETS = ("utf-8", "gbk", "latin1") 

888 

889 @staticmethod 

890 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader 

891 name = stream.read(1) 

892 if name != NameObject.prefix: 

893 raise PdfReadError("Name read error") 

894 name += read_until_regex(stream, NameObject.delimiter_pattern) 

895 try: 

896 # Name objects should represent irregular characters 

897 # with a '#' followed by the symbol's hex number 

898 name = NameObject.unnumber(name) 

899 for enc in NameObject.CHARSETS: 

900 try: 

901 ret = name.decode(enc) 

902 return NameObject(ret) 

903 except Exception: 

904 pass 

905 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") 

906 except (UnicodeEncodeError, UnicodeDecodeError) as e: 

907 if not pdf.strict: 

908 logger_warning( 

909 f"Illegal character in NameObject ({name!r}), " 

910 "you may need to adjust NameObject.CHARSETS", 

911 __name__, 

912 ) 

913 return NameObject(name.decode("charmap")) 

914 raise PdfReadError( 

915 f"Illegal character in NameObject ({name!r}). " 

916 "You may need to adjust NameObject.CHARSETS.", 

917 ) from e 

918 

919 

920def encode_pdfdocencoding(unicode_string: str) -> bytes: 

921 try: 

922 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) 

923 except KeyError: 

924 raise UnicodeEncodeError( 

925 "pdfdocencoding", 

926 unicode_string, 

927 -1, 

928 -1, 

929 "does not exist in translation table", 

930 ) 

931 

932 

933def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]: 

934 """ 

935 Returns: 

936 True if x is None or NullObject. 

937 

938 """ 

939 return x is None or ( 

940 isinstance(x, PdfObject) 

941 and (x.get_object() is None or isinstance(x.get_object(), NullObject)) 

942 )