Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

447 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27import binascii 

28import codecs 

29import hashlib 

30import re 

31import sys 

32from collections.abc import Sequence 

33from math import log10 

34from struct import iter_unpack 

35from typing import Any, Callable, ClassVar, Optional, Union, cast 

36 

37if sys.version_info[:2] >= (3, 10): 

38 from typing import TypeGuard 

39else: 

40 from typing_extensions import TypeGuard # PEP 647 

41 

42if sys.version_info >= (3, 11): 

43 from typing import Self 

44else: 

45 from typing_extensions import Self 

46 

47from .._codecs import _pdfdoc_encoding_rev 

48from .._protocols import PdfObjectProtocol, PdfWriterProtocol 

49from .._utils import ( 

50 StreamType, 

51 classproperty, 

52 deprecation_no_replacement, 

53 deprecation_with_replacement, 

54 logger_warning, 

55 read_non_whitespace, 

56 read_until_regex, 

57) 

58from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError 

59 

60__author__ = "Mathieu Fenniak" 

61__author_email__ = "biziqe@mathieu.fenniak.net" 

62 

63 

64class PdfObject(PdfObjectProtocol): 

65 # function for calculating a hash value 

66 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 

67 indirect_reference: Optional["IndirectObject"] 

68 

69 def hash_bin(self) -> int: 

70 """ 

71 Used to detect modified object. 

72 

73 Returns: 

74 Hash considering type and value. 

75 

76 """ 

77 raise NotImplementedError( 

78 f"{self.__class__.__name__} does not implement .hash_bin() so far" 

79 ) 

80 

81 def hash_value_data(self) -> bytes: 

82 return f"{self}".encode() 

83 

84 def hash_value(self) -> bytes: 

85 return ( 

86 f"{self.__class__.__name__}:" 

87 f"{self.hash_func(self.hash_value_data()).hexdigest()}" 

88 ).encode() 

89 

90 def replicate( 

91 self, 

92 pdf_dest: PdfWriterProtocol, 

93 ) -> "PdfObject": 

94 """ 

95 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) 

96 without ensuring links. This is used in clone_document_from_root with incremental = True. 

97 

98 Args: 

99 pdf_dest: Target to clone to. 

100 

101 Returns: 

102 The cloned PdfObject 

103 

104 """ 

105 return self.clone(pdf_dest) 

106 

107 def clone( 

108 self, 

109 pdf_dest: PdfWriterProtocol, 

110 force_duplicate: bool = False, 

111 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

112 ) -> "PdfObject": 

113 """ 

114 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). 

115 

116 By default, this method will call ``_reference_clone`` (see ``_reference``). 

117 

118 

119 Args: 

120 pdf_dest: Target to clone to. 

121 force_duplicate: By default, if the object has already been cloned and referenced, 

122 the copy will be returned; when ``True``, a new copy will be created. 

123 (Default value = ``False``) 

124 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored 

125 during cloning (applies to children duplication as well). If fields are to be 

126 considered for a limited number of levels, you have to add it as integer, for 

127 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first 

128 level only but ``"/TOTO"`` on all levels. 

129 

130 Returns: 

131 The cloned PdfObject 

132 

133 """ 

134 raise NotImplementedError( 

135 f"{self.__class__.__name__} does not implement .clone so far" 

136 ) 

137 

138 def _reference_clone( 

139 self, clone: "PdfObject", pdf_dest: PdfWriterProtocol, force_duplicate: bool = False 

140 ) -> "PdfObject": 

141 """ 

142 Reference the object within the _objects of pdf_dest only if 

143 indirect_reference attribute exists (which means the objects was 

144 already identified in xref/xobjstm) if object has been already 

145 referenced do nothing. 

146 

147 Args: 

148 clone: 

149 pdf_dest: 

150 

151 Returns: 

152 The clone 

153 

154 """ 

155 try: 

156 if ( 

157 not force_duplicate 

158 and clone.indirect_reference is not None 

159 and clone.indirect_reference.pdf == pdf_dest 

160 ): 

161 return clone 

162 except Exception: 

163 pass 

164 # if hasattr(clone, "indirect_reference"): 

165 try: 

166 ind = self.indirect_reference 

167 except AttributeError: 

168 return clone 

169 if ( 

170 pdf_dest.incremental 

171 and ind is not None 

172 and ind.pdf == pdf_dest._reader 

173 and ind.idnum <= len(pdf_dest._objects) 

174 ): 

175 i = ind.idnum 

176 else: 

177 i = len(pdf_dest._objects) + 1 

178 if ind is not None: 

179 if id(ind.pdf) not in pdf_dest._id_translated: 

180 pdf_dest._id_translated[id(ind.pdf)] = {} 

181 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index] 

182 if ( 

183 not force_duplicate 

184 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] 

185 ): 

186 obj = pdf_dest.get_object( 

187 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] 

188 ) 

189 assert isinstance(obj, PdfObject), "mypy" 

190 return obj 

191 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i 

192 try: 

193 pdf_dest._objects[i - 1] = clone 

194 except IndexError: 

195 pdf_dest._objects.append(clone) 

196 i = len(pdf_dest._objects) 

197 clone.indirect_reference = IndirectObject(i, 0, pdf_dest) 

198 return clone 

199 

200 def get_object(self) -> Optional["PdfObject"]: 

201 """Resolve indirect references.""" 

202 return self 

203 

204 def write_to_stream( 

205 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

206 ) -> None: 

207 raise NotImplementedError 

208 

209 

210class NullObject(PdfObject): 

211 def clone( 

212 self, 

213 pdf_dest: PdfWriterProtocol, 

214 force_duplicate: bool = False, 

215 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

216 ) -> "NullObject": 

217 """Clone object into pdf_dest.""" 

218 return cast( 

219 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) 

220 ) 

221 

222 def hash_bin(self) -> int: 

223 """ 

224 Used to detect modified object. 

225 

226 Returns: 

227 Hash considering type and value. 

228 

229 """ 

230 return hash((self.__class__,)) 

231 

232 def write_to_stream( 

233 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

234 ) -> None: 

235 if encryption_key is not None: # deprecated 

236 deprecation_no_replacement( 

237 "the encryption_key parameter of write_to_stream", "5.0.0" 

238 ) 

239 stream.write(b"null") 

240 

241 @staticmethod 

242 def read_from_stream(stream: StreamType) -> "NullObject": 

243 nulltxt = stream.read(4) 

244 if nulltxt != b"null": 

245 raise PdfReadError("Could not read Null object") 

246 return NullObject() 

247 

248 def __repr__(self) -> str: 

249 return "NullObject" 

250 

251 def __eq__(self, other: object) -> bool: 

252 return isinstance(other, NullObject) 

253 

254 def __hash__(self) -> int: 

255 return self.hash_bin() 

256 

257 

258class BooleanObject(PdfObject): 

259 value: bool 

260 

261 def __init__(self, value: Any) -> None: 

262 self.value = value 

263 

264 def clone( 

265 self, 

266 pdf_dest: PdfWriterProtocol, 

267 force_duplicate: bool = False, 

268 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

269 ) -> "BooleanObject": 

270 """Clone object into pdf_dest.""" 

271 return cast( 

272 "BooleanObject", 

273 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), 

274 ) 

275 

276 def hash_bin(self) -> int: 

277 """ 

278 Used to detect modified object. 

279 

280 Returns: 

281 Hash considering type and value. 

282 

283 """ 

284 return hash((self.__class__, self.value)) 

285 

286 def __eq__(self, o: object, /) -> bool: 

287 if isinstance(o, BooleanObject): 

288 return self.value == o.value 

289 if isinstance(o, bool): 

290 return self.value == o 

291 return False 

292 

293 def __hash__(self) -> int: 

294 return self.hash_bin() 

295 

296 def __repr__(self) -> str: 

297 return "True" if self.value else "False" 

298 

299 def write_to_stream( 

300 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

301 ) -> None: 

302 if encryption_key is not None: # deprecated 

303 deprecation_no_replacement( 

304 "the encryption_key parameter of write_to_stream", "5.0.0" 

305 ) 

306 if self.value: 

307 stream.write(b"true") 

308 else: 

309 stream.write(b"false") 

310 

311 @staticmethod 

312 def read_from_stream(stream: StreamType) -> "BooleanObject": 

313 word = stream.read(4) 

314 if word == b"true": 

315 return BooleanObject(True) 

316 if word == b"fals": 

317 stream.read(1) 

318 return BooleanObject(False) 

319 raise PdfReadError("Could not read Boolean object") 

320 

321 

322class IndirectObject(PdfObject): 

323 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader 

324 self.idnum = idnum 

325 self.generation = generation 

326 self.pdf = pdf 

327 

328 def __hash__(self) -> int: 

329 return hash((self.idnum, self.generation, id(self.pdf))) 

330 

331 def hash_bin(self) -> int: 

332 """ 

333 Used to detect modified object. 

334 

335 Returns: 

336 Hash considering type and value. 

337 

338 """ 

339 return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) 

340 

341 def replicate( 

342 self, 

343 pdf_dest: PdfWriterProtocol, 

344 ) -> "PdfObject": 

345 return IndirectObject(self.idnum, self.generation, pdf_dest) 

346 

347 def clone( 

348 self, 

349 pdf_dest: PdfWriterProtocol, 

350 force_duplicate: bool = False, 

351 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

352 ) -> "IndirectObject": 

353 """Clone object into pdf_dest.""" 

354 if self.pdf == pdf_dest and not force_duplicate: 

355 # Already duplicated and no extra duplication required 

356 return self 

357 if id(self.pdf) not in pdf_dest._id_translated: 

358 pdf_dest._id_translated[id(self.pdf)] = {} 

359 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index] 

360 

361 if self.idnum in pdf_dest._id_translated[id(self.pdf)]: 

362 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) 

363 if force_duplicate: 

364 assert dup is not None 

365 assert dup.indirect_reference is not None 

366 idref = dup.indirect_reference 

367 return IndirectObject(idref.idnum, idref.generation, idref.pdf) 

368 else: 

369 obj = self.get_object() 

370 # case observed : a pointed object can not be found 

371 if obj is None: 

372 # this normally 

373 obj = NullObject() 

374 assert isinstance(self, (IndirectObject,)) 

375 obj.indirect_reference = self 

376 dup = pdf_dest._add_object( 

377 obj.clone(pdf_dest, force_duplicate, ignore_fields) 

378 ) 

379 assert isinstance(dup, PdfObject), "mypy" 

380 assert dup.indirect_reference is not None, "mypy" 

381 return dup.indirect_reference 

382 

383 @property 

384 def indirect_reference(self) -> "IndirectObject": # type: ignore[override] 

385 return self 

386 

387 def get_object(self) -> Optional["PdfObject"]: 

388 obj: Optional[PdfObject] = self.pdf.get_object(self) 

389 return obj 

390 

391 def __deepcopy__(self, memo: Any) -> "IndirectObject": 

392 return IndirectObject(self.idnum, self.generation, self.pdf) 

393 

394 def _get_object_with_check(self) -> Optional["PdfObject"]: 

395 o = self.get_object() 

396 # the check is done here to not slow down get_object() 

397 if isinstance(o, IndirectObject): 

398 raise PdfStreamError( 

399 f"{self.__repr__()} references an IndirectObject {o.__repr__()}" 

400 ) 

401 return o 

402 

403 def __getattr__(self, name: str) -> Any: 

404 # Attribute not found in object: look in pointed object 

405 try: 

406 return getattr(self._get_object_with_check(), name) 

407 except AttributeError: 

408 raise AttributeError( 

409 f"No attribute {name} found in IndirectObject or pointed object" 

410 ) 

411 

412 def __getitem__(self, key: Any) -> Any: 

413 # items should be extracted from pointed Object 

414 return self._get_object_with_check()[key] # type: ignore[index] 

415 

416 def __contains__(self, key: Any) -> bool: 

417 return key in self._get_object_with_check() # type: ignore[operator] 

418 

419 def __iter__(self) -> Any: 

420 return self._get_object_with_check().__iter__() # type: ignore[union-attr] 

421 

422 def __float__(self) -> str: 

423 # in this case we are looking for the pointed data 

424 return self.get_object().__float__() # type: ignore[union-attr, no-any-return] 

425 

426 def __int__(self) -> int: 

427 # in this case we are looking for the pointed data 

428 return self.get_object().__int__() # type: ignore[union-attr, no-any-return] 

429 

430 def __str__(self) -> str: 

431 # in this case we are looking for the pointed data 

432 return self.get_object().__str__() 

433 

434 def __repr__(self) -> str: 

435 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" 

436 

437 def __eq__(self, other: object) -> bool: 

438 return ( 

439 other is not None 

440 and isinstance(other, IndirectObject) 

441 and self.idnum == other.idnum 

442 and self.generation == other.generation 

443 and self.pdf is other.pdf 

444 ) 

445 

446 def __ne__(self, other: object) -> bool: 

447 return not self.__eq__(other) 

448 

449 def write_to_stream( 

450 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

451 ) -> None: 

452 if encryption_key is not None: # deprecated 

453 deprecation_no_replacement( 

454 "the encryption_key parameter of write_to_stream", "5.0.0" 

455 ) 

456 stream.write(f"{self.idnum} {self.generation} R".encode()) 

457 

458 @staticmethod 

459 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader 

460 idnum = b"" 

461 while True: 

462 tok = stream.read(1) 

463 if not tok: 

464 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

465 if tok.isspace(): 

466 break 

467 idnum += tok 

468 generation = b"" 

469 while True: 

470 tok = stream.read(1) 

471 if not tok: 

472 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

473 if tok.isspace(): 

474 if not generation: 

475 continue 

476 break 

477 generation += tok 

478 r = read_non_whitespace(stream) 

479 if r != b"R": 

480 raise PdfReadError( 

481 f"Error reading indirect object reference at byte {hex(stream.tell())}" 

482 ) 

483 return IndirectObject(int(idnum), int(generation), pdf) 

484 

485 

486FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj 

487 

488 

489class FloatObject(float, PdfObject): 

490 def __new__( 

491 cls, value: Any = "0.0", context: Optional[Any] = None 

492 ) -> Self: 

493 try: 

494 value = float(value) 

495 return float.__new__(cls, value) 

496 except Exception as e: 

497 # If this isn't a valid decimal (happens in malformed PDFs) 

498 # fallback to 0 

499 logger_warning( 

500 "%(error)s : FloatObject (%(value)s) invalid; use 0.0 instead", 

501 source=__name__, 

502 error=e, 

503 value=value, 

504 ) 

505 return float.__new__(cls, 0.0) 

506 

507 def clone( 

508 self, 

509 pdf_dest: Any, 

510 force_duplicate: bool = False, 

511 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

512 ) -> "FloatObject": 

513 """Clone object into pdf_dest.""" 

514 return cast( 

515 "FloatObject", 

516 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), 

517 ) 

518 

519 def hash_bin(self) -> int: 

520 """ 

521 Used to detect modified object. 

522 

523 Returns: 

524 Hash considering type and value. 

525 

526 """ 

527 return hash((self.__class__, self.as_numeric)) 

528 

529 def myrepr(self) -> str: 

530 if self == 0: # type: ignore[comparison-overlap] 

531 return "0.0" 

532 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) 

533 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".") 

534 

535 def __repr__(self) -> str: 

536 return self.myrepr() # repr(float(self)) 

537 

538 def as_numeric(self) -> float: 

539 return float(self) 

540 

541 def write_to_stream( 

542 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

543 ) -> None: 

544 if encryption_key is not None: # deprecated 

545 deprecation_no_replacement( 

546 "the encryption_key parameter of write_to_stream", "5.0.0" 

547 ) 

548 stream.write(self.myrepr().encode("utf8")) 

549 

550 

551class NumberObject(int, PdfObject): 

552 NumberPattern = re.compile(b"[^+-.0-9]") 

553 

554 def __new__(cls, value: Any) -> Self: 

555 try: 

556 return int.__new__(cls, int(value)) 

557 except ValueError: 

558 logger_warning("NumberObject(%(value)s) invalid; use 0 instead", source=__name__, value=value) 

559 return int.__new__(cls, 0) 

560 

561 def clone( 

562 self, 

563 pdf_dest: Any, 

564 force_duplicate: bool = False, 

565 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

566 ) -> "NumberObject": 

567 """Clone object into pdf_dest.""" 

568 return cast( 

569 "NumberObject", 

570 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), 

571 ) 

572 

573 def hash_bin(self) -> int: 

574 """ 

575 Used to detect modified object. 

576 

577 Returns: 

578 Hash considering type and value. 

579 

580 """ 

581 return hash((self.__class__, self.as_numeric())) 

582 

583 def as_numeric(self) -> int: 

584 return int(repr(self).encode("utf8")) 

585 

586 def write_to_stream( 

587 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

588 ) -> None: 

589 if encryption_key is not None: # deprecated 

590 deprecation_no_replacement( 

591 "the encryption_key parameter of write_to_stream", "5.0.0" 

592 ) 

593 stream.write(repr(self).encode("utf8")) 

594 

595 @staticmethod 

596 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: 

597 num = read_until_regex(stream, NumberObject.NumberPattern) 

598 if b"." in num: 

599 return FloatObject(num) 

600 return NumberObject(num) 

601 

602 

603class ByteStringObject(bytes, PdfObject): 

604 """ 

605 Represents a string object where the text encoding could not be determined. 

606 

607 This occurs quite often, as the PDF spec doesn't provide an alternate way to 

608 represent strings -- for example, the encryption data stored in files (like 

609 /O) is clearly not text, but is still stored in a "String" object. 

610 """ 

611 

612 def clone( 

613 self, 

614 pdf_dest: Any, 

615 force_duplicate: bool = False, 

616 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

617 ) -> "ByteStringObject": 

618 """Clone object into pdf_dest.""" 

619 return cast( 

620 "ByteStringObject", 

621 self._reference_clone( 

622 ByteStringObject(bytes(self)), pdf_dest, force_duplicate 

623 ), 

624 ) 

625 

626 def hash_bin(self) -> int: 

627 """ 

628 Used to detect modified object. 

629 

630 Returns: 

631 Hash considering type and value. 

632 

633 """ 

634 return hash((self.__class__, bytes(self))) 

635 

636 @property 

637 def original_bytes(self) -> bytes: 

638 """For compatibility with TextStringObject.original_bytes.""" 

639 return self 

640 

641 def write_to_stream( 

642 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

643 ) -> None: 

644 if encryption_key is not None: # deprecated 

645 deprecation_no_replacement( 

646 "the encryption_key parameter of write_to_stream", "5.0.0" 

647 ) 

648 stream.write(b"<") 

649 stream.write(binascii.hexlify(self)) 

650 stream.write(b">") 

651 

652 def __str__(self) -> str: 

653 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)] 

654 for enc in charset_to_try: 

655 try: 

656 return self.decode(enc) 

657 except UnicodeDecodeError: 

658 pass 

659 raise PdfReadError("Cannot decode ByteStringObject.") 

660 

661 

662class TextStringObject(str, PdfObject): # noqa: SLOT000 

663 """ 

664 A string object that has been decoded into a real unicode string. 

665 

666 If read from a PDF document, this string appeared to match the 

667 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding 

668 to occur. 

669 """ 

670 

671 autodetect_pdfdocencoding: bool 

672 autodetect_utf16: bool 

673 utf16_bom: bytes 

674 _original_bytes: Optional[bytes] = None 

675 

676 def __new__(cls, value: Any) -> Self: 

677 original_bytes = None 

678 if isinstance(value, bytes): 

679 original_bytes = value 

680 value = value.decode("charmap") 

681 text_string_object = str.__new__(cls, value) 

682 text_string_object._original_bytes = original_bytes 

683 text_string_object.autodetect_utf16 = False 

684 text_string_object.autodetect_pdfdocencoding = False 

685 text_string_object.utf16_bom = b"" 

686 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}: 

687 # The value of `original_bytes` is only set for inputs being `bytes`. 

688 # If this is UTF-16 data according to the BOM (first two characters), 

689 # perform special handling. All other cases should not need any special conversion 

690 # due to already being a string. 

691 try: 

692 text_string_object = str.__new__(cls, original_bytes.decode("utf-16")) 

693 except UnicodeDecodeError as exception: 

694 logger_warning( 

695 "%(exception)s; initial string: %(initial_string)r", 

696 source=__name__, 

697 exception=exception, 

698 initial_string=exception.object, 

699 ) 

700 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16")) 

701 text_string_object._original_bytes = original_bytes 

702 text_string_object.autodetect_utf16 = True 

703 text_string_object.utf16_bom = original_bytes[:2] 

704 else: 

705 try: 

706 encode_pdfdocencoding(text_string_object) 

707 text_string_object.autodetect_pdfdocencoding = True 

708 except UnicodeEncodeError: 

709 text_string_object.autodetect_utf16 = True 

710 text_string_object.utf16_bom = codecs.BOM_UTF16_BE 

711 return text_string_object 

712 

713 def clone( 

714 self, 

715 pdf_dest: Any, 

716 force_duplicate: bool = False, 

717 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

718 ) -> "TextStringObject": 

719 """Clone object into pdf_dest.""" 

720 obj = TextStringObject(self) 

721 obj._original_bytes = self._original_bytes 

722 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding 

723 obj.autodetect_utf16 = self.autodetect_utf16 

724 obj.utf16_bom = self.utf16_bom 

725 return cast( 

726 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) 

727 ) 

728 

729 def hash_bin(self) -> int: 

730 """ 

731 Used to detect modified object. 

732 

733 Returns: 

734 Hash considering type and value. 

735 

736 """ 

737 return hash((self.__class__, self.original_bytes)) 

738 

739 @property 

740 def original_bytes(self) -> bytes: 

741 """ 

742 It is occasionally possible that a text string object gets created where 

743 a byte string object was expected due to the autodetection mechanism -- 

744 if that occurs, this "original_bytes" property can be used to 

745 back-calculate what the original encoded bytes were. 

746 """ 

747 if self._original_bytes is not None: 

748 return self._original_bytes 

749 return self.get_original_bytes() 

750 

751 def get_original_bytes(self) -> bytes: 

752 # We're a text string object, but the library is trying to get our raw 

753 # bytes. This can happen if we auto-detected this string as text, but 

754 # we were wrong. It's pretty common. Return the original bytes that 

755 # would have been used to create this object, based upon the autodetect 

756 # method. 

757 if self.autodetect_utf16: 

758 if self.utf16_bom == codecs.BOM_UTF16_LE: 

759 return codecs.BOM_UTF16_LE + self.encode("utf-16le") 

760 if self.utf16_bom == codecs.BOM_UTF16_BE: 

761 return codecs.BOM_UTF16_BE + self.encode("utf-16be") 

762 return self.encode("utf-16be") 

763 if self.autodetect_pdfdocencoding: 

764 return encode_pdfdocencoding(self) 

765 raise Exception("no information about original bytes") # pragma: no cover 

766 

767 def get_encoded_bytes(self) -> bytes: 

768 # Try to write the string out as a PDFDocEncoding encoded string. It's 

769 # nicer to look at in the PDF file. Sadly, we take a performance hit 

770 # here for trying... 

771 try: 

772 if self._original_bytes is not None: 

773 return self._original_bytes 

774 if self.autodetect_utf16: 

775 raise UnicodeEncodeError("", "forced", -1, -1, "") 

776 bytearr = encode_pdfdocencoding(self) 

777 except UnicodeEncodeError: 

778 if self.utf16_bom == codecs.BOM_UTF16_LE: 

779 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") 

780 elif self.utf16_bom == codecs.BOM_UTF16_BE: 

781 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") 

782 else: 

783 bytearr = self.encode("utf-16be") 

784 return bytearr 

785 

786 def write_to_stream( 

787 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

788 ) -> None: 

789 if encryption_key is not None: # deprecated 

790 deprecation_no_replacement( 

791 "the encryption_key parameter of write_to_stream", "5.0.0" 

792 ) 

793 bytearr = self.get_encoded_bytes() 

794 stream.write(b"(") 

795 for c_ in iter_unpack("c", bytearr): 

796 c = cast(bytes, c_[0]) 

797 if not c.isalnum() and c != b" ": 

798 # This: 

799 # stream.write(rf"\{c:0>3o}".encode()) 

800 # gives 

801 # https://github.com/davidhalter/parso/issues/207 

802 stream.write(b"\\%03o" % ord(c)) 

803 else: 

804 stream.write(c) 

805 stream.write(b")") 

806 

807 

808class NameObject(str, PdfObject): # noqa: SLOT000 

809 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") 

810 prefix = b"/" 

811 renumber_table: ClassVar[dict[str, bytes]] = { 

812 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"}, 

813 **{chr(i): f"#{i:02X}".encode() for i in range(33)}, 

814 } 

815 

816 def clone( 

817 self, 

818 pdf_dest: Any, 

819 force_duplicate: bool = False, 

820 ignore_fields: Optional[Sequence[Union[str, int]]] = (), 

821 ) -> "NameObject": 

822 """Clone object into pdf_dest.""" 

823 return cast( 

824 "NameObject", 

825 self._reference_clone(NameObject(self), pdf_dest, force_duplicate), 

826 ) 

827 

828 def hash_bin(self) -> int: 

829 """ 

830 Used to detect modified object. 

831 

832 Returns: 

833 Hash considering type and value. 

834 

835 """ 

836 return hash((self.__class__, self)) 

837 

838 def write_to_stream( 

839 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

840 ) -> None: 

841 if encryption_key is not None: # deprecated 

842 deprecation_no_replacement( 

843 "the encryption_key parameter of write_to_stream", "5.0.0" 

844 ) 

845 stream.write(self.renumber()) 

846 

847 def renumber(self) -> bytes: 

848 out = self[0].encode("utf-8") 

849 if out != b"/": 

850 deprecation_no_replacement( 

851 f"Incorrect first char in NameObject, should start with '/': ({self})", 

852 "5.0.0", 

853 ) 

854 parts = [out] 

855 for c in self[1:]: 

856 if c > "~": 

857 parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8")) 

858 else: 

859 try: 

860 parts.append(self.renumber_table[c]) 

861 except KeyError: 

862 parts.append(c.encode("utf-8")) 

863 return b"".join(parts) 

864 

865 def _sanitize(self) -> "NameObject": 

866 """ 

867 Sanitize the NameObject's name to be a valid PDF name part 

868 (alphanumeric, underscore, hyphen). The _sanitize method replaces 

869 spaces and any non-alphanumeric/non-underscore/non-hyphen with 

870 underscores. 

871 

872 Returns: 

873 NameObject with sanitized name. 

874 """ 

875 name = str(self).removeprefix("/") 

876 name = re.sub(r"\ ", "_", name) 

877 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) 

878 return NameObject("/" + name) 

879 

880 @classproperty 

881 def surfix(cls) -> bytes: # noqa: N805 

882 deprecation_with_replacement("surfix", "prefix", "5.0.0") 

883 return b"/" 

884 

885 @staticmethod 

886 def unnumber(sin: bytes) -> bytes: 

887 result = bytearray() 

888 i = 0 

889 while i < len(sin): 

890 if sin[i:i + 1] == b"#": 

891 try: 

892 result.append(int(sin[i + 1 : i + 3], 16)) 

893 i += 3 

894 continue 

895 except (ValueError, IndexError): 

896 # if the 2 characters after # can not be converted to hex 

897 # we change nothing and carry on 

898 pass 

899 result.append(sin[i]) 

900 i += 1 

901 return bytes(result) 

902 

903 CHARSETS = ("utf-8", "gbk", "latin1") 

904 

905 @staticmethod 

906 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader 

907 name = stream.read(1) 

908 if name != NameObject.prefix: 

909 raise PdfReadError("Name read error") 

910 name += read_until_regex(stream, NameObject.delimiter_pattern) 

911 try: 

912 # Name objects should represent irregular characters 

913 # with a '#' followed by the symbol's hex number 

914 name = NameObject.unnumber(name) 

915 for enc in NameObject.CHARSETS: 

916 try: 

917 ret = name.decode(enc) 

918 return NameObject(ret) 

919 except Exception: 

920 pass 

921 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") 

922 except (UnicodeEncodeError, UnicodeDecodeError) as e: 

923 if not pdf.strict: 

924 logger_warning( 

925 "Illegal character in NameObject (%(name)r), you may need to adjust NameObject.CHARSETS", 

926 source=__name__, 

927 name=name, 

928 ) 

929 return NameObject(name.decode("charmap")) 

930 raise PdfReadError( 

931 f"Illegal character in NameObject ({name!r}). " 

932 "You may need to adjust NameObject.CHARSETS.", 

933 ) from e 

934 

935 

936def encode_pdfdocencoding(unicode_string: str) -> bytes: 

937 try: 

938 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) 

939 except KeyError: 

940 raise UnicodeEncodeError( 

941 "pdfdocencoding", 

942 unicode_string, 

943 -1, 

944 -1, 

945 "does not exist in translation table", 

946 ) 

947 

948 

949def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]: 

950 """ 

951 Returns: 

952 True if x is None or NullObject. 

953 

954 """ 

955 return x is None or ( 

956 isinstance(x, PdfObject) 

957 and (x.get_object() is None or isinstance(x.get_object(), NullObject)) 

958 )