Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/

4# Redistribution and use in source and binary forms, with or without

5# modification, are permitted provided that the following conditions are

6# met:

8# * Redistributions of source code must retain the above copyright notice,

9# this list of conditions and the following disclaimer.

10# * Redistributions in binary form must reproduce the above copyright notice,

11# this list of conditions and the following disclaimer in the documentation

12# and/or other materials provided with the distribution.

13# * The name of the author may not be used to endorse or promote products

14# derived from this software without specific prior written permission.

15#

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

26# POSSIBILITY OF SUCH DAMAGE.

27import binascii

28import codecs

29import hashlib

30import re

31import sys

32from collections.abc import Sequence

33from math import log10

34from struct import iter_unpack

35from typing import Any, Callable, ClassVar, Optional, Union, cast

37if sys.version_info[:2] >= (3, 10):

38 from typing import TypeGuard

39else:

40 from typing_extensions import TypeGuard # PEP 647

42if sys.version_info >= (3, 11):

43 from typing import Self

44else:

45 from typing_extensions import Self

47from .._codecs import _pdfdoc_encoding_rev

48from .._protocols import PdfObjectProtocol, PdfWriterProtocol

49from .._utils import (

50 StreamType,

51 classproperty,

52 deprecation_no_replacement,

53 deprecation_with_replacement,

54 logger_warning,

55 read_non_whitespace,

56 read_until_regex,

57)

58from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError

60__author__ = "Mathieu Fenniak"

61__author_email__ = "biziqe@mathieu.fenniak.net"

64class PdfObject(PdfObjectProtocol):

65 # function for calculating a hash value

66 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1

67 indirect_reference: Optional["IndirectObject"]

69 def hash_bin(self) -> int:

70 """

71 Used to detect modified object.

73 Returns:

74 Hash considering type and value.

76 """

77 raise NotImplementedError(

78 f"{self.__class__.__name__} does not implement .hash_bin() so far"

79 )

81 def hash_value_data(self) -> bytes:

82 return f"{self}".encode()

84 def hash_value(self) -> bytes:

85 return (

86 f"{self.__class__.__name__}:"

87 f"{self.hash_func(self.hash_value_data()).hexdigest()}"

88 ).encode()

90 def replicate(

91 self,

92 pdf_dest: PdfWriterProtocol,

93 ) -> "PdfObject":

94 """

95 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)

96 without ensuring links. This is used in clone_document_from_root with incremental = True.

98 Args:

99 pdf_dest: Target to clone to.

100

101 Returns:

102 The cloned PdfObject

103

104 """

105 return self.clone(pdf_dest)

106

107 def clone(

108 self,

109 pdf_dest: PdfWriterProtocol,

110 force_duplicate: bool = False,

111 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

112 ) -> "PdfObject":

113 """

114 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).

115

116 By default, this method will call ``_reference_clone`` (see ``_reference``).

117

118

119 Args:

120 pdf_dest: Target to clone to.

121 force_duplicate: By default, if the object has already been cloned and referenced,

122 the copy will be returned; when ``True``, a new copy will be created.

123 (Default value = ``False``)

124 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored

125 during cloning (applies to children duplication as well). If fields are to be

126 considered for a limited number of levels, you have to add it as integer, for

127 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first

128 level only but ``"/TOTO"`` on all levels.

129

130 Returns:

131 The cloned PdfObject

132

133 """

134 raise NotImplementedError(

135 f"{self.__class__.__name__} does not implement .clone so far"

136 )

137

138 def _reference_clone(

139 self, clone: "PdfObject", pdf_dest: PdfWriterProtocol, force_duplicate: bool = False

140 ) -> "PdfObject":

141 """

142 Reference the object within the _objects of pdf_dest only if

143 indirect_reference attribute exists (which means the objects was

144 already identified in xref/xobjstm) if object has been already

145 referenced do nothing.

146

147 Args:

148 clone:

149 pdf_dest:

150

151 Returns:

152 The clone

153

154 """

155 try:

156 if (

157 not force_duplicate

158 and clone.indirect_reference is not None

159 and clone.indirect_reference.pdf == pdf_dest

160 ):

161 return clone

162 except Exception:

163 pass

164 # if hasattr(clone, "indirect_reference"):

165 try:

166 ind = self.indirect_reference

167 except AttributeError:

168 return clone

169 if (

170 pdf_dest.incremental

171 and ind is not None

172 and ind.pdf == pdf_dest._reader

173 and ind.idnum <= len(pdf_dest._objects)

174 ):

175 i = ind.idnum

176 else:

177 i = len(pdf_dest._objects) + 1

178 if ind is not None:

179 if id(ind.pdf) not in pdf_dest._id_translated:

180 pdf_dest._id_translated[id(ind.pdf)] = {}

181 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]

182 if (

183 not force_duplicate

184 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]

185 ):

186 obj = pdf_dest.get_object(

187 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]

188 )

189 assert isinstance(obj, PdfObject), "mypy"

190 return obj

191 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i

192 try:

193 pdf_dest._objects[i - 1] = clone

194 except IndexError:

195 pdf_dest._objects.append(clone)

196 i = len(pdf_dest._objects)

197 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)

198 return clone

199

200 def get_object(self) -> Optional["PdfObject"]:

201 """Resolve indirect references."""

202 return self

203

204 def write_to_stream(

205 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

206 ) -> None:

207 raise NotImplementedError

208

209

210class NullObject(PdfObject):

211 def clone(

212 self,

213 pdf_dest: PdfWriterProtocol,

214 force_duplicate: bool = False,

215 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

216 ) -> "NullObject":

217 """Clone object into pdf_dest."""

218 return cast(

219 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)

220 )

221

222 def hash_bin(self) -> int:

223 """

224 Used to detect modified object.

225

226 Returns:

227 Hash considering type and value.

228

229 """

230 return hash((self.__class__,))

231

232 def write_to_stream(

233 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

234 ) -> None:

235 if encryption_key is not None: # deprecated

236 deprecation_no_replacement(

237 "the encryption_key parameter of write_to_stream", "5.0.0"

238 )

239 stream.write(b"null")

240

241 @staticmethod

242 def read_from_stream(stream: StreamType) -> "NullObject":

243 nulltxt = stream.read(4)

244 if nulltxt != b"null":

245 raise PdfReadError("Could not read Null object")

246 return NullObject()

247

248 def __repr__(self) -> str:

249 return "NullObject"

250

251 def __eq__(self, other: object) -> bool:

252 return isinstance(other, NullObject)

253

254 def __hash__(self) -> int:

255 return self.hash_bin()

256

257

258class BooleanObject(PdfObject):

259 value: bool

260

261 def __init__(self, value: Any) -> None:

262 self.value = value

263

264 def clone(

265 self,

266 pdf_dest: PdfWriterProtocol,

267 force_duplicate: bool = False,

268 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

269 ) -> "BooleanObject":

270 """Clone object into pdf_dest."""

271 return cast(

272 "BooleanObject",

273 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),

274 )

275

276 def hash_bin(self) -> int:

277 """

278 Used to detect modified object.

279

280 Returns:

281 Hash considering type and value.

282

283 """

284 return hash((self.__class__, self.value))

285

286 def __eq__(self, o: object, /) -> bool:

287 if isinstance(o, BooleanObject):

288 return self.value == o.value

289 if isinstance(o, bool):

290 return self.value == o

291 return False

292

293 def __hash__(self) -> int:

294 return self.hash_bin()

295

296 def __repr__(self) -> str:

297 return "True" if self.value else "False"

298

299 def write_to_stream(

300 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

301 ) -> None:

302 if encryption_key is not None: # deprecated

303 deprecation_no_replacement(

304 "the encryption_key parameter of write_to_stream", "5.0.0"

305 )

306 if self.value:

307 stream.write(b"true")

308 else:

309 stream.write(b"false")

310

311 @staticmethod

312 def read_from_stream(stream: StreamType) -> "BooleanObject":

313 word = stream.read(4)

314 if word == b"true":

315 return BooleanObject(True)

316 if word == b"fals":

317 stream.read(1)

318 return BooleanObject(False)

319 raise PdfReadError("Could not read Boolean object")

320

321

322class IndirectObject(PdfObject):

323 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader

324 self.idnum = idnum

325 self.generation = generation

326 self.pdf = pdf

327

328 def __hash__(self) -> int:

329 return hash((self.idnum, self.generation, id(self.pdf)))

330

331 def hash_bin(self) -> int:

332 """

333 Used to detect modified object.

334

335 Returns:

336 Hash considering type and value.

337

338 """

339 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))

340

341 def replicate(

342 self,

343 pdf_dest: PdfWriterProtocol,

344 ) -> "PdfObject":

345 return IndirectObject(self.idnum, self.generation, pdf_dest)

346

347 def clone(

348 self,

349 pdf_dest: PdfWriterProtocol,

350 force_duplicate: bool = False,

351 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

352 ) -> "IndirectObject":

353 """Clone object into pdf_dest."""

354 if self.pdf == pdf_dest and not force_duplicate:

355 # Already duplicated and no extra duplication required

356 return self

357 if id(self.pdf) not in pdf_dest._id_translated:

358 pdf_dest._id_translated[id(self.pdf)] = {}

359 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]

360

361 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:

362 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])

363 if force_duplicate:

364 assert dup is not None

365 assert dup.indirect_reference is not None

366 idref = dup.indirect_reference

367 return IndirectObject(idref.idnum, idref.generation, idref.pdf)

368 else:

369 obj = self.get_object()

370 # case observed : a pointed object can not be found

371 if obj is None:

372 # this normally

373 obj = NullObject()

374 assert isinstance(self, (IndirectObject,))

375 obj.indirect_reference = self

376 dup = pdf_dest._add_object(

377 obj.clone(pdf_dest, force_duplicate, ignore_fields)

378 )

379 assert isinstance(dup, PdfObject), "mypy"

380 assert dup.indirect_reference is not None, "mypy"

381 return dup.indirect_reference

382

383 @property

384 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]

385 return self

386

387 def get_object(self) -> Optional["PdfObject"]:

388 obj: Optional[PdfObject] = self.pdf.get_object(self)

389 return obj

390

391 def __deepcopy__(self, memo: Any) -> "IndirectObject":

392 return IndirectObject(self.idnum, self.generation, self.pdf)

393

394 def _get_object_with_check(self) -> Optional["PdfObject"]:

395 o = self.get_object()

396 # the check is done here to not slow down get_object()

397 if isinstance(o, IndirectObject):

398 raise PdfStreamError(

399 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"

400 )

401 return o

402

403 def __getattr__(self, name: str) -> Any:

404 # Attribute not found in object: look in pointed object

405 try:

406 return getattr(self._get_object_with_check(), name)

407 except AttributeError:

408 raise AttributeError(

409 f"No attribute {name} found in IndirectObject or pointed object"

410 )

411

412 def __getitem__(self, key: Any) -> Any:

413 # items should be extracted from pointed Object

414 return self._get_object_with_check()[key] # type: ignore[index]

415

416 def __contains__(self, key: Any) -> bool:

417 return key in self._get_object_with_check() # type: ignore[operator]

418

419 def __iter__(self) -> Any:

420 return self._get_object_with_check().__iter__() # type: ignore[union-attr]

421

422 def __float__(self) -> str:

423 # in this case we are looking for the pointed data

424 return self.get_object().__float__() # type: ignore[union-attr, no-any-return]

425

426 def __int__(self) -> int:

427 # in this case we are looking for the pointed data

428 return self.get_object().__int__() # type: ignore[union-attr, no-any-return]

429

430 def __str__(self) -> str:

431 # in this case we are looking for the pointed data

432 return self.get_object().__str__()

433

434 def __repr__(self) -> str:

435 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"

436

437 def __eq__(self, other: object) -> bool:

438 return (

439 other is not None

440 and isinstance(other, IndirectObject)

441 and self.idnum == other.idnum

442 and self.generation == other.generation

443 and self.pdf is other.pdf

444 )

445

446 def __ne__(self, other: object) -> bool:

447 return not self.__eq__(other)

448

449 def write_to_stream(

450 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

451 ) -> None:

452 if encryption_key is not None: # deprecated

453 deprecation_no_replacement(

454 "the encryption_key parameter of write_to_stream", "5.0.0"

455 )

456 stream.write(f"{self.idnum} {self.generation} R".encode())

457

458 @staticmethod

459 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader

460 idnum = b""

461 while True:

462 tok = stream.read(1)

463 if not tok:

464 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

465 if tok.isspace():

466 break

467 idnum += tok

468 generation = b""

469 while True:

470 tok = stream.read(1)

471 if not tok:

472 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

473 if tok.isspace():

474 if not generation:

475 continue

476 break

477 generation += tok

478 r = read_non_whitespace(stream)

479 if r != b"R":

480 raise PdfReadError(

481 f"Error reading indirect object reference at byte {hex(stream.tell())}"

482 )

483 return IndirectObject(int(idnum), int(generation), pdf)

484

485

486FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj

487

488

489class FloatObject(float, PdfObject):

490 def __new__(

491 cls, value: Any = "0.0", context: Optional[Any] = None

492 ) -> Self:

493 try:

494 value = float(value)

495 return float.__new__(cls, value)

496 except Exception as e:

497 # If this isn't a valid decimal (happens in malformed PDFs)

498 # fallback to 0

499 logger_warning(

500 "%(error)s : FloatObject (%(value)s) invalid; use 0.0 instead",

501 source=__name__,

502 error=e,

503 value=value,

504 )

505 return float.__new__(cls, 0.0)

506

507 def clone(

508 self,

509 pdf_dest: Any,

510 force_duplicate: bool = False,

511 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

512 ) -> "FloatObject":

513 """Clone object into pdf_dest."""

514 return cast(

515 "FloatObject",

516 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),

517 )

518

519 def hash_bin(self) -> int:

520 """

521 Used to detect modified object.

522

523 Returns:

524 Hash considering type and value.

525

526 """

527 return hash((self.__class__, self.as_numeric))

528

529 def myrepr(self) -> str:

530 if self == 0: # type: ignore[comparison-overlap]

531 return "0.0"

532 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))

533 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")

534

535 def __repr__(self) -> str:

536 return self.myrepr() # repr(float(self))

537

538 def as_numeric(self) -> float:

539 return float(self)

540

541 def write_to_stream(

542 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

543 ) -> None:

544 if encryption_key is not None: # deprecated

545 deprecation_no_replacement(

546 "the encryption_key parameter of write_to_stream", "5.0.0"

547 )

548 stream.write(self.myrepr().encode("utf8"))

549

550

551class NumberObject(int, PdfObject):

552 NumberPattern = re.compile(b"[^+-.0-9]")

553

554 def __new__(cls, value: Any) -> Self:

555 try:

556 return int.__new__(cls, int(value))

557 except ValueError:

558 logger_warning("NumberObject(%(value)s) invalid; use 0 instead", source=__name__, value=value)

559 return int.__new__(cls, 0)

560

561 def clone(

562 self,

563 pdf_dest: Any,

564 force_duplicate: bool = False,

565 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

566 ) -> "NumberObject":

567 """Clone object into pdf_dest."""

568 return cast(

569 "NumberObject",

570 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),

571 )

572

573 def hash_bin(self) -> int:

574 """

575 Used to detect modified object.

576

577 Returns:

578 Hash considering type and value.

579

580 """

581 return hash((self.__class__, self.as_numeric()))

582

583 def as_numeric(self) -> int:

584 return int(repr(self).encode("utf8"))

585

586 def write_to_stream(

587 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

588 ) -> None:

589 if encryption_key is not None: # deprecated

590 deprecation_no_replacement(

591 "the encryption_key parameter of write_to_stream", "5.0.0"

592 )

593 stream.write(repr(self).encode("utf8"))

594

595 @staticmethod

596 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:

597 num = read_until_regex(stream, NumberObject.NumberPattern)

598 if b"." in num:

599 return FloatObject(num)

600 return NumberObject(num)

601

602

603class ByteStringObject(bytes, PdfObject):

604 """

605 Represents a string object where the text encoding could not be determined.

606

607 This occurs quite often, as the PDF spec doesn't provide an alternate way to

608 represent strings -- for example, the encryption data stored in files (like

609 /O) is clearly not text, but is still stored in a "String" object.

610 """

611

612 def clone(

613 self,

614 pdf_dest: Any,

615 force_duplicate: bool = False,

616 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

617 ) -> "ByteStringObject":

618 """Clone object into pdf_dest."""

619 return cast(

620 "ByteStringObject",

621 self._reference_clone(

622 ByteStringObject(bytes(self)), pdf_dest, force_duplicate

623 ),

624 )

625

626 def hash_bin(self) -> int:

627 """

628 Used to detect modified object.

629

630 Returns:

631 Hash considering type and value.

632

633 """

634 return hash((self.__class__, bytes(self)))

635

636 @property

637 def original_bytes(self) -> bytes:

638 """For compatibility with TextStringObject.original_bytes."""

639 return self

640

641 def write_to_stream(

642 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

643 ) -> None:

644 if encryption_key is not None: # deprecated

645 deprecation_no_replacement(

646 "the encryption_key parameter of write_to_stream", "5.0.0"

647 )

648 stream.write(b"<")

649 stream.write(binascii.hexlify(self))

650 stream.write(b">")

651

652 def __str__(self) -> str:

653 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]

654 for enc in charset_to_try:

655 try:

656 return self.decode(enc)

657 except UnicodeDecodeError:

658 pass

659 raise PdfReadError("Cannot decode ByteStringObject.")

660

661

662class TextStringObject(str, PdfObject): # noqa: SLOT000

663 """

664 A string object that has been decoded into a real unicode string.

665

666 If read from a PDF document, this string appeared to match the

667 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding

668 to occur.

669 """

670

671 autodetect_pdfdocencoding: bool

672 autodetect_utf16: bool

673 utf16_bom: bytes

674 _original_bytes: Optional[bytes] = None

675

676 def __new__(cls, value: Any) -> Self:

677 original_bytes = None

678 if isinstance(value, bytes):

679 original_bytes = value

680 value = value.decode("charmap")

681 text_string_object = str.__new__(cls, value)

682 text_string_object._original_bytes = original_bytes

683 text_string_object.autodetect_utf16 = False

684 text_string_object.autodetect_pdfdocencoding = False

685 text_string_object.utf16_bom = b""

686 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:

687 # The value of `original_bytes` is only set for inputs being `bytes`.

688 # If this is UTF-16 data according to the BOM (first two characters),

689 # perform special handling. All other cases should not need any special conversion

690 # due to already being a string.

691 try:

692 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))

693 except UnicodeDecodeError as exception:

694 logger_warning(

695 "%(exception)s; initial string: %(initial_string)r",

696 source=__name__,

697 exception=exception,

698 initial_string=exception.object,

699 )

700 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))

701 text_string_object._original_bytes = original_bytes

702 text_string_object.autodetect_utf16 = True

703 text_string_object.utf16_bom = original_bytes[:2]

704 else:

705 try:

706 encode_pdfdocencoding(text_string_object)

707 text_string_object.autodetect_pdfdocencoding = True

708 except UnicodeEncodeError:

709 text_string_object.autodetect_utf16 = True

710 text_string_object.utf16_bom = codecs.BOM_UTF16_BE

711 return text_string_object

712

713 def clone(

714 self,

715 pdf_dest: Any,

716 force_duplicate: bool = False,

717 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

718 ) -> "TextStringObject":

719 """Clone object into pdf_dest."""

720 obj = TextStringObject(self)

721 obj._original_bytes = self._original_bytes

722 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding

723 obj.autodetect_utf16 = self.autodetect_utf16

724 obj.utf16_bom = self.utf16_bom

725 return cast(

726 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)

727 )

728

729 def hash_bin(self) -> int:

730 """

731 Used to detect modified object.

732

733 Returns:

734 Hash considering type and value.

735

736 """

737 return hash((self.__class__, self.original_bytes))

738

739 @property

740 def original_bytes(self) -> bytes:

741 """

742 It is occasionally possible that a text string object gets created where

743 a byte string object was expected due to the autodetection mechanism --

744 if that occurs, this "original_bytes" property can be used to

745 back-calculate what the original encoded bytes were.

746 """

747 if self._original_bytes is not None:

748 return self._original_bytes

749 return self.get_original_bytes()

750

751 def get_original_bytes(self) -> bytes:

752 # We're a text string object, but the library is trying to get our raw

753 # bytes. This can happen if we auto-detected this string as text, but

754 # we were wrong. It's pretty common. Return the original bytes that

755 # would have been used to create this object, based upon the autodetect

756 # method.

757 if self.autodetect_utf16:

758 if self.utf16_bom == codecs.BOM_UTF16_LE:

759 return codecs.BOM_UTF16_LE + self.encode("utf-16le")

760 if self.utf16_bom == codecs.BOM_UTF16_BE:

761 return codecs.BOM_UTF16_BE + self.encode("utf-16be")

762 return self.encode("utf-16be")

763 if self.autodetect_pdfdocencoding:

764 return encode_pdfdocencoding(self)

765 raise Exception("no information about original bytes") # pragma: no cover

766

767 def get_encoded_bytes(self) -> bytes:

768 # Try to write the string out as a PDFDocEncoding encoded string. It's

769 # nicer to look at in the PDF file. Sadly, we take a performance hit

770 # here for trying...

771 try:

772 if self._original_bytes is not None:

773 return self._original_bytes

774 if self.autodetect_utf16:

775 raise UnicodeEncodeError("", "forced", -1, -1, "")

776 bytearr = encode_pdfdocencoding(self)

777 except UnicodeEncodeError:

778 if self.utf16_bom == codecs.BOM_UTF16_LE:

779 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")

780 elif self.utf16_bom == codecs.BOM_UTF16_BE:

781 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")

782 else:

783 bytearr = self.encode("utf-16be")

784 return bytearr

785

786 def write_to_stream(

787 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

788 ) -> None:

789 if encryption_key is not None: # deprecated

790 deprecation_no_replacement(

791 "the encryption_key parameter of write_to_stream", "5.0.0"

792 )

793 bytearr = self.get_encoded_bytes()

794 stream.write(b"(")

795 for c_ in iter_unpack("c", bytearr):

796 c = cast(bytes, c_[0])

797 if not c.isalnum() and c != b" ":

798 # This:

799 # stream.write(rf"\{c:0>3o}".encode())

800 # gives

801 # https://github.com/davidhalter/parso/issues/207

802 stream.write(b"\\%03o" % ord(c))

803 else:

804 stream.write(c)

805 stream.write(b")")

806

807

808class NameObject(str, PdfObject): # noqa: SLOT000

809 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")

810 prefix = b"/"

811 renumber_table: ClassVar[dict[str, bytes]] = {

812 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},

813 **{chr(i): f"#{i:02X}".encode() for i in range(33)},

814 }

815

816 def clone(

817 self,

818 pdf_dest: Any,

819 force_duplicate: bool = False,

820 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

821 ) -> "NameObject":

822 """Clone object into pdf_dest."""

823 return cast(

824 "NameObject",

825 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),

826 )

827

828 def hash_bin(self) -> int:

829 """

830 Used to detect modified object.

831

832 Returns:

833 Hash considering type and value.

834

835 """

836 return hash((self.__class__, self))

837

838 def write_to_stream(

839 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

840 ) -> None:

841 if encryption_key is not None: # deprecated

842 deprecation_no_replacement(

843 "the encryption_key parameter of write_to_stream", "5.0.0"

844 )

845 stream.write(self.renumber())

846

847 def renumber(self) -> bytes:

848 out = self[0].encode("utf-8")

849 if out != b"/":

850 deprecation_no_replacement(

851 f"Incorrect first char in NameObject, should start with '/': ({self})",

852 "5.0.0",

853 )

854 parts = [out]

855 for c in self[1:]:

856 if c > "~":

857 parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8"))

858 else:

859 try:

860 parts.append(self.renumber_table[c])

861 except KeyError:

862 parts.append(c.encode("utf-8"))

863 return b"".join(parts)

864

865 def _sanitize(self) -> "NameObject":

866 """

867 Sanitize the NameObject's name to be a valid PDF name part

868 (alphanumeric, underscore, hyphen). The _sanitize method replaces

869 spaces and any non-alphanumeric/non-underscore/non-hyphen with

870 underscores.

871

872 Returns:

873 NameObject with sanitized name.

874 """

875 name = str(self).removeprefix("/")

876 name = re.sub(r"\ ", "_", name)

877 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)

878 return NameObject("/" + name)

879

880 @classproperty

881 def surfix(cls) -> bytes: # noqa: N805

882 deprecation_with_replacement("surfix", "prefix", "5.0.0")

883 return b"/"

884

885 @staticmethod

886 def unnumber(sin: bytes) -> bytes:

887 result = bytearray()

888 i = 0

889 while i < len(sin):

890 if sin[i:i + 1] == b"#":

891 try:

892 result.append(int(sin[i + 1 : i + 3], 16))

893 i += 3

894 continue

895 except (ValueError, IndexError):

896 # if the 2 characters after # can not be converted to hex

897 # we change nothing and carry on

898 pass

899 result.append(sin[i])

900 i += 1

901 return bytes(result)

902

903 CHARSETS = ("utf-8", "gbk", "latin1")

904

905 @staticmethod

906 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader

907 name = stream.read(1)

908 if name != NameObject.prefix:

909 raise PdfReadError("Name read error")

910 name += read_until_regex(stream, NameObject.delimiter_pattern)

911 try:

912 # Name objects should represent irregular characters

913 # with a '#' followed by the symbol's hex number

914 name = NameObject.unnumber(name)

915 for enc in NameObject.CHARSETS:

916 try:

917 ret = name.decode(enc)

918 return NameObject(ret)

919 except Exception:

920 pass

921 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")

922 except (UnicodeEncodeError, UnicodeDecodeError) as e:

923 if not pdf.strict:

924 logger_warning(

925 "Illegal character in NameObject (%(name)r), you may need to adjust NameObject.CHARSETS",

926 source=__name__,

927 name=name,

928 )

929 return NameObject(name.decode("charmap"))

930 raise PdfReadError(

931 f"Illegal character in NameObject ({name!r}). "

932 "You may need to adjust NameObject.CHARSETS.",

933 ) from e

934

935

936def encode_pdfdocencoding(unicode_string: str) -> bytes:

937 try:

938 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])

939 except KeyError:

940 raise UnicodeEncodeError(

941 "pdfdocencoding",

942 unicode_string,

943 -1,

944 -1,

945 "does not exist in translation table",

946 )

947

948

949def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:

950 """

951 Returns:

952 True if x is None or NullObject.

953

954 """

955 return x is None or (

956 isinstance(x, PdfObject)

957 and (x.get_object() is None or isinstance(x.get_object(), NullObject))

958 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

447 statements