Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/

4# Redistribution and use in source and binary forms, with or without

5# modification, are permitted provided that the following conditions are

6# met:

8# * Redistributions of source code must retain the above copyright notice,

9# this list of conditions and the following disclaimer.

10# * Redistributions in binary form must reproduce the above copyright notice,

11# this list of conditions and the following disclaimer in the documentation

12# and/or other materials provided with the distribution.

13# * The name of the author may not be used to endorse or promote products

14# derived from this software without specific prior written permission.

15#

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

26# POSSIBILITY OF SUCH DAMAGE.

27import binascii

28import codecs

29import hashlib

30import re

31import sys

32from collections.abc import Sequence

33from math import log10

34from struct import iter_unpack

35from typing import Any, Callable, ClassVar, Optional, Union, cast

37if sys.version_info[:2] >= (3, 10):

38 from typing import TypeGuard

39else:

40 from typing_extensions import TypeGuard # PEP 647

42if sys.version_info >= (3, 11):

43 from typing import Self

44else:

45 from typing_extensions import Self

47from .._codecs import _pdfdoc_encoding_rev

48from .._protocols import PdfObjectProtocol, PdfWriterProtocol

49from .._utils import (

50 StreamType,

51 classproperty,

52 deprecation_no_replacement,

53 deprecation_with_replacement,

54 logger_warning,

55 read_non_whitespace,

56 read_until_regex,

57)

58from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError

60__author__ = "Mathieu Fenniak"

61__author_email__ = "biziqe@mathieu.fenniak.net"

64class PdfObject(PdfObjectProtocol):

65 # function for calculating a hash value

66 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1

67 indirect_reference: Optional["IndirectObject"]

69 def hash_bin(self) -> int:

70 """

71 Used to detect modified object.

73 Returns:

74 Hash considering type and value.

76 """

77 raise NotImplementedError(

78 f"{self.__class__.__name__} does not implement .hash_bin() so far"

79 )

81 def hash_value_data(self) -> bytes:

82 return f"{self}".encode()

84 def hash_value(self) -> bytes:

85 return (

86 f"{self.__class__.__name__}:"

87 f"{self.hash_func(self.hash_value_data()).hexdigest()}"

88 ).encode()

90 def replicate(

91 self,

92 pdf_dest: PdfWriterProtocol,

93 ) -> "PdfObject":

94 """

95 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)

96 without ensuring links. This is used in clone_document_from_root with incremental = True.

98 Args:

99 pdf_dest: Target to clone to.

100

101 Returns:

102 The cloned PdfObject

103

104 """

105 return self.clone(pdf_dest)

106

107 def clone(

108 self,

109 pdf_dest: PdfWriterProtocol,

110 force_duplicate: bool = False,

111 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

112 ) -> "PdfObject":

113 """

114 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).

115

116 By default, this method will call ``_reference_clone`` (see ``_reference``).

117

118

119 Args:

120 pdf_dest: Target to clone to.

121 force_duplicate: By default, if the object has already been cloned and referenced,

122 the copy will be returned; when ``True``, a new copy will be created.

123 (Default value = ``False``)

124 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored

125 during cloning (applies to children duplication as well). If fields are to be

126 considered for a limited number of levels, you have to add it as integer, for

127 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first

128 level only but ``"/TOTO"`` on all levels.

129

130 Returns:

131 The cloned PdfObject

132

133 """

134 raise NotImplementedError(

135 f"{self.__class__.__name__} does not implement .clone so far"

136 )

137

138 def _reference_clone(

139 self, clone: "PdfObject", pdf_dest: PdfWriterProtocol, force_duplicate: bool = False

140 ) -> "PdfObject":

141 """

142 Reference the object within the _objects of pdf_dest only if

143 indirect_reference attribute exists (which means the objects was

144 already identified in xref/xobjstm) if object has been already

145 referenced do nothing.

146

147 Args:

148 clone:

149 pdf_dest:

150

151 Returns:

152 The clone

153

154 """

155 try:

156 if (

157 not force_duplicate

158 and clone.indirect_reference is not None

159 and clone.indirect_reference.pdf == pdf_dest

160 ):

161 return clone

162 except Exception:

163 pass

164 # if hasattr(clone, "indirect_reference"):

165 try:

166 ind = self.indirect_reference

167 except AttributeError:

168 return clone

169 if (

170 pdf_dest.incremental

171 and ind is not None

172 and ind.pdf == pdf_dest._reader

173 and ind.idnum <= len(pdf_dest._objects)

174 ):

175 i = ind.idnum

176 else:

177 i = len(pdf_dest._objects) + 1

178 if ind is not None:

179 if id(ind.pdf) not in pdf_dest._id_translated:

180 pdf_dest._id_translated[id(ind.pdf)] = {}

181 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]

182 if (

183 not force_duplicate

184 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]

185 ):

186 obj = pdf_dest.get_object(

187 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]

188 )

189 assert isinstance(obj, PdfObject), "mypy"

190 return obj

191 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i

192 try:

193 pdf_dest._objects[i - 1] = clone

194 except IndexError:

195 pdf_dest._objects.append(clone)

196 i = len(pdf_dest._objects)

197 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)

198 return clone

199

200 def get_object(self) -> Optional["PdfObject"]:

201 """Resolve indirect references."""

202 return self

203

204 def write_to_stream(

205 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

206 ) -> None:

207 raise NotImplementedError

208

209

210class NullObject(PdfObject):

211 def clone(

212 self,

213 pdf_dest: PdfWriterProtocol,

214 force_duplicate: bool = False,

215 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

216 ) -> "NullObject":

217 """Clone object into pdf_dest."""

218 return cast(

219 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)

220 )

221

222 def hash_bin(self) -> int:

223 """

224 Used to detect modified object.

225

226 Returns:

227 Hash considering type and value.

228

229 """

230 return hash((self.__class__,))

231

232 def write_to_stream(

233 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

234 ) -> None:

235 if encryption_key is not None: # deprecated

236 deprecation_no_replacement(

237 "the encryption_key parameter of write_to_stream", "5.0.0"

238 )

239 stream.write(b"null")

240

241 @staticmethod

242 def read_from_stream(stream: StreamType) -> "NullObject":

243 nulltxt = stream.read(4)

244 if nulltxt != b"null":

245 raise PdfReadError("Could not read Null object")

246 return NullObject()

247

248 def __repr__(self) -> str:

249 return "NullObject"

250

251 def __eq__(self, other: object) -> bool:

252 return isinstance(other, NullObject)

253

254 def __hash__(self) -> int:

255 return self.hash_bin()

256

257

258class BooleanObject(PdfObject):

259 value: bool

260

261 def __init__(self, value: Any) -> None:

262 self.value = value

263

264 def clone(

265 self,

266 pdf_dest: PdfWriterProtocol,

267 force_duplicate: bool = False,

268 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

269 ) -> "BooleanObject":

270 """Clone object into pdf_dest."""

271 return cast(

272 "BooleanObject",

273 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),

274 )

275

276 def hash_bin(self) -> int:

277 """

278 Used to detect modified object.

279

280 Returns:

281 Hash considering type and value.

282

283 """

284 return hash((self.__class__, self.value))

285

286 def __eq__(self, o: object, /) -> bool:

287 if isinstance(o, BooleanObject):

288 return self.value == o.value

289 if isinstance(o, bool):

290 return self.value == o

291 return False

292

293 def __hash__(self) -> int:

294 return self.hash_bin()

295

296 def __repr__(self) -> str:

297 return "True" if self.value else "False"

298

299 def write_to_stream(

300 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

301 ) -> None:

302 if encryption_key is not None: # deprecated

303 deprecation_no_replacement(

304 "the encryption_key parameter of write_to_stream", "5.0.0"

305 )

306 if self.value:

307 stream.write(b"true")

308 else:

309 stream.write(b"false")

310

311 @staticmethod

312 def read_from_stream(stream: StreamType) -> "BooleanObject":

313 word = stream.read(4)

314 if word == b"true":

315 return BooleanObject(True)

316 if word == b"fals":

317 stream.read(1)

318 return BooleanObject(False)

319 raise PdfReadError("Could not read Boolean object")

320

321

322class IndirectObject(PdfObject):

323 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader

324 self.idnum = idnum

325 self.generation = generation

326 self.pdf = pdf

327

328 def __hash__(self) -> int:

329 return hash((self.idnum, self.generation, id(self.pdf)))

330

331 def hash_bin(self) -> int:

332 """

333 Used to detect modified object.

334

335 Returns:

336 Hash considering type and value.

337

338 """

339 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))

340

341 def replicate(

342 self,

343 pdf_dest: PdfWriterProtocol,

344 ) -> "PdfObject":

345 return IndirectObject(self.idnum, self.generation, pdf_dest)

346

347 def clone(

348 self,

349 pdf_dest: PdfWriterProtocol,

350 force_duplicate: bool = False,

351 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

352 ) -> "IndirectObject":

353 """Clone object into pdf_dest."""

354 if self.pdf == pdf_dest and not force_duplicate:

355 # Already duplicated and no extra duplication required

356 return self

357 if id(self.pdf) not in pdf_dest._id_translated:

358 pdf_dest._id_translated[id(self.pdf)] = {}

359 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]

360

361 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:

362 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])

363 if force_duplicate:

364 assert dup is not None

365 assert dup.indirect_reference is not None

366 idref = dup.indirect_reference

367 return IndirectObject(idref.idnum, idref.generation, idref.pdf)

368 else:

369 obj = self.get_object()

370 # case observed : a pointed object can not be found

371 if obj is None:

372 # this normally

373 obj = NullObject()

374 assert isinstance(self, (IndirectObject,))

375 obj.indirect_reference = self

376 dup = pdf_dest._add_object(

377 obj.clone(pdf_dest, force_duplicate, ignore_fields)

378 )

379 assert isinstance(dup, PdfObject), "mypy"

380 assert dup.indirect_reference is not None, "mypy"

381 return dup.indirect_reference

382

383 @property

384 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]

385 return self

386

387 def get_object(self) -> Optional["PdfObject"]:

388 obj: Optional[PdfObject] = self.pdf.get_object(self)

389 return obj

390

391 def __deepcopy__(self, memo: Any) -> "IndirectObject":

392 return IndirectObject(self.idnum, self.generation, self.pdf)

393

394 def _get_object_with_check(self) -> Optional["PdfObject"]:

395 o = self.get_object()

396 # the check is done here to not slow down get_object()

397 if isinstance(o, IndirectObject):

398 raise PdfStreamError(

399 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"

400 )

401 return o

402

403 def __getattr__(self, name: str) -> Any:

404 # Attribute not found in object: look in pointed object

405 try:

406 return getattr(self._get_object_with_check(), name)

407 except AttributeError:

408 raise AttributeError(

409 f"No attribute {name} found in IndirectObject or pointed object"

410 )

411

412 def __getitem__(self, key: Any) -> Any:

413 # items should be extracted from pointed Object

414 return self._get_object_with_check()[key] # type: ignore

415

416 def __contains__(self, key: Any) -> bool:

417 return key in self._get_object_with_check() # type: ignore

418

419 def __iter__(self) -> Any:

420 return self._get_object_with_check().__iter__() # type: ignore

421

422 def __float__(self) -> str:

423 # in this case we are looking for the pointed data

424 return self.get_object().__float__() # type: ignore

425

426 def __int__(self) -> int:

427 # in this case we are looking for the pointed data

428 return self.get_object().__int__() # type: ignore

429

430 def __str__(self) -> str:

431 # in this case we are looking for the pointed data

432 return self.get_object().__str__()

433

434 def __repr__(self) -> str:

435 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"

436

437 def __eq__(self, other: object) -> bool:

438 return (

439 other is not None

440 and isinstance(other, IndirectObject)

441 and self.idnum == other.idnum

442 and self.generation == other.generation

443 and self.pdf is other.pdf

444 )

445

446 def __ne__(self, other: object) -> bool:

447 return not self.__eq__(other)

448

449 def write_to_stream(

450 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

451 ) -> None:

452 if encryption_key is not None: # deprecated

453 deprecation_no_replacement(

454 "the encryption_key parameter of write_to_stream", "5.0.0"

455 )

456 stream.write(f"{self.idnum} {self.generation} R".encode())

457

458 @staticmethod

459 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader

460 idnum = b""

461 while True:

462 tok = stream.read(1)

463 if not tok:

464 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

465 if tok.isspace():

466 break

467 idnum += tok

468 generation = b""

469 while True:

470 tok = stream.read(1)

471 if not tok:

472 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

473 if tok.isspace():

474 if not generation:

475 continue

476 break

477 generation += tok

478 r = read_non_whitespace(stream)

479 if r != b"R":

480 raise PdfReadError(

481 f"Error reading indirect object reference at byte {hex(stream.tell())}"

482 )

483 return IndirectObject(int(idnum), int(generation), pdf)

484

485

486FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj

487

488

489class FloatObject(float, PdfObject):

490 def __new__(

491 cls, value: Any = "0.0", context: Optional[Any] = None

492 ) -> Self:

493 try:

494 value = float(value)

495 return float.__new__(cls, value)

496 except Exception as e:

497 # If this isn't a valid decimal (happens in malformed PDFs)

498 # fallback to 0

499 logger_warning(

500 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__

501 )

502 return float.__new__(cls, 0.0)

503

504 def clone(

505 self,

506 pdf_dest: Any,

507 force_duplicate: bool = False,

508 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

509 ) -> "FloatObject":

510 """Clone object into pdf_dest."""

511 return cast(

512 "FloatObject",

513 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),

514 )

515

516 def hash_bin(self) -> int:

517 """

518 Used to detect modified object.

519

520 Returns:

521 Hash considering type and value.

522

523 """

524 return hash((self.__class__, self.as_numeric))

525

526 def myrepr(self) -> str:

527 if self == 0: # type: ignore[comparison-overlap]

528 return "0.0"

529 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))

530 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")

531

532 def __repr__(self) -> str:

533 return self.myrepr() # repr(float(self))

534

535 def as_numeric(self) -> float:

536 return float(self)

537

538 def write_to_stream(

539 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

540 ) -> None:

541 if encryption_key is not None: # deprecated

542 deprecation_no_replacement(

543 "the encryption_key parameter of write_to_stream", "5.0.0"

544 )

545 stream.write(self.myrepr().encode("utf8"))

546

547

548class NumberObject(int, PdfObject):

549 NumberPattern = re.compile(b"[^+-.0-9]")

550

551 def __new__(cls, value: Any) -> Self:

552 try:

553 return int.__new__(cls, int(value))

554 except ValueError:

555 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)

556 return int.__new__(cls, 0)

557

558 def clone(

559 self,

560 pdf_dest: Any,

561 force_duplicate: bool = False,

562 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

563 ) -> "NumberObject":

564 """Clone object into pdf_dest."""

565 return cast(

566 "NumberObject",

567 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),

568 )

569

570 def hash_bin(self) -> int:

571 """

572 Used to detect modified object.

573

574 Returns:

575 Hash considering type and value.

576

577 """

578 return hash((self.__class__, self.as_numeric()))

579

580 def as_numeric(self) -> int:

581 return int(repr(self).encode("utf8"))

582

583 def write_to_stream(

584 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

585 ) -> None:

586 if encryption_key is not None: # deprecated

587 deprecation_no_replacement(

588 "the encryption_key parameter of write_to_stream", "5.0.0"

589 )

590 stream.write(repr(self).encode("utf8"))

591

592 @staticmethod

593 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:

594 num = read_until_regex(stream, NumberObject.NumberPattern)

595 if b"." in num:

596 return FloatObject(num)

597 return NumberObject(num)

598

599

600class ByteStringObject(bytes, PdfObject):

601 """

602 Represents a string object where the text encoding could not be determined.

603

604 This occurs quite often, as the PDF spec doesn't provide an alternate way to

605 represent strings -- for example, the encryption data stored in files (like

606 /O) is clearly not text, but is still stored in a "String" object.

607 """

608

609 def clone(

610 self,

611 pdf_dest: Any,

612 force_duplicate: bool = False,

613 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

614 ) -> "ByteStringObject":

615 """Clone object into pdf_dest."""

616 return cast(

617 "ByteStringObject",

618 self._reference_clone(

619 ByteStringObject(bytes(self)), pdf_dest, force_duplicate

620 ),

621 )

622

623 def hash_bin(self) -> int:

624 """

625 Used to detect modified object.

626

627 Returns:

628 Hash considering type and value.

629

630 """

631 return hash((self.__class__, bytes(self)))

632

633 @property

634 def original_bytes(self) -> bytes:

635 """For compatibility with TextStringObject.original_bytes."""

636 return self

637

638 def write_to_stream(

639 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

640 ) -> None:

641 if encryption_key is not None: # deprecated

642 deprecation_no_replacement(

643 "the encryption_key parameter of write_to_stream", "5.0.0"

644 )

645 stream.write(b"<")

646 stream.write(binascii.hexlify(self))

647 stream.write(b">")

648

649 def __str__(self) -> str:

650 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]

651 for enc in charset_to_try:

652 try:

653 return self.decode(enc)

654 except UnicodeDecodeError:

655 pass

656 raise PdfReadError("Cannot decode ByteStringObject.")

657

658

659class TextStringObject(str, PdfObject): # noqa: SLOT000

660 """

661 A string object that has been decoded into a real unicode string.

662

663 If read from a PDF document, this string appeared to match the

664 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding

665 to occur.

666 """

667

668 autodetect_pdfdocencoding: bool

669 autodetect_utf16: bool

670 utf16_bom: bytes

671 _original_bytes: Optional[bytes] = None

672

673 def __new__(cls, value: Any) -> Self:

674 original_bytes = None

675 if isinstance(value, bytes):

676 original_bytes = value

677 value = value.decode("charmap")

678 text_string_object = str.__new__(cls, value)

679 text_string_object._original_bytes = original_bytes

680 text_string_object.autodetect_utf16 = False

681 text_string_object.autodetect_pdfdocencoding = False

682 text_string_object.utf16_bom = b""

683 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:

684 # The value of `original_bytes` is only set for inputs being `bytes`.

685 # If this is UTF-16 data according to the BOM (first two characters),

686 # perform special handling. All other cases should not need any special conversion

687 # due to already being a string.

688 try:

689 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))

690 except UnicodeDecodeError as exception:

691 logger_warning(

692 f"{exception!s}\ninitial string:{exception.object!r}",

693 __name__,

694 )

695 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))

696 text_string_object._original_bytes = original_bytes

697 text_string_object.autodetect_utf16 = True

698 text_string_object.utf16_bom = original_bytes[:2]

699 else:

700 try:

701 encode_pdfdocencoding(text_string_object)

702 text_string_object.autodetect_pdfdocencoding = True

703 except UnicodeEncodeError:

704 text_string_object.autodetect_utf16 = True

705 text_string_object.utf16_bom = codecs.BOM_UTF16_BE

706 return text_string_object

707

708 def clone(

709 self,

710 pdf_dest: Any,

711 force_duplicate: bool = False,

712 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

713 ) -> "TextStringObject":

714 """Clone object into pdf_dest."""

715 obj = TextStringObject(self)

716 obj._original_bytes = self._original_bytes

717 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding

718 obj.autodetect_utf16 = self.autodetect_utf16

719 obj.utf16_bom = self.utf16_bom

720 return cast(

721 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)

722 )

723

724 def hash_bin(self) -> int:

725 """

726 Used to detect modified object.

727

728 Returns:

729 Hash considering type and value.

730

731 """

732 return hash((self.__class__, self.original_bytes))

733

734 @property

735 def original_bytes(self) -> bytes:

736 """

737 It is occasionally possible that a text string object gets created where

738 a byte string object was expected due to the autodetection mechanism --

739 if that occurs, this "original_bytes" property can be used to

740 back-calculate what the original encoded bytes were.

741 """

742 if self._original_bytes is not None:

743 return self._original_bytes

744 return self.get_original_bytes()

745

746 def get_original_bytes(self) -> bytes:

747 # We're a text string object, but the library is trying to get our raw

748 # bytes. This can happen if we auto-detected this string as text, but

749 # we were wrong. It's pretty common. Return the original bytes that

750 # would have been used to create this object, based upon the autodetect

751 # method.

752 if self.autodetect_utf16:

753 if self.utf16_bom == codecs.BOM_UTF16_LE:

754 return codecs.BOM_UTF16_LE + self.encode("utf-16le")

755 if self.utf16_bom == codecs.BOM_UTF16_BE:

756 return codecs.BOM_UTF16_BE + self.encode("utf-16be")

757 return self.encode("utf-16be")

758 if self.autodetect_pdfdocencoding:

759 return encode_pdfdocencoding(self)

760 raise Exception("no information about original bytes") # pragma: no cover

761

762 def get_encoded_bytes(self) -> bytes:

763 # Try to write the string out as a PDFDocEncoding encoded string. It's

764 # nicer to look at in the PDF file. Sadly, we take a performance hit

765 # here for trying...

766 try:

767 if self._original_bytes is not None:

768 return self._original_bytes

769 if self.autodetect_utf16:

770 raise UnicodeEncodeError("", "forced", -1, -1, "")

771 bytearr = encode_pdfdocencoding(self)

772 except UnicodeEncodeError:

773 if self.utf16_bom == codecs.BOM_UTF16_LE:

774 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")

775 elif self.utf16_bom == codecs.BOM_UTF16_BE:

776 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")

777 else:

778 bytearr = self.encode("utf-16be")

779 return bytearr

780

781 def write_to_stream(

782 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

783 ) -> None:

784 if encryption_key is not None: # deprecated

785 deprecation_no_replacement(

786 "the encryption_key parameter of write_to_stream", "5.0.0"

787 )

788 bytearr = self.get_encoded_bytes()

789 stream.write(b"(")

790 for c_ in iter_unpack("c", bytearr):

791 c = cast(bytes, c_[0])

792 if not c.isalnum() and c != b" ":

793 # This:

794 # stream.write(rf"\{c:0>3o}".encode())

795 # gives

796 # https://github.com/davidhalter/parso/issues/207

797 stream.write(b"\\%03o" % ord(c))

798 else:

799 stream.write(c)

800 stream.write(b")")

801

802

803class NameObject(str, PdfObject): # noqa: SLOT000

804 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")

805 prefix = b"/"

806 renumber_table: ClassVar[dict[str, bytes]] = {

807 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},

808 **{chr(i): f"#{i:02X}".encode() for i in range(33)},

809 }

810

811 def clone(

812 self,

813 pdf_dest: Any,

814 force_duplicate: bool = False,

815 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

816 ) -> "NameObject":

817 """Clone object into pdf_dest."""

818 return cast(

819 "NameObject",

820 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),

821 )

822

823 def hash_bin(self) -> int:

824 """

825 Used to detect modified object.

826

827 Returns:

828 Hash considering type and value.

829

830 """

831 return hash((self.__class__, self))

832

833 def write_to_stream(

834 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

835 ) -> None:

836 if encryption_key is not None: # deprecated

837 deprecation_no_replacement(

838 "the encryption_key parameter of write_to_stream", "5.0.0"

839 )

840 stream.write(self.renumber())

841

842 def renumber(self) -> bytes:

843 out = self[0].encode("utf-8")

844 if out != b"/":

845 deprecation_no_replacement(

846 f"Incorrect first char in NameObject, should start with '/': ({self})",

847 "5.0.0",

848 )

849 parts = [out]

850 for c in self[1:]:

851 if c > "~":

852 parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8"))

853 else:

854 try:

855 parts.append(self.renumber_table[c])

856 except KeyError:

857 parts.append(c.encode("utf-8"))

858 return b"".join(parts)

859

860 def _sanitize(self) -> "NameObject":

861 """

862 Sanitize the NameObject's name to be a valid PDF name part

863 (alphanumeric, underscore, hyphen). The _sanitize method replaces

864 spaces and any non-alphanumeric/non-underscore/non-hyphen with

865 underscores.

866

867 Returns:

868 NameObject with sanitized name.

869 """

870 name = str(self).removeprefix("/")

871 name = re.sub(r"\ ", "_", name)

872 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)

873 return NameObject("/" + name)

874

875 @classproperty

876 def surfix(cls) -> bytes: # noqa: N805

877 deprecation_with_replacement("surfix", "prefix", "5.0.0")

878 return b"/"

879

880 @staticmethod

881 def unnumber(sin: bytes) -> bytes:

882 result = bytearray()

883 i = 0

884 while i < len(sin):

885 if sin[i:i + 1] == b"#":

886 try:

887 result.append(int(sin[i + 1 : i + 3], 16))

888 i += 3

889 continue

890 except (ValueError, IndexError):

891 # if the 2 characters after # can not be converted to hex

892 # we change nothing and carry on

893 pass

894 result.append(sin[i])

895 i += 1

896 return bytes(result)

897

898 CHARSETS = ("utf-8", "gbk", "latin1")

899

900 @staticmethod

901 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader

902 name = stream.read(1)

903 if name != NameObject.prefix:

904 raise PdfReadError("Name read error")

905 name += read_until_regex(stream, NameObject.delimiter_pattern)

906 try:

907 # Name objects should represent irregular characters

908 # with a '#' followed by the symbol's hex number

909 name = NameObject.unnumber(name)

910 for enc in NameObject.CHARSETS:

911 try:

912 ret = name.decode(enc)

913 return NameObject(ret)

914 except Exception:

915 pass

916 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")

917 except (UnicodeEncodeError, UnicodeDecodeError) as e:

918 if not pdf.strict:

919 logger_warning(

920 f"Illegal character in NameObject ({name!r}), "

921 "you may need to adjust NameObject.CHARSETS",

922 __name__,

923 )

924 return NameObject(name.decode("charmap"))

925 raise PdfReadError(

926 f"Illegal character in NameObject ({name!r}). "

927 "You may need to adjust NameObject.CHARSETS.",

928 ) from e

929

930

931def encode_pdfdocencoding(unicode_string: str) -> bytes:

932 try:

933 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])

934 except KeyError:

935 raise UnicodeEncodeError(

936 "pdfdocencoding",

937 unicode_string,

938 -1,

939 -1,

940 "does not exist in translation table",

941 )

942

943

944def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:

945 """

946 Returns:

947 True if x is None or NullObject.

948

949 """

950 return x is None or (

951 isinstance(x, PdfObject)

952 and (x.get_object() is None or isinstance(x.get_object(), NullObject))

953 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

447 statements