Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/

4# Redistribution and use in source and binary forms, with or without

5# modification, are permitted provided that the following conditions are

6# met:

8# * Redistributions of source code must retain the above copyright notice,

9# this list of conditions and the following disclaimer.

10# * Redistributions in binary form must reproduce the above copyright notice,

11# this list of conditions and the following disclaimer in the documentation

12# and/or other materials provided with the distribution.

13# * The name of the author may not be used to endorse or promote products

14# derived from this software without specific prior written permission.

15#

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

26# POSSIBILITY OF SUCH DAMAGE.

27import binascii

28import codecs

29import hashlib

30import re

31import sys

32from binascii import unhexlify

33from collections.abc import Sequence

34from math import log10

35from struct import iter_unpack

36from typing import Any, Callable, ClassVar, Optional, Union, cast

38if sys.version_info[:2] >= (3, 10):

39 from typing import TypeGuard

40else:

41 from typing_extensions import TypeGuard # PEP 647

43if sys.version_info >= (3, 11):

44 from typing import Self

45else:

46 from typing_extensions import Self

48from .._codecs import _pdfdoc_encoding_rev

49from .._protocols import PdfObjectProtocol, PdfWriterProtocol

50from .._utils import (

51 StreamType,

52 classproperty,

53 deprecation_no_replacement,

54 deprecation_with_replacement,

55 logger_warning,

56 read_non_whitespace,

57 read_until_regex,

58)

59from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError

61__author__ = "Mathieu Fenniak"

62__author_email__ = "biziqe@mathieu.fenniak.net"

65class PdfObject(PdfObjectProtocol):

66 # function for calculating a hash value

67 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1

68 indirect_reference: Optional["IndirectObject"]

70 def hash_bin(self) -> int:

71 """

72 Used to detect modified object.

74 Returns:

75 Hash considering type and value.

77 """

78 raise NotImplementedError(

79 f"{self.__class__.__name__} does not implement .hash_bin() so far"

80 )

82 def hash_value_data(self) -> bytes:

83 return f"{self}".encode()

85 def hash_value(self) -> bytes:

86 return (

87 f"{self.__class__.__name__}:"

88 f"{self.hash_func(self.hash_value_data()).hexdigest()}"

89 ).encode()

91 def replicate(

92 self,

93 pdf_dest: PdfWriterProtocol,

94 ) -> "PdfObject":

95 """

96 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)

97 without ensuring links. This is used in clone_document_from_root with incremental = True.

99 Args:

100 pdf_dest: Target to clone to.

101

102 Returns:

103 The cloned PdfObject

104

105 """

106 return self.clone(pdf_dest)

107

108 def clone(

109 self,

110 pdf_dest: PdfWriterProtocol,

111 force_duplicate: bool = False,

112 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

113 ) -> "PdfObject":

114 """

115 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).

116

117 By default, this method will call ``_reference_clone`` (see ``_reference``).

118

119

120 Args:

121 pdf_dest: Target to clone to.

122 force_duplicate: By default, if the object has already been cloned and referenced,

123 the copy will be returned; when ``True``, a new copy will be created.

124 (Default value = ``False``)

125 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored

126 during cloning (applies to children duplication as well). If fields are to be

127 considered for a limited number of levels, you have to add it as integer, for

128 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first

129 level only but ``"/TOTO"`` on all levels.

130

131 Returns:

132 The cloned PdfObject

133

134 """

135 raise NotImplementedError(

136 f"{self.__class__.__name__} does not implement .clone so far"

137 )

138

139 def _reference_clone(

140 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False

141 ) -> PdfObjectProtocol:

142 """

143 Reference the object within the _objects of pdf_dest only if

144 indirect_reference attribute exists (which means the objects was

145 already identified in xref/xobjstm) if object has been already

146 referenced do nothing.

147

148 Args:

149 clone:

150 pdf_dest:

151

152 Returns:

153 The clone

154

155 """

156 try:

157 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:

158 return clone

159 except Exception:

160 pass

161 # if hasattr(clone, "indirect_reference"):

162 try:

163 ind = self.indirect_reference

164 except AttributeError:

165 return clone

166 if (

167 pdf_dest.incremental

168 and ind is not None

169 and ind.pdf == pdf_dest._reader

170 and ind.idnum <= len(pdf_dest._objects)

171 ):

172 i = ind.idnum

173 else:

174 i = len(pdf_dest._objects) + 1

175 if ind is not None:

176 if id(ind.pdf) not in pdf_dest._id_translated:

177 pdf_dest._id_translated[id(ind.pdf)] = {}

178 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]

179 if (

180 not force_duplicate

181 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]

182 ):

183 obj = pdf_dest.get_object(

184 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]

185 )

186 assert obj is not None

187 return obj

188 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i

189 try:

190 pdf_dest._objects[i - 1] = clone

191 except IndexError:

192 pdf_dest._objects.append(clone)

193 i = len(pdf_dest._objects)

194 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)

195 return clone

196

197 def get_object(self) -> Optional["PdfObject"]:

198 """Resolve indirect references."""

199 return self

200

201 def write_to_stream(

202 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

203 ) -> None:

204 raise NotImplementedError

205

206

207class NullObject(PdfObject):

208 def clone(

209 self,

210 pdf_dest: PdfWriterProtocol,

211 force_duplicate: bool = False,

212 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

213 ) -> "NullObject":

214 """Clone object into pdf_dest."""

215 return cast(

216 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)

217 )

218

219 def hash_bin(self) -> int:

220 """

221 Used to detect modified object.

222

223 Returns:

224 Hash considering type and value.

225

226 """

227 return hash((self.__class__,))

228

229 def write_to_stream(

230 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

231 ) -> None:

232 if encryption_key is not None: # deprecated

233 deprecation_no_replacement(

234 "the encryption_key parameter of write_to_stream", "5.0.0"

235 )

236 stream.write(b"null")

237

238 @staticmethod

239 def read_from_stream(stream: StreamType) -> "NullObject":

240 nulltxt = stream.read(4)

241 if nulltxt != b"null":

242 raise PdfReadError("Could not read Null object")

243 return NullObject()

244

245 def __repr__(self) -> str:

246 return "NullObject"

247

248 def __eq__(self, other: object) -> bool:

249 return isinstance(other, NullObject)

250

251 def __hash__(self) -> int:

252 return self.hash_bin()

253

254

255class BooleanObject(PdfObject):

256 def __init__(self, value: Any) -> None:

257 self.value = value

258

259 def clone(

260 self,

261 pdf_dest: PdfWriterProtocol,

262 force_duplicate: bool = False,

263 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

264 ) -> "BooleanObject":

265 """Clone object into pdf_dest."""

266 return cast(

267 "BooleanObject",

268 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),

269 )

270

271 def hash_bin(self) -> int:

272 """

273 Used to detect modified object.

274

275 Returns:

276 Hash considering type and value.

277

278 """

279 return hash((self.__class__, self.value))

280

281 def __eq__(self, o: object, /) -> bool:

282 if isinstance(o, BooleanObject):

283 return self.value == o.value

284 if isinstance(o, bool):

285 return self.value == o

286 return False

287

288 def __hash__(self) -> int:

289 return self.hash_bin()

290

291 def __repr__(self) -> str:

292 return "True" if self.value else "False"

293

294 def write_to_stream(

295 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

296 ) -> None:

297 if encryption_key is not None: # deprecated

298 deprecation_no_replacement(

299 "the encryption_key parameter of write_to_stream", "5.0.0"

300 )

301 if self.value:

302 stream.write(b"true")

303 else:

304 stream.write(b"false")

305

306 @staticmethod

307 def read_from_stream(stream: StreamType) -> "BooleanObject":

308 word = stream.read(4)

309 if word == b"true":

310 return BooleanObject(True)

311 if word == b"fals":

312 stream.read(1)

313 return BooleanObject(False)

314 raise PdfReadError("Could not read Boolean object")

315

316

317class IndirectObject(PdfObject):

318 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader

319 self.idnum = idnum

320 self.generation = generation

321 self.pdf = pdf

322

323 def __hash__(self) -> int:

324 return hash((self.idnum, self.generation, id(self.pdf)))

325

326 def hash_bin(self) -> int:

327 """

328 Used to detect modified object.

329

330 Returns:

331 Hash considering type and value.

332

333 """

334 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))

335

336 def replicate(

337 self,

338 pdf_dest: PdfWriterProtocol,

339 ) -> "PdfObject":

340 return IndirectObject(self.idnum, self.generation, pdf_dest)

341

342 def clone(

343 self,

344 pdf_dest: PdfWriterProtocol,

345 force_duplicate: bool = False,

346 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

347 ) -> "IndirectObject":

348 """Clone object into pdf_dest."""

349 if self.pdf == pdf_dest and not force_duplicate:

350 # Already duplicated and no extra duplication required

351 return self

352 if id(self.pdf) not in pdf_dest._id_translated:

353 pdf_dest._id_translated[id(self.pdf)] = {}

354 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]

355

356 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:

357 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])

358 if force_duplicate:

359 assert dup is not None

360 assert dup.indirect_reference is not None

361 idref = dup.indirect_reference

362 return IndirectObject(idref.idnum, idref.generation, idref.pdf)

363 else:

364 obj = self.get_object()

365 # case observed : a pointed object can not be found

366 if obj is None:

367 # this normally

368 obj = NullObject()

369 assert isinstance(self, (IndirectObject,))

370 obj.indirect_reference = self

371 dup = pdf_dest._add_object(

372 obj.clone(pdf_dest, force_duplicate, ignore_fields)

373 )

374 assert dup is not None, "mypy"

375 assert dup.indirect_reference is not None, "mypy"

376 return dup.indirect_reference

377

378 @property

379 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]

380 return self

381

382 def get_object(self) -> Optional["PdfObject"]:

383 return self.pdf.get_object(self)

384

385 def __deepcopy__(self, memo: Any) -> "IndirectObject":

386 return IndirectObject(self.idnum, self.generation, self.pdf)

387

388 def _get_object_with_check(self) -> Optional["PdfObject"]:

389 o = self.get_object()

390 # the check is done here to not slow down get_object()

391 if isinstance(o, IndirectObject):

392 raise PdfStreamError(

393 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"

394 )

395 return o

396

397 def __getattr__(self, name: str) -> Any:

398 # Attribute not found in object: look in pointed object

399 try:

400 return getattr(self._get_object_with_check(), name)

401 except AttributeError:

402 raise AttributeError(

403 f"No attribute {name} found in IndirectObject or pointed object"

404 )

405

406 def __getitem__(self, key: Any) -> Any:

407 # items should be extracted from pointed Object

408 return self._get_object_with_check()[key] # type: ignore

409

410 def __contains__(self, key: Any) -> bool:

411 return key in self._get_object_with_check() # type: ignore

412

413 def __iter__(self) -> Any:

414 return self._get_object_with_check().__iter__() # type: ignore

415

416 def __float__(self) -> str:

417 # in this case we are looking for the pointed data

418 return self.get_object().__float__() # type: ignore

419

420 def __int__(self) -> int:

421 # in this case we are looking for the pointed data

422 return self.get_object().__int__() # type: ignore

423

424 def __str__(self) -> str:

425 # in this case we are looking for the pointed data

426 return self.get_object().__str__()

427

428 def __repr__(self) -> str:

429 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"

430

431 def __eq__(self, other: object) -> bool:

432 return (

433 other is not None

434 and isinstance(other, IndirectObject)

435 and self.idnum == other.idnum

436 and self.generation == other.generation

437 and self.pdf is other.pdf

438 )

439

440 def __ne__(self, other: object) -> bool:

441 return not self.__eq__(other)

442

443 def write_to_stream(

444 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

445 ) -> None:

446 if encryption_key is not None: # deprecated

447 deprecation_no_replacement(

448 "the encryption_key parameter of write_to_stream", "5.0.0"

449 )

450 stream.write(f"{self.idnum} {self.generation} R".encode())

451

452 @staticmethod

453 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader

454 idnum = b""

455 while True:

456 tok = stream.read(1)

457 if not tok:

458 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

459 if tok.isspace():

460 break

461 idnum += tok

462 generation = b""

463 while True:

464 tok = stream.read(1)

465 if not tok:

466 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

467 if tok.isspace():

468 if not generation:

469 continue

470 break

471 generation += tok

472 r = read_non_whitespace(stream)

473 if r != b"R":

474 raise PdfReadError(

475 f"Error reading indirect object reference at byte {hex(stream.tell())}"

476 )

477 return IndirectObject(int(idnum), int(generation), pdf)

478

479

480FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj

481

482

483class FloatObject(float, PdfObject):

484 def __new__(

485 cls, value: Any = "0.0", context: Optional[Any] = None

486 ) -> Self:

487 try:

488 value = float(value)

489 return float.__new__(cls, value)

490 except Exception as e:

491 # If this isn't a valid decimal (happens in malformed PDFs)

492 # fallback to 0

493 logger_warning(

494 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__

495 )

496 return float.__new__(cls, 0.0)

497

498 def clone(

499 self,

500 pdf_dest: Any,

501 force_duplicate: bool = False,

502 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

503 ) -> "FloatObject":

504 """Clone object into pdf_dest."""

505 return cast(

506 "FloatObject",

507 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),

508 )

509

510 def hash_bin(self) -> int:

511 """

512 Used to detect modified object.

513

514 Returns:

515 Hash considering type and value.

516

517 """

518 return hash((self.__class__, self.as_numeric))

519

520 def myrepr(self) -> str:

521 if self == 0:

522 return "0.0"

523 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))

524 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")

525

526 def __repr__(self) -> str:

527 return self.myrepr() # repr(float(self))

528

529 def as_numeric(self) -> float:

530 return float(self)

531

532 def write_to_stream(

533 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

534 ) -> None:

535 if encryption_key is not None: # deprecated

536 deprecation_no_replacement(

537 "the encryption_key parameter of write_to_stream", "5.0.0"

538 )

539 stream.write(self.myrepr().encode("utf8"))

540

541

542class NumberObject(int, PdfObject):

543 NumberPattern = re.compile(b"[^+-.0-9]")

544

545 def __new__(cls, value: Any) -> Self:

546 try:

547 return int.__new__(cls, int(value))

548 except ValueError:

549 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)

550 return int.__new__(cls, 0)

551

552 def clone(

553 self,

554 pdf_dest: Any,

555 force_duplicate: bool = False,

556 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

557 ) -> "NumberObject":

558 """Clone object into pdf_dest."""

559 return cast(

560 "NumberObject",

561 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),

562 )

563

564 def hash_bin(self) -> int:

565 """

566 Used to detect modified object.

567

568 Returns:

569 Hash considering type and value.

570

571 """

572 return hash((self.__class__, self.as_numeric()))

573

574 def as_numeric(self) -> int:

575 return int(repr(self).encode("utf8"))

576

577 def write_to_stream(

578 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

579 ) -> None:

580 if encryption_key is not None: # deprecated

581 deprecation_no_replacement(

582 "the encryption_key parameter of write_to_stream", "5.0.0"

583 )

584 stream.write(repr(self).encode("utf8"))

585

586 @staticmethod

587 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:

588 num = read_until_regex(stream, NumberObject.NumberPattern)

589 if b"." in num:

590 return FloatObject(num)

591 return NumberObject(num)

592

593

594class ByteStringObject(bytes, PdfObject):

595 """

596 Represents a string object where the text encoding could not be determined.

597

598 This occurs quite often, as the PDF spec doesn't provide an alternate way to

599 represent strings -- for example, the encryption data stored in files (like

600 /O) is clearly not text, but is still stored in a "String" object.

601 """

602

603 def clone(

604 self,

605 pdf_dest: Any,

606 force_duplicate: bool = False,

607 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

608 ) -> "ByteStringObject":

609 """Clone object into pdf_dest."""

610 return cast(

611 "ByteStringObject",

612 self._reference_clone(

613 ByteStringObject(bytes(self)), pdf_dest, force_duplicate

614 ),

615 )

616

617 def hash_bin(self) -> int:

618 """

619 Used to detect modified object.

620

621 Returns:

622 Hash considering type and value.

623

624 """

625 return hash((self.__class__, bytes(self)))

626

627 @property

628 def original_bytes(self) -> bytes:

629 """For compatibility with TextStringObject.original_bytes."""

630 return self

631

632 def write_to_stream(

633 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

634 ) -> None:

635 if encryption_key is not None: # deprecated

636 deprecation_no_replacement(

637 "the encryption_key parameter of write_to_stream", "5.0.0"

638 )

639 stream.write(b"<")

640 stream.write(binascii.hexlify(self))

641 stream.write(b">")

642

643 def __str__(self) -> str:

644 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]

645 for enc in charset_to_try:

646 try:

647 return self.decode(enc)

648 except UnicodeDecodeError:

649 pass

650 raise PdfReadError("Cannot decode ByteStringObject.")

651

652

653class TextStringObject(str, PdfObject): # noqa: SLOT000

654 """

655 A string object that has been decoded into a real unicode string.

656

657 If read from a PDF document, this string appeared to match the

658 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding

659 to occur.

660 """

661

662 autodetect_pdfdocencoding: bool

663 autodetect_utf16: bool

664 utf16_bom: bytes

665 _original_bytes: Optional[bytes] = None

666

667 def __new__(cls, value: Any) -> Self:

668 original_bytes = None

669 if isinstance(value, bytes):

670 original_bytes = value

671 value = value.decode("charmap")

672 text_string_object = str.__new__(cls, value)

673 text_string_object._original_bytes = original_bytes

674 text_string_object.autodetect_utf16 = False

675 text_string_object.autodetect_pdfdocencoding = False

676 text_string_object.utf16_bom = b""

677 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:

678 # The value of `original_bytes` is only set for inputs being `bytes`.

679 # If this is UTF-16 data according to the BOM (first two characters),

680 # perform special handling. All other cases should not need any special conversion

681 # due to already being a string.

682 try:

683 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))

684 except UnicodeDecodeError as exception:

685 logger_warning(

686 f"{exception!s}\ninitial string:{exception.object!r}",

687 __name__,

688 )

689 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))

690 text_string_object._original_bytes = original_bytes

691 text_string_object.autodetect_utf16 = True

692 text_string_object.utf16_bom = original_bytes[:2]

693 else:

694 try:

695 encode_pdfdocencoding(text_string_object)

696 text_string_object.autodetect_pdfdocencoding = True

697 except UnicodeEncodeError:

698 text_string_object.autodetect_utf16 = True

699 text_string_object.utf16_bom = codecs.BOM_UTF16_BE

700 return text_string_object

701

702 def clone(

703 self,

704 pdf_dest: Any,

705 force_duplicate: bool = False,

706 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

707 ) -> "TextStringObject":

708 """Clone object into pdf_dest."""

709 obj = TextStringObject(self)

710 obj._original_bytes = self._original_bytes

711 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding

712 obj.autodetect_utf16 = self.autodetect_utf16

713 obj.utf16_bom = self.utf16_bom

714 return cast(

715 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)

716 )

717

718 def hash_bin(self) -> int:

719 """

720 Used to detect modified object.

721

722 Returns:

723 Hash considering type and value.

724

725 """

726 return hash((self.__class__, self.original_bytes))

727

728 @property

729 def original_bytes(self) -> bytes:

730 """

731 It is occasionally possible that a text string object gets created where

732 a byte string object was expected due to the autodetection mechanism --

733 if that occurs, this "original_bytes" property can be used to

734 back-calculate what the original encoded bytes were.

735 """

736 if self._original_bytes is not None:

737 return self._original_bytes

738 return self.get_original_bytes()

739

740 def get_original_bytes(self) -> bytes:

741 # We're a text string object, but the library is trying to get our raw

742 # bytes. This can happen if we auto-detected this string as text, but

743 # we were wrong. It's pretty common. Return the original bytes that

744 # would have been used to create this object, based upon the autodetect

745 # method.

746 if self.autodetect_utf16:

747 if self.utf16_bom == codecs.BOM_UTF16_LE:

748 return codecs.BOM_UTF16_LE + self.encode("utf-16le")

749 if self.utf16_bom == codecs.BOM_UTF16_BE:

750 return codecs.BOM_UTF16_BE + self.encode("utf-16be")

751 return self.encode("utf-16be")

752 if self.autodetect_pdfdocencoding:

753 return encode_pdfdocencoding(self)

754 raise Exception("no information about original bytes") # pragma: no cover

755

756 def get_encoded_bytes(self) -> bytes:

757 # Try to write the string out as a PDFDocEncoding encoded string. It's

758 # nicer to look at in the PDF file. Sadly, we take a performance hit

759 # here for trying...

760 try:

761 if self._original_bytes is not None:

762 return self._original_bytes

763 if self.autodetect_utf16:

764 raise UnicodeEncodeError("", "forced", -1, -1, "")

765 bytearr = encode_pdfdocencoding(self)

766 except UnicodeEncodeError:

767 if self.utf16_bom == codecs.BOM_UTF16_LE:

768 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")

769 elif self.utf16_bom == codecs.BOM_UTF16_BE:

770 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")

771 else:

772 bytearr = self.encode("utf-16be")

773 return bytearr

774

775 def write_to_stream(

776 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

777 ) -> None:

778 if encryption_key is not None: # deprecated

779 deprecation_no_replacement(

780 "the encryption_key parameter of write_to_stream", "5.0.0"

781 )

782 bytearr = self.get_encoded_bytes()

783 stream.write(b"(")

784 for c_ in iter_unpack("c", bytearr):

785 c = cast(bytes, c_[0])

786 if not c.isalnum() and c != b" ":

787 # This:

788 # stream.write(rf"\{c:0>3o}".encode())

789 # gives

790 # https://github.com/davidhalter/parso/issues/207

791 stream.write(b"\\%03o" % ord(c))

792 else:

793 stream.write(c)

794 stream.write(b")")

795

796

797class NameObject(str, PdfObject): # noqa: SLOT000

798 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")

799 prefix = b"/"

800 renumber_table: ClassVar[dict[str, bytes]] = {

801 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},

802 **{chr(i): f"#{i:02X}".encode() for i in range(33)},

803 }

804

805 def clone(

806 self,

807 pdf_dest: Any,

808 force_duplicate: bool = False,

809 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

810 ) -> "NameObject":

811 """Clone object into pdf_dest."""

812 return cast(

813 "NameObject",

814 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),

815 )

816

817 def hash_bin(self) -> int:

818 """

819 Used to detect modified object.

820

821 Returns:

822 Hash considering type and value.

823

824 """

825 return hash((self.__class__, self))

826

827 def write_to_stream(

828 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

829 ) -> None:

830 if encryption_key is not None: # deprecated

831 deprecation_no_replacement(

832 "the encryption_key parameter of write_to_stream", "5.0.0"

833 )

834 stream.write(self.renumber())

835

836 def renumber(self) -> bytes:

837 out = self[0].encode("utf-8")

838 if out != b"/":

839 deprecation_no_replacement(

840 f"Incorrect first char in NameObject, should start with '/': ({self})",

841 "5.0.0",

842 )

843 for c in self[1:]:

844 if c > "~":

845 for x in c.encode("utf-8"):

846 out += f"#{x:02X}".encode()

847 else:

848 try:

849 out += self.renumber_table[c]

850 except KeyError:

851 out += c.encode("utf-8")

852 return out

853

854 def _sanitize(self) -> "NameObject":

855 """

856 Sanitize the NameObject's name to be a valid PDF name part

857 (alphanumeric, underscore, hyphen). The _sanitize method replaces

858 spaces and any non-alphanumeric/non-underscore/non-hyphen with

859 underscores.

860

861 Returns:

862 NameObject with sanitized name.

863 """

864 name = str(self).removeprefix("/")

865 name = re.sub(r"\ ", "_", name)

866 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)

867 return NameObject("/" + name)

868

869 @classproperty

870 def surfix(cls) -> bytes: # noqa: N805

871 deprecation_with_replacement("surfix", "prefix", "5.0.0")

872 return b"/"

873

874 @staticmethod

875 def unnumber(sin: bytes) -> bytes:

876 i = sin.find(b"#", 0)

877 while i >= 0:

878 try:

879 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]

880 i = sin.find(b"#", i + 1)

881 except ValueError:

882 # if the 2 characters after # can not be converted to hex

883 # we change nothing and carry on

884 i = i + 1

885 return sin

886

887 CHARSETS = ("utf-8", "gbk", "latin1")

888

889 @staticmethod

890 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader

891 name = stream.read(1)

892 if name != NameObject.prefix:

893 raise PdfReadError("Name read error")

894 name += read_until_regex(stream, NameObject.delimiter_pattern)

895 try:

896 # Name objects should represent irregular characters

897 # with a '#' followed by the symbol's hex number

898 name = NameObject.unnumber(name)

899 for enc in NameObject.CHARSETS:

900 try:

901 ret = name.decode(enc)

902 return NameObject(ret)

903 except Exception:

904 pass

905 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")

906 except (UnicodeEncodeError, UnicodeDecodeError) as e:

907 if not pdf.strict:

908 logger_warning(

909 f"Illegal character in NameObject ({name!r}), "

910 "you may need to adjust NameObject.CHARSETS",

911 __name__,

912 )

913 return NameObject(name.decode("charmap"))

914 raise PdfReadError(

915 f"Illegal character in NameObject ({name!r}). "

916 "You may need to adjust NameObject.CHARSETS.",

917 ) from e

918

919

920def encode_pdfdocencoding(unicode_string: str) -> bytes:

921 try:

922 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])

923 except KeyError:

924 raise UnicodeEncodeError(

925 "pdfdocencoding",

926 unicode_string,

927 -1,

928 -1,

929 "does not exist in translation table",

930 )

931

932

933def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:

934 """

935 Returns:

936 True if x is None or NullObject.

937

938 """

939 return x is None or (

940 isinstance(x, PdfObject)

941 and (x.get_object() is None or isinstance(x.get_object(), NullObject))

942 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

441 statements