Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/

4# Redistribution and use in source and binary forms, with or without

5# modification, are permitted provided that the following conditions are

6# met:

8# * Redistributions of source code must retain the above copyright notice,

9# this list of conditions and the following disclaimer.

10# * Redistributions in binary form must reproduce the above copyright notice,

11# this list of conditions and the following disclaimer in the documentation

12# and/or other materials provided with the distribution.

13# * The name of the author may not be used to endorse or promote products

14# derived from this software without specific prior written permission.

15#

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

26# POSSIBILITY OF SUCH DAMAGE.

27import binascii

28import codecs

29import hashlib

30import re

31import sys

32from binascii import unhexlify

33from collections.abc import Sequence

34from math import log10

35from struct import iter_unpack

36from typing import Any, Callable, ClassVar, Optional, Union, cast

38if sys.version_info[:2] >= (3, 10):

39 from typing import TypeGuard

40else:

41 from typing_extensions import TypeGuard # PEP 647

43from .._codecs import _pdfdoc_encoding_rev

44from .._protocols import PdfObjectProtocol, PdfWriterProtocol

45from .._utils import (

46 StreamType,

47 classproperty,

48 deprecation_no_replacement,

49 deprecation_with_replacement,

50 logger_warning,

51 read_non_whitespace,

52 read_until_regex,

53)

54from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError

56__author__ = "Mathieu Fenniak"

57__author_email__ = "biziqe@mathieu.fenniak.net"

60class PdfObject(PdfObjectProtocol):

61 # function for calculating a hash value

62 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1

63 indirect_reference: Optional["IndirectObject"]

65 def hash_bin(self) -> int:

66 """

67 Used to detect modified object.

69 Returns:

70 Hash considering type and value.

72 """

73 raise NotImplementedError(

74 f"{self.__class__.__name__} does not implement .hash_bin() so far"

75 )

77 def hash_value_data(self) -> bytes:

78 return f"{self}".encode()

80 def hash_value(self) -> bytes:

81 return (

82 f"{self.__class__.__name__}:"

83 f"{self.hash_func(self.hash_value_data()).hexdigest()}"

84 ).encode()

86 def replicate(

87 self,

88 pdf_dest: PdfWriterProtocol,

89 ) -> "PdfObject":

90 """

91 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)

92 without ensuring links. This is used in clone_document_from_root with incremental = True.

94 Args:

95 pdf_dest: Target to clone to.

97 Returns:

98 The cloned PdfObject

100 """

101 return self.clone(pdf_dest)

102

103 def clone(

104 self,

105 pdf_dest: PdfWriterProtocol,

106 force_duplicate: bool = False,

107 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

108 ) -> "PdfObject":

109 """

110 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).

111

112 By default, this method will call ``_reference_clone`` (see ``_reference``).

113

114

115 Args:

116 pdf_dest: Target to clone to.

117 force_duplicate: By default, if the object has already been cloned and referenced,

118 the copy will be returned; when ``True``, a new copy will be created.

119 (Default value = ``False``)

120 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored

121 during cloning (applies to children duplication as well). If fields are to be

122 considered for a limited number of levels, you have to add it as integer, for

123 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first

124 level only but ``"/TOTO"`` on all levels.

125

126 Returns:

127 The cloned PdfObject

128

129 """

130 raise NotImplementedError(

131 f"{self.__class__.__name__} does not implement .clone so far"

132 )

133

134 def _reference_clone(

135 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False

136 ) -> PdfObjectProtocol:

137 """

138 Reference the object within the _objects of pdf_dest only if

139 indirect_reference attribute exists (which means the objects was

140 already identified in xref/xobjstm) if object has been already

141 referenced do nothing.

142

143 Args:

144 clone:

145 pdf_dest:

146

147 Returns:

148 The clone

149

150 """

151 try:

152 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:

153 return clone

154 except Exception:

155 pass

156 # if hasattr(clone, "indirect_reference"):

157 try:

158 ind = self.indirect_reference

159 except AttributeError:

160 return clone

161 if (

162 pdf_dest.incremental

163 and ind is not None

164 and ind.pdf == pdf_dest._reader

165 and ind.idnum <= len(pdf_dest._objects)

166 ):

167 i = ind.idnum

168 else:

169 i = len(pdf_dest._objects) + 1

170 if ind is not None:

171 if id(ind.pdf) not in pdf_dest._id_translated:

172 pdf_dest._id_translated[id(ind.pdf)] = {}

173 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]

174 if (

175 not force_duplicate

176 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]

177 ):

178 obj = pdf_dest.get_object(

179 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]

180 )

181 assert obj is not None

182 return obj

183 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i

184 try:

185 pdf_dest._objects[i - 1] = clone

186 except IndexError:

187 pdf_dest._objects.append(clone)

188 i = len(pdf_dest._objects)

189 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)

190 return clone

191

192 def get_object(self) -> Optional["PdfObject"]:

193 """Resolve indirect references."""

194 return self

195

196 def write_to_stream(

197 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

198 ) -> None:

199 raise NotImplementedError

200

201

202class NullObject(PdfObject):

203 def clone(

204 self,

205 pdf_dest: PdfWriterProtocol,

206 force_duplicate: bool = False,

207 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

208 ) -> "NullObject":

209 """Clone object into pdf_dest."""

210 return cast(

211 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)

212 )

213

214 def hash_bin(self) -> int:

215 """

216 Used to detect modified object.

217

218 Returns:

219 Hash considering type and value.

220

221 """

222 return hash((self.__class__,))

223

224 def write_to_stream(

225 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

226 ) -> None:

227 if encryption_key is not None: # deprecated

228 deprecation_no_replacement(

229 "the encryption_key parameter of write_to_stream", "5.0.0"

230 )

231 stream.write(b"null")

232

233 @staticmethod

234 def read_from_stream(stream: StreamType) -> "NullObject":

235 nulltxt = stream.read(4)

236 if nulltxt != b"null":

237 raise PdfReadError("Could not read Null object")

238 return NullObject()

239

240 def __repr__(self) -> str:

241 return "NullObject"

242

243 def __eq__(self, other: object) -> bool:

244 return isinstance(other, NullObject)

245

246 def __hash__(self) -> int:

247 return self.hash_bin()

248

249

250class BooleanObject(PdfObject):

251 def __init__(self, value: Any) -> None:

252 self.value = value

253

254 def clone(

255 self,

256 pdf_dest: PdfWriterProtocol,

257 force_duplicate: bool = False,

258 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

259 ) -> "BooleanObject":

260 """Clone object into pdf_dest."""

261 return cast(

262 "BooleanObject",

263 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),

264 )

265

266 def hash_bin(self) -> int:

267 """

268 Used to detect modified object.

269

270 Returns:

271 Hash considering type and value.

272

273 """

274 return hash((self.__class__, self.value))

275

276 def __eq__(self, o: object, /) -> bool:

277 if isinstance(o, BooleanObject):

278 return self.value == o.value

279 if isinstance(o, bool):

280 return self.value == o

281 return False

282

283 def __hash__(self) -> int:

284 return self.hash_bin()

285

286 def __repr__(self) -> str:

287 return "True" if self.value else "False"

288

289 def write_to_stream(

290 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

291 ) -> None:

292 if encryption_key is not None: # deprecated

293 deprecation_no_replacement(

294 "the encryption_key parameter of write_to_stream", "5.0.0"

295 )

296 if self.value:

297 stream.write(b"true")

298 else:

299 stream.write(b"false")

300

301 @staticmethod

302 def read_from_stream(stream: StreamType) -> "BooleanObject":

303 word = stream.read(4)

304 if word == b"true":

305 return BooleanObject(True)

306 if word == b"fals":

307 stream.read(1)

308 return BooleanObject(False)

309 raise PdfReadError("Could not read Boolean object")

310

311

312class IndirectObject(PdfObject):

313 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader

314 self.idnum = idnum

315 self.generation = generation

316 self.pdf = pdf

317

318 def __hash__(self) -> int:

319 return hash((self.idnum, self.generation, id(self.pdf)))

320

321 def hash_bin(self) -> int:

322 """

323 Used to detect modified object.

324

325 Returns:

326 Hash considering type and value.

327

328 """

329 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))

330

331 def replicate(

332 self,

333 pdf_dest: PdfWriterProtocol,

334 ) -> "PdfObject":

335 return IndirectObject(self.idnum, self.generation, pdf_dest)

336

337 def clone(

338 self,

339 pdf_dest: PdfWriterProtocol,

340 force_duplicate: bool = False,

341 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

342 ) -> "IndirectObject":

343 """Clone object into pdf_dest."""

344 if self.pdf == pdf_dest and not force_duplicate:

345 # Already duplicated and no extra duplication required

346 return self

347 if id(self.pdf) not in pdf_dest._id_translated:

348 pdf_dest._id_translated[id(self.pdf)] = {}

349 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]

350

351 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:

352 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])

353 if force_duplicate:

354 assert dup is not None

355 assert dup.indirect_reference is not None

356 idref = dup.indirect_reference

357 return IndirectObject(idref.idnum, idref.generation, idref.pdf)

358 else:

359 obj = self.get_object()

360 # case observed : a pointed object can not be found

361 if obj is None:

362 # this normally

363 obj = NullObject()

364 assert isinstance(self, (IndirectObject,))

365 obj.indirect_reference = self

366 dup = pdf_dest._add_object(

367 obj.clone(pdf_dest, force_duplicate, ignore_fields)

368 )

369 assert dup is not None, "mypy"

370 assert dup.indirect_reference is not None, "mypy"

371 return dup.indirect_reference

372

373 @property

374 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]

375 return self

376

377 def get_object(self) -> Optional["PdfObject"]:

378 return self.pdf.get_object(self)

379

380 def __deepcopy__(self, memo: Any) -> "IndirectObject":

381 return IndirectObject(self.idnum, self.generation, self.pdf)

382

383 def _get_object_with_check(self) -> Optional["PdfObject"]:

384 o = self.get_object()

385 # the check is done here to not slow down get_object()

386 if isinstance(o, IndirectObject):

387 raise PdfStreamError(

388 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"

389 )

390 return o

391

392 def __getattr__(self, name: str) -> Any:

393 # Attribute not found in object: look in pointed object

394 try:

395 return getattr(self._get_object_with_check(), name)

396 except AttributeError:

397 raise AttributeError(

398 f"No attribute {name} found in IndirectObject or pointed object"

399 )

400

401 def __getitem__(self, key: Any) -> Any:

402 # items should be extracted from pointed Object

403 return self._get_object_with_check()[key] # type: ignore

404

405 def __contains__(self, key: Any) -> bool:

406 return key in self._get_object_with_check() # type: ignore

407

408 def __iter__(self) -> Any:

409 return self._get_object_with_check().__iter__() # type: ignore

410

411 def __float__(self) -> str:

412 # in this case we are looking for the pointed data

413 return self.get_object().__float__() # type: ignore

414

415 def __int__(self) -> int:

416 # in this case we are looking for the pointed data

417 return self.get_object().__int__() # type: ignore

418

419 def __str__(self) -> str:

420 # in this case we are looking for the pointed data

421 return self.get_object().__str__()

422

423 def __repr__(self) -> str:

424 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"

425

426 def __eq__(self, other: object) -> bool:

427 return (

428 other is not None

429 and isinstance(other, IndirectObject)

430 and self.idnum == other.idnum

431 and self.generation == other.generation

432 and self.pdf is other.pdf

433 )

434

435 def __ne__(self, other: object) -> bool:

436 return not self.__eq__(other)

437

438 def write_to_stream(

439 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

440 ) -> None:

441 if encryption_key is not None: # deprecated

442 deprecation_no_replacement(

443 "the encryption_key parameter of write_to_stream", "5.0.0"

444 )

445 stream.write(f"{self.idnum} {self.generation} R".encode())

446

447 @staticmethod

448 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader

449 idnum = b""

450 while True:

451 tok = stream.read(1)

452 if not tok:

453 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

454 if tok.isspace():

455 break

456 idnum += tok

457 generation = b""

458 while True:

459 tok = stream.read(1)

460 if not tok:

461 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

462 if tok.isspace():

463 if not generation:

464 continue

465 break

466 generation += tok

467 r = read_non_whitespace(stream)

468 if r != b"R":

469 raise PdfReadError(

470 f"Error reading indirect object reference at byte {hex(stream.tell())}"

471 )

472 return IndirectObject(int(idnum), int(generation), pdf)

473

474

475FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj

476

477

478class FloatObject(float, PdfObject):

479 def __new__(

480 cls, value: Any = "0.0", context: Optional[Any] = None

481 ) -> "FloatObject":

482 try:

483 value = float(value)

484 return float.__new__(cls, value)

485 except Exception as e:

486 # If this isn't a valid decimal (happens in malformed PDFs)

487 # fallback to 0

488 logger_warning(

489 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__

490 )

491 return float.__new__(cls, 0.0)

492

493 def clone(

494 self,

495 pdf_dest: Any,

496 force_duplicate: bool = False,

497 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

498 ) -> "FloatObject":

499 """Clone object into pdf_dest."""

500 return cast(

501 "FloatObject",

502 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),

503 )

504

505 def hash_bin(self) -> int:

506 """

507 Used to detect modified object.

508

509 Returns:

510 Hash considering type and value.

511

512 """

513 return hash((self.__class__, self.as_numeric))

514

515 def myrepr(self) -> str:

516 if self == 0:

517 return "0.0"

518 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))

519 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")

520

521 def __repr__(self) -> str:

522 return self.myrepr() # repr(float(self))

523

524 def as_numeric(self) -> float:

525 return float(self)

526

527 def write_to_stream(

528 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

529 ) -> None:

530 if encryption_key is not None: # deprecated

531 deprecation_no_replacement(

532 "the encryption_key parameter of write_to_stream", "5.0.0"

533 )

534 stream.write(self.myrepr().encode("utf8"))

535

536

537class NumberObject(int, PdfObject):

538 NumberPattern = re.compile(b"[^+-.0-9]")

539

540 def __new__(cls, value: Any) -> "NumberObject":

541 try:

542 return int.__new__(cls, int(value))

543 except ValueError:

544 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)

545 return int.__new__(cls, 0)

546

547 def clone(

548 self,

549 pdf_dest: Any,

550 force_duplicate: bool = False,

551 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

552 ) -> "NumberObject":

553 """Clone object into pdf_dest."""

554 return cast(

555 "NumberObject",

556 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),

557 )

558

559 def hash_bin(self) -> int:

560 """

561 Used to detect modified object.

562

563 Returns:

564 Hash considering type and value.

565

566 """

567 return hash((self.__class__, self.as_numeric()))

568

569 def as_numeric(self) -> int:

570 return int(repr(self).encode("utf8"))

571

572 def write_to_stream(

573 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

574 ) -> None:

575 if encryption_key is not None: # deprecated

576 deprecation_no_replacement(

577 "the encryption_key parameter of write_to_stream", "5.0.0"

578 )

579 stream.write(repr(self).encode("utf8"))

580

581 @staticmethod

582 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:

583 num = read_until_regex(stream, NumberObject.NumberPattern)

584 if b"." in num:

585 return FloatObject(num)

586 return NumberObject(num)

587

588

589class ByteStringObject(bytes, PdfObject):

590 """

591 Represents a string object where the text encoding could not be determined.

592

593 This occurs quite often, as the PDF spec doesn't provide an alternate way to

594 represent strings -- for example, the encryption data stored in files (like

595 /O) is clearly not text, but is still stored in a "String" object.

596 """

597

598 def clone(

599 self,

600 pdf_dest: Any,

601 force_duplicate: bool = False,

602 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

603 ) -> "ByteStringObject":

604 """Clone object into pdf_dest."""

605 return cast(

606 "ByteStringObject",

607 self._reference_clone(

608 ByteStringObject(bytes(self)), pdf_dest, force_duplicate

609 ),

610 )

611

612 def hash_bin(self) -> int:

613 """

614 Used to detect modified object.

615

616 Returns:

617 Hash considering type and value.

618

619 """

620 return hash((self.__class__, bytes(self)))

621

622 @property

623 def original_bytes(self) -> bytes:

624 """For compatibility with TextStringObject.original_bytes."""

625 return self

626

627 def write_to_stream(

628 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

629 ) -> None:

630 if encryption_key is not None: # deprecated

631 deprecation_no_replacement(

632 "the encryption_key parameter of write_to_stream", "5.0.0"

633 )

634 stream.write(b"<")

635 stream.write(binascii.hexlify(self))

636 stream.write(b">")

637

638 def __str__(self) -> str:

639 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]

640 for enc in charset_to_try:

641 try:

642 return self.decode(enc)

643 except UnicodeDecodeError:

644 pass

645 raise PdfReadError("Cannot decode ByteStringObject.")

646

647

648class TextStringObject(str, PdfObject): # noqa: SLOT000

649 """

650 A string object that has been decoded into a real unicode string.

651

652 If read from a PDF document, this string appeared to match the

653 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding

654 to occur.

655 """

656

657 autodetect_pdfdocencoding: bool

658 autodetect_utf16: bool

659 utf16_bom: bytes

660 _original_bytes: Optional[bytes] = None

661

662 def __new__(cls, value: Any) -> "TextStringObject":

663 original_bytes = None

664 if isinstance(value, bytes):

665 original_bytes = value

666 value = value.decode("charmap")

667 text_string_object = str.__new__(cls, value)

668 text_string_object._original_bytes = original_bytes

669 text_string_object.autodetect_utf16 = False

670 text_string_object.autodetect_pdfdocencoding = False

671 text_string_object.utf16_bom = b""

672 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:

673 # The value of `original_bytes` is only set for inputs being `bytes`.

674 # If this is UTF-16 data according to the BOM (first two characters),

675 # perform special handling. All other cases should not need any special conversion

676 # due to already being a string.

677 try:

678 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))

679 except UnicodeDecodeError as exception:

680 logger_warning(

681 f"{exception!s}\ninitial string:{exception.object!r}",

682 __name__,

683 )

684 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))

685 text_string_object._original_bytes = original_bytes

686 text_string_object.autodetect_utf16 = True

687 text_string_object.utf16_bom = original_bytes[:2]

688 else:

689 try:

690 encode_pdfdocencoding(text_string_object)

691 text_string_object.autodetect_pdfdocencoding = True

692 except UnicodeEncodeError:

693 text_string_object.autodetect_utf16 = True

694 text_string_object.utf16_bom = codecs.BOM_UTF16_BE

695 return text_string_object

696

697 def clone(

698 self,

699 pdf_dest: Any,

700 force_duplicate: bool = False,

701 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

702 ) -> "TextStringObject":

703 """Clone object into pdf_dest."""

704 obj = TextStringObject(self)

705 obj._original_bytes = self._original_bytes

706 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding

707 obj.autodetect_utf16 = self.autodetect_utf16

708 obj.utf16_bom = self.utf16_bom

709 return cast(

710 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)

711 )

712

713 def hash_bin(self) -> int:

714 """

715 Used to detect modified object.

716

717 Returns:

718 Hash considering type and value.

719

720 """

721 return hash((self.__class__, self.original_bytes))

722

723 @property

724 def original_bytes(self) -> bytes:

725 """

726 It is occasionally possible that a text string object gets created where

727 a byte string object was expected due to the autodetection mechanism --

728 if that occurs, this "original_bytes" property can be used to

729 back-calculate what the original encoded bytes were.

730 """

731 if self._original_bytes is not None:

732 return self._original_bytes

733 return self.get_original_bytes()

734

735 def get_original_bytes(self) -> bytes:

736 # We're a text string object, but the library is trying to get our raw

737 # bytes. This can happen if we auto-detected this string as text, but

738 # we were wrong. It's pretty common. Return the original bytes that

739 # would have been used to create this object, based upon the autodetect

740 # method.

741 if self.autodetect_utf16:

742 if self.utf16_bom == codecs.BOM_UTF16_LE:

743 return codecs.BOM_UTF16_LE + self.encode("utf-16le")

744 if self.utf16_bom == codecs.BOM_UTF16_BE:

745 return codecs.BOM_UTF16_BE + self.encode("utf-16be")

746 return self.encode("utf-16be")

747 if self.autodetect_pdfdocencoding:

748 return encode_pdfdocencoding(self)

749 raise Exception("no information about original bytes") # pragma: no cover

750

751 def get_encoded_bytes(self) -> bytes:

752 # Try to write the string out as a PDFDocEncoding encoded string. It's

753 # nicer to look at in the PDF file. Sadly, we take a performance hit

754 # here for trying...

755 try:

756 if self._original_bytes is not None:

757 return self._original_bytes

758 if self.autodetect_utf16:

759 raise UnicodeEncodeError("", "forced", -1, -1, "")

760 bytearr = encode_pdfdocencoding(self)

761 except UnicodeEncodeError:

762 if self.utf16_bom == codecs.BOM_UTF16_LE:

763 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")

764 elif self.utf16_bom == codecs.BOM_UTF16_BE:

765 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")

766 else:

767 bytearr = self.encode("utf-16be")

768 return bytearr

769

770 def write_to_stream(

771 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

772 ) -> None:

773 if encryption_key is not None: # deprecated

774 deprecation_no_replacement(

775 "the encryption_key parameter of write_to_stream", "5.0.0"

776 )

777 bytearr = self.get_encoded_bytes()

778 stream.write(b"(")

779 for c_ in iter_unpack("c", bytearr):

780 c = cast(bytes, c_[0])

781 if not c.isalnum() and c != b" ":

782 # This:

783 # stream.write(rf"\{c:0>3o}".encode())

784 # gives

785 # https://github.com/davidhalter/parso/issues/207

786 stream.write(b"\\%03o" % ord(c))

787 else:

788 stream.write(c)

789 stream.write(b")")

790

791

792class NameObject(str, PdfObject): # noqa: SLOT000

793 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")

794 prefix = b"/"

795 renumber_table: ClassVar[dict[str, bytes]] = {

796 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},

797 **{chr(i): f"#{i:02X}".encode() for i in range(33)},

798 }

799

800 def clone(

801 self,

802 pdf_dest: Any,

803 force_duplicate: bool = False,

804 ignore_fields: Optional[Sequence[Union[str, int]]] = (),

805 ) -> "NameObject":

806 """Clone object into pdf_dest."""

807 return cast(

808 "NameObject",

809 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),

810 )

811

812 def hash_bin(self) -> int:

813 """

814 Used to detect modified object.

815

816 Returns:

817 Hash considering type and value.

818

819 """

820 return hash((self.__class__, self))

821

822 def write_to_stream(

823 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

824 ) -> None:

825 if encryption_key is not None: # deprecated

826 deprecation_no_replacement(

827 "the encryption_key parameter of write_to_stream", "5.0.0"

828 )

829 stream.write(self.renumber())

830

831 def renumber(self) -> bytes:

832 out = self[0].encode("utf-8")

833 if out != b"/":

834 deprecation_no_replacement(

835 f"Incorrect first char in NameObject, should start with '/': ({self})",

836 "5.0.0",

837 )

838 for c in self[1:]:

839 if c > "~":

840 for x in c.encode("utf-8"):

841 out += f"#{x:02X}".encode()

842 else:

843 try:

844 out += self.renumber_table[c]

845 except KeyError:

846 out += c.encode("utf-8")

847 return out

848

849 def _sanitize(self) -> "NameObject":

850 """

851 Sanitize the NameObject's name to be a valid PDF name part

852 (alphanumeric, underscore, hyphen). The _sanitize method replaces

853 spaces and any non-alphanumeric/non-underscore/non-hyphen with

854 underscores.

855

856 Returns:

857 NameObject with sanitized name.

858 """

859 name = str(self).removeprefix("/")

860 name = re.sub(r"\ ", "_", name)

861 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)

862 return NameObject("/" + name)

863

864 @classproperty

865 def surfix(cls) -> bytes: # noqa: N805

866 deprecation_with_replacement("surfix", "prefix", "5.0.0")

867 return b"/"

868

869 @staticmethod

870 def unnumber(sin: bytes) -> bytes:

871 i = sin.find(b"#", 0)

872 while i >= 0:

873 try:

874 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]

875 i = sin.find(b"#", i + 1)

876 except ValueError:

877 # if the 2 characters after # can not be converted to hex

878 # we change nothing and carry on

879 i = i + 1

880 return sin

881

882 CHARSETS = ("utf-8", "gbk", "latin1")

883

884 @staticmethod

885 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader

886 name = stream.read(1)

887 if name != NameObject.prefix:

888 raise PdfReadError("Name read error")

889 name += read_until_regex(stream, NameObject.delimiter_pattern)

890 try:

891 # Name objects should represent irregular characters

892 # with a '#' followed by the symbol's hex number

893 name = NameObject.unnumber(name)

894 for enc in NameObject.CHARSETS:

895 try:

896 ret = name.decode(enc)

897 return NameObject(ret)

898 except Exception:

899 pass

900 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")

901 except (UnicodeEncodeError, UnicodeDecodeError) as e:

902 if not pdf.strict:

903 logger_warning(

904 f"Illegal character in NameObject ({name!r}), "

905 "you may need to adjust NameObject.CHARSETS",

906 __name__,

907 )

908 return NameObject(name.decode("charmap"))

909 raise PdfReadError(

910 f"Illegal character in NameObject ({name!r}). "

911 "You may need to adjust NameObject.CHARSETS.",

912 ) from e

913

914

915def encode_pdfdocencoding(unicode_string: str) -> bytes:

916 try:

917 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])

918 except KeyError:

919 raise UnicodeEncodeError(

920 "pdfdocencoding",

921 unicode_string,

922 -1,

923 -1,

924 "does not exist in translation table",

925 )

926

927

928def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:

929 """

930 Returns:

931 True if x is None or NullObject.

932

933 """

934 return x is None or (

935 isinstance(x, PdfObject)

936 and (x.get_object() is None or isinstance(x.get_object(), NullObject))

937 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%

438 statements