Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27import binascii
28import codecs
29import hashlib
30import re
31import sys
32from binascii import unhexlify
33from collections.abc import Sequence
34from math import log10
35from struct import iter_unpack
36from typing import Any, Callable, ClassVar, Optional, Union, cast
38if sys.version_info[:2] >= (3, 10):
39 from typing import TypeGuard
40else:
41 from typing_extensions import TypeGuard # PEP 647
43from .._codecs import _pdfdoc_encoding_rev
44from .._protocols import PdfObjectProtocol, PdfWriterProtocol
45from .._utils import (
46 StreamType,
47 classproperty,
48 deprecation_no_replacement,
49 deprecation_with_replacement,
50 logger_warning,
51 read_non_whitespace,
52 read_until_regex,
53)
54from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
56__author__ = "Mathieu Fenniak"
57__author_email__ = "biziqe@mathieu.fenniak.net"
60class PdfObject(PdfObjectProtocol):
61 # function for calculating a hash value
62 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
63 indirect_reference: Optional["IndirectObject"]
65 def hash_bin(self) -> int:
66 """
67 Used to detect modified object.
69 Returns:
70 Hash considering type and value.
72 """
73 raise NotImplementedError(
74 f"{self.__class__.__name__} does not implement .hash_bin() so far"
75 )
77 def hash_value_data(self) -> bytes:
78 return f"{self}".encode()
80 def hash_value(self) -> bytes:
81 return (
82 f"{self.__class__.__name__}:"
83 f"{self.hash_func(self.hash_value_data()).hexdigest()}"
84 ).encode()
86 def replicate(
87 self,
88 pdf_dest: PdfWriterProtocol,
89 ) -> "PdfObject":
90 """
91 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
92 without ensuring links. This is used in clone_document_from_root with incremental = True.
94 Args:
95 pdf_dest: Target to clone to.
97 Returns:
98 The cloned PdfObject
100 """
101 return self.clone(pdf_dest)
103 def clone(
104 self,
105 pdf_dest: PdfWriterProtocol,
106 force_duplicate: bool = False,
107 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
108 ) -> "PdfObject":
109 """
110 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
112 By default, this method will call ``_reference_clone`` (see ``_reference``).
115 Args:
116 pdf_dest: Target to clone to.
117 force_duplicate: By default, if the object has already been cloned and referenced,
118 the copy will be returned; when ``True``, a new copy will be created.
119 (Default value = ``False``)
120 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
121 during cloning (applies to children duplication as well). If fields are to be
122 considered for a limited number of levels, you have to add it as integer, for
123 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
124 level only but ``"/TOTO"`` on all levels.
126 Returns:
127 The cloned PdfObject
129 """
130 raise NotImplementedError(
131 f"{self.__class__.__name__} does not implement .clone so far"
132 )
134 def _reference_clone(
135 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
136 ) -> PdfObjectProtocol:
137 """
138 Reference the object within the _objects of pdf_dest only if
139 indirect_reference attribute exists (which means the objects was
140 already identified in xref/xobjstm) if object has been already
141 referenced do nothing.
143 Args:
144 clone:
145 pdf_dest:
147 Returns:
148 The clone
150 """
151 try:
152 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
153 return clone
154 except Exception:
155 pass
156 # if hasattr(clone, "indirect_reference"):
157 try:
158 ind = self.indirect_reference
159 except AttributeError:
160 return clone
161 if (
162 pdf_dest.incremental
163 and ind is not None
164 and ind.pdf == pdf_dest._reader
165 and ind.idnum <= len(pdf_dest._objects)
166 ):
167 i = ind.idnum
168 else:
169 i = len(pdf_dest._objects) + 1
170 if ind is not None:
171 if id(ind.pdf) not in pdf_dest._id_translated:
172 pdf_dest._id_translated[id(ind.pdf)] = {}
173 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
174 if (
175 not force_duplicate
176 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
177 ):
178 obj = pdf_dest.get_object(
179 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
180 )
181 assert obj is not None
182 return obj
183 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
184 try:
185 pdf_dest._objects[i - 1] = clone
186 except IndexError:
187 pdf_dest._objects.append(clone)
188 i = len(pdf_dest._objects)
189 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
190 return clone
192 def get_object(self) -> Optional["PdfObject"]:
193 """Resolve indirect references."""
194 return self
196 def write_to_stream(
197 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
198 ) -> None:
199 raise NotImplementedError
202class NullObject(PdfObject):
203 def clone(
204 self,
205 pdf_dest: PdfWriterProtocol,
206 force_duplicate: bool = False,
207 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
208 ) -> "NullObject":
209 """Clone object into pdf_dest."""
210 return cast(
211 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
212 )
214 def hash_bin(self) -> int:
215 """
216 Used to detect modified object.
218 Returns:
219 Hash considering type and value.
221 """
222 return hash((self.__class__,))
224 def write_to_stream(
225 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
226 ) -> None:
227 if encryption_key is not None: # deprecated
228 deprecation_no_replacement(
229 "the encryption_key parameter of write_to_stream", "5.0.0"
230 )
231 stream.write(b"null")
233 @staticmethod
234 def read_from_stream(stream: StreamType) -> "NullObject":
235 nulltxt = stream.read(4)
236 if nulltxt != b"null":
237 raise PdfReadError("Could not read Null object")
238 return NullObject()
240 def __repr__(self) -> str:
241 return "NullObject"
243 def __eq__(self, other: object) -> bool:
244 return isinstance(other, NullObject)
246 def __hash__(self) -> int:
247 return self.hash_bin()
250class BooleanObject(PdfObject):
251 def __init__(self, value: Any) -> None:
252 self.value = value
254 def clone(
255 self,
256 pdf_dest: PdfWriterProtocol,
257 force_duplicate: bool = False,
258 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
259 ) -> "BooleanObject":
260 """Clone object into pdf_dest."""
261 return cast(
262 "BooleanObject",
263 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
264 )
266 def hash_bin(self) -> int:
267 """
268 Used to detect modified object.
270 Returns:
271 Hash considering type and value.
273 """
274 return hash((self.__class__, self.value))
276 def __eq__(self, o: object, /) -> bool:
277 if isinstance(o, BooleanObject):
278 return self.value == o.value
279 if isinstance(o, bool):
280 return self.value == o
281 return False
283 def __hash__(self) -> int:
284 return self.hash_bin()
286 def __repr__(self) -> str:
287 return "True" if self.value else "False"
289 def write_to_stream(
290 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
291 ) -> None:
292 if encryption_key is not None: # deprecated
293 deprecation_no_replacement(
294 "the encryption_key parameter of write_to_stream", "5.0.0"
295 )
296 if self.value:
297 stream.write(b"true")
298 else:
299 stream.write(b"false")
301 @staticmethod
302 def read_from_stream(stream: StreamType) -> "BooleanObject":
303 word = stream.read(4)
304 if word == b"true":
305 return BooleanObject(True)
306 if word == b"fals":
307 stream.read(1)
308 return BooleanObject(False)
309 raise PdfReadError("Could not read Boolean object")
312class IndirectObject(PdfObject):
313 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
314 self.idnum = idnum
315 self.generation = generation
316 self.pdf = pdf
318 def __hash__(self) -> int:
319 return hash((self.idnum, self.generation, id(self.pdf)))
321 def hash_bin(self) -> int:
322 """
323 Used to detect modified object.
325 Returns:
326 Hash considering type and value.
328 """
329 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
331 def replicate(
332 self,
333 pdf_dest: PdfWriterProtocol,
334 ) -> "PdfObject":
335 return IndirectObject(self.idnum, self.generation, pdf_dest)
337 def clone(
338 self,
339 pdf_dest: PdfWriterProtocol,
340 force_duplicate: bool = False,
341 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
342 ) -> "IndirectObject":
343 """Clone object into pdf_dest."""
344 if self.pdf == pdf_dest and not force_duplicate:
345 # Already duplicated and no extra duplication required
346 return self
347 if id(self.pdf) not in pdf_dest._id_translated:
348 pdf_dest._id_translated[id(self.pdf)] = {}
349 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
351 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
352 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
353 if force_duplicate:
354 assert dup is not None
355 assert dup.indirect_reference is not None
356 idref = dup.indirect_reference
357 return IndirectObject(idref.idnum, idref.generation, idref.pdf)
358 else:
359 obj = self.get_object()
360 # case observed : a pointed object can not be found
361 if obj is None:
362 # this normally
363 obj = NullObject()
364 assert isinstance(self, (IndirectObject,))
365 obj.indirect_reference = self
366 dup = pdf_dest._add_object(
367 obj.clone(pdf_dest, force_duplicate, ignore_fields)
368 )
369 assert dup is not None, "mypy"
370 assert dup.indirect_reference is not None, "mypy"
371 return dup.indirect_reference
373 @property
374 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
375 return self
377 def get_object(self) -> Optional["PdfObject"]:
378 return self.pdf.get_object(self)
380 def __deepcopy__(self, memo: Any) -> "IndirectObject":
381 return IndirectObject(self.idnum, self.generation, self.pdf)
383 def _get_object_with_check(self) -> Optional["PdfObject"]:
384 o = self.get_object()
385 # the check is done here to not slow down get_object()
386 if isinstance(o, IndirectObject):
387 raise PdfStreamError(
388 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
389 )
390 return o
392 def __getattr__(self, name: str) -> Any:
393 # Attribute not found in object: look in pointed object
394 try:
395 return getattr(self._get_object_with_check(), name)
396 except AttributeError:
397 raise AttributeError(
398 f"No attribute {name} found in IndirectObject or pointed object"
399 )
401 def __getitem__(self, key: Any) -> Any:
402 # items should be extracted from pointed Object
403 return self._get_object_with_check()[key] # type: ignore
405 def __contains__(self, key: Any) -> bool:
406 return key in self._get_object_with_check() # type: ignore
408 def __iter__(self) -> Any:
409 return self._get_object_with_check().__iter__() # type: ignore
411 def __float__(self) -> str:
412 # in this case we are looking for the pointed data
413 return self.get_object().__float__() # type: ignore
415 def __int__(self) -> int:
416 # in this case we are looking for the pointed data
417 return self.get_object().__int__() # type: ignore
419 def __str__(self) -> str:
420 # in this case we are looking for the pointed data
421 return self.get_object().__str__()
423 def __repr__(self) -> str:
424 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
426 def __eq__(self, other: object) -> bool:
427 return (
428 other is not None
429 and isinstance(other, IndirectObject)
430 and self.idnum == other.idnum
431 and self.generation == other.generation
432 and self.pdf is other.pdf
433 )
435 def __ne__(self, other: object) -> bool:
436 return not self.__eq__(other)
438 def write_to_stream(
439 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
440 ) -> None:
441 if encryption_key is not None: # deprecated
442 deprecation_no_replacement(
443 "the encryption_key parameter of write_to_stream", "5.0.0"
444 )
445 stream.write(f"{self.idnum} {self.generation} R".encode())
447 @staticmethod
448 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
449 idnum = b""
450 while True:
451 tok = stream.read(1)
452 if not tok:
453 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
454 if tok.isspace():
455 break
456 idnum += tok
457 generation = b""
458 while True:
459 tok = stream.read(1)
460 if not tok:
461 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
462 if tok.isspace():
463 if not generation:
464 continue
465 break
466 generation += tok
467 r = read_non_whitespace(stream)
468 if r != b"R":
469 raise PdfReadError(
470 f"Error reading indirect object reference at byte {hex(stream.tell())}"
471 )
472 return IndirectObject(int(idnum), int(generation), pdf)
475FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
478class FloatObject(float, PdfObject):
479 def __new__(
480 cls, value: Any = "0.0", context: Optional[Any] = None
481 ) -> "FloatObject":
482 try:
483 value = float(value)
484 return float.__new__(cls, value)
485 except Exception as e:
486 # If this isn't a valid decimal (happens in malformed PDFs)
487 # fallback to 0
488 logger_warning(
489 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
490 )
491 return float.__new__(cls, 0.0)
493 def clone(
494 self,
495 pdf_dest: Any,
496 force_duplicate: bool = False,
497 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
498 ) -> "FloatObject":
499 """Clone object into pdf_dest."""
500 return cast(
501 "FloatObject",
502 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
503 )
505 def hash_bin(self) -> int:
506 """
507 Used to detect modified object.
509 Returns:
510 Hash considering type and value.
512 """
513 return hash((self.__class__, self.as_numeric))
515 def myrepr(self) -> str:
516 if self == 0:
517 return "0.0"
518 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
519 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
521 def __repr__(self) -> str:
522 return self.myrepr() # repr(float(self))
524 def as_numeric(self) -> float:
525 return float(self)
527 def write_to_stream(
528 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
529 ) -> None:
530 if encryption_key is not None: # deprecated
531 deprecation_no_replacement(
532 "the encryption_key parameter of write_to_stream", "5.0.0"
533 )
534 stream.write(self.myrepr().encode("utf8"))
537class NumberObject(int, PdfObject):
538 NumberPattern = re.compile(b"[^+-.0-9]")
540 def __new__(cls, value: Any) -> "NumberObject":
541 try:
542 return int.__new__(cls, int(value))
543 except ValueError:
544 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
545 return int.__new__(cls, 0)
547 def clone(
548 self,
549 pdf_dest: Any,
550 force_duplicate: bool = False,
551 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
552 ) -> "NumberObject":
553 """Clone object into pdf_dest."""
554 return cast(
555 "NumberObject",
556 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
557 )
559 def hash_bin(self) -> int:
560 """
561 Used to detect modified object.
563 Returns:
564 Hash considering type and value.
566 """
567 return hash((self.__class__, self.as_numeric()))
569 def as_numeric(self) -> int:
570 return int(repr(self).encode("utf8"))
572 def write_to_stream(
573 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
574 ) -> None:
575 if encryption_key is not None: # deprecated
576 deprecation_no_replacement(
577 "the encryption_key parameter of write_to_stream", "5.0.0"
578 )
579 stream.write(repr(self).encode("utf8"))
581 @staticmethod
582 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
583 num = read_until_regex(stream, NumberObject.NumberPattern)
584 if b"." in num:
585 return FloatObject(num)
586 return NumberObject(num)
589class ByteStringObject(bytes, PdfObject):
590 """
591 Represents a string object where the text encoding could not be determined.
593 This occurs quite often, as the PDF spec doesn't provide an alternate way to
594 represent strings -- for example, the encryption data stored in files (like
595 /O) is clearly not text, but is still stored in a "String" object.
596 """
598 def clone(
599 self,
600 pdf_dest: Any,
601 force_duplicate: bool = False,
602 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
603 ) -> "ByteStringObject":
604 """Clone object into pdf_dest."""
605 return cast(
606 "ByteStringObject",
607 self._reference_clone(
608 ByteStringObject(bytes(self)), pdf_dest, force_duplicate
609 ),
610 )
612 def hash_bin(self) -> int:
613 """
614 Used to detect modified object.
616 Returns:
617 Hash considering type and value.
619 """
620 return hash((self.__class__, bytes(self)))
622 @property
623 def original_bytes(self) -> bytes:
624 """For compatibility with TextStringObject.original_bytes."""
625 return self
627 def write_to_stream(
628 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
629 ) -> None:
630 if encryption_key is not None: # deprecated
631 deprecation_no_replacement(
632 "the encryption_key parameter of write_to_stream", "5.0.0"
633 )
634 stream.write(b"<")
635 stream.write(binascii.hexlify(self))
636 stream.write(b">")
638 def __str__(self) -> str:
639 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
640 for enc in charset_to_try:
641 try:
642 return self.decode(enc)
643 except UnicodeDecodeError:
644 pass
645 raise PdfReadError("Cannot decode ByteStringObject.")
648class TextStringObject(str, PdfObject): # noqa: SLOT000
649 """
650 A string object that has been decoded into a real unicode string.
652 If read from a PDF document, this string appeared to match the
653 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
654 to occur.
655 """
657 autodetect_pdfdocencoding: bool
658 autodetect_utf16: bool
659 utf16_bom: bytes
660 _original_bytes: Optional[bytes] = None
662 def __new__(cls, value: Any) -> "TextStringObject":
663 org = None
664 if isinstance(value, bytes):
665 org = value
666 value = value.decode("charmap")
667 o = str.__new__(cls, value)
668 o._original_bytes = org
669 o.autodetect_utf16 = False
670 o.autodetect_pdfdocencoding = False
671 o.utf16_bom = b""
672 if o.startswith(("\xfe\xff", "\xff\xfe")):
673 assert org is not None, "mypy"
674 try:
675 o = str.__new__(cls, org.decode("utf-16"))
676 except UnicodeDecodeError as exc:
677 logger_warning(
678 f"{exc!s}\ninitial string:{exc.object!r}",
679 __name__,
680 )
681 o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
682 o._original_bytes = org
683 o.autodetect_utf16 = True
684 o.utf16_bom = org[:2]
685 else:
686 try:
687 encode_pdfdocencoding(o)
688 o.autodetect_pdfdocencoding = True
689 except UnicodeEncodeError:
690 o.autodetect_utf16 = True
691 o.utf16_bom = codecs.BOM_UTF16_BE
692 return o
694 def clone(
695 self,
696 pdf_dest: Any,
697 force_duplicate: bool = False,
698 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
699 ) -> "TextStringObject":
700 """Clone object into pdf_dest."""
701 obj = TextStringObject(self)
702 obj._original_bytes = self._original_bytes
703 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
704 obj.autodetect_utf16 = self.autodetect_utf16
705 obj.utf16_bom = self.utf16_bom
706 return cast(
707 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
708 )
710 def hash_bin(self) -> int:
711 """
712 Used to detect modified object.
714 Returns:
715 Hash considering type and value.
717 """
718 return hash((self.__class__, self.original_bytes))
720 @property
721 def original_bytes(self) -> bytes:
722 """
723 It is occasionally possible that a text string object gets created where
724 a byte string object was expected due to the autodetection mechanism --
725 if that occurs, this "original_bytes" property can be used to
726 back-calculate what the original encoded bytes were.
727 """
728 if self._original_bytes is not None:
729 return self._original_bytes
730 return self.get_original_bytes()
732 def get_original_bytes(self) -> bytes:
733 # We're a text string object, but the library is trying to get our raw
734 # bytes. This can happen if we auto-detected this string as text, but
735 # we were wrong. It's pretty common. Return the original bytes that
736 # would have been used to create this object, based upon the autodetect
737 # method.
738 if self.autodetect_utf16:
739 if self.utf16_bom == codecs.BOM_UTF16_LE:
740 return codecs.BOM_UTF16_LE + self.encode("utf-16le")
741 if self.utf16_bom == codecs.BOM_UTF16_BE:
742 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
743 return self.encode("utf-16be")
744 if self.autodetect_pdfdocencoding:
745 return encode_pdfdocencoding(self)
746 raise Exception("no information about original bytes") # pragma: no cover
748 def get_encoded_bytes(self) -> bytes:
749 # Try to write the string out as a PDFDocEncoding encoded string. It's
750 # nicer to look at in the PDF file. Sadly, we take a performance hit
751 # here for trying...
752 try:
753 if self._original_bytes is not None:
754 return self._original_bytes
755 if self.autodetect_utf16:
756 raise UnicodeEncodeError("", "forced", -1, -1, "")
757 bytearr = encode_pdfdocencoding(self)
758 except UnicodeEncodeError:
759 if self.utf16_bom == codecs.BOM_UTF16_LE:
760 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
761 elif self.utf16_bom == codecs.BOM_UTF16_BE:
762 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
763 else:
764 bytearr = self.encode("utf-16be")
765 return bytearr
767 def write_to_stream(
768 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
769 ) -> None:
770 if encryption_key is not None: # deprecated
771 deprecation_no_replacement(
772 "the encryption_key parameter of write_to_stream", "5.0.0"
773 )
774 bytearr = self.get_encoded_bytes()
775 stream.write(b"(")
776 for c_ in iter_unpack("c", bytearr):
777 c = cast(bytes, c_[0])
778 if not c.isalnum() and c != b" ":
779 # This:
780 # stream.write(rf"\{c:0>3o}".encode())
781 # gives
782 # https://github.com/davidhalter/parso/issues/207
783 stream.write(b"\\%03o" % ord(c))
784 else:
785 stream.write(c)
786 stream.write(b")")
789class NameObject(str, PdfObject): # noqa: SLOT000
790 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
791 prefix = b"/"
792 renumber_table: ClassVar[dict[str, bytes]] = {
793 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
794 **{chr(i): f"#{i:02X}".encode() for i in range(33)},
795 }
797 def clone(
798 self,
799 pdf_dest: Any,
800 force_duplicate: bool = False,
801 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
802 ) -> "NameObject":
803 """Clone object into pdf_dest."""
804 return cast(
805 "NameObject",
806 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
807 )
809 def hash_bin(self) -> int:
810 """
811 Used to detect modified object.
813 Returns:
814 Hash considering type and value.
816 """
817 return hash((self.__class__, self))
819 def write_to_stream(
820 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
821 ) -> None:
822 if encryption_key is not None: # deprecated
823 deprecation_no_replacement(
824 "the encryption_key parameter of write_to_stream", "5.0.0"
825 )
826 stream.write(self.renumber())
828 def renumber(self) -> bytes:
829 out = self[0].encode("utf-8")
830 if out != b"/":
831 deprecation_no_replacement(
832 f"Incorrect first char in NameObject, should start with '/': ({self})",
833 "5.0.0",
834 )
835 for c in self[1:]:
836 if c > "~":
837 for x in c.encode("utf-8"):
838 out += f"#{x:02X}".encode()
839 else:
840 try:
841 out += self.renumber_table[c]
842 except KeyError:
843 out += c.encode("utf-8")
844 return out
846 def _sanitize(self) -> "NameObject":
847 """
848 Sanitize the NameObject's name to be a valid PDF name part
849 (alphanumeric, underscore, hyphen). The _sanitize method replaces
850 spaces and any non-alphanumeric/non-underscore/non-hyphen with
851 underscores.
853 Returns:
854 NameObject with sanitized name.
855 """
856 name = str(self).removeprefix("/")
857 name = re.sub(r"\ ", "_", name)
858 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
859 return NameObject("/" + name)
861 @classproperty
862 def surfix(cls) -> bytes: # noqa: N805
863 deprecation_with_replacement("surfix", "prefix", "5.0.0")
864 return b"/"
866 @staticmethod
867 def unnumber(sin: bytes) -> bytes:
868 i = sin.find(b"#", 0)
869 while i >= 0:
870 try:
871 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
872 i = sin.find(b"#", i + 1)
873 except ValueError:
874 # if the 2 characters after # can not be converted to hex
875 # we change nothing and carry on
876 i = i + 1
877 return sin
879 CHARSETS = ("utf-8", "gbk", "latin1")
881 @staticmethod
882 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
883 name = stream.read(1)
884 if name != NameObject.prefix:
885 raise PdfReadError("Name read error")
886 name += read_until_regex(stream, NameObject.delimiter_pattern)
887 try:
888 # Name objects should represent irregular characters
889 # with a '#' followed by the symbol's hex number
890 name = NameObject.unnumber(name)
891 for enc in NameObject.CHARSETS:
892 try:
893 ret = name.decode(enc)
894 return NameObject(ret)
895 except Exception:
896 pass
897 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
898 except (UnicodeEncodeError, UnicodeDecodeError) as e:
899 if not pdf.strict:
900 logger_warning(
901 f"Illegal character in NameObject ({name!r}), "
902 "you may need to adjust NameObject.CHARSETS",
903 __name__,
904 )
905 return NameObject(name.decode("charmap"))
906 raise PdfReadError(
907 f"Illegal character in NameObject ({name!r}). "
908 "You may need to adjust NameObject.CHARSETS.",
909 ) from e
912def encode_pdfdocencoding(unicode_string: str) -> bytes:
913 try:
914 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
915 except KeyError:
916 raise UnicodeEncodeError(
917 "pdfdocencoding",
918 unicode_string,
919 -1,
920 -1,
921 "does not exist in translation table",
922 )
925def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
926 """
927 Returns:
928 True if x is None or NullObject.
930 """
931 return x is None or (
932 isinstance(x, PdfObject)
933 and (x.get_object() is None or isinstance(x.get_object(), NullObject))
934 )