Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27import binascii
28import codecs
29import hashlib
30import re
31import sys
32from collections.abc import Sequence
33from math import log10
34from struct import iter_unpack
35from typing import Any, Callable, ClassVar, Optional, Union, cast
37if sys.version_info[:2] >= (3, 10):
38 from typing import TypeGuard
39else:
40 from typing_extensions import TypeGuard # PEP 647
42if sys.version_info >= (3, 11):
43 from typing import Self
44else:
45 from typing_extensions import Self
47from .._codecs import _pdfdoc_encoding_rev
48from .._protocols import PdfObjectProtocol, PdfWriterProtocol
49from .._utils import (
50 StreamType,
51 classproperty,
52 deprecation_no_replacement,
53 deprecation_with_replacement,
54 logger_warning,
55 read_non_whitespace,
56 read_until_regex,
57)
58from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
60__author__ = "Mathieu Fenniak"
61__author_email__ = "biziqe@mathieu.fenniak.net"
64class PdfObject(PdfObjectProtocol):
65 # function for calculating a hash value
66 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
67 indirect_reference: Optional["IndirectObject"]
69 def hash_bin(self) -> int:
70 """
71 Used to detect modified object.
73 Returns:
74 Hash considering type and value.
76 """
77 raise NotImplementedError(
78 f"{self.__class__.__name__} does not implement .hash_bin() so far"
79 )
81 def hash_value_data(self) -> bytes:
82 return f"{self}".encode()
84 def hash_value(self) -> bytes:
85 return (
86 f"{self.__class__.__name__}:"
87 f"{self.hash_func(self.hash_value_data()).hexdigest()}"
88 ).encode()
90 def replicate(
91 self,
92 pdf_dest: PdfWriterProtocol,
93 ) -> "PdfObject":
94 """
95 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
96 without ensuring links. This is used in clone_document_from_root with incremental = True.
98 Args:
99 pdf_dest: Target to clone to.
101 Returns:
102 The cloned PdfObject
104 """
105 return self.clone(pdf_dest)
107 def clone(
108 self,
109 pdf_dest: PdfWriterProtocol,
110 force_duplicate: bool = False,
111 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
112 ) -> "PdfObject":
113 """
114 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
116 By default, this method will call ``_reference_clone`` (see ``_reference``).
119 Args:
120 pdf_dest: Target to clone to.
121 force_duplicate: By default, if the object has already been cloned and referenced,
122 the copy will be returned; when ``True``, a new copy will be created.
123 (Default value = ``False``)
124 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
125 during cloning (applies to children duplication as well). If fields are to be
126 considered for a limited number of levels, you have to add it as integer, for
127 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
128 level only but ``"/TOTO"`` on all levels.
130 Returns:
131 The cloned PdfObject
133 """
134 raise NotImplementedError(
135 f"{self.__class__.__name__} does not implement .clone so far"
136 )
138 def _reference_clone(
139 self, clone: "PdfObject", pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
140 ) -> "PdfObject":
141 """
142 Reference the object within the _objects of pdf_dest only if
143 indirect_reference attribute exists (which means the objects was
144 already identified in xref/xobjstm) if object has been already
145 referenced do nothing.
147 Args:
148 clone:
149 pdf_dest:
151 Returns:
152 The clone
154 """
155 try:
156 if (
157 not force_duplicate
158 and clone.indirect_reference is not None
159 and clone.indirect_reference.pdf == pdf_dest
160 ):
161 return clone
162 except Exception:
163 pass
164 # if hasattr(clone, "indirect_reference"):
165 try:
166 ind = self.indirect_reference
167 except AttributeError:
168 return clone
169 if (
170 pdf_dest.incremental
171 and ind is not None
172 and ind.pdf == pdf_dest._reader
173 and ind.idnum <= len(pdf_dest._objects)
174 ):
175 i = ind.idnum
176 else:
177 i = len(pdf_dest._objects) + 1
178 if ind is not None:
179 if id(ind.pdf) not in pdf_dest._id_translated:
180 pdf_dest._id_translated[id(ind.pdf)] = {}
181 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
182 if (
183 not force_duplicate
184 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
185 ):
186 obj = pdf_dest.get_object(
187 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
188 )
189 assert isinstance(obj, PdfObject), "mypy"
190 return obj
191 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
192 try:
193 pdf_dest._objects[i - 1] = clone
194 except IndexError:
195 pdf_dest._objects.append(clone)
196 i = len(pdf_dest._objects)
197 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
198 return clone
200 def get_object(self) -> Optional["PdfObject"]:
201 """Resolve indirect references."""
202 return self
204 def write_to_stream(
205 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
206 ) -> None:
207 raise NotImplementedError
210class NullObject(PdfObject):
211 def clone(
212 self,
213 pdf_dest: PdfWriterProtocol,
214 force_duplicate: bool = False,
215 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
216 ) -> "NullObject":
217 """Clone object into pdf_dest."""
218 return cast(
219 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
220 )
222 def hash_bin(self) -> int:
223 """
224 Used to detect modified object.
226 Returns:
227 Hash considering type and value.
229 """
230 return hash((self.__class__,))
232 def write_to_stream(
233 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
234 ) -> None:
235 if encryption_key is not None: # deprecated
236 deprecation_no_replacement(
237 "the encryption_key parameter of write_to_stream", "5.0.0"
238 )
239 stream.write(b"null")
241 @staticmethod
242 def read_from_stream(stream: StreamType) -> "NullObject":
243 nulltxt = stream.read(4)
244 if nulltxt != b"null":
245 raise PdfReadError("Could not read Null object")
246 return NullObject()
248 def __repr__(self) -> str:
249 return "NullObject"
251 def __eq__(self, other: object) -> bool:
252 return isinstance(other, NullObject)
254 def __hash__(self) -> int:
255 return self.hash_bin()
258class BooleanObject(PdfObject):
259 value: bool
261 def __init__(self, value: Any) -> None:
262 self.value = value
264 def clone(
265 self,
266 pdf_dest: PdfWriterProtocol,
267 force_duplicate: bool = False,
268 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
269 ) -> "BooleanObject":
270 """Clone object into pdf_dest."""
271 return cast(
272 "BooleanObject",
273 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
274 )
276 def hash_bin(self) -> int:
277 """
278 Used to detect modified object.
280 Returns:
281 Hash considering type and value.
283 """
284 return hash((self.__class__, self.value))
286 def __eq__(self, o: object, /) -> bool:
287 if isinstance(o, BooleanObject):
288 return self.value == o.value
289 if isinstance(o, bool):
290 return self.value == o
291 return False
293 def __hash__(self) -> int:
294 return self.hash_bin()
296 def __repr__(self) -> str:
297 return "True" if self.value else "False"
299 def write_to_stream(
300 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
301 ) -> None:
302 if encryption_key is not None: # deprecated
303 deprecation_no_replacement(
304 "the encryption_key parameter of write_to_stream", "5.0.0"
305 )
306 if self.value:
307 stream.write(b"true")
308 else:
309 stream.write(b"false")
311 @staticmethod
312 def read_from_stream(stream: StreamType) -> "BooleanObject":
313 word = stream.read(4)
314 if word == b"true":
315 return BooleanObject(True)
316 if word == b"fals":
317 stream.read(1)
318 return BooleanObject(False)
319 raise PdfReadError("Could not read Boolean object")
322class IndirectObject(PdfObject):
323 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
324 self.idnum = idnum
325 self.generation = generation
326 self.pdf = pdf
328 def __hash__(self) -> int:
329 return hash((self.idnum, self.generation, id(self.pdf)))
331 def hash_bin(self) -> int:
332 """
333 Used to detect modified object.
335 Returns:
336 Hash considering type and value.
338 """
339 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
341 def replicate(
342 self,
343 pdf_dest: PdfWriterProtocol,
344 ) -> "PdfObject":
345 return IndirectObject(self.idnum, self.generation, pdf_dest)
347 def clone(
348 self,
349 pdf_dest: PdfWriterProtocol,
350 force_duplicate: bool = False,
351 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
352 ) -> "IndirectObject":
353 """Clone object into pdf_dest."""
354 if self.pdf == pdf_dest and not force_duplicate:
355 # Already duplicated and no extra duplication required
356 return self
357 if id(self.pdf) not in pdf_dest._id_translated:
358 pdf_dest._id_translated[id(self.pdf)] = {}
359 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
361 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
362 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
363 if force_duplicate:
364 assert dup is not None
365 assert dup.indirect_reference is not None
366 idref = dup.indirect_reference
367 return IndirectObject(idref.idnum, idref.generation, idref.pdf)
368 else:
369 obj = self.get_object()
370 # case observed : a pointed object can not be found
371 if obj is None:
372 # this normally
373 obj = NullObject()
374 assert isinstance(self, (IndirectObject,))
375 obj.indirect_reference = self
376 dup = pdf_dest._add_object(
377 obj.clone(pdf_dest, force_duplicate, ignore_fields)
378 )
379 assert isinstance(dup, PdfObject), "mypy"
380 assert dup.indirect_reference is not None, "mypy"
381 return dup.indirect_reference
383 @property
384 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
385 return self
387 def get_object(self) -> Optional["PdfObject"]:
388 obj: Optional[PdfObject] = self.pdf.get_object(self)
389 return obj
391 def __deepcopy__(self, memo: Any) -> "IndirectObject":
392 return IndirectObject(self.idnum, self.generation, self.pdf)
394 def _get_object_with_check(self) -> Optional["PdfObject"]:
395 o = self.get_object()
396 # the check is done here to not slow down get_object()
397 if isinstance(o, IndirectObject):
398 raise PdfStreamError(
399 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
400 )
401 return o
403 def __getattr__(self, name: str) -> Any:
404 # Attribute not found in object: look in pointed object
405 try:
406 return getattr(self._get_object_with_check(), name)
407 except AttributeError:
408 raise AttributeError(
409 f"No attribute {name} found in IndirectObject or pointed object"
410 )
412 def __getitem__(self, key: Any) -> Any:
413 # items should be extracted from pointed Object
414 return self._get_object_with_check()[key] # type: ignore
416 def __contains__(self, key: Any) -> bool:
417 return key in self._get_object_with_check() # type: ignore
419 def __iter__(self) -> Any:
420 return self._get_object_with_check().__iter__() # type: ignore
422 def __float__(self) -> str:
423 # in this case we are looking for the pointed data
424 return self.get_object().__float__() # type: ignore
426 def __int__(self) -> int:
427 # in this case we are looking for the pointed data
428 return self.get_object().__int__() # type: ignore
430 def __str__(self) -> str:
431 # in this case we are looking for the pointed data
432 return self.get_object().__str__()
434 def __repr__(self) -> str:
435 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
437 def __eq__(self, other: object) -> bool:
438 return (
439 other is not None
440 and isinstance(other, IndirectObject)
441 and self.idnum == other.idnum
442 and self.generation == other.generation
443 and self.pdf is other.pdf
444 )
446 def __ne__(self, other: object) -> bool:
447 return not self.__eq__(other)
449 def write_to_stream(
450 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
451 ) -> None:
452 if encryption_key is not None: # deprecated
453 deprecation_no_replacement(
454 "the encryption_key parameter of write_to_stream", "5.0.0"
455 )
456 stream.write(f"{self.idnum} {self.generation} R".encode())
458 @staticmethod
459 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
460 idnum = b""
461 while True:
462 tok = stream.read(1)
463 if not tok:
464 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
465 if tok.isspace():
466 break
467 idnum += tok
468 generation = b""
469 while True:
470 tok = stream.read(1)
471 if not tok:
472 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
473 if tok.isspace():
474 if not generation:
475 continue
476 break
477 generation += tok
478 r = read_non_whitespace(stream)
479 if r != b"R":
480 raise PdfReadError(
481 f"Error reading indirect object reference at byte {hex(stream.tell())}"
482 )
483 return IndirectObject(int(idnum), int(generation), pdf)
486FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
489class FloatObject(float, PdfObject):
490 def __new__(
491 cls, value: Any = "0.0", context: Optional[Any] = None
492 ) -> Self:
493 try:
494 value = float(value)
495 return float.__new__(cls, value)
496 except Exception as e:
497 # If this isn't a valid decimal (happens in malformed PDFs)
498 # fallback to 0
499 logger_warning(
500 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
501 )
502 return float.__new__(cls, 0.0)
504 def clone(
505 self,
506 pdf_dest: Any,
507 force_duplicate: bool = False,
508 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
509 ) -> "FloatObject":
510 """Clone object into pdf_dest."""
511 return cast(
512 "FloatObject",
513 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
514 )
516 def hash_bin(self) -> int:
517 """
518 Used to detect modified object.
520 Returns:
521 Hash considering type and value.
523 """
524 return hash((self.__class__, self.as_numeric))
526 def myrepr(self) -> str:
527 if self == 0: # type: ignore[comparison-overlap]
528 return "0.0"
529 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
530 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
532 def __repr__(self) -> str:
533 return self.myrepr() # repr(float(self))
535 def as_numeric(self) -> float:
536 return float(self)
538 def write_to_stream(
539 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
540 ) -> None:
541 if encryption_key is not None: # deprecated
542 deprecation_no_replacement(
543 "the encryption_key parameter of write_to_stream", "5.0.0"
544 )
545 stream.write(self.myrepr().encode("utf8"))
548class NumberObject(int, PdfObject):
549 NumberPattern = re.compile(b"[^+-.0-9]")
551 def __new__(cls, value: Any) -> Self:
552 try:
553 return int.__new__(cls, int(value))
554 except ValueError:
555 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
556 return int.__new__(cls, 0)
558 def clone(
559 self,
560 pdf_dest: Any,
561 force_duplicate: bool = False,
562 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
563 ) -> "NumberObject":
564 """Clone object into pdf_dest."""
565 return cast(
566 "NumberObject",
567 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
568 )
570 def hash_bin(self) -> int:
571 """
572 Used to detect modified object.
574 Returns:
575 Hash considering type and value.
577 """
578 return hash((self.__class__, self.as_numeric()))
580 def as_numeric(self) -> int:
581 return int(repr(self).encode("utf8"))
583 def write_to_stream(
584 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
585 ) -> None:
586 if encryption_key is not None: # deprecated
587 deprecation_no_replacement(
588 "the encryption_key parameter of write_to_stream", "5.0.0"
589 )
590 stream.write(repr(self).encode("utf8"))
592 @staticmethod
593 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
594 num = read_until_regex(stream, NumberObject.NumberPattern)
595 if b"." in num:
596 return FloatObject(num)
597 return NumberObject(num)
600class ByteStringObject(bytes, PdfObject):
601 """
602 Represents a string object where the text encoding could not be determined.
604 This occurs quite often, as the PDF spec doesn't provide an alternate way to
605 represent strings -- for example, the encryption data stored in files (like
606 /O) is clearly not text, but is still stored in a "String" object.
607 """
609 def clone(
610 self,
611 pdf_dest: Any,
612 force_duplicate: bool = False,
613 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
614 ) -> "ByteStringObject":
615 """Clone object into pdf_dest."""
616 return cast(
617 "ByteStringObject",
618 self._reference_clone(
619 ByteStringObject(bytes(self)), pdf_dest, force_duplicate
620 ),
621 )
623 def hash_bin(self) -> int:
624 """
625 Used to detect modified object.
627 Returns:
628 Hash considering type and value.
630 """
631 return hash((self.__class__, bytes(self)))
633 @property
634 def original_bytes(self) -> bytes:
635 """For compatibility with TextStringObject.original_bytes."""
636 return self
638 def write_to_stream(
639 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
640 ) -> None:
641 if encryption_key is not None: # deprecated
642 deprecation_no_replacement(
643 "the encryption_key parameter of write_to_stream", "5.0.0"
644 )
645 stream.write(b"<")
646 stream.write(binascii.hexlify(self))
647 stream.write(b">")
649 def __str__(self) -> str:
650 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
651 for enc in charset_to_try:
652 try:
653 return self.decode(enc)
654 except UnicodeDecodeError:
655 pass
656 raise PdfReadError("Cannot decode ByteStringObject.")
659class TextStringObject(str, PdfObject): # noqa: SLOT000
660 """
661 A string object that has been decoded into a real unicode string.
663 If read from a PDF document, this string appeared to match the
664 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
665 to occur.
666 """
668 autodetect_pdfdocencoding: bool
669 autodetect_utf16: bool
670 utf16_bom: bytes
671 _original_bytes: Optional[bytes] = None
673 def __new__(cls, value: Any) -> Self:
674 original_bytes = None
675 if isinstance(value, bytes):
676 original_bytes = value
677 value = value.decode("charmap")
678 text_string_object = str.__new__(cls, value)
679 text_string_object._original_bytes = original_bytes
680 text_string_object.autodetect_utf16 = False
681 text_string_object.autodetect_pdfdocencoding = False
682 text_string_object.utf16_bom = b""
683 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
684 # The value of `original_bytes` is only set for inputs being `bytes`.
685 # If this is UTF-16 data according to the BOM (first two characters),
686 # perform special handling. All other cases should not need any special conversion
687 # due to already being a string.
688 try:
689 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
690 except UnicodeDecodeError as exception:
691 logger_warning(
692 f"{exception!s}\ninitial string:{exception.object!r}",
693 __name__,
694 )
695 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
696 text_string_object._original_bytes = original_bytes
697 text_string_object.autodetect_utf16 = True
698 text_string_object.utf16_bom = original_bytes[:2]
699 else:
700 try:
701 encode_pdfdocencoding(text_string_object)
702 text_string_object.autodetect_pdfdocencoding = True
703 except UnicodeEncodeError:
704 text_string_object.autodetect_utf16 = True
705 text_string_object.utf16_bom = codecs.BOM_UTF16_BE
706 return text_string_object
708 def clone(
709 self,
710 pdf_dest: Any,
711 force_duplicate: bool = False,
712 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
713 ) -> "TextStringObject":
714 """Clone object into pdf_dest."""
715 obj = TextStringObject(self)
716 obj._original_bytes = self._original_bytes
717 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
718 obj.autodetect_utf16 = self.autodetect_utf16
719 obj.utf16_bom = self.utf16_bom
720 return cast(
721 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
722 )
724 def hash_bin(self) -> int:
725 """
726 Used to detect modified object.
728 Returns:
729 Hash considering type and value.
731 """
732 return hash((self.__class__, self.original_bytes))
734 @property
735 def original_bytes(self) -> bytes:
736 """
737 It is occasionally possible that a text string object gets created where
738 a byte string object was expected due to the autodetection mechanism --
739 if that occurs, this "original_bytes" property can be used to
740 back-calculate what the original encoded bytes were.
741 """
742 if self._original_bytes is not None:
743 return self._original_bytes
744 return self.get_original_bytes()
746 def get_original_bytes(self) -> bytes:
747 # We're a text string object, but the library is trying to get our raw
748 # bytes. This can happen if we auto-detected this string as text, but
749 # we were wrong. It's pretty common. Return the original bytes that
750 # would have been used to create this object, based upon the autodetect
751 # method.
752 if self.autodetect_utf16:
753 if self.utf16_bom == codecs.BOM_UTF16_LE:
754 return codecs.BOM_UTF16_LE + self.encode("utf-16le")
755 if self.utf16_bom == codecs.BOM_UTF16_BE:
756 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
757 return self.encode("utf-16be")
758 if self.autodetect_pdfdocencoding:
759 return encode_pdfdocencoding(self)
760 raise Exception("no information about original bytes") # pragma: no cover
762 def get_encoded_bytes(self) -> bytes:
763 # Try to write the string out as a PDFDocEncoding encoded string. It's
764 # nicer to look at in the PDF file. Sadly, we take a performance hit
765 # here for trying...
766 try:
767 if self._original_bytes is not None:
768 return self._original_bytes
769 if self.autodetect_utf16:
770 raise UnicodeEncodeError("", "forced", -1, -1, "")
771 bytearr = encode_pdfdocencoding(self)
772 except UnicodeEncodeError:
773 if self.utf16_bom == codecs.BOM_UTF16_LE:
774 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
775 elif self.utf16_bom == codecs.BOM_UTF16_BE:
776 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
777 else:
778 bytearr = self.encode("utf-16be")
779 return bytearr
781 def write_to_stream(
782 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
783 ) -> None:
784 if encryption_key is not None: # deprecated
785 deprecation_no_replacement(
786 "the encryption_key parameter of write_to_stream", "5.0.0"
787 )
788 bytearr = self.get_encoded_bytes()
789 stream.write(b"(")
790 for c_ in iter_unpack("c", bytearr):
791 c = cast(bytes, c_[0])
792 if not c.isalnum() and c != b" ":
793 # This:
794 # stream.write(rf"\{c:0>3o}".encode())
795 # gives
796 # https://github.com/davidhalter/parso/issues/207
797 stream.write(b"\\%03o" % ord(c))
798 else:
799 stream.write(c)
800 stream.write(b")")
803class NameObject(str, PdfObject): # noqa: SLOT000
804 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
805 prefix = b"/"
806 renumber_table: ClassVar[dict[str, bytes]] = {
807 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
808 **{chr(i): f"#{i:02X}".encode() for i in range(33)},
809 }
811 def clone(
812 self,
813 pdf_dest: Any,
814 force_duplicate: bool = False,
815 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
816 ) -> "NameObject":
817 """Clone object into pdf_dest."""
818 return cast(
819 "NameObject",
820 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
821 )
823 def hash_bin(self) -> int:
824 """
825 Used to detect modified object.
827 Returns:
828 Hash considering type and value.
830 """
831 return hash((self.__class__, self))
833 def write_to_stream(
834 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
835 ) -> None:
836 if encryption_key is not None: # deprecated
837 deprecation_no_replacement(
838 "the encryption_key parameter of write_to_stream", "5.0.0"
839 )
840 stream.write(self.renumber())
842 def renumber(self) -> bytes:
843 out = self[0].encode("utf-8")
844 if out != b"/":
845 deprecation_no_replacement(
846 f"Incorrect first char in NameObject, should start with '/': ({self})",
847 "5.0.0",
848 )
849 parts = [out]
850 for c in self[1:]:
851 if c > "~":
852 parts.extend(f"#{x:02X}".encode() for x in c.encode("utf-8"))
853 else:
854 try:
855 parts.append(self.renumber_table[c])
856 except KeyError:
857 parts.append(c.encode("utf-8"))
858 return b"".join(parts)
860 def _sanitize(self) -> "NameObject":
861 """
862 Sanitize the NameObject's name to be a valid PDF name part
863 (alphanumeric, underscore, hyphen). The _sanitize method replaces
864 spaces and any non-alphanumeric/non-underscore/non-hyphen with
865 underscores.
867 Returns:
868 NameObject with sanitized name.
869 """
870 name = str(self).removeprefix("/")
871 name = re.sub(r"\ ", "_", name)
872 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
873 return NameObject("/" + name)
875 @classproperty
876 def surfix(cls) -> bytes: # noqa: N805
877 deprecation_with_replacement("surfix", "prefix", "5.0.0")
878 return b"/"
880 @staticmethod
881 def unnumber(sin: bytes) -> bytes:
882 result = bytearray()
883 i = 0
884 while i < len(sin):
885 if sin[i:i + 1] == b"#":
886 try:
887 result.append(int(sin[i + 1 : i + 3], 16))
888 i += 3
889 continue
890 except (ValueError, IndexError):
891 # if the 2 characters after # can not be converted to hex
892 # we change nothing and carry on
893 pass
894 result.append(sin[i])
895 i += 1
896 return bytes(result)
898 CHARSETS = ("utf-8", "gbk", "latin1")
900 @staticmethod
901 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
902 name = stream.read(1)
903 if name != NameObject.prefix:
904 raise PdfReadError("Name read error")
905 name += read_until_regex(stream, NameObject.delimiter_pattern)
906 try:
907 # Name objects should represent irregular characters
908 # with a '#' followed by the symbol's hex number
909 name = NameObject.unnumber(name)
910 for enc in NameObject.CHARSETS:
911 try:
912 ret = name.decode(enc)
913 return NameObject(ret)
914 except Exception:
915 pass
916 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
917 except (UnicodeEncodeError, UnicodeDecodeError) as e:
918 if not pdf.strict:
919 logger_warning(
920 f"Illegal character in NameObject ({name!r}), "
921 "you may need to adjust NameObject.CHARSETS",
922 __name__,
923 )
924 return NameObject(name.decode("charmap"))
925 raise PdfReadError(
926 f"Illegal character in NameObject ({name!r}). "
927 "You may need to adjust NameObject.CHARSETS.",
928 ) from e
931def encode_pdfdocencoding(unicode_string: str) -> bytes:
932 try:
933 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
934 except KeyError:
935 raise UnicodeEncodeError(
936 "pdfdocencoding",
937 unicode_string,
938 -1,
939 -1,
940 "does not exist in translation table",
941 )
944def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
945 """
946 Returns:
947 True if x is None or NullObject.
949 """
950 return x is None or (
951 isinstance(x, PdfObject)
952 and (x.get_object() is None or isinstance(x.get_object(), NullObject))
953 )