Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27import binascii
28import codecs
29import hashlib
30import re
31import sys
32from binascii import unhexlify
33from collections.abc import Sequence
34from math import log10
35from struct import iter_unpack
36from typing import Any, Callable, ClassVar, Optional, Union, cast
38if sys.version_info[:2] >= (3, 10):
39 from typing import TypeGuard
40else:
41 from typing_extensions import TypeGuard # PEP 647
43from .._codecs import _pdfdoc_encoding_rev
44from .._protocols import PdfObjectProtocol, PdfWriterProtocol
45from .._utils import (
46 StreamType,
47 classproperty,
48 deprecation_no_replacement,
49 deprecation_with_replacement,
50 logger_warning,
51 read_non_whitespace,
52 read_until_regex,
53)
54from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
56__author__ = "Mathieu Fenniak"
57__author_email__ = "biziqe@mathieu.fenniak.net"
60class PdfObject(PdfObjectProtocol):
61 # function for calculating a hash value
62 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
63 indirect_reference: Optional["IndirectObject"]
65 def hash_bin(self) -> int:
66 """
67 Used to detect modified object.
69 Returns:
70 Hash considering type and value.
72 """
73 raise NotImplementedError(
74 f"{self.__class__.__name__} does not implement .hash_bin() so far"
75 )
77 def hash_value_data(self) -> bytes:
78 return f"{self}".encode()
80 def hash_value(self) -> bytes:
81 return (
82 f"{self.__class__.__name__}:"
83 f"{self.hash_func(self.hash_value_data()).hexdigest()}"
84 ).encode()
86 def replicate(
87 self,
88 pdf_dest: PdfWriterProtocol,
89 ) -> "PdfObject":
90 """
91 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
92 without ensuring links. This is used in clone_document_from_root with incremental = True.
94 Args:
95 pdf_dest: Target to clone to.
97 Returns:
98 The cloned PdfObject
100 """
101 return self.clone(pdf_dest)
103 def clone(
104 self,
105 pdf_dest: PdfWriterProtocol,
106 force_duplicate: bool = False,
107 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
108 ) -> "PdfObject":
109 """
110 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
112 By default, this method will call ``_reference_clone`` (see ``_reference``).
115 Args:
116 pdf_dest: Target to clone to.
117 force_duplicate: By default, if the object has already been cloned and referenced,
118 the copy will be returned; when ``True``, a new copy will be created.
119 (Default value = ``False``)
120 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
121 during cloning (applies to children duplication as well). If fields are to be
122 considered for a limited number of levels, you have to add it as integer, for
123 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
124 level only but ``"/TOTO"`` on all levels.
126 Returns:
127 The cloned PdfObject
129 """
130 raise NotImplementedError(
131 f"{self.__class__.__name__} does not implement .clone so far"
132 )
134 def _reference_clone(
135 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
136 ) -> PdfObjectProtocol:
137 """
138 Reference the object within the _objects of pdf_dest only if
139 indirect_reference attribute exists (which means the objects was
140 already identified in xref/xobjstm) if object has been already
141 referenced do nothing.
143 Args:
144 clone:
145 pdf_dest:
147 Returns:
148 The clone
150 """
151 try:
152 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
153 return clone
154 except Exception:
155 pass
156 # if hasattr(clone, "indirect_reference"):
157 try:
158 ind = self.indirect_reference
159 except AttributeError:
160 return clone
161 if (
162 pdf_dest.incremental
163 and ind is not None
164 and ind.pdf == pdf_dest._reader
165 and ind.idnum <= len(pdf_dest._objects)
166 ):
167 i = ind.idnum
168 else:
169 i = len(pdf_dest._objects) + 1
170 if ind is not None:
171 if id(ind.pdf) not in pdf_dest._id_translated:
172 pdf_dest._id_translated[id(ind.pdf)] = {}
173 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
174 if (
175 not force_duplicate
176 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
177 ):
178 obj = pdf_dest.get_object(
179 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
180 )
181 assert obj is not None
182 return obj
183 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
184 try:
185 pdf_dest._objects[i - 1] = clone
186 except IndexError:
187 pdf_dest._objects.append(clone)
188 i = len(pdf_dest._objects)
189 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
190 return clone
192 def get_object(self) -> Optional["PdfObject"]:
193 """Resolve indirect references."""
194 return self
196 def write_to_stream(
197 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
198 ) -> None:
199 raise NotImplementedError
202class NullObject(PdfObject):
203 def clone(
204 self,
205 pdf_dest: PdfWriterProtocol,
206 force_duplicate: bool = False,
207 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
208 ) -> "NullObject":
209 """Clone object into pdf_dest."""
210 return cast(
211 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
212 )
214 def hash_bin(self) -> int:
215 """
216 Used to detect modified object.
218 Returns:
219 Hash considering type and value.
221 """
222 return hash((self.__class__,))
224 def write_to_stream(
225 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
226 ) -> None:
227 if encryption_key is not None: # deprecated
228 deprecation_no_replacement(
229 "the encryption_key parameter of write_to_stream", "5.0.0"
230 )
231 stream.write(b"null")
233 @staticmethod
234 def read_from_stream(stream: StreamType) -> "NullObject":
235 nulltxt = stream.read(4)
236 if nulltxt != b"null":
237 raise PdfReadError("Could not read Null object")
238 return NullObject()
240 def __repr__(self) -> str:
241 return "NullObject"
243 def __eq__(self, other: object) -> bool:
244 return isinstance(other, NullObject)
246 def __hash__(self) -> int:
247 return self.hash_bin()
250class BooleanObject(PdfObject):
251 def __init__(self, value: Any) -> None:
252 self.value = value
254 def clone(
255 self,
256 pdf_dest: PdfWriterProtocol,
257 force_duplicate: bool = False,
258 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
259 ) -> "BooleanObject":
260 """Clone object into pdf_dest."""
261 return cast(
262 "BooleanObject",
263 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
264 )
266 def hash_bin(self) -> int:
267 """
268 Used to detect modified object.
270 Returns:
271 Hash considering type and value.
273 """
274 return hash((self.__class__, self.value))
276 def __eq__(self, o: object, /) -> bool:
277 if isinstance(o, BooleanObject):
278 return self.value == o.value
279 if isinstance(o, bool):
280 return self.value == o
281 return False
283 def __hash__(self) -> int:
284 return self.hash_bin()
286 def __repr__(self) -> str:
287 return "True" if self.value else "False"
289 def write_to_stream(
290 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
291 ) -> None:
292 if encryption_key is not None: # deprecated
293 deprecation_no_replacement(
294 "the encryption_key parameter of write_to_stream", "5.0.0"
295 )
296 if self.value:
297 stream.write(b"true")
298 else:
299 stream.write(b"false")
301 @staticmethod
302 def read_from_stream(stream: StreamType) -> "BooleanObject":
303 word = stream.read(4)
304 if word == b"true":
305 return BooleanObject(True)
306 if word == b"fals":
307 stream.read(1)
308 return BooleanObject(False)
309 raise PdfReadError("Could not read Boolean object")
312class IndirectObject(PdfObject):
313 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
314 self.idnum = idnum
315 self.generation = generation
316 self.pdf = pdf
318 def __hash__(self) -> int:
319 return hash((self.idnum, self.generation, id(self.pdf)))
321 def hash_bin(self) -> int:
322 """
323 Used to detect modified object.
325 Returns:
326 Hash considering type and value.
328 """
329 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
331 def replicate(
332 self,
333 pdf_dest: PdfWriterProtocol,
334 ) -> "PdfObject":
335 return IndirectObject(self.idnum, self.generation, pdf_dest)
337 def clone(
338 self,
339 pdf_dest: PdfWriterProtocol,
340 force_duplicate: bool = False,
341 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
342 ) -> "IndirectObject":
343 """Clone object into pdf_dest."""
344 if self.pdf == pdf_dest and not force_duplicate:
345 # Already duplicated and no extra duplication required
346 return self
347 if id(self.pdf) not in pdf_dest._id_translated:
348 pdf_dest._id_translated[id(self.pdf)] = {}
349 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
351 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
352 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
353 if force_duplicate:
354 assert dup is not None
355 assert dup.indirect_reference is not None
356 idref = dup.indirect_reference
357 return IndirectObject(idref.idnum, idref.generation, idref.pdf)
358 else:
359 obj = self.get_object()
360 # case observed : a pointed object can not be found
361 if obj is None:
362 # this normally
363 obj = NullObject()
364 assert isinstance(self, (IndirectObject,))
365 obj.indirect_reference = self
366 dup = pdf_dest._add_object(
367 obj.clone(pdf_dest, force_duplicate, ignore_fields)
368 )
369 assert dup is not None, "mypy"
370 assert dup.indirect_reference is not None, "mypy"
371 return dup.indirect_reference
373 @property
374 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
375 return self
377 def get_object(self) -> Optional["PdfObject"]:
378 return self.pdf.get_object(self)
380 def __deepcopy__(self, memo: Any) -> "IndirectObject":
381 return IndirectObject(self.idnum, self.generation, self.pdf)
383 def _get_object_with_check(self) -> Optional["PdfObject"]:
384 o = self.get_object()
385 # the check is done here to not slow down get_object()
386 if isinstance(o, IndirectObject):
387 raise PdfStreamError(
388 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
389 )
390 return o
392 def __getattr__(self, name: str) -> Any:
393 # Attribute not found in object: look in pointed object
394 try:
395 return getattr(self._get_object_with_check(), name)
396 except AttributeError:
397 raise AttributeError(
398 f"No attribute {name} found in IndirectObject or pointed object"
399 )
401 def __getitem__(self, key: Any) -> Any:
402 # items should be extracted from pointed Object
403 return self._get_object_with_check()[key] # type: ignore
405 def __contains__(self, key: Any) -> bool:
406 return key in self._get_object_with_check() # type: ignore
408 def __iter__(self) -> Any:
409 return self._get_object_with_check().__iter__() # type: ignore
411 def __float__(self) -> str:
412 # in this case we are looking for the pointed data
413 return self.get_object().__float__() # type: ignore
415 def __int__(self) -> int:
416 # in this case we are looking for the pointed data
417 return self.get_object().__int__() # type: ignore
419 def __str__(self) -> str:
420 # in this case we are looking for the pointed data
421 return self.get_object().__str__()
423 def __repr__(self) -> str:
424 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
426 def __eq__(self, other: object) -> bool:
427 return (
428 other is not None
429 and isinstance(other, IndirectObject)
430 and self.idnum == other.idnum
431 and self.generation == other.generation
432 and self.pdf is other.pdf
433 )
435 def __ne__(self, other: object) -> bool:
436 return not self.__eq__(other)
438 def write_to_stream(
439 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
440 ) -> None:
441 if encryption_key is not None: # deprecated
442 deprecation_no_replacement(
443 "the encryption_key parameter of write_to_stream", "5.0.0"
444 )
445 stream.write(f"{self.idnum} {self.generation} R".encode())
447 @staticmethod
448 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
449 idnum = b""
450 while True:
451 tok = stream.read(1)
452 if not tok:
453 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
454 if tok.isspace():
455 break
456 idnum += tok
457 generation = b""
458 while True:
459 tok = stream.read(1)
460 if not tok:
461 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
462 if tok.isspace():
463 if not generation:
464 continue
465 break
466 generation += tok
467 r = read_non_whitespace(stream)
468 if r != b"R":
469 raise PdfReadError(
470 f"Error reading indirect object reference at byte {hex(stream.tell())}"
471 )
472 return IndirectObject(int(idnum), int(generation), pdf)
475FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
478class FloatObject(float, PdfObject):
479 def __new__(
480 cls, value: Any = "0.0", context: Optional[Any] = None
481 ) -> "FloatObject":
482 try:
483 value = float(value)
484 return float.__new__(cls, value)
485 except Exception as e:
486 # If this isn't a valid decimal (happens in malformed PDFs)
487 # fallback to 0
488 logger_warning(
489 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
490 )
491 return float.__new__(cls, 0.0)
493 def clone(
494 self,
495 pdf_dest: Any,
496 force_duplicate: bool = False,
497 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
498 ) -> "FloatObject":
499 """Clone object into pdf_dest."""
500 return cast(
501 "FloatObject",
502 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
503 )
505 def hash_bin(self) -> int:
506 """
507 Used to detect modified object.
509 Returns:
510 Hash considering type and value.
512 """
513 return hash((self.__class__, self.as_numeric))
515 def myrepr(self) -> str:
516 if self == 0:
517 return "0.0"
518 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
519 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
521 def __repr__(self) -> str:
522 return self.myrepr() # repr(float(self))
524 def as_numeric(self) -> float:
525 return float(self)
527 def write_to_stream(
528 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
529 ) -> None:
530 if encryption_key is not None: # deprecated
531 deprecation_no_replacement(
532 "the encryption_key parameter of write_to_stream", "5.0.0"
533 )
534 stream.write(self.myrepr().encode("utf8"))
537class NumberObject(int, PdfObject):
538 NumberPattern = re.compile(b"[^+-.0-9]")
540 def __new__(cls, value: Any) -> "NumberObject":
541 try:
542 return int.__new__(cls, int(value))
543 except ValueError:
544 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
545 return int.__new__(cls, 0)
547 def clone(
548 self,
549 pdf_dest: Any,
550 force_duplicate: bool = False,
551 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
552 ) -> "NumberObject":
553 """Clone object into pdf_dest."""
554 return cast(
555 "NumberObject",
556 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
557 )
559 def hash_bin(self) -> int:
560 """
561 Used to detect modified object.
563 Returns:
564 Hash considering type and value.
566 """
567 return hash((self.__class__, self.as_numeric()))
569 def as_numeric(self) -> int:
570 return int(repr(self).encode("utf8"))
572 def write_to_stream(
573 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
574 ) -> None:
575 if encryption_key is not None: # deprecated
576 deprecation_no_replacement(
577 "the encryption_key parameter of write_to_stream", "5.0.0"
578 )
579 stream.write(repr(self).encode("utf8"))
581 @staticmethod
582 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
583 num = read_until_regex(stream, NumberObject.NumberPattern)
584 if b"." in num:
585 return FloatObject(num)
586 return NumberObject(num)
589class ByteStringObject(bytes, PdfObject):
590 """
591 Represents a string object where the text encoding could not be determined.
593 This occurs quite often, as the PDF spec doesn't provide an alternate way to
594 represent strings -- for example, the encryption data stored in files (like
595 /O) is clearly not text, but is still stored in a "String" object.
596 """
598 def clone(
599 self,
600 pdf_dest: Any,
601 force_duplicate: bool = False,
602 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
603 ) -> "ByteStringObject":
604 """Clone object into pdf_dest."""
605 return cast(
606 "ByteStringObject",
607 self._reference_clone(
608 ByteStringObject(bytes(self)), pdf_dest, force_duplicate
609 ),
610 )
612 def hash_bin(self) -> int:
613 """
614 Used to detect modified object.
616 Returns:
617 Hash considering type and value.
619 """
620 return hash((self.__class__, bytes(self)))
622 @property
623 def original_bytes(self) -> bytes:
624 """For compatibility with TextStringObject.original_bytes."""
625 return self
627 def write_to_stream(
628 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
629 ) -> None:
630 if encryption_key is not None: # deprecated
631 deprecation_no_replacement(
632 "the encryption_key parameter of write_to_stream", "5.0.0"
633 )
634 stream.write(b"<")
635 stream.write(binascii.hexlify(self))
636 stream.write(b">")
638 def __str__(self) -> str:
639 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
640 for enc in charset_to_try:
641 try:
642 return self.decode(enc)
643 except UnicodeDecodeError:
644 pass
645 raise PdfReadError("Cannot decode ByteStringObject.")
648class TextStringObject(str, PdfObject): # noqa: SLOT000
649 """
650 A string object that has been decoded into a real unicode string.
652 If read from a PDF document, this string appeared to match the
653 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
654 to occur.
655 """
657 autodetect_pdfdocencoding: bool
658 autodetect_utf16: bool
659 utf16_bom: bytes
660 _original_bytes: Optional[bytes] = None
662 def __new__(cls, value: Any) -> "TextStringObject":
663 original_bytes = None
664 if isinstance(value, bytes):
665 original_bytes = value
666 value = value.decode("charmap")
667 text_string_object = str.__new__(cls, value)
668 text_string_object._original_bytes = original_bytes
669 text_string_object.autodetect_utf16 = False
670 text_string_object.autodetect_pdfdocencoding = False
671 text_string_object.utf16_bom = b""
672 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
673 # The value of `original_bytes` is only set for inputs being `bytes`.
674 # If this is UTF-16 data according to the BOM (first two characters),
675 # perform special handling. All other cases should not need any special conversion
676 # due to already being a string.
677 try:
678 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
679 except UnicodeDecodeError as exception:
680 logger_warning(
681 f"{exception!s}\ninitial string:{exception.object!r}",
682 __name__,
683 )
684 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
685 text_string_object._original_bytes = original_bytes
686 text_string_object.autodetect_utf16 = True
687 text_string_object.utf16_bom = original_bytes[:2]
688 else:
689 try:
690 encode_pdfdocencoding(text_string_object)
691 text_string_object.autodetect_pdfdocencoding = True
692 except UnicodeEncodeError:
693 text_string_object.autodetect_utf16 = True
694 text_string_object.utf16_bom = codecs.BOM_UTF16_BE
695 return text_string_object
697 def clone(
698 self,
699 pdf_dest: Any,
700 force_duplicate: bool = False,
701 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
702 ) -> "TextStringObject":
703 """Clone object into pdf_dest."""
704 obj = TextStringObject(self)
705 obj._original_bytes = self._original_bytes
706 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
707 obj.autodetect_utf16 = self.autodetect_utf16
708 obj.utf16_bom = self.utf16_bom
709 return cast(
710 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
711 )
713 def hash_bin(self) -> int:
714 """
715 Used to detect modified object.
717 Returns:
718 Hash considering type and value.
720 """
721 return hash((self.__class__, self.original_bytes))
723 @property
724 def original_bytes(self) -> bytes:
725 """
726 It is occasionally possible that a text string object gets created where
727 a byte string object was expected due to the autodetection mechanism --
728 if that occurs, this "original_bytes" property can be used to
729 back-calculate what the original encoded bytes were.
730 """
731 if self._original_bytes is not None:
732 return self._original_bytes
733 return self.get_original_bytes()
735 def get_original_bytes(self) -> bytes:
736 # We're a text string object, but the library is trying to get our raw
737 # bytes. This can happen if we auto-detected this string as text, but
738 # we were wrong. It's pretty common. Return the original bytes that
739 # would have been used to create this object, based upon the autodetect
740 # method.
741 if self.autodetect_utf16:
742 if self.utf16_bom == codecs.BOM_UTF16_LE:
743 return codecs.BOM_UTF16_LE + self.encode("utf-16le")
744 if self.utf16_bom == codecs.BOM_UTF16_BE:
745 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
746 return self.encode("utf-16be")
747 if self.autodetect_pdfdocencoding:
748 return encode_pdfdocencoding(self)
749 raise Exception("no information about original bytes") # pragma: no cover
751 def get_encoded_bytes(self) -> bytes:
752 # Try to write the string out as a PDFDocEncoding encoded string. It's
753 # nicer to look at in the PDF file. Sadly, we take a performance hit
754 # here for trying...
755 try:
756 if self._original_bytes is not None:
757 return self._original_bytes
758 if self.autodetect_utf16:
759 raise UnicodeEncodeError("", "forced", -1, -1, "")
760 bytearr = encode_pdfdocencoding(self)
761 except UnicodeEncodeError:
762 if self.utf16_bom == codecs.BOM_UTF16_LE:
763 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
764 elif self.utf16_bom == codecs.BOM_UTF16_BE:
765 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
766 else:
767 bytearr = self.encode("utf-16be")
768 return bytearr
770 def write_to_stream(
771 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
772 ) -> None:
773 if encryption_key is not None: # deprecated
774 deprecation_no_replacement(
775 "the encryption_key parameter of write_to_stream", "5.0.0"
776 )
777 bytearr = self.get_encoded_bytes()
778 stream.write(b"(")
779 for c_ in iter_unpack("c", bytearr):
780 c = cast(bytes, c_[0])
781 if not c.isalnum() and c != b" ":
782 # This:
783 # stream.write(rf"\{c:0>3o}".encode())
784 # gives
785 # https://github.com/davidhalter/parso/issues/207
786 stream.write(b"\\%03o" % ord(c))
787 else:
788 stream.write(c)
789 stream.write(b")")
792class NameObject(str, PdfObject): # noqa: SLOT000
793 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
794 prefix = b"/"
795 renumber_table: ClassVar[dict[str, bytes]] = {
796 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
797 **{chr(i): f"#{i:02X}".encode() for i in range(33)},
798 }
800 def clone(
801 self,
802 pdf_dest: Any,
803 force_duplicate: bool = False,
804 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
805 ) -> "NameObject":
806 """Clone object into pdf_dest."""
807 return cast(
808 "NameObject",
809 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
810 )
812 def hash_bin(self) -> int:
813 """
814 Used to detect modified object.
816 Returns:
817 Hash considering type and value.
819 """
820 return hash((self.__class__, self))
822 def write_to_stream(
823 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
824 ) -> None:
825 if encryption_key is not None: # deprecated
826 deprecation_no_replacement(
827 "the encryption_key parameter of write_to_stream", "5.0.0"
828 )
829 stream.write(self.renumber())
831 def renumber(self) -> bytes:
832 out = self[0].encode("utf-8")
833 if out != b"/":
834 deprecation_no_replacement(
835 f"Incorrect first char in NameObject, should start with '/': ({self})",
836 "5.0.0",
837 )
838 for c in self[1:]:
839 if c > "~":
840 for x in c.encode("utf-8"):
841 out += f"#{x:02X}".encode()
842 else:
843 try:
844 out += self.renumber_table[c]
845 except KeyError:
846 out += c.encode("utf-8")
847 return out
849 def _sanitize(self) -> "NameObject":
850 """
851 Sanitize the NameObject's name to be a valid PDF name part
852 (alphanumeric, underscore, hyphen). The _sanitize method replaces
853 spaces and any non-alphanumeric/non-underscore/non-hyphen with
854 underscores.
856 Returns:
857 NameObject with sanitized name.
858 """
859 name = str(self).removeprefix("/")
860 name = re.sub(r"\ ", "_", name)
861 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
862 return NameObject("/" + name)
864 @classproperty
865 def surfix(cls) -> bytes: # noqa: N805
866 deprecation_with_replacement("surfix", "prefix", "5.0.0")
867 return b"/"
869 @staticmethod
870 def unnumber(sin: bytes) -> bytes:
871 i = sin.find(b"#", 0)
872 while i >= 0:
873 try:
874 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
875 i = sin.find(b"#", i + 1)
876 except ValueError:
877 # if the 2 characters after # can not be converted to hex
878 # we change nothing and carry on
879 i = i + 1
880 return sin
882 CHARSETS = ("utf-8", "gbk", "latin1")
884 @staticmethod
885 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
886 name = stream.read(1)
887 if name != NameObject.prefix:
888 raise PdfReadError("Name read error")
889 name += read_until_regex(stream, NameObject.delimiter_pattern)
890 try:
891 # Name objects should represent irregular characters
892 # with a '#' followed by the symbol's hex number
893 name = NameObject.unnumber(name)
894 for enc in NameObject.CHARSETS:
895 try:
896 ret = name.decode(enc)
897 return NameObject(ret)
898 except Exception:
899 pass
900 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
901 except (UnicodeEncodeError, UnicodeDecodeError) as e:
902 if not pdf.strict:
903 logger_warning(
904 f"Illegal character in NameObject ({name!r}), "
905 "you may need to adjust NameObject.CHARSETS",
906 __name__,
907 )
908 return NameObject(name.decode("charmap"))
909 raise PdfReadError(
910 f"Illegal character in NameObject ({name!r}). "
911 "You may need to adjust NameObject.CHARSETS.",
912 ) from e
915def encode_pdfdocencoding(unicode_string: str) -> bytes:
916 try:
917 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
918 except KeyError:
919 raise UnicodeEncodeError(
920 "pdfdocencoding",
921 unicode_string,
922 -1,
923 -1,
924 "does not exist in translation table",
925 )
928def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
929 """
930 Returns:
931 True if x is None or NullObject.
933 """
934 return x is None or (
935 isinstance(x, PdfObject)
936 and (x.get_object() is None or isinstance(x.get_object(), NullObject))
937 )