Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_base.py: 53%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2006, Mathieu Fenniak
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27import binascii
28import codecs
29import hashlib
30import re
31import sys
32from binascii import unhexlify
33from collections.abc import Sequence
34from math import log10
35from struct import iter_unpack
36from typing import Any, Callable, ClassVar, Optional, Union, cast
38if sys.version_info[:2] >= (3, 10):
39 from typing import TypeGuard
40else:
41 from typing_extensions import TypeGuard # PEP 647
43if sys.version_info >= (3, 11):
44 from typing import Self
45else:
46 from typing_extensions import Self
48from .._codecs import _pdfdoc_encoding_rev
49from .._protocols import PdfObjectProtocol, PdfWriterProtocol
50from .._utils import (
51 StreamType,
52 classproperty,
53 deprecation_no_replacement,
54 deprecation_with_replacement,
55 logger_warning,
56 read_non_whitespace,
57 read_until_regex,
58)
59from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
61__author__ = "Mathieu Fenniak"
62__author_email__ = "biziqe@mathieu.fenniak.net"
65class PdfObject(PdfObjectProtocol):
66 # function for calculating a hash value
67 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
68 indirect_reference: Optional["IndirectObject"]
70 def hash_bin(self) -> int:
71 """
72 Used to detect modified object.
74 Returns:
75 Hash considering type and value.
77 """
78 raise NotImplementedError(
79 f"{self.__class__.__name__} does not implement .hash_bin() so far"
80 )
82 def hash_value_data(self) -> bytes:
83 return f"{self}".encode()
85 def hash_value(self) -> bytes:
86 return (
87 f"{self.__class__.__name__}:"
88 f"{self.hash_func(self.hash_value_data()).hexdigest()}"
89 ).encode()
91 def replicate(
92 self,
93 pdf_dest: PdfWriterProtocol,
94 ) -> "PdfObject":
95 """
96 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
97 without ensuring links. This is used in clone_document_from_root with incremental = True.
99 Args:
100 pdf_dest: Target to clone to.
102 Returns:
103 The cloned PdfObject
105 """
106 return self.clone(pdf_dest)
108 def clone(
109 self,
110 pdf_dest: PdfWriterProtocol,
111 force_duplicate: bool = False,
112 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
113 ) -> "PdfObject":
114 """
115 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
117 By default, this method will call ``_reference_clone`` (see ``_reference``).
120 Args:
121 pdf_dest: Target to clone to.
122 force_duplicate: By default, if the object has already been cloned and referenced,
123 the copy will be returned; when ``True``, a new copy will be created.
124 (Default value = ``False``)
125 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
126 during cloning (applies to children duplication as well). If fields are to be
127 considered for a limited number of levels, you have to add it as integer, for
128 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
129 level only but ``"/TOTO"`` on all levels.
131 Returns:
132 The cloned PdfObject
134 """
135 raise NotImplementedError(
136 f"{self.__class__.__name__} does not implement .clone so far"
137 )
139 def _reference_clone(
140 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
141 ) -> PdfObjectProtocol:
142 """
143 Reference the object within the _objects of pdf_dest only if
144 indirect_reference attribute exists (which means the objects was
145 already identified in xref/xobjstm) if object has been already
146 referenced do nothing.
148 Args:
149 clone:
150 pdf_dest:
152 Returns:
153 The clone
155 """
156 try:
157 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
158 return clone
159 except Exception:
160 pass
161 # if hasattr(clone, "indirect_reference"):
162 try:
163 ind = self.indirect_reference
164 except AttributeError:
165 return clone
166 if (
167 pdf_dest.incremental
168 and ind is not None
169 and ind.pdf == pdf_dest._reader
170 and ind.idnum <= len(pdf_dest._objects)
171 ):
172 i = ind.idnum
173 else:
174 i = len(pdf_dest._objects) + 1
175 if ind is not None:
176 if id(ind.pdf) not in pdf_dest._id_translated:
177 pdf_dest._id_translated[id(ind.pdf)] = {}
178 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
179 if (
180 not force_duplicate
181 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
182 ):
183 obj = pdf_dest.get_object(
184 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
185 )
186 assert obj is not None
187 return obj
188 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
189 try:
190 pdf_dest._objects[i - 1] = clone
191 except IndexError:
192 pdf_dest._objects.append(clone)
193 i = len(pdf_dest._objects)
194 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
195 return clone
197 def get_object(self) -> Optional["PdfObject"]:
198 """Resolve indirect references."""
199 return self
201 def write_to_stream(
202 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
203 ) -> None:
204 raise NotImplementedError
207class NullObject(PdfObject):
208 def clone(
209 self,
210 pdf_dest: PdfWriterProtocol,
211 force_duplicate: bool = False,
212 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
213 ) -> "NullObject":
214 """Clone object into pdf_dest."""
215 return cast(
216 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
217 )
219 def hash_bin(self) -> int:
220 """
221 Used to detect modified object.
223 Returns:
224 Hash considering type and value.
226 """
227 return hash((self.__class__,))
229 def write_to_stream(
230 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
231 ) -> None:
232 if encryption_key is not None: # deprecated
233 deprecation_no_replacement(
234 "the encryption_key parameter of write_to_stream", "5.0.0"
235 )
236 stream.write(b"null")
238 @staticmethod
239 def read_from_stream(stream: StreamType) -> "NullObject":
240 nulltxt = stream.read(4)
241 if nulltxt != b"null":
242 raise PdfReadError("Could not read Null object")
243 return NullObject()
245 def __repr__(self) -> str:
246 return "NullObject"
248 def __eq__(self, other: object) -> bool:
249 return isinstance(other, NullObject)
251 def __hash__(self) -> int:
252 return self.hash_bin()
255class BooleanObject(PdfObject):
256 def __init__(self, value: Any) -> None:
257 self.value = value
259 def clone(
260 self,
261 pdf_dest: PdfWriterProtocol,
262 force_duplicate: bool = False,
263 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
264 ) -> "BooleanObject":
265 """Clone object into pdf_dest."""
266 return cast(
267 "BooleanObject",
268 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
269 )
271 def hash_bin(self) -> int:
272 """
273 Used to detect modified object.
275 Returns:
276 Hash considering type and value.
278 """
279 return hash((self.__class__, self.value))
281 def __eq__(self, o: object, /) -> bool:
282 if isinstance(o, BooleanObject):
283 return self.value == o.value
284 if isinstance(o, bool):
285 return self.value == o
286 return False
288 def __hash__(self) -> int:
289 return self.hash_bin()
291 def __repr__(self) -> str:
292 return "True" if self.value else "False"
294 def write_to_stream(
295 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
296 ) -> None:
297 if encryption_key is not None: # deprecated
298 deprecation_no_replacement(
299 "the encryption_key parameter of write_to_stream", "5.0.0"
300 )
301 if self.value:
302 stream.write(b"true")
303 else:
304 stream.write(b"false")
306 @staticmethod
307 def read_from_stream(stream: StreamType) -> "BooleanObject":
308 word = stream.read(4)
309 if word == b"true":
310 return BooleanObject(True)
311 if word == b"fals":
312 stream.read(1)
313 return BooleanObject(False)
314 raise PdfReadError("Could not read Boolean object")
317class IndirectObject(PdfObject):
318 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
319 self.idnum = idnum
320 self.generation = generation
321 self.pdf = pdf
323 def __hash__(self) -> int:
324 return hash((self.idnum, self.generation, id(self.pdf)))
326 def hash_bin(self) -> int:
327 """
328 Used to detect modified object.
330 Returns:
331 Hash considering type and value.
333 """
334 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
336 def replicate(
337 self,
338 pdf_dest: PdfWriterProtocol,
339 ) -> "PdfObject":
340 return IndirectObject(self.idnum, self.generation, pdf_dest)
342 def clone(
343 self,
344 pdf_dest: PdfWriterProtocol,
345 force_duplicate: bool = False,
346 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
347 ) -> "IndirectObject":
348 """Clone object into pdf_dest."""
349 if self.pdf == pdf_dest and not force_duplicate:
350 # Already duplicated and no extra duplication required
351 return self
352 if id(self.pdf) not in pdf_dest._id_translated:
353 pdf_dest._id_translated[id(self.pdf)] = {}
354 pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
356 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
357 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
358 if force_duplicate:
359 assert dup is not None
360 assert dup.indirect_reference is not None
361 idref = dup.indirect_reference
362 return IndirectObject(idref.idnum, idref.generation, idref.pdf)
363 else:
364 obj = self.get_object()
365 # case observed : a pointed object can not be found
366 if obj is None:
367 # this normally
368 obj = NullObject()
369 assert isinstance(self, (IndirectObject,))
370 obj.indirect_reference = self
371 dup = pdf_dest._add_object(
372 obj.clone(pdf_dest, force_duplicate, ignore_fields)
373 )
374 assert dup is not None, "mypy"
375 assert dup.indirect_reference is not None, "mypy"
376 return dup.indirect_reference
378 @property
379 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
380 return self
382 def get_object(self) -> Optional["PdfObject"]:
383 return self.pdf.get_object(self)
385 def __deepcopy__(self, memo: Any) -> "IndirectObject":
386 return IndirectObject(self.idnum, self.generation, self.pdf)
388 def _get_object_with_check(self) -> Optional["PdfObject"]:
389 o = self.get_object()
390 # the check is done here to not slow down get_object()
391 if isinstance(o, IndirectObject):
392 raise PdfStreamError(
393 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
394 )
395 return o
397 def __getattr__(self, name: str) -> Any:
398 # Attribute not found in object: look in pointed object
399 try:
400 return getattr(self._get_object_with_check(), name)
401 except AttributeError:
402 raise AttributeError(
403 f"No attribute {name} found in IndirectObject or pointed object"
404 )
406 def __getitem__(self, key: Any) -> Any:
407 # items should be extracted from pointed Object
408 return self._get_object_with_check()[key] # type: ignore
410 def __contains__(self, key: Any) -> bool:
411 return key in self._get_object_with_check() # type: ignore
413 def __iter__(self) -> Any:
414 return self._get_object_with_check().__iter__() # type: ignore
416 def __float__(self) -> str:
417 # in this case we are looking for the pointed data
418 return self.get_object().__float__() # type: ignore
420 def __int__(self) -> int:
421 # in this case we are looking for the pointed data
422 return self.get_object().__int__() # type: ignore
424 def __str__(self) -> str:
425 # in this case we are looking for the pointed data
426 return self.get_object().__str__()
428 def __repr__(self) -> str:
429 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
431 def __eq__(self, other: object) -> bool:
432 return (
433 other is not None
434 and isinstance(other, IndirectObject)
435 and self.idnum == other.idnum
436 and self.generation == other.generation
437 and self.pdf is other.pdf
438 )
440 def __ne__(self, other: object) -> bool:
441 return not self.__eq__(other)
443 def write_to_stream(
444 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
445 ) -> None:
446 if encryption_key is not None: # deprecated
447 deprecation_no_replacement(
448 "the encryption_key parameter of write_to_stream", "5.0.0"
449 )
450 stream.write(f"{self.idnum} {self.generation} R".encode())
452 @staticmethod
453 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
454 idnum = b""
455 while True:
456 tok = stream.read(1)
457 if not tok:
458 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
459 if tok.isspace():
460 break
461 idnum += tok
462 generation = b""
463 while True:
464 tok = stream.read(1)
465 if not tok:
466 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
467 if tok.isspace():
468 if not generation:
469 continue
470 break
471 generation += tok
472 r = read_non_whitespace(stream)
473 if r != b"R":
474 raise PdfReadError(
475 f"Error reading indirect object reference at byte {hex(stream.tell())}"
476 )
477 return IndirectObject(int(idnum), int(generation), pdf)
480FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
483class FloatObject(float, PdfObject):
484 def __new__(
485 cls, value: Any = "0.0", context: Optional[Any] = None
486 ) -> Self:
487 try:
488 value = float(value)
489 return float.__new__(cls, value)
490 except Exception as e:
491 # If this isn't a valid decimal (happens in malformed PDFs)
492 # fallback to 0
493 logger_warning(
494 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
495 )
496 return float.__new__(cls, 0.0)
498 def clone(
499 self,
500 pdf_dest: Any,
501 force_duplicate: bool = False,
502 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
503 ) -> "FloatObject":
504 """Clone object into pdf_dest."""
505 return cast(
506 "FloatObject",
507 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
508 )
510 def hash_bin(self) -> int:
511 """
512 Used to detect modified object.
514 Returns:
515 Hash considering type and value.
517 """
518 return hash((self.__class__, self.as_numeric))
520 def myrepr(self) -> str:
521 if self == 0:
522 return "0.0"
523 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
524 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
526 def __repr__(self) -> str:
527 return self.myrepr() # repr(float(self))
529 def as_numeric(self) -> float:
530 return float(self)
532 def write_to_stream(
533 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
534 ) -> None:
535 if encryption_key is not None: # deprecated
536 deprecation_no_replacement(
537 "the encryption_key parameter of write_to_stream", "5.0.0"
538 )
539 stream.write(self.myrepr().encode("utf8"))
542class NumberObject(int, PdfObject):
543 NumberPattern = re.compile(b"[^+-.0-9]")
545 def __new__(cls, value: Any) -> Self:
546 try:
547 return int.__new__(cls, int(value))
548 except ValueError:
549 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
550 return int.__new__(cls, 0)
552 def clone(
553 self,
554 pdf_dest: Any,
555 force_duplicate: bool = False,
556 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
557 ) -> "NumberObject":
558 """Clone object into pdf_dest."""
559 return cast(
560 "NumberObject",
561 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
562 )
564 def hash_bin(self) -> int:
565 """
566 Used to detect modified object.
568 Returns:
569 Hash considering type and value.
571 """
572 return hash((self.__class__, self.as_numeric()))
574 def as_numeric(self) -> int:
575 return int(repr(self).encode("utf8"))
577 def write_to_stream(
578 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
579 ) -> None:
580 if encryption_key is not None: # deprecated
581 deprecation_no_replacement(
582 "the encryption_key parameter of write_to_stream", "5.0.0"
583 )
584 stream.write(repr(self).encode("utf8"))
586 @staticmethod
587 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
588 num = read_until_regex(stream, NumberObject.NumberPattern)
589 if b"." in num:
590 return FloatObject(num)
591 return NumberObject(num)
594class ByteStringObject(bytes, PdfObject):
595 """
596 Represents a string object where the text encoding could not be determined.
598 This occurs quite often, as the PDF spec doesn't provide an alternate way to
599 represent strings -- for example, the encryption data stored in files (like
600 /O) is clearly not text, but is still stored in a "String" object.
601 """
603 def clone(
604 self,
605 pdf_dest: Any,
606 force_duplicate: bool = False,
607 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
608 ) -> "ByteStringObject":
609 """Clone object into pdf_dest."""
610 return cast(
611 "ByteStringObject",
612 self._reference_clone(
613 ByteStringObject(bytes(self)), pdf_dest, force_duplicate
614 ),
615 )
617 def hash_bin(self) -> int:
618 """
619 Used to detect modified object.
621 Returns:
622 Hash considering type and value.
624 """
625 return hash((self.__class__, bytes(self)))
627 @property
628 def original_bytes(self) -> bytes:
629 """For compatibility with TextStringObject.original_bytes."""
630 return self
632 def write_to_stream(
633 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
634 ) -> None:
635 if encryption_key is not None: # deprecated
636 deprecation_no_replacement(
637 "the encryption_key parameter of write_to_stream", "5.0.0"
638 )
639 stream.write(b"<")
640 stream.write(binascii.hexlify(self))
641 stream.write(b">")
643 def __str__(self) -> str:
644 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
645 for enc in charset_to_try:
646 try:
647 return self.decode(enc)
648 except UnicodeDecodeError:
649 pass
650 raise PdfReadError("Cannot decode ByteStringObject.")
653class TextStringObject(str, PdfObject): # noqa: SLOT000
654 """
655 A string object that has been decoded into a real unicode string.
657 If read from a PDF document, this string appeared to match the
658 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
659 to occur.
660 """
662 autodetect_pdfdocencoding: bool
663 autodetect_utf16: bool
664 utf16_bom: bytes
665 _original_bytes: Optional[bytes] = None
667 def __new__(cls, value: Any) -> Self:
668 original_bytes = None
669 if isinstance(value, bytes):
670 original_bytes = value
671 value = value.decode("charmap")
672 text_string_object = str.__new__(cls, value)
673 text_string_object._original_bytes = original_bytes
674 text_string_object.autodetect_utf16 = False
675 text_string_object.autodetect_pdfdocencoding = False
676 text_string_object.utf16_bom = b""
677 if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
678 # The value of `original_bytes` is only set for inputs being `bytes`.
679 # If this is UTF-16 data according to the BOM (first two characters),
680 # perform special handling. All other cases should not need any special conversion
681 # due to already being a string.
682 try:
683 text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
684 except UnicodeDecodeError as exception:
685 logger_warning(
686 f"{exception!s}\ninitial string:{exception.object!r}",
687 __name__,
688 )
689 text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
690 text_string_object._original_bytes = original_bytes
691 text_string_object.autodetect_utf16 = True
692 text_string_object.utf16_bom = original_bytes[:2]
693 else:
694 try:
695 encode_pdfdocencoding(text_string_object)
696 text_string_object.autodetect_pdfdocencoding = True
697 except UnicodeEncodeError:
698 text_string_object.autodetect_utf16 = True
699 text_string_object.utf16_bom = codecs.BOM_UTF16_BE
700 return text_string_object
702 def clone(
703 self,
704 pdf_dest: Any,
705 force_duplicate: bool = False,
706 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
707 ) -> "TextStringObject":
708 """Clone object into pdf_dest."""
709 obj = TextStringObject(self)
710 obj._original_bytes = self._original_bytes
711 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
712 obj.autodetect_utf16 = self.autodetect_utf16
713 obj.utf16_bom = self.utf16_bom
714 return cast(
715 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
716 )
718 def hash_bin(self) -> int:
719 """
720 Used to detect modified object.
722 Returns:
723 Hash considering type and value.
725 """
726 return hash((self.__class__, self.original_bytes))
728 @property
729 def original_bytes(self) -> bytes:
730 """
731 It is occasionally possible that a text string object gets created where
732 a byte string object was expected due to the autodetection mechanism --
733 if that occurs, this "original_bytes" property can be used to
734 back-calculate what the original encoded bytes were.
735 """
736 if self._original_bytes is not None:
737 return self._original_bytes
738 return self.get_original_bytes()
740 def get_original_bytes(self) -> bytes:
741 # We're a text string object, but the library is trying to get our raw
742 # bytes. This can happen if we auto-detected this string as text, but
743 # we were wrong. It's pretty common. Return the original bytes that
744 # would have been used to create this object, based upon the autodetect
745 # method.
746 if self.autodetect_utf16:
747 if self.utf16_bom == codecs.BOM_UTF16_LE:
748 return codecs.BOM_UTF16_LE + self.encode("utf-16le")
749 if self.utf16_bom == codecs.BOM_UTF16_BE:
750 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
751 return self.encode("utf-16be")
752 if self.autodetect_pdfdocencoding:
753 return encode_pdfdocencoding(self)
754 raise Exception("no information about original bytes") # pragma: no cover
756 def get_encoded_bytes(self) -> bytes:
757 # Try to write the string out as a PDFDocEncoding encoded string. It's
758 # nicer to look at in the PDF file. Sadly, we take a performance hit
759 # here for trying...
760 try:
761 if self._original_bytes is not None:
762 return self._original_bytes
763 if self.autodetect_utf16:
764 raise UnicodeEncodeError("", "forced", -1, -1, "")
765 bytearr = encode_pdfdocencoding(self)
766 except UnicodeEncodeError:
767 if self.utf16_bom == codecs.BOM_UTF16_LE:
768 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
769 elif self.utf16_bom == codecs.BOM_UTF16_BE:
770 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
771 else:
772 bytearr = self.encode("utf-16be")
773 return bytearr
775 def write_to_stream(
776 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
777 ) -> None:
778 if encryption_key is not None: # deprecated
779 deprecation_no_replacement(
780 "the encryption_key parameter of write_to_stream", "5.0.0"
781 )
782 bytearr = self.get_encoded_bytes()
783 stream.write(b"(")
784 for c_ in iter_unpack("c", bytearr):
785 c = cast(bytes, c_[0])
786 if not c.isalnum() and c != b" ":
787 # This:
788 # stream.write(rf"\{c:0>3o}".encode())
789 # gives
790 # https://github.com/davidhalter/parso/issues/207
791 stream.write(b"\\%03o" % ord(c))
792 else:
793 stream.write(c)
794 stream.write(b")")
797class NameObject(str, PdfObject): # noqa: SLOT000
798 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
799 prefix = b"/"
800 renumber_table: ClassVar[dict[str, bytes]] = {
801 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
802 **{chr(i): f"#{i:02X}".encode() for i in range(33)},
803 }
805 def clone(
806 self,
807 pdf_dest: Any,
808 force_duplicate: bool = False,
809 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
810 ) -> "NameObject":
811 """Clone object into pdf_dest."""
812 return cast(
813 "NameObject",
814 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
815 )
817 def hash_bin(self) -> int:
818 """
819 Used to detect modified object.
821 Returns:
822 Hash considering type and value.
824 """
825 return hash((self.__class__, self))
827 def write_to_stream(
828 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
829 ) -> None:
830 if encryption_key is not None: # deprecated
831 deprecation_no_replacement(
832 "the encryption_key parameter of write_to_stream", "5.0.0"
833 )
834 stream.write(self.renumber())
836 def renumber(self) -> bytes:
837 out = self[0].encode("utf-8")
838 if out != b"/":
839 deprecation_no_replacement(
840 f"Incorrect first char in NameObject, should start with '/': ({self})",
841 "5.0.0",
842 )
843 for c in self[1:]:
844 if c > "~":
845 for x in c.encode("utf-8"):
846 out += f"#{x:02X}".encode()
847 else:
848 try:
849 out += self.renumber_table[c]
850 except KeyError:
851 out += c.encode("utf-8")
852 return out
854 def _sanitize(self) -> "NameObject":
855 """
856 Sanitize the NameObject's name to be a valid PDF name part
857 (alphanumeric, underscore, hyphen). The _sanitize method replaces
858 spaces and any non-alphanumeric/non-underscore/non-hyphen with
859 underscores.
861 Returns:
862 NameObject with sanitized name.
863 """
864 name = str(self).removeprefix("/")
865 name = re.sub(r"\ ", "_", name)
866 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
867 return NameObject("/" + name)
869 @classproperty
870 def surfix(cls) -> bytes: # noqa: N805
871 deprecation_with_replacement("surfix", "prefix", "5.0.0")
872 return b"/"
874 @staticmethod
875 def unnumber(sin: bytes) -> bytes:
876 i = sin.find(b"#", 0)
877 while i >= 0:
878 try:
879 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
880 i = sin.find(b"#", i + 1)
881 except ValueError:
882 # if the 2 characters after # can not be converted to hex
883 # we change nothing and carry on
884 i = i + 1
885 return sin
887 CHARSETS = ("utf-8", "gbk", "latin1")
889 @staticmethod
890 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
891 name = stream.read(1)
892 if name != NameObject.prefix:
893 raise PdfReadError("Name read error")
894 name += read_until_regex(stream, NameObject.delimiter_pattern)
895 try:
896 # Name objects should represent irregular characters
897 # with a '#' followed by the symbol's hex number
898 name = NameObject.unnumber(name)
899 for enc in NameObject.CHARSETS:
900 try:
901 ret = name.decode(enc)
902 return NameObject(ret)
903 except Exception:
904 pass
905 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
906 except (UnicodeEncodeError, UnicodeDecodeError) as e:
907 if not pdf.strict:
908 logger_warning(
909 f"Illegal character in NameObject ({name!r}), "
910 "you may need to adjust NameObject.CHARSETS",
911 __name__,
912 )
913 return NameObject(name.decode("charmap"))
914 raise PdfReadError(
915 f"Illegal character in NameObject ({name!r}). "
916 "You may need to adjust NameObject.CHARSETS.",
917 ) from e
920def encode_pdfdocencoding(unicode_string: str) -> bytes:
921 try:
922 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
923 except KeyError:
924 raise UnicodeEncodeError(
925 "pdfdocencoding",
926 unicode_string,
927 -1,
928 -1,
929 "does not exist in translation table",
930 )
933def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
934 """
935 Returns:
936 True if x is None or NullObject.
938 """
939 return x is None or (
940 isinstance(x, PdfObject)
941 and (x.get_object() is None or isinstance(x.get_object(), NullObject))
942 )