1# Copyright (c) 2006, Mathieu Fenniak
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27import binascii
28import codecs
29import hashlib
30import re
31import sys
32from binascii import unhexlify
33from math import log10
34from struct import iter_unpack
35from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast
36
37if sys.version_info[:2] >= (3, 10):
38 from typing import TypeGuard
39else:
40 from typing_extensions import TypeGuard # PEP 647
41
42from .._codecs import _pdfdoc_encoding_rev
43from .._protocols import PdfObjectProtocol, PdfWriterProtocol
44from .._utils import (
45 StreamType,
46 classproperty,
47 deprecate_no_replacement,
48 deprecate_with_replacement,
49 logger_warning,
50 read_non_whitespace,
51 read_until_regex,
52)
53from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
54
55__author__ = "Mathieu Fenniak"
56__author_email__ = "biziqe@mathieu.fenniak.net"
57
58
59class PdfObject(PdfObjectProtocol):
60 # function for calculating a hash value
61 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
62 indirect_reference: Optional["IndirectObject"]
63
64 def hash_bin(self) -> int:
65 """
66 Used to detect modified object.
67
68 Returns:
69 Hash considering type and value.
70
71 """
72 raise NotImplementedError(
73 f"{self.__class__.__name__} does not implement .hash_bin() so far"
74 )
75
76 def hash_value_data(self) -> bytes:
77 return f"{self}".encode()
78
79 def hash_value(self) -> bytes:
80 return (
81 f"{self.__class__.__name__}:"
82 f"{self.hash_func(self.hash_value_data()).hexdigest()}"
83 ).encode()
84
85 def replicate(
86 self,
87 pdf_dest: PdfWriterProtocol,
88 ) -> "PdfObject":
89 """
90 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
91 without ensuring links. This is used in clone_document_from_root with incremental = True.
92
93 Args:
94 pdf_dest: Target to clone to.
95
96 Returns:
97 The cloned PdfObject
98
99 """
100 return self.clone(pdf_dest)
101
102 def clone(
103 self,
104 pdf_dest: PdfWriterProtocol,
105 force_duplicate: bool = False,
106 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
107 ) -> "PdfObject":
108 """
109 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
110
111 By default, this method will call ``_reference_clone`` (see ``_reference``).
112
113
114 Args:
115 pdf_dest: Target to clone to.
116 force_duplicate: By default, if the object has already been cloned and referenced,
117 the copy will be returned; when ``True``, a new copy will be created.
118 (Default value = ``False``)
119 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
120 during cloning (applies to children duplication as well). If fields are to be
121 considered for a limited number of levels, you have to add it as integer, for
122 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
123 level only but ``"/TOTO"`` on all levels.
124
125 Returns:
126 The cloned PdfObject
127
128 """
129 raise NotImplementedError(
130 f"{self.__class__.__name__} does not implement .clone so far"
131 )
132
133 def _reference_clone(
134 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
135 ) -> PdfObjectProtocol:
136 """
137 Reference the object within the _objects of pdf_dest only if
138 indirect_reference attribute exists (which means the objects was
139 already identified in xref/xobjstm) if object has been already
140 referenced do nothing.
141
142 Args:
143 clone:
144 pdf_dest:
145
146 Returns:
147 The clone
148
149 """
150 try:
151 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
152 return clone
153 except Exception:
154 pass
155 # if hasattr(clone, "indirect_reference"):
156 try:
157 ind = self.indirect_reference
158 except AttributeError:
159 return clone
160 if (
161 pdf_dest.incremental
162 and ind is not None
163 and ind.pdf == pdf_dest._reader
164 and ind.idnum <= len(pdf_dest._objects)
165 ):
166 i = ind.idnum
167 else:
168 i = len(pdf_dest._objects) + 1
169 if ind is not None:
170 if id(ind.pdf) not in pdf_dest._id_translated:
171 pdf_dest._id_translated[id(ind.pdf)] = {}
172 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore
173 if (
174 not force_duplicate
175 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
176 ):
177 obj = pdf_dest.get_object(
178 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
179 )
180 assert obj is not None
181 return obj
182 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
183 try:
184 pdf_dest._objects[i - 1] = clone
185 except IndexError:
186 pdf_dest._objects.append(clone)
187 i = len(pdf_dest._objects)
188 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
189 return clone
190
191 def get_object(self) -> Optional["PdfObject"]:
192 """Resolve indirect references."""
193 return self
194
195 def write_to_stream(
196 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
197 ) -> None:
198 raise NotImplementedError
199
200
201class NullObject(PdfObject):
202 def clone(
203 self,
204 pdf_dest: PdfWriterProtocol,
205 force_duplicate: bool = False,
206 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
207 ) -> "NullObject":
208 """Clone object into pdf_dest."""
209 return cast(
210 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
211 )
212
213 def hash_bin(self) -> int:
214 """
215 Used to detect modified object.
216
217 Returns:
218 Hash considering type and value.
219
220 """
221 return hash((self.__class__,))
222
223 def write_to_stream(
224 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
225 ) -> None:
226 if encryption_key is not None: # deprecated
227 deprecate_no_replacement(
228 "the encryption_key parameter of write_to_stream", "5.0.0"
229 )
230 stream.write(b"null")
231
232 @staticmethod
233 def read_from_stream(stream: StreamType) -> "NullObject":
234 nulltxt = stream.read(4)
235 if nulltxt != b"null":
236 raise PdfReadError("Could not read Null object")
237 return NullObject()
238
239 def __repr__(self) -> str:
240 return "NullObject"
241
242 def __eq__(self, other: object) -> bool:
243 return isinstance(other, NullObject)
244
245 def __hash__(self) -> int:
246 return self.hash_bin()
247
248
249class BooleanObject(PdfObject):
250 def __init__(self, value: Any) -> None:
251 self.value = value
252
253 def clone(
254 self,
255 pdf_dest: PdfWriterProtocol,
256 force_duplicate: bool = False,
257 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
258 ) -> "BooleanObject":
259 """Clone object into pdf_dest."""
260 return cast(
261 "BooleanObject",
262 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
263 )
264
265 def hash_bin(self) -> int:
266 """
267 Used to detect modified object.
268
269 Returns:
270 Hash considering type and value.
271
272 """
273 return hash((self.__class__, self.value))
274
275 def __eq__(self, o: object, /) -> bool:
276 if isinstance(o, BooleanObject):
277 return self.value == o.value
278 if isinstance(o, bool):
279 return self.value == o
280 return False
281
282 def __hash__(self) -> int:
283 return self.hash_bin()
284
285 def __repr__(self) -> str:
286 return "True" if self.value else "False"
287
288 def write_to_stream(
289 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
290 ) -> None:
291 if encryption_key is not None: # deprecated
292 deprecate_no_replacement(
293 "the encryption_key parameter of write_to_stream", "5.0.0"
294 )
295 if self.value:
296 stream.write(b"true")
297 else:
298 stream.write(b"false")
299
300 @staticmethod
301 def read_from_stream(stream: StreamType) -> "BooleanObject":
302 word = stream.read(4)
303 if word == b"true":
304 return BooleanObject(True)
305 if word == b"fals":
306 stream.read(1)
307 return BooleanObject(False)
308 raise PdfReadError("Could not read Boolean object")
309
310
311class IndirectObject(PdfObject):
312 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
313 self.idnum = idnum
314 self.generation = generation
315 self.pdf = pdf
316
317 def __hash__(self) -> int:
318 return hash((self.idnum, self.generation, id(self.pdf)))
319
320 def hash_bin(self) -> int:
321 """
322 Used to detect modified object.
323
324 Returns:
325 Hash considering type and value.
326
327 """
328 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
329
330 def replicate(
331 self,
332 pdf_dest: PdfWriterProtocol,
333 ) -> "PdfObject":
334 return IndirectObject(self.idnum, self.generation, pdf_dest)
335
336 def clone(
337 self,
338 pdf_dest: PdfWriterProtocol,
339 force_duplicate: bool = False,
340 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
341 ) -> "IndirectObject":
342 """Clone object into pdf_dest."""
343 if self.pdf == pdf_dest and not force_duplicate:
344 # Already duplicated and no extra duplication required
345 return self
346 if id(self.pdf) not in pdf_dest._id_translated:
347 pdf_dest._id_translated[id(self.pdf)] = {}
348
349 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
350 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
351 if force_duplicate:
352 assert dup is not None
353 assert dup.indirect_reference is not None
354 idref = dup.indirect_reference
355 return IndirectObject(idref.idnum, idref.generation, idref.pdf)
356 else:
357 obj = self.get_object()
358 # case observed : a pointed object can not be found
359 if obj is None:
360 # this normally
361 obj = NullObject()
362 assert isinstance(self, (IndirectObject,))
363 obj.indirect_reference = self
364 dup = pdf_dest._add_object(
365 obj.clone(pdf_dest, force_duplicate, ignore_fields)
366 )
367 assert dup is not None, "mypy"
368 assert dup.indirect_reference is not None, "mypy"
369 return dup.indirect_reference
370
371 @property
372 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
373 return self
374
375 def get_object(self) -> Optional["PdfObject"]:
376 return self.pdf.get_object(self)
377
378 def __deepcopy__(self, memo: Any) -> "IndirectObject":
379 return IndirectObject(self.idnum, self.generation, self.pdf)
380
381 def _get_object_with_check(self) -> Optional["PdfObject"]:
382 o = self.get_object()
383 # the check is done here to not slow down get_object()
384 if isinstance(o, IndirectObject):
385 raise PdfStreamError(
386 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
387 )
388 return o
389
390 def __getattr__(self, name: str) -> Any:
391 # Attribute not found in object: look in pointed object
392 try:
393 return getattr(self._get_object_with_check(), name)
394 except AttributeError:
395 raise AttributeError(
396 f"No attribute {name} found in IndirectObject or pointed object"
397 )
398
399 def __getitem__(self, key: Any) -> Any:
400 # items should be extracted from pointed Object
401 return self._get_object_with_check()[key] # type: ignore
402
403 def __contains__(self, key: Any) -> bool:
404 return key in self._get_object_with_check() # type: ignore
405
406 def __iter__(self) -> Any:
407 return self._get_object_with_check().__iter__() # type: ignore
408
409 def __float__(self) -> str:
410 # in this case we are looking for the pointed data
411 return self.get_object().__float__() # type: ignore
412
413 def __int__(self) -> int:
414 # in this case we are looking for the pointed data
415 return self.get_object().__int__() # type: ignore
416
417 def __str__(self) -> str:
418 # in this case we are looking for the pointed data
419 return self.get_object().__str__()
420
421 def __repr__(self) -> str:
422 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
423
424 def __eq__(self, other: object) -> bool:
425 return (
426 other is not None
427 and isinstance(other, IndirectObject)
428 and self.idnum == other.idnum
429 and self.generation == other.generation
430 and self.pdf is other.pdf
431 )
432
433 def __ne__(self, other: object) -> bool:
434 return not self.__eq__(other)
435
436 def write_to_stream(
437 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
438 ) -> None:
439 if encryption_key is not None: # deprecated
440 deprecate_no_replacement(
441 "the encryption_key parameter of write_to_stream", "5.0.0"
442 )
443 stream.write(f"{self.idnum} {self.generation} R".encode())
444
445 @staticmethod
446 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
447 idnum = b""
448 while True:
449 tok = stream.read(1)
450 if not tok:
451 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
452 if tok.isspace():
453 break
454 idnum += tok
455 generation = b""
456 while True:
457 tok = stream.read(1)
458 if not tok:
459 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
460 if tok.isspace():
461 if not generation:
462 continue
463 break
464 generation += tok
465 r = read_non_whitespace(stream)
466 if r != b"R":
467 raise PdfReadError(
468 f"Error reading indirect object reference at byte {hex(stream.tell())}"
469 )
470 return IndirectObject(int(idnum), int(generation), pdf)
471
472
473FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
474
475
476class FloatObject(float, PdfObject):
477 def __new__(
478 cls, value: Any = "0.0", context: Optional[Any] = None
479 ) -> "FloatObject":
480 try:
481 value = float(value)
482 return float.__new__(cls, value)
483 except Exception as e:
484 # If this isn't a valid decimal (happens in malformed PDFs)
485 # fallback to 0
486 logger_warning(
487 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
488 )
489 return float.__new__(cls, 0.0)
490
491 def clone(
492 self,
493 pdf_dest: Any,
494 force_duplicate: bool = False,
495 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
496 ) -> "FloatObject":
497 """Clone object into pdf_dest."""
498 return cast(
499 "FloatObject",
500 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
501 )
502
503 def hash_bin(self) -> int:
504 """
505 Used to detect modified object.
506
507 Returns:
508 Hash considering type and value.
509
510 """
511 return hash((self.__class__, self.as_numeric))
512
513 def myrepr(self) -> str:
514 if self == 0:
515 return "0.0"
516 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
517 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
518
519 def __repr__(self) -> str:
520 return self.myrepr() # repr(float(self))
521
522 def as_numeric(self) -> float:
523 return float(self)
524
525 def write_to_stream(
526 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
527 ) -> None:
528 if encryption_key is not None: # deprecated
529 deprecate_no_replacement(
530 "the encryption_key parameter of write_to_stream", "5.0.0"
531 )
532 stream.write(self.myrepr().encode("utf8"))
533
534
535class NumberObject(int, PdfObject):
536 NumberPattern = re.compile(b"[^+-.0-9]")
537
538 def __new__(cls, value: Any) -> "NumberObject":
539 try:
540 return int.__new__(cls, int(value))
541 except ValueError:
542 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
543 return int.__new__(cls, 0)
544
545 def clone(
546 self,
547 pdf_dest: Any,
548 force_duplicate: bool = False,
549 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
550 ) -> "NumberObject":
551 """Clone object into pdf_dest."""
552 return cast(
553 "NumberObject",
554 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
555 )
556
557 def hash_bin(self) -> int:
558 """
559 Used to detect modified object.
560
561 Returns:
562 Hash considering type and value.
563
564 """
565 return hash((self.__class__, self.as_numeric()))
566
567 def as_numeric(self) -> int:
568 return int(repr(self).encode("utf8"))
569
570 def write_to_stream(
571 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
572 ) -> None:
573 if encryption_key is not None: # deprecated
574 deprecate_no_replacement(
575 "the encryption_key parameter of write_to_stream", "5.0.0"
576 )
577 stream.write(repr(self).encode("utf8"))
578
579 @staticmethod
580 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
581 num = read_until_regex(stream, NumberObject.NumberPattern)
582 if b"." in num:
583 return FloatObject(num)
584 return NumberObject(num)
585
586
587class ByteStringObject(bytes, PdfObject):
588 """
589 Represents a string object where the text encoding could not be determined.
590
591 This occurs quite often, as the PDF spec doesn't provide an alternate way to
592 represent strings -- for example, the encryption data stored in files (like
593 /O) is clearly not text, but is still stored in a "String" object.
594 """
595
596 def clone(
597 self,
598 pdf_dest: Any,
599 force_duplicate: bool = False,
600 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
601 ) -> "ByteStringObject":
602 """Clone object into pdf_dest."""
603 return cast(
604 "ByteStringObject",
605 self._reference_clone(
606 ByteStringObject(bytes(self)), pdf_dest, force_duplicate
607 ),
608 )
609
610 def hash_bin(self) -> int:
611 """
612 Used to detect modified object.
613
614 Returns:
615 Hash considering type and value.
616
617 """
618 return hash((self.__class__, bytes(self)))
619
620 @property
621 def original_bytes(self) -> bytes:
622 """For compatibility with TextStringObject.original_bytes."""
623 return self
624
625 def write_to_stream(
626 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
627 ) -> None:
628 if encryption_key is not None: # deprecated
629 deprecate_no_replacement(
630 "the encryption_key parameter of write_to_stream", "5.0.0"
631 )
632 stream.write(b"<")
633 stream.write(binascii.hexlify(self))
634 stream.write(b">")
635
636 def __str__(self) -> str:
637 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
638 for enc in charset_to_try:
639 try:
640 return self.decode(enc)
641 except UnicodeDecodeError:
642 pass
643 raise PdfReadError("Cannot decode ByteStringObject.")
644
645
646class TextStringObject(str, PdfObject): # noqa: SLOT000
647 """
648 A string object that has been decoded into a real unicode string.
649
650 If read from a PDF document, this string appeared to match the
651 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
652 to occur.
653 """
654
655 autodetect_pdfdocencoding: bool
656 autodetect_utf16: bool
657 utf16_bom: bytes
658 _original_bytes: Optional[bytes] = None
659
660 def __new__(cls, value: Any) -> "TextStringObject":
661 org = None
662 if isinstance(value, bytes):
663 org = value
664 value = value.decode("charmap")
665 o = str.__new__(cls, value)
666 o._original_bytes = org
667 o.autodetect_utf16 = False
668 o.autodetect_pdfdocencoding = False
669 o.utf16_bom = b""
670 if o.startswith(("\xfe\xff", "\xff\xfe")):
671 assert org is not None, "mypy"
672 try:
673 o = str.__new__(cls, org.decode("utf-16"))
674 except UnicodeDecodeError as exc:
675 logger_warning(
676 f"{exc!s}\ninitial string:{exc.object!r}",
677 __name__,
678 )
679 o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
680 o._original_bytes = org
681 o.autodetect_utf16 = True
682 o.utf16_bom = org[:2]
683 else:
684 try:
685 encode_pdfdocencoding(o)
686 o.autodetect_pdfdocencoding = True
687 except UnicodeEncodeError:
688 o.autodetect_utf16 = True
689 o.utf16_bom = codecs.BOM_UTF16_BE
690 return o
691
692 def clone(
693 self,
694 pdf_dest: Any,
695 force_duplicate: bool = False,
696 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
697 ) -> "TextStringObject":
698 """Clone object into pdf_dest."""
699 obj = TextStringObject(self)
700 obj._original_bytes = self._original_bytes
701 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
702 obj.autodetect_utf16 = self.autodetect_utf16
703 obj.utf16_bom = self.utf16_bom
704 return cast(
705 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
706 )
707
708 def hash_bin(self) -> int:
709 """
710 Used to detect modified object.
711
712 Returns:
713 Hash considering type and value.
714
715 """
716 return hash((self.__class__, self.original_bytes))
717
718 @property
719 def original_bytes(self) -> bytes:
720 """
721 It is occasionally possible that a text string object gets created where
722 a byte string object was expected due to the autodetection mechanism --
723 if that occurs, this "original_bytes" property can be used to
724 back-calculate what the original encoded bytes were.
725 """
726 if self._original_bytes is not None:
727 return self._original_bytes
728 return self.get_original_bytes()
729
730 def get_original_bytes(self) -> bytes:
731 # We're a text string object, but the library is trying to get our raw
732 # bytes. This can happen if we auto-detected this string as text, but
733 # we were wrong. It's pretty common. Return the original bytes that
734 # would have been used to create this object, based upon the autodetect
735 # method.
736 if self.autodetect_utf16:
737 if self.utf16_bom == codecs.BOM_UTF16_LE:
738 return codecs.BOM_UTF16_LE + self.encode("utf-16le")
739 if self.utf16_bom == codecs.BOM_UTF16_BE:
740 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
741 return self.encode("utf-16be")
742 if self.autodetect_pdfdocencoding:
743 return encode_pdfdocencoding(self)
744 raise Exception("no information about original bytes") # pragma: no cover
745
746 def get_encoded_bytes(self) -> bytes:
747 # Try to write the string out as a PDFDocEncoding encoded string. It's
748 # nicer to look at in the PDF file. Sadly, we take a performance hit
749 # here for trying...
750 try:
751 if self._original_bytes is not None:
752 return self._original_bytes
753 if self.autodetect_utf16:
754 raise UnicodeEncodeError("", "forced", -1, -1, "")
755 bytearr = encode_pdfdocencoding(self)
756 except UnicodeEncodeError:
757 if self.utf16_bom == codecs.BOM_UTF16_LE:
758 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
759 elif self.utf16_bom == codecs.BOM_UTF16_BE:
760 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
761 else:
762 bytearr = self.encode("utf-16be")
763 return bytearr
764
765 def write_to_stream(
766 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
767 ) -> None:
768 if encryption_key is not None: # deprecated
769 deprecate_no_replacement(
770 "the encryption_key parameter of write_to_stream", "5.0.0"
771 )
772 bytearr = self.get_encoded_bytes()
773 stream.write(b"(")
774 for c_ in iter_unpack("c", bytearr):
775 c = cast(bytes, c_[0])
776 if not c.isalnum() and c != b" ":
777 # This:
778 # stream.write(rf"\{c:0>3o}".encode())
779 # gives
780 # https://github.com/davidhalter/parso/issues/207
781 stream.write(b"\\%03o" % ord(c))
782 else:
783 stream.write(c)
784 stream.write(b")")
785
786
787class NameObject(str, PdfObject): # noqa: SLOT000
788 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
789 prefix = b"/"
790 renumber_table: ClassVar[Dict[str, bytes]] = {
791 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
792 **{chr(i): f"#{i:02X}".encode() for i in range(33)},
793 }
794
795 def clone(
796 self,
797 pdf_dest: Any,
798 force_duplicate: bool = False,
799 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
800 ) -> "NameObject":
801 """Clone object into pdf_dest."""
802 return cast(
803 "NameObject",
804 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
805 )
806
807 def hash_bin(self) -> int:
808 """
809 Used to detect modified object.
810
811 Returns:
812 Hash considering type and value.
813
814 """
815 return hash((self.__class__, self))
816
817 def write_to_stream(
818 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
819 ) -> None:
820 if encryption_key is not None: # deprecated
821 deprecate_no_replacement(
822 "the encryption_key parameter of write_to_stream", "5.0.0"
823 )
824 stream.write(self.renumber())
825
826 def renumber(self) -> bytes:
827 out = self[0].encode("utf-8")
828 if out != b"/":
829 deprecate_no_replacement(
830 f"Incorrect first char in NameObject, should start with '/': ({self})",
831 "6.0.0",
832 )
833 for c in self[1:]:
834 if c > "~":
835 for x in c.encode("utf-8"):
836 out += f"#{x:02X}".encode()
837 else:
838 try:
839 out += self.renumber_table[c]
840 except KeyError:
841 out += c.encode("utf-8")
842 return out
843
844 @classproperty
845 def surfix(cls) -> bytes: # noqa: N805
846 deprecate_with_replacement("surfix", "prefix", "6.0.0")
847 return b"/"
848
849 @staticmethod
850 def unnumber(sin: bytes) -> bytes:
851 i = sin.find(b"#", 0)
852 while i >= 0:
853 try:
854 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
855 i = sin.find(b"#", i + 1)
856 except ValueError:
857 # if the 2 characters after # can not be converted to hex
858 # we change nothing and carry on
859 i = i + 1
860 return sin
861
862 CHARSETS = ("utf-8", "gbk", "latin1")
863
864 @staticmethod
865 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
866 name = stream.read(1)
867 if name != NameObject.prefix:
868 raise PdfReadError("Name read error")
869 name += read_until_regex(stream, NameObject.delimiter_pattern)
870 try:
871 # Name objects should represent irregular characters
872 # with a '#' followed by the symbol's hex number
873 name = NameObject.unnumber(name)
874 for enc in NameObject.CHARSETS:
875 try:
876 ret = name.decode(enc)
877 return NameObject(ret)
878 except Exception:
879 pass
880 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
881 except (UnicodeEncodeError, UnicodeDecodeError) as e:
882 if not pdf.strict:
883 logger_warning(
884 f"Illegal character in NameObject ({name!r}), "
885 "you may need to adjust NameObject.CHARSETS",
886 __name__,
887 )
888 return NameObject(name.decode("charmap"))
889 raise PdfReadError(
890 f"Illegal character in NameObject ({name!r}). "
891 "You may need to adjust NameObject.CHARSETS.",
892 ) from e
893
894
895def encode_pdfdocencoding(unicode_string: str) -> bytes:
896 try:
897 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
898 except KeyError:
899 raise UnicodeEncodeError(
900 "pdfdocencoding",
901 unicode_string,
902 -1,
903 -1,
904 "does not exist in translation table",
905 )
906
907
908def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
909 """
910 Returns:
911 True if x is None or NullObject.
912
913 """
914 return x is None or (
915 isinstance(x, PdfObject)
916 and (x.get_object() is None or isinstance(x.get_object(), NullObject))
917 )