1# Copyright (c) 2006, Mathieu Fenniak
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27import binascii
28import codecs
29import hashlib
30import re
31import sys
32from binascii import unhexlify
33from collections.abc import Sequence
34from math import log10
35from struct import iter_unpack
36from typing import Any, Callable, ClassVar, Optional, Union, cast
37
38if sys.version_info[:2] >= (3, 10):
39 from typing import TypeGuard
40else:
41 from typing_extensions import TypeGuard # PEP 647
42
43from .._codecs import _pdfdoc_encoding_rev
44from .._protocols import PdfObjectProtocol, PdfWriterProtocol
45from .._utils import (
46 StreamType,
47 classproperty,
48 deprecation_no_replacement,
49 deprecation_with_replacement,
50 logger_warning,
51 read_non_whitespace,
52 read_until_regex,
53)
54from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
55
56__author__ = "Mathieu Fenniak"
57__author_email__ = "biziqe@mathieu.fenniak.net"
58
59
60class PdfObject(PdfObjectProtocol):
61 # function for calculating a hash value
62 hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
63 indirect_reference: Optional["IndirectObject"]
64
65 def hash_bin(self) -> int:
66 """
67 Used to detect modified object.
68
69 Returns:
70 Hash considering type and value.
71
72 """
73 raise NotImplementedError(
74 f"{self.__class__.__name__} does not implement .hash_bin() so far"
75 )
76
77 def hash_value_data(self) -> bytes:
78 return f"{self}".encode()
79
80 def hash_value(self) -> bytes:
81 return (
82 f"{self.__class__.__name__}:"
83 f"{self.hash_func(self.hash_value_data()).hexdigest()}"
84 ).encode()
85
86 def replicate(
87 self,
88 pdf_dest: PdfWriterProtocol,
89 ) -> "PdfObject":
90 """
91 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
92 without ensuring links. This is used in clone_document_from_root with incremental = True.
93
94 Args:
95 pdf_dest: Target to clone to.
96
97 Returns:
98 The cloned PdfObject
99
100 """
101 return self.clone(pdf_dest)
102
103 def clone(
104 self,
105 pdf_dest: PdfWriterProtocol,
106 force_duplicate: bool = False,
107 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
108 ) -> "PdfObject":
109 """
110 Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
111
112 By default, this method will call ``_reference_clone`` (see ``_reference``).
113
114
115 Args:
116 pdf_dest: Target to clone to.
117 force_duplicate: By default, if the object has already been cloned and referenced,
118 the copy will be returned; when ``True``, a new copy will be created.
119 (Default value = ``False``)
120 ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
121 during cloning (applies to children duplication as well). If fields are to be
122 considered for a limited number of levels, you have to add it as integer, for
123 example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
124 level only but ``"/TOTO"`` on all levels.
125
126 Returns:
127 The cloned PdfObject
128
129 """
130 raise NotImplementedError(
131 f"{self.__class__.__name__} does not implement .clone so far"
132 )
133
134 def _reference_clone(
135 self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
136 ) -> PdfObjectProtocol:
137 """
138 Reference the object within the _objects of pdf_dest only if
139 indirect_reference attribute exists (which means the objects was
140 already identified in xref/xobjstm) if object has been already
141 referenced do nothing.
142
143 Args:
144 clone:
145 pdf_dest:
146
147 Returns:
148 The clone
149
150 """
151 try:
152 if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
153 return clone
154 except Exception:
155 pass
156 # if hasattr(clone, "indirect_reference"):
157 try:
158 ind = self.indirect_reference
159 except AttributeError:
160 return clone
161 if (
162 pdf_dest.incremental
163 and ind is not None
164 and ind.pdf == pdf_dest._reader
165 and ind.idnum <= len(pdf_dest._objects)
166 ):
167 i = ind.idnum
168 else:
169 i = len(pdf_dest._objects) + 1
170 if ind is not None:
171 if id(ind.pdf) not in pdf_dest._id_translated:
172 pdf_dest._id_translated[id(ind.pdf)] = {}
173 pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore
174 if (
175 not force_duplicate
176 and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
177 ):
178 obj = pdf_dest.get_object(
179 pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
180 )
181 assert obj is not None
182 return obj
183 pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
184 try:
185 pdf_dest._objects[i - 1] = clone
186 except IndexError:
187 pdf_dest._objects.append(clone)
188 i = len(pdf_dest._objects)
189 clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
190 return clone
191
192 def get_object(self) -> Optional["PdfObject"]:
193 """Resolve indirect references."""
194 return self
195
196 def write_to_stream(
197 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
198 ) -> None:
199 raise NotImplementedError
200
201
202class NullObject(PdfObject):
203 def clone(
204 self,
205 pdf_dest: PdfWriterProtocol,
206 force_duplicate: bool = False,
207 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
208 ) -> "NullObject":
209 """Clone object into pdf_dest."""
210 return cast(
211 "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
212 )
213
214 def hash_bin(self) -> int:
215 """
216 Used to detect modified object.
217
218 Returns:
219 Hash considering type and value.
220
221 """
222 return hash((self.__class__,))
223
224 def write_to_stream(
225 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
226 ) -> None:
227 if encryption_key is not None: # deprecated
228 deprecation_no_replacement(
229 "the encryption_key parameter of write_to_stream", "5.0.0"
230 )
231 stream.write(b"null")
232
233 @staticmethod
234 def read_from_stream(stream: StreamType) -> "NullObject":
235 nulltxt = stream.read(4)
236 if nulltxt != b"null":
237 raise PdfReadError("Could not read Null object")
238 return NullObject()
239
240 def __repr__(self) -> str:
241 return "NullObject"
242
243 def __eq__(self, other: object) -> bool:
244 return isinstance(other, NullObject)
245
246 def __hash__(self) -> int:
247 return self.hash_bin()
248
249
250class BooleanObject(PdfObject):
251 def __init__(self, value: Any) -> None:
252 self.value = value
253
254 def clone(
255 self,
256 pdf_dest: PdfWriterProtocol,
257 force_duplicate: bool = False,
258 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
259 ) -> "BooleanObject":
260 """Clone object into pdf_dest."""
261 return cast(
262 "BooleanObject",
263 self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
264 )
265
266 def hash_bin(self) -> int:
267 """
268 Used to detect modified object.
269
270 Returns:
271 Hash considering type and value.
272
273 """
274 return hash((self.__class__, self.value))
275
276 def __eq__(self, o: object, /) -> bool:
277 if isinstance(o, BooleanObject):
278 return self.value == o.value
279 if isinstance(o, bool):
280 return self.value == o
281 return False
282
283 def __hash__(self) -> int:
284 return self.hash_bin()
285
286 def __repr__(self) -> str:
287 return "True" if self.value else "False"
288
289 def write_to_stream(
290 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
291 ) -> None:
292 if encryption_key is not None: # deprecated
293 deprecation_no_replacement(
294 "the encryption_key parameter of write_to_stream", "5.0.0"
295 )
296 if self.value:
297 stream.write(b"true")
298 else:
299 stream.write(b"false")
300
301 @staticmethod
302 def read_from_stream(stream: StreamType) -> "BooleanObject":
303 word = stream.read(4)
304 if word == b"true":
305 return BooleanObject(True)
306 if word == b"fals":
307 stream.read(1)
308 return BooleanObject(False)
309 raise PdfReadError("Could not read Boolean object")
310
311
312class IndirectObject(PdfObject):
313 def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
314 self.idnum = idnum
315 self.generation = generation
316 self.pdf = pdf
317
318 def __hash__(self) -> int:
319 return hash((self.idnum, self.generation, id(self.pdf)))
320
321 def hash_bin(self) -> int:
322 """
323 Used to detect modified object.
324
325 Returns:
326 Hash considering type and value.
327
328 """
329 return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
330
331 def replicate(
332 self,
333 pdf_dest: PdfWriterProtocol,
334 ) -> "PdfObject":
335 return IndirectObject(self.idnum, self.generation, pdf_dest)
336
337 def clone(
338 self,
339 pdf_dest: PdfWriterProtocol,
340 force_duplicate: bool = False,
341 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
342 ) -> "IndirectObject":
343 """Clone object into pdf_dest."""
344 if self.pdf == pdf_dest and not force_duplicate:
345 # Already duplicated and no extra duplication required
346 return self
347 if id(self.pdf) not in pdf_dest._id_translated:
348 pdf_dest._id_translated[id(self.pdf)] = {}
349
350 if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
351 dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
352 if force_duplicate:
353 assert dup is not None
354 assert dup.indirect_reference is not None
355 idref = dup.indirect_reference
356 return IndirectObject(idref.idnum, idref.generation, idref.pdf)
357 else:
358 obj = self.get_object()
359 # case observed : a pointed object can not be found
360 if obj is None:
361 # this normally
362 obj = NullObject()
363 assert isinstance(self, (IndirectObject,))
364 obj.indirect_reference = self
365 dup = pdf_dest._add_object(
366 obj.clone(pdf_dest, force_duplicate, ignore_fields)
367 )
368 assert dup is not None, "mypy"
369 assert dup.indirect_reference is not None, "mypy"
370 return dup.indirect_reference
371
372 @property
373 def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
374 return self
375
376 def get_object(self) -> Optional["PdfObject"]:
377 return self.pdf.get_object(self)
378
379 def __deepcopy__(self, memo: Any) -> "IndirectObject":
380 return IndirectObject(self.idnum, self.generation, self.pdf)
381
382 def _get_object_with_check(self) -> Optional["PdfObject"]:
383 o = self.get_object()
384 # the check is done here to not slow down get_object()
385 if isinstance(o, IndirectObject):
386 raise PdfStreamError(
387 f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
388 )
389 return o
390
391 def __getattr__(self, name: str) -> Any:
392 # Attribute not found in object: look in pointed object
393 try:
394 return getattr(self._get_object_with_check(), name)
395 except AttributeError:
396 raise AttributeError(
397 f"No attribute {name} found in IndirectObject or pointed object"
398 )
399
400 def __getitem__(self, key: Any) -> Any:
401 # items should be extracted from pointed Object
402 return self._get_object_with_check()[key] # type: ignore
403
404 def __contains__(self, key: Any) -> bool:
405 return key in self._get_object_with_check() # type: ignore
406
407 def __iter__(self) -> Any:
408 return self._get_object_with_check().__iter__() # type: ignore
409
410 def __float__(self) -> str:
411 # in this case we are looking for the pointed data
412 return self.get_object().__float__() # type: ignore
413
414 def __int__(self) -> int:
415 # in this case we are looking for the pointed data
416 return self.get_object().__int__() # type: ignore
417
418 def __str__(self) -> str:
419 # in this case we are looking for the pointed data
420 return self.get_object().__str__()
421
422 def __repr__(self) -> str:
423 return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
424
425 def __eq__(self, other: object) -> bool:
426 return (
427 other is not None
428 and isinstance(other, IndirectObject)
429 and self.idnum == other.idnum
430 and self.generation == other.generation
431 and self.pdf is other.pdf
432 )
433
434 def __ne__(self, other: object) -> bool:
435 return not self.__eq__(other)
436
437 def write_to_stream(
438 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
439 ) -> None:
440 if encryption_key is not None: # deprecated
441 deprecation_no_replacement(
442 "the encryption_key parameter of write_to_stream", "5.0.0"
443 )
444 stream.write(f"{self.idnum} {self.generation} R".encode())
445
446 @staticmethod
447 def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
448 idnum = b""
449 while True:
450 tok = stream.read(1)
451 if not tok:
452 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
453 if tok.isspace():
454 break
455 idnum += tok
456 generation = b""
457 while True:
458 tok = stream.read(1)
459 if not tok:
460 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
461 if tok.isspace():
462 if not generation:
463 continue
464 break
465 generation += tok
466 r = read_non_whitespace(stream)
467 if r != b"R":
468 raise PdfReadError(
469 f"Error reading indirect object reference at byte {hex(stream.tell())}"
470 )
471 return IndirectObject(int(idnum), int(generation), pdf)
472
473
474FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
475
476
477class FloatObject(float, PdfObject):
478 def __new__(
479 cls, value: Any = "0.0", context: Optional[Any] = None
480 ) -> "FloatObject":
481 try:
482 value = float(value)
483 return float.__new__(cls, value)
484 except Exception as e:
485 # If this isn't a valid decimal (happens in malformed PDFs)
486 # fallback to 0
487 logger_warning(
488 f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
489 )
490 return float.__new__(cls, 0.0)
491
492 def clone(
493 self,
494 pdf_dest: Any,
495 force_duplicate: bool = False,
496 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
497 ) -> "FloatObject":
498 """Clone object into pdf_dest."""
499 return cast(
500 "FloatObject",
501 self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
502 )
503
504 def hash_bin(self) -> int:
505 """
506 Used to detect modified object.
507
508 Returns:
509 Hash considering type and value.
510
511 """
512 return hash((self.__class__, self.as_numeric))
513
514 def myrepr(self) -> str:
515 if self == 0:
516 return "0.0"
517 nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
518 return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
519
520 def __repr__(self) -> str:
521 return self.myrepr() # repr(float(self))
522
523 def as_numeric(self) -> float:
524 return float(self)
525
526 def write_to_stream(
527 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
528 ) -> None:
529 if encryption_key is not None: # deprecated
530 deprecation_no_replacement(
531 "the encryption_key parameter of write_to_stream", "5.0.0"
532 )
533 stream.write(self.myrepr().encode("utf8"))
534
535
536class NumberObject(int, PdfObject):
537 NumberPattern = re.compile(b"[^+-.0-9]")
538
539 def __new__(cls, value: Any) -> "NumberObject":
540 try:
541 return int.__new__(cls, int(value))
542 except ValueError:
543 logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
544 return int.__new__(cls, 0)
545
546 def clone(
547 self,
548 pdf_dest: Any,
549 force_duplicate: bool = False,
550 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
551 ) -> "NumberObject":
552 """Clone object into pdf_dest."""
553 return cast(
554 "NumberObject",
555 self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
556 )
557
558 def hash_bin(self) -> int:
559 """
560 Used to detect modified object.
561
562 Returns:
563 Hash considering type and value.
564
565 """
566 return hash((self.__class__, self.as_numeric()))
567
568 def as_numeric(self) -> int:
569 return int(repr(self).encode("utf8"))
570
571 def write_to_stream(
572 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
573 ) -> None:
574 if encryption_key is not None: # deprecated
575 deprecation_no_replacement(
576 "the encryption_key parameter of write_to_stream", "5.0.0"
577 )
578 stream.write(repr(self).encode("utf8"))
579
580 @staticmethod
581 def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
582 num = read_until_regex(stream, NumberObject.NumberPattern)
583 if b"." in num:
584 return FloatObject(num)
585 return NumberObject(num)
586
587
588class ByteStringObject(bytes, PdfObject):
589 """
590 Represents a string object where the text encoding could not be determined.
591
592 This occurs quite often, as the PDF spec doesn't provide an alternate way to
593 represent strings -- for example, the encryption data stored in files (like
594 /O) is clearly not text, but is still stored in a "String" object.
595 """
596
597 def clone(
598 self,
599 pdf_dest: Any,
600 force_duplicate: bool = False,
601 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
602 ) -> "ByteStringObject":
603 """Clone object into pdf_dest."""
604 return cast(
605 "ByteStringObject",
606 self._reference_clone(
607 ByteStringObject(bytes(self)), pdf_dest, force_duplicate
608 ),
609 )
610
611 def hash_bin(self) -> int:
612 """
613 Used to detect modified object.
614
615 Returns:
616 Hash considering type and value.
617
618 """
619 return hash((self.__class__, bytes(self)))
620
621 @property
622 def original_bytes(self) -> bytes:
623 """For compatibility with TextStringObject.original_bytes."""
624 return self
625
626 def write_to_stream(
627 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
628 ) -> None:
629 if encryption_key is not None: # deprecated
630 deprecation_no_replacement(
631 "the encryption_key parameter of write_to_stream", "5.0.0"
632 )
633 stream.write(b"<")
634 stream.write(binascii.hexlify(self))
635 stream.write(b">")
636
637 def __str__(self) -> str:
638 charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
639 for enc in charset_to_try:
640 try:
641 return self.decode(enc)
642 except UnicodeDecodeError:
643 pass
644 raise PdfReadError("Cannot decode ByteStringObject.")
645
646
647class TextStringObject(str, PdfObject): # noqa: SLOT000
648 """
649 A string object that has been decoded into a real unicode string.
650
651 If read from a PDF document, this string appeared to match the
652 PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
653 to occur.
654 """
655
656 autodetect_pdfdocencoding: bool
657 autodetect_utf16: bool
658 utf16_bom: bytes
659 _original_bytes: Optional[bytes] = None
660
661 def __new__(cls, value: Any) -> "TextStringObject":
662 org = None
663 if isinstance(value, bytes):
664 org = value
665 value = value.decode("charmap")
666 o = str.__new__(cls, value)
667 o._original_bytes = org
668 o.autodetect_utf16 = False
669 o.autodetect_pdfdocencoding = False
670 o.utf16_bom = b""
671 if o.startswith(("\xfe\xff", "\xff\xfe")):
672 assert org is not None, "mypy"
673 try:
674 o = str.__new__(cls, org.decode("utf-16"))
675 except UnicodeDecodeError as exc:
676 logger_warning(
677 f"{exc!s}\ninitial string:{exc.object!r}",
678 __name__,
679 )
680 o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
681 o._original_bytes = org
682 o.autodetect_utf16 = True
683 o.utf16_bom = org[:2]
684 else:
685 try:
686 encode_pdfdocencoding(o)
687 o.autodetect_pdfdocencoding = True
688 except UnicodeEncodeError:
689 o.autodetect_utf16 = True
690 o.utf16_bom = codecs.BOM_UTF16_BE
691 return o
692
693 def clone(
694 self,
695 pdf_dest: Any,
696 force_duplicate: bool = False,
697 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
698 ) -> "TextStringObject":
699 """Clone object into pdf_dest."""
700 obj = TextStringObject(self)
701 obj._original_bytes = self._original_bytes
702 obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
703 obj.autodetect_utf16 = self.autodetect_utf16
704 obj.utf16_bom = self.utf16_bom
705 return cast(
706 "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
707 )
708
709 def hash_bin(self) -> int:
710 """
711 Used to detect modified object.
712
713 Returns:
714 Hash considering type and value.
715
716 """
717 return hash((self.__class__, self.original_bytes))
718
719 @property
720 def original_bytes(self) -> bytes:
721 """
722 It is occasionally possible that a text string object gets created where
723 a byte string object was expected due to the autodetection mechanism --
724 if that occurs, this "original_bytes" property can be used to
725 back-calculate what the original encoded bytes were.
726 """
727 if self._original_bytes is not None:
728 return self._original_bytes
729 return self.get_original_bytes()
730
731 def get_original_bytes(self) -> bytes:
732 # We're a text string object, but the library is trying to get our raw
733 # bytes. This can happen if we auto-detected this string as text, but
734 # we were wrong. It's pretty common. Return the original bytes that
735 # would have been used to create this object, based upon the autodetect
736 # method.
737 if self.autodetect_utf16:
738 if self.utf16_bom == codecs.BOM_UTF16_LE:
739 return codecs.BOM_UTF16_LE + self.encode("utf-16le")
740 if self.utf16_bom == codecs.BOM_UTF16_BE:
741 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
742 return self.encode("utf-16be")
743 if self.autodetect_pdfdocencoding:
744 return encode_pdfdocencoding(self)
745 raise Exception("no information about original bytes") # pragma: no cover
746
747 def get_encoded_bytes(self) -> bytes:
748 # Try to write the string out as a PDFDocEncoding encoded string. It's
749 # nicer to look at in the PDF file. Sadly, we take a performance hit
750 # here for trying...
751 try:
752 if self._original_bytes is not None:
753 return self._original_bytes
754 if self.autodetect_utf16:
755 raise UnicodeEncodeError("", "forced", -1, -1, "")
756 bytearr = encode_pdfdocencoding(self)
757 except UnicodeEncodeError:
758 if self.utf16_bom == codecs.BOM_UTF16_LE:
759 bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
760 elif self.utf16_bom == codecs.BOM_UTF16_BE:
761 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
762 else:
763 bytearr = self.encode("utf-16be")
764 return bytearr
765
766 def write_to_stream(
767 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
768 ) -> None:
769 if encryption_key is not None: # deprecated
770 deprecation_no_replacement(
771 "the encryption_key parameter of write_to_stream", "5.0.0"
772 )
773 bytearr = self.get_encoded_bytes()
774 stream.write(b"(")
775 for c_ in iter_unpack("c", bytearr):
776 c = cast(bytes, c_[0])
777 if not c.isalnum() and c != b" ":
778 # This:
779 # stream.write(rf"\{c:0>3o}".encode())
780 # gives
781 # https://github.com/davidhalter/parso/issues/207
782 stream.write(b"\\%03o" % ord(c))
783 else:
784 stream.write(c)
785 stream.write(b")")
786
787
788class NameObject(str, PdfObject): # noqa: SLOT000
789 delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
790 prefix = b"/"
791 renumber_table: ClassVar[dict[str, bytes]] = {
792 **{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
793 **{chr(i): f"#{i:02X}".encode() for i in range(33)},
794 }
795
796 def clone(
797 self,
798 pdf_dest: Any,
799 force_duplicate: bool = False,
800 ignore_fields: Optional[Sequence[Union[str, int]]] = (),
801 ) -> "NameObject":
802 """Clone object into pdf_dest."""
803 return cast(
804 "NameObject",
805 self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
806 )
807
808 def hash_bin(self) -> int:
809 """
810 Used to detect modified object.
811
812 Returns:
813 Hash considering type and value.
814
815 """
816 return hash((self.__class__, self))
817
818 def write_to_stream(
819 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
820 ) -> None:
821 if encryption_key is not None: # deprecated
822 deprecation_no_replacement(
823 "the encryption_key parameter of write_to_stream", "5.0.0"
824 )
825 stream.write(self.renumber())
826
827 def renumber(self) -> bytes:
828 out = self[0].encode("utf-8")
829 if out != b"/":
830 deprecation_no_replacement(
831 f"Incorrect first char in NameObject, should start with '/': ({self})",
832 "5.0.0",
833 )
834 for c in self[1:]:
835 if c > "~":
836 for x in c.encode("utf-8"):
837 out += f"#{x:02X}".encode()
838 else:
839 try:
840 out += self.renumber_table[c]
841 except KeyError:
842 out += c.encode("utf-8")
843 return out
844
845 def _sanitize(self) -> "NameObject":
846 """
847 Sanitize the NameObject's name to be a valid PDF name part
848 (alphanumeric, underscore, hyphen). The _sanitize method replaces
849 spaces and any non-alphanumeric/non-underscore/non-hyphen with
850 underscores.
851
852 Returns:
853 NameObject with sanitized name.
854 """
855 name = str(self)[1:] # Remove leading forward slash
856 name = re.sub(r"\ ", "_", name)
857 name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
858 return NameObject("/" + name)
859
860 @classproperty
861 def surfix(cls) -> bytes: # noqa: N805
862 deprecation_with_replacement("surfix", "prefix", "5.0.0")
863 return b"/"
864
865 @staticmethod
866 def unnumber(sin: bytes) -> bytes:
867 i = sin.find(b"#", 0)
868 while i >= 0:
869 try:
870 sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
871 i = sin.find(b"#", i + 1)
872 except ValueError:
873 # if the 2 characters after # can not be converted to hex
874 # we change nothing and carry on
875 i = i + 1
876 return sin
877
878 CHARSETS = ("utf-8", "gbk", "latin1")
879
880 @staticmethod
881 def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
882 name = stream.read(1)
883 if name != NameObject.prefix:
884 raise PdfReadError("Name read error")
885 name += read_until_regex(stream, NameObject.delimiter_pattern)
886 try:
887 # Name objects should represent irregular characters
888 # with a '#' followed by the symbol's hex number
889 name = NameObject.unnumber(name)
890 for enc in NameObject.CHARSETS:
891 try:
892 ret = name.decode(enc)
893 return NameObject(ret)
894 except Exception:
895 pass
896 raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
897 except (UnicodeEncodeError, UnicodeDecodeError) as e:
898 if not pdf.strict:
899 logger_warning(
900 f"Illegal character in NameObject ({name!r}), "
901 "you may need to adjust NameObject.CHARSETS",
902 __name__,
903 )
904 return NameObject(name.decode("charmap"))
905 raise PdfReadError(
906 f"Illegal character in NameObject ({name!r}). "
907 "You may need to adjust NameObject.CHARSETS.",
908 ) from e
909
910
911def encode_pdfdocencoding(unicode_string: str) -> bytes:
912 try:
913 return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
914 except KeyError:
915 raise UnicodeEncodeError(
916 "pdfdocencoding",
917 unicode_string,
918 -1,
919 -1,
920 "does not exist in translation table",
921 )
922
923
924def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
925 """
926 Returns:
927 True if x is None or NullObject.
928
929 """
930 return x is None or (
931 isinstance(x, PdfObject)
932 and (x.get_object() is None or isinstance(x.get_object(), NullObject))
933 )