1import codecs
2from typing import Dict, List, Tuple, Union
3
4from .._codecs import _pdfdoc_encoding
5from .._utils import StreamType, logger_warning, read_non_whitespace
6from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
7from ._base import ByteStringObject, TextStringObject
8
9
10def hex_to_rgb(value: str) -> Tuple[float, float, float]:
11 return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
12
13
14def read_hex_string_from_stream(
15 stream: StreamType,
16 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
17) -> Union["TextStringObject", "ByteStringObject"]:
18 stream.read(1)
19 arr = []
20 x = b""
21 while True:
22 tok = read_non_whitespace(stream)
23 if not tok:
24 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
25 if tok == b">":
26 break
27 x += tok
28 if len(x) == 2:
29 arr.append(int(x, base=16))
30 x = b""
31 if len(x) == 1:
32 x += b"0"
33 if x != b"":
34 arr.append(int(x, base=16))
35 return create_string_object(bytes(arr), forced_encoding)
36
37
38__ESCAPE_DICT__ = {
39 b"n": ord(b"\n"),
40 b"r": ord(b"\r"),
41 b"t": ord(b"\t"),
42 b"b": ord(b"\b"),
43 b"f": ord(b"\f"),
44 b"(": ord(b"("),
45 b")": ord(b")"),
46 b"/": ord(b"/"),
47 b"\\": ord(b"\\"),
48 b" ": ord(b" "),
49 b"%": ord(b"%"),
50 b"<": ord(b"<"),
51 b">": ord(b">"),
52 b"[": ord(b"["),
53 b"]": ord(b"]"),
54 b"#": ord(b"#"),
55 b"_": ord(b"_"),
56 b"&": ord(b"&"),
57 b"$": ord(b"$"),
58}
59__BACKSLASH_CODE__ = 92
60
61
62def read_string_from_stream(
63 stream: StreamType,
64 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
65) -> Union["TextStringObject", "ByteStringObject"]:
66 tok = stream.read(1)
67 parens = 1
68 txt = []
69 while True:
70 tok = stream.read(1)
71 if not tok:
72 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
73 if tok == b"(":
74 parens += 1
75 elif tok == b")":
76 parens -= 1
77 if parens == 0:
78 break
79 elif tok == b"\\":
80 tok = stream.read(1)
81 try:
82 txt.append(__ESCAPE_DICT__[tok])
83 continue
84 except KeyError:
85 if b"0" <= tok <= b"7":
86 # "The number ddd may consist of one, two, or three
87 # octal digits; high-order overflow shall be ignored.
88 # Three octal digits shall be used, with leading zeros
89 # as needed, if the next character of the string is also
90 # a digit." (PDF reference 7.3.4.2, p 16)
91 sav = stream.tell() - 1
92 for _ in range(2):
93 ntok = stream.read(1)
94 if b"0" <= ntok <= b"7":
95 tok += ntok
96 else:
97 stream.seek(-1, 1) # ntok has to be analyzed
98 break
99 i = int(tok, base=8)
100 if i > 255:
101 txt.append(__BACKSLASH_CODE__)
102 stream.seek(sav)
103 else:
104 txt.append(i)
105 continue
106 if tok in b"\n\r":
107 # This case is hit when a backslash followed by a line
108 # break occurs. If it's a multi-char EOL, consume the
109 # second character:
110 tok = stream.read(1)
111 if tok not in b"\n\r":
112 stream.seek(-1, 1)
113 # Then don't add anything to the actual string, since this
114 # line break was escaped:
115 continue
116 msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}"
117 logger_warning(msg, __name__)
118 txt.append(__BACKSLASH_CODE__)
119 txt.append(ord(tok))
120 return create_string_object(bytes(txt), forced_encoding)
121
122
123def create_string_object(
124 string: Union[str, bytes],
125 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
126) -> Union[TextStringObject, ByteStringObject]:
127 """
128 Create a ByteStringObject or a TextStringObject from a string to represent the string.
129
130 Args:
131 string: The data being used
132 forced_encoding: Typically None, or an encoding string
133
134 Returns:
135 A ByteStringObject
136
137 Raises:
138 TypeError: If string is not of type str or bytes.
139
140 """
141 if isinstance(string, str):
142 return TextStringObject(string)
143 if isinstance(string, bytes):
144 if isinstance(forced_encoding, (list, dict)):
145 out = ""
146 for x in string:
147 try:
148 out += forced_encoding[x]
149 except Exception:
150 out += bytes((x,)).decode("charmap")
151 obj = TextStringObject(out)
152 obj._original_bytes = string
153 return obj
154 if isinstance(forced_encoding, str):
155 if forced_encoding == "bytes":
156 return ByteStringObject(string)
157 obj = TextStringObject(string.decode(forced_encoding))
158 obj._original_bytes = string
159 return obj
160 try:
161 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
162 retval = TextStringObject(string.decode("utf-16"))
163 retval._original_bytes = string
164 retval.autodetect_utf16 = True
165 retval.utf16_bom = string[:2]
166 return retval
167 if string.startswith(b"\x00"):
168 retval = TextStringObject(string.decode("utf-16be"))
169 retval._original_bytes = string
170 retval.autodetect_utf16 = True
171 retval.utf16_bom = codecs.BOM_UTF16_BE
172 return retval
173 if string[1:2] == b"\x00":
174 retval = TextStringObject(string.decode("utf-16le"))
175 retval._original_bytes = string
176 retval.autodetect_utf16 = True
177 retval.utf16_bom = codecs.BOM_UTF16_LE
178 return retval
179
180 # This is probably a big performance hit here, but we need
181 # to convert string objects into the text/unicode-aware
182 # version if possible... and the only way to check if that's
183 # possible is to try.
184 # Some strings are strings, some are just byte arrays.
185 retval = TextStringObject(decode_pdfdocencoding(string))
186 retval._original_bytes = string
187 retval.autodetect_pdfdocencoding = True
188 return retval
189 except UnicodeDecodeError:
190 return ByteStringObject(string)
191 else:
192 raise TypeError("create_string_object should have str or unicode arg")
193
194
195def decode_pdfdocencoding(byte_array: bytes) -> str:
196 retval = ""
197 for b in byte_array:
198 c = _pdfdoc_encoding[b]
199 if c == "\u0000":
200 raise UnicodeDecodeError(
201 "pdfdocencoding",
202 bytearray(b),
203 -1,
204 -1,
205 "does not exist in translation table",
206 )
207 retval += c
208 return retval