Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_utils.py: 48%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import codecs
2from typing import Union
4from .._codecs import _pdfdoc_encoding
5from .._utils import StreamType, logger_warning, read_non_whitespace
6from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
7from ._base import ByteStringObject, TextStringObject
10def hex_to_rgb(value: str) -> tuple[float, float, float]:
11 return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore[return-value]
14def read_hex_string_from_stream(
15 stream: StreamType,
16 forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
17) -> Union["TextStringObject", "ByteStringObject"]:
18 stream.read(1)
19 arr = []
20 x = b""
21 while True:
22 tok = read_non_whitespace(stream)
23 if not tok:
24 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
25 if tok == b">":
26 break
27 x += tok
28 if len(x) == 2:
29 arr.append(int(x, base=16))
30 x = b""
31 if len(x) == 1:
32 x += b"0"
33 if x != b"":
34 arr.append(int(x, base=16))
35 return create_string_object(bytes(arr), forced_encoding)
38__ESCAPE_DICT__ = {
39 b"n": ord(b"\n"),
40 b"r": ord(b"\r"),
41 b"t": ord(b"\t"),
42 b"b": ord(b"\b"),
43 b"f": ord(b"\f"),
44 b"(": ord(b"("),
45 b")": ord(b")"),
46 b"/": ord(b"/"),
47 b"\\": ord(b"\\"),
48 b" ": ord(b" "),
49 b"%": ord(b"%"),
50 b"<": ord(b"<"),
51 b">": ord(b">"),
52 b"[": ord(b"["),
53 b"]": ord(b"]"),
54 b"#": ord(b"#"),
55 b"_": ord(b"_"),
56 b"&": ord(b"&"),
57 b"$": ord(b"$"),
58}
59__BACKSLASH_CODE__ = 92
62def read_string_from_stream(
63 stream: StreamType,
64 forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
65) -> Union["TextStringObject", "ByteStringObject"]:
66 tok = stream.read(1)
67 parens = 1
68 txt = []
69 while True:
70 tok = stream.read(1)
71 if not tok:
72 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
73 if tok == b"(":
74 parens += 1
75 elif tok == b")":
76 parens -= 1
77 if parens == 0:
78 break
79 elif tok == b"\\":
80 tok = stream.read(1)
81 try:
82 txt.append(__ESCAPE_DICT__[tok])
83 continue
84 except KeyError:
85 if b"0" <= tok <= b"7":
86 # "The number ddd may consist of one, two, or three
87 # octal digits; high-order overflow shall be ignored.
88 # Three octal digits shall be used, with leading zeros
89 # as needed, if the next character of the string is also
90 # a digit." (PDF reference 7.3.4.2, p 16)
91 sav = stream.tell() - 1
92 for _ in range(2):
93 ntok = stream.read(1)
94 if b"0" <= ntok <= b"7":
95 tok += ntok
96 else:
97 stream.seek(-1, 1) # ntok has to be analyzed
98 break
99 i = int(tok, base=8)
100 if i > 255:
101 txt.append(__BACKSLASH_CODE__)
102 stream.seek(sav)
103 else:
104 txt.append(i)
105 continue
106 if tok in b"\n\r":
107 # This case is hit when a backslash followed by a line
108 # break occurs. If it's a multi-char EOL, consume the
109 # second character:
110 tok = stream.read(1)
111 if tok not in b"\n\r":
112 stream.seek(-1, 1)
113 # Then don't add anything to the actual string, since this
114 # line break was escaped:
115 continue
116 logger_warning(
117 "Unexpected escaped string: %(token)s",
118 source=__name__,
119 token=tok.decode("utf-8", "ignore"),
120 )
121 txt.append(__BACKSLASH_CODE__)
122 txt.append(ord(tok))
123 return create_string_object(bytes(txt), forced_encoding)
126def create_string_object(
127 string: Union[str, bytes],
128 forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
129) -> Union[TextStringObject, ByteStringObject]:
130 """
131 Create a ByteStringObject or a TextStringObject from a string to represent the string.
133 Args:
134 string: The data being used
135 forced_encoding: Typically None, or an encoding string
137 Returns:
138 A ByteStringObject
140 Raises:
141 TypeError: If string is not of type str or bytes.
143 """
144 if isinstance(string, str):
145 return TextStringObject(string)
146 if isinstance(string, bytes):
147 if isinstance(forced_encoding, (list, dict)):
148 out = ""
149 for x in string:
150 try:
151 out += forced_encoding[x]
152 except Exception:
153 out += bytes((x,)).decode("charmap")
154 obj = TextStringObject(out)
155 obj._original_bytes = string
156 return obj
157 if isinstance(forced_encoding, str):
158 if forced_encoding == "bytes":
159 return ByteStringObject(string)
160 obj = TextStringObject(string.decode(forced_encoding))
161 obj._original_bytes = string
162 return obj
163 try:
164 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
165 retval = TextStringObject(string.decode("utf-16"))
166 retval._original_bytes = string
167 retval.autodetect_utf16 = True
168 retval.utf16_bom = string[:2]
169 return retval
170 if string.startswith(b"\x00"):
171 retval = TextStringObject(string.decode("utf-16be"))
172 retval._original_bytes = string
173 retval.autodetect_utf16 = True
174 retval.utf16_bom = codecs.BOM_UTF16_BE
175 return retval
176 if string[1:2] == b"\x00":
177 retval = TextStringObject(string.decode("utf-16le"))
178 retval._original_bytes = string
179 retval.autodetect_utf16 = True
180 retval.utf16_bom = codecs.BOM_UTF16_LE
181 return retval
183 # This is probably a big performance hit here, but we need
184 # to convert string objects into the text/unicode-aware
185 # version if possible... and the only way to check if that's
186 # possible is to try.
187 # Some strings are strings, some are just byte arrays.
188 retval = TextStringObject(decode_pdfdocencoding(string))
189 retval._original_bytes = string
190 retval.autodetect_pdfdocencoding = True
191 return retval
192 except UnicodeDecodeError:
193 return ByteStringObject(string)
194 else:
195 raise TypeError("create_string_object should have str or unicode arg")
198def decode_pdfdocencoding(byte_array: bytes) -> str:
199 retval = ""
200 for b in byte_array:
201 c = _pdfdoc_encoding[b]
202 if c == "\u0000":
203 raise UnicodeDecodeError(
204 "pdfdocencoding",
205 bytearray(b),
206 -1,
207 -1,
208 "does not exist in translation table",
209 )
210 retval += c
211 return retval