Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import binascii
2from binascii import Error as BinasciiError
3from binascii import unhexlify
4from math import ceil
5from typing import Any, Union, cast
7from ._codecs import adobe_glyphs, charset_encoding
8from ._utils import logger_error, logger_warning
9from .generic import (
10 DecodedStreamObject,
11 DictionaryObject,
12 NullObject,
13 StreamObject,
14 is_null_or_none,
15)
17_predefined_cmap: dict[str, str] = {
18 "/Identity-H": "utf-16-be",
19 "/Identity-V": "utf-16-be",
20 "/GB-EUC-H": "gbk",
21 "/GB-EUC-V": "gbk",
22 "/GBpc-EUC-H": "gb2312",
23 "/GBpc-EUC-V": "gb2312",
24 "/GBK-EUC-H": "gbk",
25 "/GBK-EUC-V": "gbk",
26 "/GBK2K-H": "gb18030",
27 "/GBK2K-V": "gb18030",
28 "/ETen-B5-H": "cp950",
29 "/ETen-B5-V": "cp950",
30 "/ETenms-B5-H": "cp950",
31 "/ETenms-B5-V": "cp950",
32 "/UniCNS-UTF16-H": "utf-16-be",
33 "/UniCNS-UTF16-V": "utf-16-be",
34 "/UniGB-UTF16-H": "gb18030",
35 "/UniGB-UTF16-V": "gb18030",
36 # UCS2 in code
37}
40def get_encoding(
41 ft: DictionaryObject
42) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
43 encoding = _parse_encoding(ft)
44 map_dict, int_entry = _parse_to_unicode(ft)
46 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
47 # if cmap not empty encoding should be discarded
48 # (here transformed into identity for those characters)
49 # If encoding is a string, it is expected to be an identity translation.
50 if isinstance(encoding, dict):
51 for x in int_entry:
52 if x <= 255:
53 encoding[x] = chr(x)
55 return encoding, map_dict
58def _parse_encoding(
59 ft: DictionaryObject
60) -> Union[str, dict[int, str]]:
61 encoding: Union[str, list[str], dict[int, str]] = []
62 if "/Encoding" not in ft:
63 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
64 encoding = dict(
65 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
66 )
67 else:
68 encoding = "charmap"
69 return encoding
70 enc: Union[str, DictionaryObject, NullObject] = cast(
71 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
72 )
73 if isinstance(enc, str):
74 try:
75 # already done : enc = NameObject.unnumber(enc.encode()).decode()
76 # for #xx decoding
77 if enc in charset_encoding:
78 encoding = charset_encoding[enc].copy()
79 elif enc in _predefined_cmap:
80 encoding = _predefined_cmap[enc]
81 elif "-UCS2-" in enc:
82 encoding = "utf-16-be"
83 else:
84 raise Exception("not found")
85 except Exception:
86 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
87 encoding = enc
88 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
89 try:
90 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
91 except Exception:
92 logger_error(
93 f"Advanced encoding {encoding} not implemented yet",
94 __name__,
95 )
96 encoding = charset_encoding["/StandardEncoding"].copy()
97 else:
98 encoding = charset_encoding["/StandardEncoding"].copy()
99 if isinstance(enc, DictionaryObject) and "/Differences" in enc:
100 x: int = 0
101 o: Union[int, str]
102 for o in cast(DictionaryObject, enc["/Differences"]):
103 if isinstance(o, int):
104 x = o
105 else: # isinstance(o, str):
106 try:
107 if x < len(encoding):
108 encoding[x] = adobe_glyphs[o] # type: ignore
109 except Exception:
110 encoding[x] = o # type: ignore
111 x += 1
112 if isinstance(encoding, list):
113 encoding = dict(zip(range(256), encoding))
114 return encoding
117def _parse_to_unicode(
118 ft: DictionaryObject
119) -> tuple[dict[Any, Any], list[int]]:
120 # will store all translation code
121 # and map_dict[-1] we will have the number of bytes to convert
122 map_dict: dict[Any, Any] = {}
124 # will provide the list of cmap keys as int to correct encoding
125 int_entry: list[int] = []
127 if "/ToUnicode" not in ft:
128 if ft.get("/Subtype", "") == "/Type1":
129 return _type1_alternative(ft, map_dict, int_entry)
130 return {}, []
131 process_rg: bool = False
132 process_char: bool = False
133 multiline_rg: Union[
134 None, tuple[int, int]
135 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
136 cm = prepare_cm(ft)
137 for line in cm.split(b"\n"):
138 process_rg, process_char, multiline_rg = process_cm_line(
139 line.strip(b" \t"),
140 process_rg,
141 process_char,
142 multiline_rg,
143 map_dict,
144 int_entry,
145 )
147 return map_dict, int_entry
150def prepare_cm(ft: DictionaryObject) -> bytes:
151 tu = ft["/ToUnicode"]
152 cm: bytes
153 if isinstance(tu, StreamObject):
154 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
155 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
156 # the full range 0000-FFFF will be processed
157 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
158 if isinstance(cm, str):
159 cm = cm.encode()
160 # we need to prepare cm before due to missing return line in pdf printed
161 # to pdf from word
162 cm = (
163 cm.strip()
164 .replace(b"beginbfchar", b"\nbeginbfchar\n")
165 .replace(b"endbfchar", b"\nendbfchar\n")
166 .replace(b"beginbfrange", b"\nbeginbfrange\n")
167 .replace(b"endbfrange", b"\nendbfrange\n")
168 .replace(b"<<", b"\n{\n") # text between << and >> not used but
169 .replace(b">>", b"\n}\n") # some solution to find it back
170 )
171 ll = cm.split(b"<")
172 for i in range(len(ll)):
173 j = ll[i].find(b">")
174 if j >= 0:
175 if j == 0:
176 # string is empty: stash a placeholder here (see below)
177 # see https://github.com/py-pdf/pypdf/issues/1111
178 content = b"."
179 else:
180 content = ll[i][:j].replace(b" ", b"")
181 ll[i] = content + b" " + ll[i][j + 1 :]
182 cm = (
183 (b" ".join(ll))
184 .replace(b"[", b" [ ")
185 .replace(b"]", b" ]\n ")
186 .replace(b"\r", b"\n")
187 )
188 return cm
191def process_cm_line(
192 line: bytes,
193 process_rg: bool,
194 process_char: bool,
195 multiline_rg: Union[None, tuple[int, int]],
196 map_dict: dict[Any, Any],
197 int_entry: list[int],
198) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
199 if line == b"" or line[0] == 37: # 37 = %
200 return process_rg, process_char, multiline_rg
201 line = line.replace(b"\t", b" ")
202 if b"beginbfrange" in line:
203 process_rg = True
204 elif b"endbfrange" in line:
205 process_rg = False
206 elif b"beginbfchar" in line:
207 process_char = True
208 elif b"endbfchar" in line:
209 process_char = False
210 elif process_rg:
211 try:
212 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
213 except binascii.Error as error:
214 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
215 elif process_char:
216 parse_bfchar(line, map_dict, int_entry)
217 return process_rg, process_char, multiline_rg
220def parse_bfrange(
221 line: bytes,
222 map_dict: dict[Any, Any],
223 int_entry: list[int],
224 multiline_rg: Union[None, tuple[int, int]],
225) -> Union[None, tuple[int, int]]:
226 lst = [x for x in line.split(b" ") if x]
227 closure_found = False
228 if multiline_rg is not None:
229 fmt = b"%%0%dX" % (map_dict[-1] * 2)
230 a = multiline_rg[0] # a, b not in the current line
231 b = multiline_rg[1]
232 for sq in lst:
233 if sq == b"]":
234 closure_found = True
235 break
236 map_dict[
237 unhexlify(fmt % a).decode(
238 "charmap" if map_dict[-1] == 1 else "utf-16-be",
239 "surrogatepass",
240 )
241 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
242 int_entry.append(a)
243 a += 1
244 else:
245 a = int(lst[0], 16)
246 b = int(lst[1], 16)
247 nbi = max(len(lst[0]), len(lst[1]))
248 map_dict[-1] = ceil(nbi / 2)
249 fmt = b"%%0%dX" % (map_dict[-1] * 2)
250 if lst[2] == b"[":
251 for sq in lst[3:]:
252 if sq == b"]":
253 closure_found = True
254 break
255 map_dict[
256 unhexlify(fmt % a).decode(
257 "charmap" if map_dict[-1] == 1 else "utf-16-be",
258 "surrogatepass",
259 )
260 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
261 int_entry.append(a)
262 a += 1
263 else: # case without list
264 c = int(lst[2], 16)
265 fmt2 = b"%%0%dX" % max(4, len(lst[2]))
266 closure_found = True
267 while a <= b:
268 map_dict[
269 unhexlify(fmt % a).decode(
270 "charmap" if map_dict[-1] == 1 else "utf-16-be",
271 "surrogatepass",
272 )
273 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
274 int_entry.append(a)
275 a += 1
276 c += 1
277 return None if closure_found else (a, b)
280def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
281 lst = [x for x in line.split(b" ") if x]
282 map_dict[-1] = len(lst[0]) // 2
283 while len(lst) > 1:
284 map_to = ""
285 # placeholder (see above) means empty string
286 if lst[1] != b".":
287 try:
288 map_to = unhexlify(lst[1]).decode(
289 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
290 ) # join is here as some cases where the code was split
291 except BinasciiError as exception:
292 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
293 map_dict[
294 unhexlify(lst[0]).decode(
295 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
296 )
297 ] = map_to
298 int_entry.append(int(lst[0], 16))
299 lst = lst[2:]
302def _type1_alternative(
303 ft: DictionaryObject,
304 map_dict: dict[Any, Any],
305 int_entry: list[int],
306) -> tuple[dict[Any, Any], list[int]]:
307 if "/FontDescriptor" not in ft:
308 return map_dict, int_entry
309 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
310 if is_null_or_none(ft_desc):
311 return map_dict, int_entry
312 assert ft_desc is not None, "mypy"
313 txt = ft_desc.get_object().get_data()
314 txt = txt.split(b"eexec\n")[0] # only clear part
315 txt = txt.split(b"/Encoding")[1] # to get the encoding part
316 lines = txt.replace(b"\r", b"\n").split(b"\n")
317 for li in lines:
318 if li.startswith(b"dup"):
319 words = [_w for _w in li.split(b" ") if _w != b""]
320 if len(words) > 3 and words[3] != b"put":
321 continue
322 try:
323 i = int(words[1])
324 except ValueError: # pragma: no cover
325 continue
326 try:
327 v = adobe_glyphs[words[2].decode()]
328 except KeyError:
329 if words[2].startswith(b"/uni"):
330 try:
331 v = chr(int(words[2][4:], 16))
332 except ValueError: # pragma: no cover
333 continue
334 else:
335 continue
336 map_dict[chr(i)] = v
337 int_entry.append(i)
338 return map_dict, int_entry