Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 10%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import binascii
2from binascii import Error as BinasciiError
3from binascii import unhexlify
4from math import ceil
5from typing import Any, Union, cast
7from ._codecs import adobe_glyphs, charset_encoding
8from ._utils import logger_error, logger_warning
9from .errors import LimitReachedError
10from .generic import (
11 DecodedStreamObject,
12 DictionaryObject,
13 NullObject,
14 StreamObject,
15 is_null_or_none,
16)
18_predefined_cmap: dict[str, str] = {
19 "/Identity-H": "utf-16-be",
20 "/Identity-V": "utf-16-be",
21 "/GB-EUC-H": "gbk",
22 "/GB-EUC-V": "gbk",
23 "/GBpc-EUC-H": "gb2312",
24 "/GBpc-EUC-V": "gb2312",
25 "/GBK-EUC-H": "gbk",
26 "/GBK-EUC-V": "gbk",
27 "/GBK2K-H": "gb18030",
28 "/GBK2K-V": "gb18030",
29 "/ETen-B5-H": "cp950",
30 "/ETen-B5-V": "cp950",
31 "/ETenms-B5-H": "cp950",
32 "/ETenms-B5-V": "cp950",
33 "/UniCNS-UTF16-H": "utf-16-be",
34 "/UniCNS-UTF16-V": "utf-16-be",
35 "/UniGB-UTF16-H": "gb18030",
36 "/UniGB-UTF16-V": "gb18030",
37 # Japanese CMaps (PDF Reference 1.7, Appendix H)
38 "/90ms-RKSJ-H": "cp932", # Shift-JIS (JIS X 0208-1990), horizontal
39 "/90ms-RKSJ-V": "cp932", # Shift-JIS (JIS X 0208-1990), vertical
40 "/UniJIS-UTF16-H": "utf-16-be", # Unicode UTF-16BE -> JIS, horizontal
41 "/UniJIS-UTF16-V": "utf-16-be", # Unicode UTF-16BE -> JIS, vertical
42 # UCS2 in code
43}
46def get_encoding(
47 ft: DictionaryObject
48) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
49 encoding = _parse_encoding(ft)
50 map_dict, int_entry = _parse_to_unicode(ft)
52 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
53 # if cmap not empty encoding should be discarded
54 # (here transformed into identity for those characters)
55 # If encoding is a string, it is expected to be an identity translation.
56 if isinstance(encoding, dict):
57 for x in int_entry:
58 if x <= 255:
59 encoding[x] = chr(x)
61 return encoding, map_dict
64def _parse_encoding(
65 ft: DictionaryObject
66) -> Union[str, dict[int, str]]:
67 encoding: Union[str, list[str], dict[int, str]] = []
68 if "/Encoding" not in ft:
69 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
70 encoding = dict(
71 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
72 )
73 else:
74 encoding = "charmap"
75 return encoding
76 enc: Union[str, DictionaryObject, NullObject] = cast(
77 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
78 )
79 if isinstance(enc, str):
80 try:
81 # already done : enc = NameObject.unnumber(enc.encode()).decode()
82 # for #xx decoding
83 if enc in charset_encoding:
84 encoding = charset_encoding[enc].copy()
85 elif enc in _predefined_cmap:
86 encoding = _predefined_cmap[enc]
87 elif "-UCS2-" in enc:
88 encoding = "utf-16-be"
89 else:
90 raise Exception("not found")
91 except Exception:
92 logger_error("Advanced encoding %(encoding)s not implemented yet", source=__name__, encoding=enc)
93 encoding = enc
94 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
95 try:
96 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
97 except Exception:
98 logger_error(
99 "Advanced encoding %(encoding)s not implemented yet",
100 source=__name__, encoding=encoding
101 )
102 encoding = charset_encoding["/StandardEncoding"].copy()
103 else:
104 encoding = charset_encoding["/StandardEncoding"].copy()
105 if isinstance(enc, DictionaryObject) and "/Differences" in enc:
106 x: int = 0
107 o: Union[int, str]
108 for o in cast(DictionaryObject, enc["/Differences"]):
109 if isinstance(o, int):
110 x = o
111 else: # isinstance(o, str):
112 try:
113 if x < len(encoding):
114 encoding[x] = adobe_glyphs[o] # type: ignore[index]
115 except Exception:
116 encoding[x] = o # type: ignore[index]
117 x += 1
118 if isinstance(encoding, list):
119 encoding = dict(zip(range(256), encoding))
120 return encoding
123def _parse_to_unicode(
124 ft: DictionaryObject
125) -> tuple[dict[Any, Any], list[int]]:
126 # will store all translation code
127 # and map_dict[-1] we will have the number of bytes to convert
128 map_dict: dict[Any, Any] = {}
130 # will provide the list of cmap keys as int to correct encoding
131 int_entry: list[int] = []
133 if "/ToUnicode" not in ft:
134 if ft.get("/Subtype", "") == "/Type1":
135 return _type1_alternative(ft, map_dict, int_entry)
136 return {}, []
137 process_rg: bool = False
138 process_char: bool = False
139 multiline_rg: Union[
140 None, tuple[int, int]
141 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
142 cm = prepare_cm(ft)
143 for line in cm.split(b"\n"):
144 process_rg, process_char, multiline_rg = process_cm_line(
145 line.strip(b" \t"),
146 process_rg,
147 process_char,
148 multiline_rg,
149 map_dict,
150 int_entry,
151 )
153 map_dict.pop(-1, None) # Don't pass the -1 key, we only used it to temporarily store encoding length
155 return map_dict, int_entry
158def prepare_cm(ft: DictionaryObject) -> bytes:
159 tu = ft["/ToUnicode"]
160 cm: bytes
161 if isinstance(tu, StreamObject):
162 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
163 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
164 # the full range 0000-FFFF will be processed
165 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
166 if isinstance(cm, str):
167 cm = cm.encode()
168 # we need to prepare cm before due to missing return line in pdf printed
169 # to pdf from word
170 cm = (
171 cm.strip()
172 .replace(b"beginbfchar", b"\nbeginbfchar\n")
173 .replace(b"endbfchar", b"\nendbfchar\n")
174 .replace(b"beginbfrange", b"\nbeginbfrange\n")
175 .replace(b"endbfrange", b"\nendbfrange\n")
176 .replace(b"<<", b"\n{\n") # text between << and >> not used but
177 .replace(b">>", b"\n}\n") # some solution to find it back
178 )
179 ll = cm.split(b"<")
180 for i in range(len(ll)):
181 j = ll[i].find(b">")
182 if j >= 0:
183 if j == 0:
184 # string is empty: stash a placeholder here (see below)
185 # see https://github.com/py-pdf/pypdf/issues/1111
186 content = b"."
187 else:
188 content = ll[i][:j].replace(b" ", b"")
189 ll[i] = content + b" " + ll[i][j + 1 :]
190 cm = (
191 (b" ".join(ll))
192 .replace(b"[", b" [ ")
193 .replace(b"]", b" ]\n ")
194 .replace(b"\r", b"\n")
195 )
196 return cm
199def process_cm_line(
200 line: bytes,
201 process_rg: bool,
202 process_char: bool,
203 multiline_rg: Union[None, tuple[int, int]],
204 map_dict: dict[Any, Any],
205 int_entry: list[int],
206) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
207 if line == b"" or line[0] == 37: # 37 = %
208 return process_rg, process_char, multiline_rg
209 line = line.replace(b"\t", b" ")
210 if b"beginbfrange" in line:
211 process_rg = True
212 elif b"endbfrange" in line:
213 process_rg = False
214 elif b"beginbfchar" in line:
215 process_char = True
216 elif b"endbfchar" in line:
217 process_char = False
218 elif process_rg:
219 try:
220 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
221 except binascii.Error as error:
222 logger_warning("Skipping broken line %(line)r: %(error)s", source=__name__, line=line, error=error)
223 elif process_char:
224 parse_bfchar(line, map_dict, int_entry)
225 return process_rg, process_char, multiline_rg
228# Usual values should be up to 65_536.
229MAPPING_DICTIONARY_SIZE_LIMIT = 100_000
232def _check_mapping_size(size: int) -> None:
233 if size > MAPPING_DICTIONARY_SIZE_LIMIT:
234 raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.")
237def parse_bfrange(
238 line: bytes,
239 map_dict: dict[Any, Any],
240 int_entry: list[int],
241 multiline_rg: Union[None, tuple[int, int]],
242) -> Union[None, tuple[int, int]]:
243 lst = [x for x in line.split(b" ") if x]
244 closure_found = False
245 entry_count = len(int_entry)
246 _check_mapping_size(entry_count)
247 if multiline_rg is not None:
248 fmt = b"%%0%dX" % (map_dict[-1] * 2)
249 a = multiline_rg[0] # a, b not in the current line
250 b = multiline_rg[1]
251 for sq in lst:
252 if sq == b"]":
253 closure_found = True
254 break
255 entry_count += 1
256 _check_mapping_size(entry_count)
257 map_dict[
258 unhexlify(fmt % a).decode(
259 "charmap" if map_dict[-1] == 1 else "utf-16-be",
260 "surrogatepass",
261 )
262 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
263 int_entry.append(a)
264 a += 1
265 else:
266 a = int(lst[0], 16)
267 b = int(lst[1], 16)
268 nbi = max(len(lst[0]), len(lst[1]))
269 map_dict[-1] = ceil(nbi / 2)
270 fmt = b"%%0%dX" % (map_dict[-1] * 2)
271 if lst[2] == b"[":
272 for sq in lst[3:]:
273 if sq == b"]":
274 closure_found = True
275 break
276 entry_count += 1
277 _check_mapping_size(entry_count)
278 map_dict[
279 unhexlify(fmt % a).decode(
280 "charmap" if map_dict[-1] == 1 else "utf-16-be",
281 "surrogatepass",
282 )
283 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
284 int_entry.append(a)
285 a += 1
286 else: # case without list
287 c = int(lst[2], 16)
288 fmt2 = b"%%0%dX" % max(4, len(lst[2]))
289 closure_found = True
290 range_size = max(0, b - a + 1)
291 _check_mapping_size(entry_count + range_size) # This can be checked beforehand.
292 while a <= b:
293 map_dict[
294 unhexlify(fmt % a).decode(
295 "charmap" if map_dict[-1] == 1 else "utf-16-be",
296 "surrogatepass",
297 )
298 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
299 int_entry.append(a)
300 a += 1
301 c += 1
302 return None if closure_found else (a, b)
305def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
306 lst = [x for x in line.split(b" ") if x]
307 new_count = len(lst) // 2
308 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand.
309 map_dict[-1] = len(lst[0]) // 2
310 while len(lst) > 1:
311 map_to = ""
312 # placeholder (see above) means empty string
313 if lst[1] != b".":
314 try:
315 map_to = unhexlify(lst[1]).decode(
316 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
317 ) # join is here as some cases where the code was split
318 except BinasciiError as exception:
319 logger_warning(
320 "Got invalid hex string: %(exception)s (%(lst_value)r)",
321 source=__name__,
322 exception=exception,
323 lst_value=lst[1],
324 )
325 map_dict[
326 unhexlify(lst[0]).decode(
327 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
328 )
329 ] = map_to
330 int_entry.append(int(lst[0], 16))
331 lst = lst[2:]
334def _type1_alternative(
335 ft: DictionaryObject,
336 map_dict: dict[Any, Any],
337 int_entry: list[int],
338) -> tuple[dict[Any, Any], list[int]]:
339 if "/FontDescriptor" not in ft:
340 return map_dict, int_entry
341 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
342 if is_null_or_none(ft_desc):
343 return map_dict, int_entry
344 assert ft_desc is not None, "mypy"
345 txt = ft_desc.get_object().get_data()
346 txt = txt.split(b"eexec\n")[0] # only clear part
347 txt = txt.split(b"/Encoding")[1] # to get the encoding part
348 lines = txt.replace(b"\r", b"\n").split(b"\n")
349 for li in lines:
350 if li.startswith(b"dup"):
351 words = [_w for _w in li.split(b" ") if _w != b""]
352 if len(words) > 3 and words[3] != b"put":
353 continue
354 try:
355 i = int(words[1])
356 except ValueError: # pragma: no cover
357 continue
358 try:
359 v = adobe_glyphs[words[2].decode()]
360 except KeyError:
361 if words[2].startswith(b"/uni"):
362 try:
363 v = chr(int(words[2][4:], 16))
364 except ValueError: # pragma: no cover
365 continue
366 else:
367 continue
368 map_dict[chr(i)] = v
369 int_entry.append(i)
370 return map_dict, int_entry