Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 10%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import binascii
2from binascii import Error as BinasciiError
3from binascii import unhexlify
4from math import ceil
5from typing import Any, Union, cast
7from ._codecs import adobe_glyphs, charset_encoding
8from ._utils import logger_error, logger_warning
9from .errors import LimitReachedError
10from .generic import (
11 DecodedStreamObject,
12 DictionaryObject,
13 NullObject,
14 StreamObject,
15 is_null_or_none,
16)
18_predefined_cmap: dict[str, str] = {
19 "/Identity-H": "utf-16-be",
20 "/Identity-V": "utf-16-be",
21 "/GB-EUC-H": "gbk",
22 "/GB-EUC-V": "gbk",
23 "/GBpc-EUC-H": "gb2312",
24 "/GBpc-EUC-V": "gb2312",
25 "/GBK-EUC-H": "gbk",
26 "/GBK-EUC-V": "gbk",
27 "/GBK2K-H": "gb18030",
28 "/GBK2K-V": "gb18030",
29 "/ETen-B5-H": "cp950",
30 "/ETen-B5-V": "cp950",
31 "/ETenms-B5-H": "cp950",
32 "/ETenms-B5-V": "cp950",
33 "/UniCNS-UTF16-H": "utf-16-be",
34 "/UniCNS-UTF16-V": "utf-16-be",
35 "/UniGB-UTF16-H": "gb18030",
36 "/UniGB-UTF16-V": "gb18030",
37 # UCS2 in code
38}
41def get_encoding(
42 ft: DictionaryObject
43) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
44 encoding = _parse_encoding(ft)
45 map_dict, int_entry = _parse_to_unicode(ft)
47 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
48 # if cmap not empty encoding should be discarded
49 # (here transformed into identity for those characters)
50 # If encoding is a string, it is expected to be an identity translation.
51 if isinstance(encoding, dict):
52 for x in int_entry:
53 if x <= 255:
54 encoding[x] = chr(x)
56 return encoding, map_dict
59def _parse_encoding(
60 ft: DictionaryObject
61) -> Union[str, dict[int, str]]:
62 encoding: Union[str, list[str], dict[int, str]] = []
63 if "/Encoding" not in ft:
64 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
65 encoding = dict(
66 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
67 )
68 else:
69 encoding = "charmap"
70 return encoding
71 enc: Union[str, DictionaryObject, NullObject] = cast(
72 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
73 )
74 if isinstance(enc, str):
75 try:
76 # already done : enc = NameObject.unnumber(enc.encode()).decode()
77 # for #xx decoding
78 if enc in charset_encoding:
79 encoding = charset_encoding[enc].copy()
80 elif enc in _predefined_cmap:
81 encoding = _predefined_cmap[enc]
82 elif "-UCS2-" in enc:
83 encoding = "utf-16-be"
84 else:
85 raise Exception("not found")
86 except Exception:
87 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
88 encoding = enc
89 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
90 try:
91 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
92 except Exception:
93 logger_error(
94 f"Advanced encoding {encoding} not implemented yet",
95 __name__,
96 )
97 encoding = charset_encoding["/StandardEncoding"].copy()
98 else:
99 encoding = charset_encoding["/StandardEncoding"].copy()
100 if isinstance(enc, DictionaryObject) and "/Differences" in enc:
101 x: int = 0
102 o: Union[int, str]
103 for o in cast(DictionaryObject, enc["/Differences"]):
104 if isinstance(o, int):
105 x = o
106 else: # isinstance(o, str):
107 try:
108 if x < len(encoding):
109 encoding[x] = adobe_glyphs[o] # type: ignore
110 except Exception:
111 encoding[x] = o # type: ignore
112 x += 1
113 if isinstance(encoding, list):
114 encoding = dict(zip(range(256), encoding))
115 return encoding
118def _parse_to_unicode(
119 ft: DictionaryObject
120) -> tuple[dict[Any, Any], list[int]]:
121 # will store all translation code
122 # and map_dict[-1] we will have the number of bytes to convert
123 map_dict: dict[Any, Any] = {}
125 # will provide the list of cmap keys as int to correct encoding
126 int_entry: list[int] = []
128 if "/ToUnicode" not in ft:
129 if ft.get("/Subtype", "") == "/Type1":
130 return _type1_alternative(ft, map_dict, int_entry)
131 return {}, []
132 process_rg: bool = False
133 process_char: bool = False
134 multiline_rg: Union[
135 None, tuple[int, int]
136 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
137 cm = prepare_cm(ft)
138 for line in cm.split(b"\n"):
139 process_rg, process_char, multiline_rg = process_cm_line(
140 line.strip(b" \t"),
141 process_rg,
142 process_char,
143 multiline_rg,
144 map_dict,
145 int_entry,
146 )
148 return map_dict, int_entry
151def prepare_cm(ft: DictionaryObject) -> bytes:
152 tu = ft["/ToUnicode"]
153 cm: bytes
154 if isinstance(tu, StreamObject):
155 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
156 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
157 # the full range 0000-FFFF will be processed
158 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
159 if isinstance(cm, str):
160 cm = cm.encode()
161 # we need to prepare cm before due to missing return line in pdf printed
162 # to pdf from word
163 cm = (
164 cm.strip()
165 .replace(b"beginbfchar", b"\nbeginbfchar\n")
166 .replace(b"endbfchar", b"\nendbfchar\n")
167 .replace(b"beginbfrange", b"\nbeginbfrange\n")
168 .replace(b"endbfrange", b"\nendbfrange\n")
169 .replace(b"<<", b"\n{\n") # text between << and >> not used but
170 .replace(b">>", b"\n}\n") # some solution to find it back
171 )
172 ll = cm.split(b"<")
173 for i in range(len(ll)):
174 j = ll[i].find(b">")
175 if j >= 0:
176 if j == 0:
177 # string is empty: stash a placeholder here (see below)
178 # see https://github.com/py-pdf/pypdf/issues/1111
179 content = b"."
180 else:
181 content = ll[i][:j].replace(b" ", b"")
182 ll[i] = content + b" " + ll[i][j + 1 :]
183 cm = (
184 (b" ".join(ll))
185 .replace(b"[", b" [ ")
186 .replace(b"]", b" ]\n ")
187 .replace(b"\r", b"\n")
188 )
189 return cm
192def process_cm_line(
193 line: bytes,
194 process_rg: bool,
195 process_char: bool,
196 multiline_rg: Union[None, tuple[int, int]],
197 map_dict: dict[Any, Any],
198 int_entry: list[int],
199) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
200 if line == b"" or line[0] == 37: # 37 = %
201 return process_rg, process_char, multiline_rg
202 line = line.replace(b"\t", b" ")
203 if b"beginbfrange" in line:
204 process_rg = True
205 elif b"endbfrange" in line:
206 process_rg = False
207 elif b"beginbfchar" in line:
208 process_char = True
209 elif b"endbfchar" in line:
210 process_char = False
211 elif process_rg:
212 try:
213 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
214 except binascii.Error as error:
215 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
216 elif process_char:
217 parse_bfchar(line, map_dict, int_entry)
218 return process_rg, process_char, multiline_rg
221# Usual values should be up to 65_536.
222MAPPING_DICTIONARY_SIZE_LIMIT = 100_000
225def _check_mapping_size(size: int) -> None:
226 if size > MAPPING_DICTIONARY_SIZE_LIMIT:
227 raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.")
230def parse_bfrange(
231 line: bytes,
232 map_dict: dict[Any, Any],
233 int_entry: list[int],
234 multiline_rg: Union[None, tuple[int, int]],
235) -> Union[None, tuple[int, int]]:
236 lst = [x for x in line.split(b" ") if x]
237 closure_found = False
238 entry_count = len(int_entry)
239 _check_mapping_size(entry_count)
240 if multiline_rg is not None:
241 fmt = b"%%0%dX" % (map_dict[-1] * 2)
242 a = multiline_rg[0] # a, b not in the current line
243 b = multiline_rg[1]
244 for sq in lst:
245 if sq == b"]":
246 closure_found = True
247 break
248 entry_count += 1
249 _check_mapping_size(entry_count)
250 map_dict[
251 unhexlify(fmt % a).decode(
252 "charmap" if map_dict[-1] == 1 else "utf-16-be",
253 "surrogatepass",
254 )
255 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
256 int_entry.append(a)
257 a += 1
258 else:
259 a = int(lst[0], 16)
260 b = int(lst[1], 16)
261 nbi = max(len(lst[0]), len(lst[1]))
262 map_dict[-1] = ceil(nbi / 2)
263 fmt = b"%%0%dX" % (map_dict[-1] * 2)
264 if lst[2] == b"[":
265 for sq in lst[3:]:
266 if sq == b"]":
267 closure_found = True
268 break
269 entry_count += 1
270 _check_mapping_size(entry_count)
271 map_dict[
272 unhexlify(fmt % a).decode(
273 "charmap" if map_dict[-1] == 1 else "utf-16-be",
274 "surrogatepass",
275 )
276 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
277 int_entry.append(a)
278 a += 1
279 else: # case without list
280 c = int(lst[2], 16)
281 fmt2 = b"%%0%dX" % max(4, len(lst[2]))
282 closure_found = True
283 range_size = max(0, b - a + 1)
284 _check_mapping_size(entry_count + range_size) # This can be checked beforehand.
285 while a <= b:
286 map_dict[
287 unhexlify(fmt % a).decode(
288 "charmap" if map_dict[-1] == 1 else "utf-16-be",
289 "surrogatepass",
290 )
291 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
292 int_entry.append(a)
293 a += 1
294 c += 1
295 return None if closure_found else (a, b)
298def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
299 lst = [x for x in line.split(b" ") if x]
300 new_count = len(lst) // 2
301 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand.
302 map_dict[-1] = len(lst[0]) // 2
303 while len(lst) > 1:
304 map_to = ""
305 # placeholder (see above) means empty string
306 if lst[1] != b".":
307 try:
308 map_to = unhexlify(lst[1]).decode(
309 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
310 ) # join is here as some cases where the code was split
311 except BinasciiError as exception:
312 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
313 map_dict[
314 unhexlify(lst[0]).decode(
315 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
316 )
317 ] = map_to
318 int_entry.append(int(lst[0], 16))
319 lst = lst[2:]
322def _type1_alternative(
323 ft: DictionaryObject,
324 map_dict: dict[Any, Any],
325 int_entry: list[int],
326) -> tuple[dict[Any, Any], list[int]]:
327 if "/FontDescriptor" not in ft:
328 return map_dict, int_entry
329 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
330 if is_null_or_none(ft_desc):
331 return map_dict, int_entry
332 assert ft_desc is not None, "mypy"
333 txt = ft_desc.get_object().get_data()
334 txt = txt.split(b"eexec\n")[0] # only clear part
335 txt = txt.split(b"/Encoding")[1] # to get the encoding part
336 lines = txt.replace(b"\r", b"\n").split(b"\n")
337 for li in lines:
338 if li.startswith(b"dup"):
339 words = [_w for _w in li.split(b" ") if _w != b""]
340 if len(words) > 3 and words[3] != b"put":
341 continue
342 try:
343 i = int(words[1])
344 except ValueError: # pragma: no cover
345 continue
346 try:
347 v = adobe_glyphs[words[2].decode()]
348 except KeyError:
349 if words[2].startswith(b"/uni"):
350 try:
351 v = chr(int(words[2][4:], 16))
352 except ValueError: # pragma: no cover
353 continue
354 else:
355 continue
356 map_dict[chr(i)] = v
357 int_entry.append(i)
358 return map_dict, int_entry