Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import binascii
2from binascii import Error as BinasciiError
3from binascii import unhexlify
4from math import ceil
5from typing import Any, Union, cast
7from ._codecs import adobe_glyphs, charset_encoding
8from ._codecs.core_fontmetrics import CORE_FONT_METRICS
9from ._utils import logger_error, logger_warning
10from .generic import (
11 ArrayObject,
12 DecodedStreamObject,
13 DictionaryObject,
14 NullObject,
15 StreamObject,
16 is_null_or_none,
17)
20# code freely inspired from @twiggy ; see #711
21def build_char_map(
22 font_name: str, space_width: float, obj: DictionaryObject
23) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any], DictionaryObject]:
24 """
25 Determine information about a font.
27 Args:
28 font_name: font name as a string
29 space_width: default space width if no data is found.
30 obj: XObject or Page where you can find a /Resource dictionary
32 Returns:
33 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
34 The font-dictionary itself is suitable for the curious.
36 """
37 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
38 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
39 space_width, ft
40 )
41 return font_subtype, font_halfspace, font_encoding, font_map, ft
44def build_char_map_from_dict(
45 space_width: float, ft: DictionaryObject
46) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]]:
47 """
48 Determine information about a font.
50 Args:
51 space_width: default space with if no data found
52 (normally half the width of a character).
53 ft: Font Dictionary
55 Returns:
56 Font sub-type, space_width criteria(50% of width), encoding, map character-map.
57 The font-dictionary itself is suitable for the curious.
59 """
60 font_type = cast(str, ft["/Subtype"].get_object())
61 encoding, map_dict = get_encoding(ft)
63 space_key_char = get_actual_str_key(" ", encoding, map_dict)
64 font_width_map = build_font_width_map(ft, space_width * 2.0)
65 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
67 return (
68 font_type,
69 half_space_width,
70 encoding,
71 # https://github.com/python/mypy/issues/4374
72 map_dict
73 )
76# used when missing data, e.g. font def missing
77unknown_char_map: tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]] = (
78 "Unknown",
79 9999,
80 dict.fromkeys(range(256), "�"),
81 {},
82)
85_predefined_cmap: dict[str, str] = {
86 "/Identity-H": "utf-16-be",
87 "/Identity-V": "utf-16-be",
88 "/GB-EUC-H": "gbk",
89 "/GB-EUC-V": "gbk",
90 "/GBpc-EUC-H": "gb2312",
91 "/GBpc-EUC-V": "gb2312",
92 "/GBK-EUC-H": "gbk",
93 "/GBK-EUC-V": "gbk",
94 "/GBK2K-H": "gb18030",
95 "/GBK2K-V": "gb18030",
96 "/ETen-B5-H": "cp950",
97 "/ETen-B5-V": "cp950",
98 "/ETenms-B5-H": "cp950",
99 "/ETenms-B5-V": "cp950",
100 "/UniCNS-UTF16-H": "utf-16-be",
101 "/UniCNS-UTF16-V": "utf-16-be",
102 "/UniGB-UTF16-H": "gb18030",
103 "/UniGB-UTF16-V": "gb18030",
104 # UCS2 in code
105}
108def get_encoding(
109 ft: DictionaryObject
110) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
111 encoding = _parse_encoding(ft)
112 map_dict, int_entry = _parse_to_unicode(ft)
114 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
115 # if cmap not empty encoding should be discarded
116 # (here transformed into identity for those characters)
117 # If encoding is a string, it is expected to be an identity translation.
118 if isinstance(encoding, dict):
119 for x in int_entry:
120 if x <= 255:
121 encoding[x] = chr(x)
123 return encoding, map_dict
126def _parse_encoding(
127 ft: DictionaryObject
128) -> Union[str, dict[int, str]]:
129 encoding: Union[str, list[str], dict[int, str]] = []
130 if "/Encoding" not in ft:
131 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
132 encoding = dict(
133 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
134 )
135 else:
136 encoding = "charmap"
137 return encoding
138 enc: Union[str, DictionaryObject, NullObject] = cast(
139 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
140 )
141 if isinstance(enc, str):
142 try:
143 # already done : enc = NameObject.unnumber(enc.encode()).decode()
144 # for #xx decoding
145 if enc in charset_encoding:
146 encoding = charset_encoding[enc].copy()
147 elif enc in _predefined_cmap:
148 encoding = _predefined_cmap[enc]
149 elif "-UCS2-" in enc:
150 encoding = "utf-16-be"
151 else:
152 raise Exception("not found")
153 except Exception:
154 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
155 encoding = enc
156 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
157 try:
158 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
159 except Exception:
160 logger_error(
161 f"Advanced encoding {encoding} not implemented yet",
162 __name__,
163 )
164 encoding = charset_encoding["/StandardEncoding"].copy()
165 else:
166 encoding = charset_encoding["/StandardEncoding"].copy()
167 if isinstance(enc, DictionaryObject) and "/Differences" in enc:
168 x: int = 0
169 o: Union[int, str]
170 for o in cast(DictionaryObject, enc["/Differences"]):
171 if isinstance(o, int):
172 x = o
173 else: # isinstance(o, str):
174 try:
175 if x < len(encoding):
176 encoding[x] = adobe_glyphs[o] # type: ignore
177 except Exception:
178 encoding[x] = o # type: ignore
179 x += 1
180 if isinstance(encoding, list):
181 encoding = dict(zip(range(256), encoding))
182 return encoding
185def _parse_to_unicode(
186 ft: DictionaryObject
187) -> tuple[dict[Any, Any], list[int]]:
188 # will store all translation code
189 # and map_dict[-1] we will have the number of bytes to convert
190 map_dict: dict[Any, Any] = {}
192 # will provide the list of cmap keys as int to correct encoding
193 int_entry: list[int] = []
195 if "/ToUnicode" not in ft:
196 if ft.get("/Subtype", "") == "/Type1":
197 return _type1_alternative(ft, map_dict, int_entry)
198 return {}, []
199 process_rg: bool = False
200 process_char: bool = False
201 multiline_rg: Union[
202 None, tuple[int, int]
203 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
204 cm = prepare_cm(ft)
205 for line in cm.split(b"\n"):
206 process_rg, process_char, multiline_rg = process_cm_line(
207 line.strip(b" \t"),
208 process_rg,
209 process_char,
210 multiline_rg,
211 map_dict,
212 int_entry,
213 )
215 return map_dict, int_entry
218def get_actual_str_key(
219 value_char: str, encoding: Union[str, dict[int, str]], map_dict: dict[Any, Any]
220) -> str:
221 key_dict = {}
222 if isinstance(encoding, dict):
223 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
224 else:
225 key_dict = {value: key for key, value in map_dict.items() if value == value_char}
226 return key_dict.get(value_char, value_char)
229def prepare_cm(ft: DictionaryObject) -> bytes:
230 tu = ft["/ToUnicode"]
231 cm: bytes
232 if isinstance(tu, StreamObject):
233 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
234 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
235 # the full range 0000-FFFF will be processed
236 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
237 if isinstance(cm, str):
238 cm = cm.encode()
239 # we need to prepare cm before due to missing return line in pdf printed
240 # to pdf from word
241 cm = (
242 cm.strip()
243 .replace(b"beginbfchar", b"\nbeginbfchar\n")
244 .replace(b"endbfchar", b"\nendbfchar\n")
245 .replace(b"beginbfrange", b"\nbeginbfrange\n")
246 .replace(b"endbfrange", b"\nendbfrange\n")
247 .replace(b"<<", b"\n{\n") # text between << and >> not used but
248 .replace(b">>", b"\n}\n") # some solution to find it back
249 )
250 ll = cm.split(b"<")
251 for i in range(len(ll)):
252 j = ll[i].find(b">")
253 if j >= 0:
254 if j == 0:
255 # string is empty: stash a placeholder here (see below)
256 # see https://github.com/py-pdf/pypdf/issues/1111
257 content = b"."
258 else:
259 content = ll[i][:j].replace(b" ", b"")
260 ll[i] = content + b" " + ll[i][j + 1 :]
261 cm = (
262 (b" ".join(ll))
263 .replace(b"[", b" [ ")
264 .replace(b"]", b" ]\n ")
265 .replace(b"\r", b"\n")
266 )
267 return cm
270def process_cm_line(
271 line: bytes,
272 process_rg: bool,
273 process_char: bool,
274 multiline_rg: Union[None, tuple[int, int]],
275 map_dict: dict[Any, Any],
276 int_entry: list[int],
277) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
278 if line == b"" or line[0] == 37: # 37 = %
279 return process_rg, process_char, multiline_rg
280 line = line.replace(b"\t", b" ")
281 if b"beginbfrange" in line:
282 process_rg = True
283 elif b"endbfrange" in line:
284 process_rg = False
285 elif b"beginbfchar" in line:
286 process_char = True
287 elif b"endbfchar" in line:
288 process_char = False
289 elif process_rg:
290 try:
291 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
292 except binascii.Error as error:
293 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
294 elif process_char:
295 parse_bfchar(line, map_dict, int_entry)
296 return process_rg, process_char, multiline_rg
299def parse_bfrange(
300 line: bytes,
301 map_dict: dict[Any, Any],
302 int_entry: list[int],
303 multiline_rg: Union[None, tuple[int, int]],
304) -> Union[None, tuple[int, int]]:
305 lst = [x for x in line.split(b" ") if x]
306 closure_found = False
307 if multiline_rg is not None:
308 fmt = b"%%0%dX" % (map_dict[-1] * 2)
309 a = multiline_rg[0] # a, b not in the current line
310 b = multiline_rg[1]
311 for sq in lst:
312 if sq == b"]":
313 closure_found = True
314 break
315 map_dict[
316 unhexlify(fmt % a).decode(
317 "charmap" if map_dict[-1] == 1 else "utf-16-be",
318 "surrogatepass",
319 )
320 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
321 int_entry.append(a)
322 a += 1
323 else:
324 a = int(lst[0], 16)
325 b = int(lst[1], 16)
326 nbi = max(len(lst[0]), len(lst[1]))
327 map_dict[-1] = ceil(nbi / 2)
328 fmt = b"%%0%dX" % (map_dict[-1] * 2)
329 if lst[2] == b"[":
330 for sq in lst[3:]:
331 if sq == b"]":
332 closure_found = True
333 break
334 map_dict[
335 unhexlify(fmt % a).decode(
336 "charmap" if map_dict[-1] == 1 else "utf-16-be",
337 "surrogatepass",
338 )
339 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
340 int_entry.append(a)
341 a += 1
342 else: # case without list
343 c = int(lst[2], 16)
344 fmt2 = b"%%0%dX" % max(4, len(lst[2]))
345 closure_found = True
346 while a <= b:
347 map_dict[
348 unhexlify(fmt % a).decode(
349 "charmap" if map_dict[-1] == 1 else "utf-16-be",
350 "surrogatepass",
351 )
352 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
353 int_entry.append(a)
354 a += 1
355 c += 1
356 return None if closure_found else (a, b)
359def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
360 lst = [x for x in line.split(b" ") if x]
361 map_dict[-1] = len(lst[0]) // 2
362 while len(lst) > 1:
363 map_to = ""
364 # placeholder (see above) means empty string
365 if lst[1] != b".":
366 try:
367 map_to = unhexlify(lst[1]).decode(
368 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
369 ) # join is here as some cases where the code was split
370 except BinasciiError as exception:
371 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
372 map_dict[
373 unhexlify(lst[0]).decode(
374 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
375 )
376 ] = map_to
377 int_entry.append(int(lst[0], 16))
378 lst = lst[2:]
381def build_font_width_map(
382 ft: DictionaryObject, default_font_width: float
383) -> dict[Any, float]:
384 font_width_map: dict[Any, float] = {}
385 st: int = 0
386 en: int = 0
387 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
388 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
389 # Widths for a CIDFont are defined using the DW and W entries.
390 # DW2 and W2 are for vertical use. Vertical type is not implemented.
391 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
392 if "/DW" in ft1:
393 font_width_map["default"] = cast(float, ft1["/DW"].get_object())
394 else:
395 font_name = str(ft["/BaseFont"]).removeprefix("/")
396 if font_name in CORE_FONT_METRICS:
397 # This applies to test_tounicode_is_identity, which has a CID CourierNew font that
398 # apparently does not specify the width of a space.
399 font_width_map["default"] = CORE_FONT_METRICS[font_name].character_widths[" "] * 2
400 else:
401 font_width_map["default"] = default_font_width
402 if "/W" in ft1:
403 w = ft1["/W"].get_object()
404 else:
405 w = []
406 while len(w) > 0:
407 st = w[0] if isinstance(w[0], int) else w[0].get_object()
408 second = w[1].get_object()
409 if isinstance(second, int):
410 # C_first C_last same_W
411 en = second
412 width = w[2].get_object()
413 if not isinstance(width, (int, float)):
414 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)
415 w = w[3:]
416 continue
417 for c_code in range(st, en + 1):
418 font_width_map[chr(c_code)] = width
419 w = w[3:]
420 elif isinstance(second, list):
421 # Starting_C [W1 W2 ... Wn]
422 c_code = st
423 for ww in second:
424 width = ww.get_object()
425 font_width_map[chr(c_code)] = width
426 c_code += 1
427 w = w[2:]
428 else:
429 logger_warning(
430 "unknown widths : \n" + (ft1["/W"]).__repr__(),
431 __name__,
432 )
433 break
434 elif "/Widths" in ft:
435 w = cast(ArrayObject, ft["/Widths"].get_object())
436 if "/FontDescriptor" in ft and "/MissingWidth" in cast(
437 DictionaryObject, ft["/FontDescriptor"]
438 ):
439 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
440 else:
441 # will consider width of char as avg(width)
442 m = 0
443 cpt = 0
444 for xx in w:
445 xx = xx.get_object()
446 if xx > 0:
447 m += xx
448 cpt += 1
449 font_width_map["default"] = m / max(1, cpt)
450 st = cast(int, ft["/FirstChar"])
451 en = cast(int, ft["/LastChar"])
452 for c_code in range(st, en + 1):
453 try:
454 width = w[c_code - st].get_object()
455 font_width_map[chr(c_code)] = width
456 except (IndexError, KeyError):
457 # The PDF structure is invalid. The array is too small
458 # for the specified font width.
459 pass
460 else:
461 font_name = str(ft["/BaseFont"]).removeprefix("/")
462 if font_name in CORE_FONT_METRICS:
463 font_width_map = cast(dict[str, float], CORE_FONT_METRICS[font_name].character_widths)
464 font_width_map["default"] = font_width_map[" "] * 2
465 if is_null_or_none(font_width_map.get("default")):
466 font_width_map["default"] = 0
467 return font_width_map
470def compute_space_width(
471 font_width_map: dict[Any, float], space_char: str
472) -> float:
473 try:
474 sp_width = font_width_map[space_char]
475 if sp_width == 0:
476 raise ValueError("Zero width")
477 except (KeyError, ValueError):
478 sp_width = (
479 font_width_map["default"] / 2.0
480 ) # if using default we consider space will be only half size
482 return sp_width
485def compute_font_width(
486 font_width_map: dict[Any, float],
487 char: str
488) -> float:
489 char_width: float = 0.0
490 try:
491 char_width = font_width_map[char]
492 except KeyError:
493 char_width = (
494 font_width_map["default"]
495 )
497 return char_width
500def _type1_alternative(
501 ft: DictionaryObject,
502 map_dict: dict[Any, Any],
503 int_entry: list[int],
504) -> tuple[dict[Any, Any], list[int]]:
505 if "/FontDescriptor" not in ft:
506 return map_dict, int_entry
507 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
508 if is_null_or_none(ft_desc):
509 return map_dict, int_entry
510 assert ft_desc is not None, "mypy"
511 txt = ft_desc.get_object().get_data()
512 txt = txt.split(b"eexec\n")[0] # only clear part
513 txt = txt.split(b"/Encoding")[1] # to get the encoding part
514 lines = txt.replace(b"\r", b"\n").split(b"\n")
515 for li in lines:
516 if li.startswith(b"dup"):
517 words = [_w for _w in li.split(b" ") if _w != b""]
518 if len(words) > 3 and words[3] != b"put":
519 continue
520 try:
521 i = int(words[1])
522 except ValueError: # pragma: no cover
523 continue
524 try:
525 v = adobe_glyphs[words[2].decode()]
526 except KeyError:
527 if words[2].startswith(b"/uni"):
528 try:
529 v = chr(int(words[2][4:], 16))
530 except ValueError: # pragma: no cover
531 continue
532 else:
533 continue
534 map_dict[chr(i)] = v
535 int_entry.append(i)
536 return map_dict, int_entry